diff --git a/.github/licenserc.yml b/.github/licenserc.yml
index eaf7a49eee4..b122b2c9775 100644
--- a/.github/licenserc.yml
+++ b/.github/licenserc.yml
@@ -25,6 +25,7 @@ header:
     - '**/LICENSE.TXT'
     - '**/cipher-file-256'
     - '**/asan.suppression'
+    - '**/tsan.suppression'
     - '**/LICENSE.TXT'
     - '**/LICENSE'
     - '**/README'
diff --git a/.github/workflows/license-checker.yml b/.github/workflows/license-checker.yml
index e156c1b2b4c..2b23cca7eaa 100644
--- a/.github/workflows/license-checker.yml
+++ b/.github/workflows/license-checker.yml
@@ -15,7 +15,7 @@ jobs:
     steps:
       - uses: actions/checkout@v2
       - name: Check License Header
-        uses: apache/skywalking-eyes@main
+        uses: apache/skywalking-eyes@v0.3.0
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         with:
diff --git a/.gitmodules b/.gitmodules
index 8472d78404e..335e1dbd9c8 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -82,3 +82,6 @@
 [submodule "contrib/cpu_features"]
 	path = contrib/cpu_features
 	url = https://github.com/google/cpu_features
+[submodule "contrib/arm-optimized-routines"]
+	path = contrib/arm-optimized-routines
+	url = https://github.com/ARM-software/optimized-routines
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f2ec9f3316b..2e33a127807 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -239,10 +239,8 @@ else ()
     set (CMAKE_CXX_STANDARD_REQUIRED ON)
 endif ()
 
-if (NOT ARCH_ARM)
-    set (CMAKE_CXX_FLAGS_RELWITHDEBINFO      "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O3")
-    set (CMAKE_C_FLAGS_RELWITHDEBINFO        "${CMAKE_C_FLAGS_RELWITHDEBINFO} -O3")
-endif ()
+set (CMAKE_CXX_FLAGS_RELWITHDEBINFO      "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O3")
+set (CMAKE_C_FLAGS_RELWITHDEBINFO        "${CMAKE_C_FLAGS_RELWITHDEBINFO} -O3")
 
 option (DEBUG_WITHOUT_DEBUG_INFO "Set to ON to build dev target without debug info (remove flag `-g` in order to accelerate compiling speed and reduce target binary size)" OFF)
 if (DEBUG_WITHOUT_DEBUG_INFO)
diff --git a/README.md b/README.md
index 02af727105b..ab996b6f3d6 100644
--- a/README.md
+++ b/README.md
@@ -242,7 +242,30 @@ LSAN_OPTIONS=suppressions=$WORKSPACE/tiflash/test/sanitize/asan.suppression
 
 ## Run Integration Tests
 
-TBD.
+1. Build your own tiflash binary in $BUILD with `-DCMAKE_BUILD_TYPE=DEBUG`.
+```
+cd $BUILD
+cmake $WORKSPACE/tiflash -GNinja -DCMAKE_BUILD_TYPE=DEBUG
+ninja tiflash
+```
+2. Run tidb cluster locally using tiup playgroud or other tools. 
+```
+tiup playground nightly --tiflash.binpath $BUILD/dbms/src/Server/tiflash
+```
+3. Check $WORKSPACE/tests/_env.sh to make the port and build dir right.
+4. Run your integration tests using commands like "./run-test.sh fullstack-test2/ddl" under $WORKSPACE/tests dir
+
+## Run MicroBenchmark Tests
+
+To run micro benchmark tests, you need to build with -DCMAKE_BUILD_TYPE=RELEASE -DENABLE_TESTS=ON:
+
+```shell
+cd $BUILD
+cmake $WORKSPACE/tiflash -GNinja -DCMAKE_BUILD_TYPE=RELEASE -DENABLE_TESTS=ON
+ninja bench_dbms       
+```
+
+And the microbenchmark-test executables are at `$BUILD/dbms/bench_dbms`, you can run it with `./bench_dbms` or `./bench_dbms --benchmark_filter=xxx` . More usage please check with `./bench_dbms --help`.
 
 ## Generate LLVM Coverage Report
 
diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt
index 71f81ae3ee5..4520d1cb176 100644
--- a/contrib/CMakeLists.txt
+++ b/contrib/CMakeLists.txt
@@ -165,3 +165,7 @@ add_subdirectory(benchmark)
 
 set (BUILD_TESTING OFF CACHE BOOL "Disable cpu-features testing" FORCE)
 add_subdirectory(cpu_features)
+
+if (ARCH_AARCH64 AND ARCH_LINUX)
+    add_subdirectory(arm-optimized-routines-cmake)
+endif ()
diff --git a/contrib/arm-optimized-routines b/contrib/arm-optimized-routines
new file mode 160000
index 00000000000..e373f659523
--- /dev/null
+++ b/contrib/arm-optimized-routines
@@ -0,0 +1 @@
+Subproject commit e373f6595230087a8ddea449bfb14b47150b4059
diff --git a/contrib/arm-optimized-routines-cmake/CMakeLists.txt b/contrib/arm-optimized-routines-cmake/CMakeLists.txt
new file mode 100644
index 00000000000..89baa7222f3
--- /dev/null
+++ b/contrib/arm-optimized-routines-cmake/CMakeLists.txt
@@ -0,0 +1,45 @@
+# Copyright 2022 PingCAP, Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This library is to override performance-critical routines for aarch64 targets.
+# The implementations are imported from official ARM repo.
+# To reduce dispatching cost, indirect function technique is utilized. Therefore,
+# this library should only be enabled with ELF targets.
+
+# Considerations:
+# - By Jun, 2022, most enterprise OSs (CentOS 7, CentOS Stream 8 and RHEL 8) still
+#   use relatively old glibc on ARM64, where ASIMD, MTE, DC ZVA and SVE are not
+#   fully utilized. However, it is becoming increasingly common to use ARM64 instances
+#   in cloud-native situations.
+# - `optimized-routines` repo is actively maintained by ARM officials. Therefore,
+#   the qualities can be ensured while using it also enables us to keep sync with latest
+#   acceleration techniques.
+
+set(CMAKE_C_FLAGS "")
+ENABLE_LANGUAGE(C)
+ENABLE_LANGUAGE(ASM)
+set(TIFLASH_AOR_DIR ../arm-optimized-routines)
+
+file(GLOB TIFLASH_AARCH64_STRING_FILES ${TIFLASH_AOR_DIR}/string/aarch64/*.S)
+add_library(tiflash-aarch64-string STATIC ${TIFLASH_AARCH64_STRING_FILES} src/aor.c)
+target_compile_options(tiflash-aarch64-string PRIVATE -march=armv8-a+sve)
+target_include_directories(tiflash-aarch64-string PRIVATE ${TIFLASH_AOR_DIR}/string/include)
+
+file(GLOB TIFLASH_AARCH64_MATH_FILES ${TIFLASH_AOR_DIR}/math/*.c)
+add_library(tiflash-aarch64-math STATIC ${TIFLASH_AARCH64_MATH_FILES})
+target_include_directories(tiflash-aarch64-math PRIVATE ${TIFLASH_AOR_DIR}/math/include)
+
+# it is reasonable to keep these libraries optimized
+target_compile_options(tiflash-aarch64-string PRIVATE -O3 -g3 -fno-omit-frame-pointer -ffunction-sections -fdata-sections)
+target_compile_options(tiflash-aarch64-math PRIVATE -O3 -g3 -fno-omit-frame-pointer -ffunction-sections -fdata-sections)
diff --git a/contrib/arm-optimized-routines-cmake/src/aor.c b/contrib/arm-optimized-routines-cmake/src/aor.c
new file mode 100644
index 00000000000..daff1df3c4b
--- /dev/null
+++ b/contrib/arm-optimized-routines-cmake/src/aor.c
@@ -0,0 +1,115 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <stringlib.h>
+#include <sys/auxv.h>
+
+// Provide default macro definitions in case that they are not defined on current linux distro.
+// For example, TiFlash compiled on older linux kernels may also be used in newer ones.
+// These values should be stable for Linux: only false negative is expected when running on
+// older kernels, but it is acceptable as `google/cpu_features` is also doing so.
+#ifndef HWCAP2_MTE
+#define HWCAP2_MTE (1 << 18)
+#endif
+
+#ifndef HWCAP_SVE
+#define HWCAP_SVE (1 << 22)
+#endif
+
+#ifndef AT_HWCAP2
+#define AT_HWCAP2 26
+#endif
+
+#ifndef AT_HWCAP
+#define AT_HWCAP 16
+#endif
+
+/// check if MTE is supported in current environment
+static inline bool mte_supported(void)
+{
+    return (getauxval(AT_HWCAP2) & HWCAP2_MTE) != 0;
+}
+
+/// check if SVE is supported in current environment
+static inline bool sve_supported(void)
+{
+    return (getauxval(AT_HWCAP) & HWCAP_SVE) != 0;
+}
+
+#define STRINGIFY_IMPL(X) #X
+#define STRINGIFY(X) STRINGIFY_IMPL(X)
+/**
+ *  \brief
+ *  Symbol is defined as hidden visibility. Therefore, implementations here are only to override routines with TiFlash
+ *  binary itself. This is because dependencies like `ld.so`, `libgcc_s.so`, etc will need essential routines like
+ *  `memcpy` to finish the early loading procedure. Therefore, declare such symbols as visible indirect function will
+ *  create cyclic dependency. It shall be good enough to override symbols within TiFlash, as most heavy computation works
+ *  are happening in the main binary.
+ *  \param NAME: exported symbol name
+ *  \param SVE: preferred implementation when SVE is available
+ *  \param MTE: preferred implementation when MTE is available
+ *  \param ASIMD: preferred implementation for generic aarch64 targets (ASIMD is required by default for Armv8 and above)
+ */
+#define DISPATCH(NAME, SVE, MTE, ASIMD)                                                                                  \
+    extern typeof(ASIMD) __tiflash_##NAME __attribute__((ifunc(STRINGIFY(__tiflash_##NAME##_resolver))));                \
+    extern typeof(ASIMD) NAME __attribute__((visibility("hidden"), alias(STRINGIFY(__tiflash_##NAME))));                 \
+    _Pragma("GCC diagnostic push")                                                                                       \
+        _Pragma("GCC diagnostic ignored \"-Wunused-function\"") static typeof(ASIMD) * __tiflash_##NAME##_resolver(void) \
+    {                                                                                                                    \
+        if (sve_supported())                                                                                             \
+        {                                                                                                                \
+            return SVE;                                                                                                  \
+        }                                                                                                                \
+        if (mte_supported())                                                                                             \
+        {                                                                                                                \
+            return MTE;                                                                                                  \
+        }                                                                                                                \
+        return ASIMD;                                                                                                    \
+    }                                                                                                                    \
+    _Pragma("GCC diagnostic pop")
+#undef memcpy
+#undef memmove
+#undef memset
+#undef memchr
+#undef memrchr
+#undef memcmp
+#undef strcpy
+#undef stpcpy
+#undef strcmp
+#undef strchr
+#undef strrchr
+#undef strchrnul
+#undef strlen
+#undef strnlen
+#undef strncmp
+
+DISPATCH(memcpy, __memcpy_aarch64_sve, __memcpy_aarch64_simd, __memcpy_aarch64_simd)
+DISPATCH(memmove, __memmove_aarch64_sve, __memmove_aarch64_simd, __memmove_aarch64_simd)
+DISPATCH(memset, __memset_aarch64, __memset_aarch64, __memset_aarch64)
+DISPATCH(memchr, __memchr_aarch64_sve, __memchr_aarch64_mte, __memchr_aarch64)
+DISPATCH(memrchr, __memrchr_aarch64, __memrchr_aarch64, __memrchr_aarch64)
+DISPATCH(memcmp, __memcmp_aarch64_sve, __memcmp_aarch64, __memcmp_aarch64)
+DISPATCH(strcpy, __strcpy_aarch64_sve, __strcpy_aarch64, __strcpy_aarch64)
+DISPATCH(stpcpy, __stpcpy_aarch64_sve, __stpcpy_aarch64, __stpcpy_aarch64)
+DISPATCH(strcmp, __strcmp_aarch64_sve, __strcmp_aarch64, __strcmp_aarch64)
+DISPATCH(strchr, __strchr_aarch64_sve, __strchr_aarch64_mte, __strchr_aarch64)
+DISPATCH(strrchr, __strrchr_aarch64_sve, __strrchr_aarch64_mte, __strrchr_aarch64)
+DISPATCH(strchrnul, __strchrnul_aarch64_sve, __strchrnul_aarch64_mte, __strchrnul_aarch64)
+DISPATCH(strlen, __strlen_aarch64_sve, __strlen_aarch64_mte, __strlen_aarch64)
+DISPATCH(strnlen, __strnlen_aarch64_sve, __strnlen_aarch64, __strnlen_aarch64)
+DISPATCH(strncmp, __strncmp_aarch64_sve, __strncmp_aarch64, __strncmp_aarch64)
\ No newline at end of file
diff --git a/contrib/client-c b/contrib/client-c
index 36e05cb0f24..034d1e782cb 160000
--- a/contrib/client-c
+++ b/contrib/client-c
@@ -1 +1 @@
-Subproject commit 36e05cb0f24c085785abf367176dac2a45bfd67b
+Subproject commit 034d1e782cb4697f99b09b679c00dade00f19dd5
diff --git a/contrib/jemalloc b/contrib/jemalloc
index ea6b3e973b4..54eaed1d8b5 160000
--- a/contrib/jemalloc
+++ b/contrib/jemalloc
@@ -1 +1 @@
-Subproject commit ea6b3e973b477b8061e0076bb257dbd7f3faa756
+Subproject commit 54eaed1d8b56b1aa528be3bdd1877e59c56fa90c
diff --git a/contrib/jemalloc-cmake/CMakeLists.txt b/contrib/jemalloc-cmake/CMakeLists.txt
index ef02fbabc81..91b17eb8ec7 100644
--- a/contrib/jemalloc-cmake/CMakeLists.txt
+++ b/contrib/jemalloc-cmake/CMakeLists.txt
@@ -4,65 +4,136 @@ endif()
 
 set(JEMALLOC_SOURCE_DIR ${TiFlash_SOURCE_DIR}/contrib/jemalloc)
 
-set(SRCS
-${JEMALLOC_SOURCE_DIR}/src/arena.c
-${JEMALLOC_SOURCE_DIR}/src/background_thread.c
-${JEMALLOC_SOURCE_DIR}/src/base.c
-${JEMALLOC_SOURCE_DIR}/src/bin.c
-${JEMALLOC_SOURCE_DIR}/src/bitmap.c
-${JEMALLOC_SOURCE_DIR}/src/ckh.c
-${JEMALLOC_SOURCE_DIR}/src/ctl.c
-${JEMALLOC_SOURCE_DIR}/src/div.c
-${JEMALLOC_SOURCE_DIR}/src/extent.c
-${JEMALLOC_SOURCE_DIR}/src/extent_dss.c
-${JEMALLOC_SOURCE_DIR}/src/extent_mmap.c
-${JEMALLOC_SOURCE_DIR}/src/hash.c
-${JEMALLOC_SOURCE_DIR}/src/hook.c
-${JEMALLOC_SOURCE_DIR}/src/jemalloc.c
-${JEMALLOC_SOURCE_DIR}/src/jemalloc_cpp.cpp
-${JEMALLOC_SOURCE_DIR}/src/large.c
-${JEMALLOC_SOURCE_DIR}/src/log.c
-${JEMALLOC_SOURCE_DIR}/src/malloc_io.c
-${JEMALLOC_SOURCE_DIR}/src/mutex.c
-${JEMALLOC_SOURCE_DIR}/src/mutex_pool.c
-${JEMALLOC_SOURCE_DIR}/src/nstime.c
-${JEMALLOC_SOURCE_DIR}/src/pages.c
-${JEMALLOC_SOURCE_DIR}/src/prng.c
-${JEMALLOC_SOURCE_DIR}/src/prof.c
-${JEMALLOC_SOURCE_DIR}/src/rtree.c
-${JEMALLOC_SOURCE_DIR}/src/sc.c
-${JEMALLOC_SOURCE_DIR}/src/stats.c
-${JEMALLOC_SOURCE_DIR}/src/sz.c
-${JEMALLOC_SOURCE_DIR}/src/tcache.c
-${JEMALLOC_SOURCE_DIR}/src/test_hooks.c
-${JEMALLOC_SOURCE_DIR}/src/ticker.c
-${JEMALLOC_SOURCE_DIR}/src/tsd.c
-${JEMALLOC_SOURCE_DIR}/src/witness.c
-${JEMALLOC_SOURCE_DIR}/src/safety_check.c
+set (SRCS
+        "${JEMALLOC_SOURCE_DIR}/src/arena.c"
+        "${JEMALLOC_SOURCE_DIR}/src/background_thread.c"
+        "${JEMALLOC_SOURCE_DIR}/src/base.c"
+        "${JEMALLOC_SOURCE_DIR}/src/bin.c"
+        "${JEMALLOC_SOURCE_DIR}/src/bin_info.c"
+        "${JEMALLOC_SOURCE_DIR}/src/bitmap.c"
+        "${JEMALLOC_SOURCE_DIR}/src/buf_writer.c"
+        "${JEMALLOC_SOURCE_DIR}/src/cache_bin.c"
+        "${JEMALLOC_SOURCE_DIR}/src/ckh.c"
+        "${JEMALLOC_SOURCE_DIR}/src/counter.c"
+        "${JEMALLOC_SOURCE_DIR}/src/ctl.c"
+        "${JEMALLOC_SOURCE_DIR}/src/decay.c"
+        "${JEMALLOC_SOURCE_DIR}/src/div.c"
+        "${JEMALLOC_SOURCE_DIR}/src/ecache.c"
+        "${JEMALLOC_SOURCE_DIR}/src/edata.c"
+        "${JEMALLOC_SOURCE_DIR}/src/edata_cache.c"
+        "${JEMALLOC_SOURCE_DIR}/src/ehooks.c"
+        "${JEMALLOC_SOURCE_DIR}/src/emap.c"
+        "${JEMALLOC_SOURCE_DIR}/src/eset.c"
+        "${JEMALLOC_SOURCE_DIR}/src/exp_grow.c"
+        "${JEMALLOC_SOURCE_DIR}/src/extent.c"
+        "${JEMALLOC_SOURCE_DIR}/src/extent_dss.c"
+        "${JEMALLOC_SOURCE_DIR}/src/extent_mmap.c"
+        "${JEMALLOC_SOURCE_DIR}/src/fxp.c"
+        "${JEMALLOC_SOURCE_DIR}/src/hook.c"
+        "${JEMALLOC_SOURCE_DIR}/src/hpa.c"
+        "${JEMALLOC_SOURCE_DIR}/src/hpa_hooks.c"
+        "${JEMALLOC_SOURCE_DIR}/src/hpdata.c"
+        "${JEMALLOC_SOURCE_DIR}/src/inspect.c"
+        "${JEMALLOC_SOURCE_DIR}/src/jemalloc.c"
+        "${JEMALLOC_SOURCE_DIR}/src/large.c"
+        "${JEMALLOC_SOURCE_DIR}/src/log.c"
+        "${JEMALLOC_SOURCE_DIR}/src/malloc_io.c"
+        "${JEMALLOC_SOURCE_DIR}/src/mutex.c"
+        "${JEMALLOC_SOURCE_DIR}/src/nstime.c"
+        "${JEMALLOC_SOURCE_DIR}/src/pa.c"
+        "${JEMALLOC_SOURCE_DIR}/src/pac.c"
+        "${JEMALLOC_SOURCE_DIR}/src/pa_extra.c"
+        "${JEMALLOC_SOURCE_DIR}/src/pages.c"
+        "${JEMALLOC_SOURCE_DIR}/src/pai.c"
+        "${JEMALLOC_SOURCE_DIR}/src/peak_event.c"
+        "${JEMALLOC_SOURCE_DIR}/src/prof.c"
+        "${JEMALLOC_SOURCE_DIR}/src/prof_data.c"
+        "${JEMALLOC_SOURCE_DIR}/src/prof_log.c"
+        "${JEMALLOC_SOURCE_DIR}/src/prof_recent.c"
+        "${JEMALLOC_SOURCE_DIR}/src/prof_stats.c"
+        "${JEMALLOC_SOURCE_DIR}/src/prof_sys.c"
+        "${JEMALLOC_SOURCE_DIR}/src/psset.c"
+        "${JEMALLOC_SOURCE_DIR}/src/rtree.c"
+        "${JEMALLOC_SOURCE_DIR}/src/safety_check.c"
+        "${JEMALLOC_SOURCE_DIR}/src/san_bump.c"
+        "${JEMALLOC_SOURCE_DIR}/src/san.c"
+        "${JEMALLOC_SOURCE_DIR}/src/sc.c"
+        "${JEMALLOC_SOURCE_DIR}/src/sec.c"
+        "${JEMALLOC_SOURCE_DIR}/src/stats.c"
+        "${JEMALLOC_SOURCE_DIR}/src/sz.c"
+        "${JEMALLOC_SOURCE_DIR}/src/tcache.c"
+        "${JEMALLOC_SOURCE_DIR}/src/test_hooks.c"
+        "${JEMALLOC_SOURCE_DIR}/src/thread_event.c"
+        "${JEMALLOC_SOURCE_DIR}/src/ticker.c"
+        "${JEMALLOC_SOURCE_DIR}/src/tsd.c"
+        "${JEMALLOC_SOURCE_DIR}/src/witness.c"
 )
 
 if(CMAKE_SYSTEM_NAME MATCHES "Darwin")
   list(APPEND SRCS ${JEMALLOC_SOURCE_DIR}/src/zone.c)
 endif()
 
+if (ARCH_LINUX)
+    # ThreadPool select job randomly, and there can be some threads that had been
+    # performed some memory heavy task before and will be inactive for some time,
+    # but until it will became active again, the memory will not be freed since by
+    # default each thread has it's own arena, but there should be not more then
+    # 4*CPU arenas (see opt.nareans description).
+    #
+    # By enabling percpu_arena number of arenas limited to number of CPUs and hence
+    # this problem should go away.
+    #
+    # muzzy_decay_ms -- use MADV_FREE when available on newer Linuxes, to
+    # avoid spurious latencies and additional work associated with
+    # MADV_DONTNEED. See
+    # https://github.com/ClickHouse/ClickHouse/issues/11121 for motivation.
+    set (JEMALLOC_CONFIG_MALLOC_CONF "percpu_arena:percpu,oversize_threshold:0,muzzy_decay_ms:5000,dirty_decay_ms:5000")
+else()
+    set (JEMALLOC_CONFIG_MALLOC_CONF "oversize_threshold:0,muzzy_decay_ms:5000,dirty_decay_ms:5000")
+endif()
+
+message (STATUS "jemalloc malloc_conf: ${JEMALLOC_CONFIG_MALLOC_CONF}")
+
 if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
     set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -w")
 endif ()
 
 add_library(jemalloc STATIC ${SRCS})
+set (JEMALLOC_INCLUDE_PREFIX)
+
+if (ARCH_LINUX)
+    set (JEMALLOC_INCLUDE_PREFIX "include_linux")
+    target_compile_definitions(jemalloc PRIVATE JEMALLOC_MADV_FREE=8)
+elseif (ARCH_FREEBSD)
+    set (JEMALLOC_INCLUDE_PREFIX "include_freebsd")
+elseif (APPLE)
+    set (JEMALLOC_INCLUDE_PREFIX "include_darwin")
+else ()
+    message (FATAL_ERROR "internal jemalloc: This OS is not supported")
+endif ()
 
-if (ARCH_ARM)
-    target_include_directories(jemalloc PUBLIC
-        ${CMAKE_CURRENT_SOURCE_DIR}/include
-        ${CMAKE_CURRENT_SOURCE_DIR}/include_linux_aarch64)
+if (ARCH_AMD64)
+    if (USE_MUSL)
+        set(JEMALLOC_INCLUDE_PREFIX "${JEMALLOC_INCLUDE_PREFIX}_x86_64_musl")
+    else()
+        set(JEMALLOC_INCLUDE_PREFIX "${JEMALLOC_INCLUDE_PREFIX}_x86_64")
+    endif()
+elseif (ARCH_AARCH64)
+    set(JEMALLOC_INCLUDE_PREFIX "${JEMALLOC_INCLUDE_PREFIX}_aarch64")
+elseif (ARCH_PPC64LE)
+    set(JEMALLOC_INCLUDE_PREFIX "${JEMALLOC_INCLUDE_PREFIX}_ppc64le")
+elseif (ARCH_RISCV64)
+    set(JEMALLOC_INCLUDE_PREFIX "${JEMALLOC_INCLUDE_PREFIX}_riscv64")
 else ()
-    target_include_directories(jemalloc PUBLIC
-        ${CMAKE_CURRENT_SOURCE_DIR}/include
-        ${CMAKE_CURRENT_SOURCE_DIR}/include_linux_x86_64)
+    message (FATAL_ERROR "internal jemalloc: This arch is not supported")
 endif ()
 
-target_include_directories(jemalloc PRIVATE
-    ${JEMALLOC_SOURCE_DIR}/include)
+configure_file(${JEMALLOC_INCLUDE_PREFIX}/jemalloc/internal/jemalloc_internal_defs.h.in
+        ${JEMALLOC_INCLUDE_PREFIX}/jemalloc/internal/jemalloc_internal_defs.h)
+target_include_directories(jemalloc SYSTEM PRIVATE
+        "${CMAKE_CURRENT_BINARY_DIR}/${JEMALLOC_INCLUDE_PREFIX}/jemalloc/internal")
+
+target_include_directories(jemalloc PUBLIC ${JEMALLOC_SOURCE_DIR}/include ${TiFlash_SOURCE_DIR}/contrib/jemalloc-cmake/include)
 
 target_compile_definitions(jemalloc PRIVATE -DJEMALLOC_NO_PRIVATE_NAMESPACE)
 
@@ -80,3 +151,5 @@ if (ENABLE_JEMALLOC_PROF)
         target_link_libraries (jemalloc PRIVATE ${UNWIND_LIBRARY})
     endif ()
 endif ()
+
+target_compile_options(jemalloc PRIVATE -D_GNU_SOURCE)
diff --git a/contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/internal/jemalloc_preamble.h b/contrib/jemalloc-cmake/include/jemalloc/internal/jemalloc_preamble.h
similarity index 69%
rename from contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/internal/jemalloc_preamble.h
rename to contrib/jemalloc-cmake/include/jemalloc/internal/jemalloc_preamble.h
index d79551e1f25..45f43a6cd02 100644
--- a/contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/internal/jemalloc_preamble.h
+++ b/contrib/jemalloc-cmake/include/jemalloc/internal/jemalloc_preamble.h
@@ -1,27 +1,33 @@
 #ifndef JEMALLOC_PREAMBLE_H
 #define JEMALLOC_PREAMBLE_H
 
-#include "jemalloc_internal_defs.h"
 #include "jemalloc/internal/jemalloc_internal_decls.h"
+#include "jemalloc_internal_defs.h"
 
-#ifdef JEMALLOC_UTRACE
+#if defined(JEMALLOC_UTRACE) || defined(JEMALLOC_UTRACE_LABEL)
 #include <sys/ktrace.h>
+#if defined(JEMALLOC_UTRACE)
+#define UTRACE_CALL(p, l) utrace(p, l)
+#else
+#define UTRACE_CALL(p, l) utrace("jemalloc_process", p, l)
+#define JEMALLOC_UTRACE
+#endif
 #endif
 
 #define JEMALLOC_NO_DEMANGLE
 #ifdef JEMALLOC_JET
-#  undef JEMALLOC_IS_MALLOC
-#  define JEMALLOC_N(n) jet_##n
-#  include "jemalloc/internal/public_namespace.h"
-#  define JEMALLOC_NO_RENAME
-#  include "jemalloc/jemalloc.h"
-#  undef JEMALLOC_NO_RENAME
+#undef JEMALLOC_IS_MALLOC
+#define JEMALLOC_N(n) jet_##n
+#include "jemalloc/internal/public_namespace.h"
+#define JEMALLOC_NO_RENAME
+#include "jemalloc/jemalloc.h"
+#undef JEMALLOC_NO_RENAME
 #else
-#  define JEMALLOC_N(n) je_##n
-#  include "jemalloc/jemalloc.h"
+#define JEMALLOC_N(n) je_##n
+#include "jemalloc/jemalloc.h"
 #endif
 
-#if (defined(JEMALLOC_OSATOMIC) || defined(JEMALLOC_OSSPIN))
+#if defined(JEMALLOC_OSATOMIC)
 #include <libkern/OSAtomic.h>
 #endif
 
@@ -39,16 +45,16 @@
  * possible.
  */
 #ifndef JEMALLOC_NO_PRIVATE_NAMESPACE
-#  ifndef JEMALLOC_JET
-#    include "jemalloc/internal/private_namespace.h"
-#  else
-#    include "jemalloc/internal/private_namespace_jet.h"
-#  endif
+#ifndef JEMALLOC_JET
+#include "jemalloc/internal/private_namespace.h"
+#else
+#include "jemalloc/internal/private_namespace_jet.h"
+#endif
 #endif
 #include "jemalloc/internal/test_hooks.h"
 
 #ifdef JEMALLOC_DEFINE_MADVISE_FREE
-#  define JEMALLOC_MADV_FREE 8
+#define JEMALLOC_MADV_FREE 8
 #endif
 
 static const bool config_debug =
@@ -161,7 +167,55 @@ static const bool config_log =
     false
 #endif
     ;
-#ifdef JEMALLOC_HAVE_SCHED_GETCPU
+/*
+ * Are extra safety checks enabled; things like checking the size of sized
+ * deallocations, double-frees, etc.
+ */
+static const bool config_opt_safety_checks =
+#ifdef JEMALLOC_OPT_SAFETY_CHECKS
+    true
+#elif defined(JEMALLOC_DEBUG)
+    /*
+     * This lets us only guard safety checks by one flag instead of two; fast
+     * checks can guard solely by config_opt_safety_checks and run in debug mode
+     * too.
+     */
+    true
+#else
+    false
+#endif
+    ;
+
+/*
+ * Extra debugging of sized deallocations too onerous to be included in the
+ * general safety checks.
+ */
+static const bool config_opt_size_checks =
+#if defined(JEMALLOC_OPT_SIZE_CHECKS) || defined(JEMALLOC_DEBUG)
+    true
+#else
+    false
+#endif
+    ;
+
+static const bool config_uaf_detection =
+#if defined(JEMALLOC_UAF_DETECTION) || defined(JEMALLOC_DEBUG)
+    true
+#else
+    false
+#endif
+    ;
+
+/* Whether or not the C++ extensions are enabled. */
+static const bool config_enable_cxx =
+#ifdef JEMALLOC_ENABLE_CXX
+    true
+#else
+    false
+#endif
+    ;
+
+#if defined(_WIN32) || defined(JEMALLOC_HAVE_SCHED_GETCPU)
 /* Currently percpu_arena depends on sched_getcpu. */
 #define JEMALLOC_PERCPU_ARENA
 #endif
@@ -190,23 +244,16 @@ static const bool have_background_thread =
     false
 #endif
     ;
-
-#define JEMALLOC_GCC_U8_ATOMIC_ATOMICS 1
-#define JEMALLOC_GCC_U8_SYNC_ATOMICS 1
-
-/*
- * Are extra safety checks enabled; things like checking the size of sized
- * deallocations, double-frees, etc.
- */
-static const bool config_opt_safety_checks =
-#ifdef JEMALLOC_OPT_SAFETY_CHECKS
+static const bool config_high_res_timer =
+#ifdef JEMALLOC_HAVE_CLOCK_REALTIME
     true
-#elif defined(JEMALLOC_DEBUG)
-    /*
-     * This lets us only guard safety checks by one flag instead of two; fast
-     * checks can guard solely by config_opt_safety_checks and run in debug mode
-     * too.
-     */
+#else
+    false
+#endif
+    ;
+
+static const bool have_memcntl =
+#ifdef JEMALLOC_HAVE_MEMCNTL
     true
 #else
     false
diff --git a/contrib/jemalloc-cmake/include/jemalloc/jemalloc.h b/contrib/jemalloc-cmake/include/jemalloc/jemalloc.h
index d06243c5239..e90fa892100 100644
--- a/contrib/jemalloc-cmake/include/jemalloc/jemalloc.h
+++ b/contrib/jemalloc-cmake/include/jemalloc/jemalloc.h
@@ -4,13 +4,21 @@
 extern "C" {
 #endif
 
+#if !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wredundant-decls"
+#endif
+
 #include <jemalloc/jemalloc_defs.h>
-#include <jemalloc/jemalloc_rename.h>
 #include <jemalloc/jemalloc_macros.h>
 #include <jemalloc/jemalloc_protos.h>
+#include <jemalloc/jemalloc_rename.h>
 #include <jemalloc/jemalloc_typedefs.h>
 
+#if !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+
 #ifdef __cplusplus
 }
 #endif
-
diff --git a/contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/jemalloc_defs.h b/contrib/jemalloc-cmake/include/jemalloc/jemalloc_defs.h
similarity index 67%
rename from contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/jemalloc_defs.h
rename to contrib/jemalloc-cmake/include/jemalloc/jemalloc_defs.h
index d1389237a77..1fc77be57cf 100644
--- a/contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/jemalloc_defs.h
+++ b/contrib/jemalloc-cmake/include/jemalloc/jemalloc_defs.h
@@ -5,15 +5,29 @@
 /* Defined if alloc_size attribute is supported. */
 #define JEMALLOC_HAVE_ATTR_ALLOC_SIZE
 
+/* Defined if format_arg(...) attribute is supported. */
+#define JEMALLOC_HAVE_ATTR_FORMAT_ARG
+
+/* Defined if format(gnu_printf, ...) attribute is supported. */
+/* #undef JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF */
+
 /* Defined if format(printf, ...) attribute is supported. */
 #define JEMALLOC_HAVE_ATTR_FORMAT_PRINTF
 
+/* Defined if fallthrough attribute is supported. */
+#define JEMALLOC_HAVE_ATTR_FALLTHROUGH
+
+/* Defined if cold attribute is supported. */
+#define JEMALLOC_HAVE_ATTR_COLD
+
 /*
  * Define overrides for non-standard allocator-related functions if they are
  * present on the system.
  */
+#if !defined(USE_MUSL)
 #define JEMALLOC_OVERRIDE_MEMALIGN
 #define JEMALLOC_OVERRIDE_VALLOC
+#endif
 
 /*
  * At least Linux omits the "const" in:
@@ -32,11 +46,11 @@
 #define JEMALLOC_USE_CXX_THROW
 
 #ifdef _MSC_VER
-#  ifdef _WIN64
-#    define LG_SIZEOF_PTR_WIN 3
-#  else
-#    define LG_SIZEOF_PTR_WIN 2
-#  endif
+#ifdef _WIN64
+#define LG_SIZEOF_PTR_WIN 3
+#else
+#define LG_SIZEOF_PTR_WIN 2
+#endif
 #endif
 
 /* sizeof(void *) == 2^LG_SIZEOF_PTR. */
diff --git a/contrib/jemalloc-cmake/include/jemalloc/jemalloc_macros.h b/contrib/jemalloc-cmake/include/jemalloc/jemalloc_macros.h
new file mode 100644
index 00000000000..ccb22470e64
--- /dev/null
+++ b/contrib/jemalloc-cmake/include/jemalloc/jemalloc_macros.h
@@ -0,0 +1,148 @@
+#include <limits.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <strings.h>
+
+#define JEMALLOC_VERSION "5.3-RC"
+#define JEMALLOC_VERSION_MAJOR 5
+#define JEMALLOC_VERSION_MINOR 3
+#define JEMALLOC_VERSION_BUGFIX 0
+#define JEMALLOC_VERSION_NREV 0
+#define JEMALLOC_VERSION_GID "ca709c3139f77f4c00a903cdee46d71e9028f6c6"
+#define JEMALLOC_VERSION_GID_IDENT ca709c3139f77f4c00a903cdee46d71e9028f6c6
+
+#define MALLOCX_LG_ALIGN(la) ((int)(la))
+#if LG_SIZEOF_PTR == 2
+#define MALLOCX_ALIGN(a) ((int)(ffs((int)(a)) - 1))
+#else
+#define MALLOCX_ALIGN(a) \
+    ((int)(((size_t)(a) < (size_t)INT_MAX) ? ffs((int)(a)) - 1 : ffs((int)(((size_t)(a)) >> 32)) + 31))
+#endif
+#define MALLOCX_ZERO ((int)0x40)
+/*
+ * Bias tcache index bits so that 0 encodes "automatic tcache management", and 1
+ * encodes MALLOCX_TCACHE_NONE.
+ */
+#define MALLOCX_TCACHE(tc) ((int)(((tc) + 2) << 8))
+#define MALLOCX_TCACHE_NONE MALLOCX_TCACHE(-1)
+/*
+ * Bias arena index bits so that 0 encodes "use an automatically chosen arena".
+ */
+#define MALLOCX_ARENA(a) ((((int)(a)) + 1) << 20)
+
+/*
+ * Use as arena index in "arena.<i>.{purge,decay,dss}" and
+ * "stats.arenas.<i>.*" mallctl interfaces to select all arenas.  This
+ * definition is intentionally specified in raw decimal format to support
+ * cpp-based string concatenation, e.g.
+ *
+ *   #define STRINGIFY_HELPER(x) #x
+ *   #define STRINGIFY(x) STRINGIFY_HELPER(x)
+ *
+ *   mallctl("arena." STRINGIFY(MALLCTL_ARENAS_ALL) ".purge", NULL, NULL, NULL,
+ *       0);
+ */
+#define MALLCTL_ARENAS_ALL 4096
+/*
+ * Use as arena index in "stats.arenas.<i>.*" mallctl interfaces to select
+ * destroyed arenas.
+ */
+#define MALLCTL_ARENAS_DESTROYED 4097
+
+#if defined(__cplusplus) && defined(JEMALLOC_USE_CXX_THROW)
+#define JEMALLOC_CXX_THROW throw()
+#else
+#define JEMALLOC_CXX_THROW
+#endif
+
+#if defined(_MSC_VER)
+#define JEMALLOC_ATTR(s)
+#define JEMALLOC_ALIGNED(s) __declspec(align(s))
+#define JEMALLOC_ALLOC_SIZE(s)
+#define JEMALLOC_ALLOC_SIZE2(s1, s2)
+#ifndef JEMALLOC_EXPORT
+#ifdef DLLEXPORT
+#define JEMALLOC_EXPORT __declspec(dllexport)
+#else
+#define JEMALLOC_EXPORT __declspec(dllimport)
+#endif
+#endif
+#define JEMALLOC_FORMAT_ARG(i)
+#define JEMALLOC_FORMAT_PRINTF(s, i)
+#define JEMALLOC_FALLTHROUGH
+#define JEMALLOC_NOINLINE __declspec(noinline)
+#ifdef __cplusplus
+#define JEMALLOC_NOTHROW __declspec(nothrow)
+#else
+#define JEMALLOC_NOTHROW
+#endif
+#define JEMALLOC_SECTION(s) __declspec(allocate(s))
+#define JEMALLOC_RESTRICT_RETURN __declspec(restrict)
+#if _MSC_VER >= 1900 && !defined(__EDG__)
+#define JEMALLOC_ALLOCATOR __declspec(allocator)
+#else
+#define JEMALLOC_ALLOCATOR
+#endif
+#define JEMALLOC_COLD
+#elif defined(JEMALLOC_HAVE_ATTR)
+#define JEMALLOC_ATTR(s) __attribute__((s))
+#define JEMALLOC_ALIGNED(s) JEMALLOC_ATTR(aligned(s))
+#ifdef JEMALLOC_HAVE_ATTR_ALLOC_SIZE
+#define JEMALLOC_ALLOC_SIZE(s) JEMALLOC_ATTR(alloc_size(s))
+#define JEMALLOC_ALLOC_SIZE2(s1, s2) JEMALLOC_ATTR(alloc_size(s1, s2))
+#else
+#define JEMALLOC_ALLOC_SIZE(s)
+#define JEMALLOC_ALLOC_SIZE2(s1, s2)
+#endif
+#ifndef JEMALLOC_EXPORT
+#define JEMALLOC_EXPORT JEMALLOC_ATTR(visibility("default"))
+#endif
+#ifdef JEMALLOC_HAVE_ATTR_FORMAT_ARG
+#define JEMALLOC_FORMAT_ARG(i) JEMALLOC_ATTR(__format_arg__(3))
+#else
+#define JEMALLOC_FORMAT_ARG(i)
+#endif
+#ifdef JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF
+#define JEMALLOC_FORMAT_PRINTF(s, i) JEMALLOC_ATTR(format(gnu_printf, s, i))
+#elif defined(JEMALLOC_HAVE_ATTR_FORMAT_PRINTF)
+#define JEMALLOC_FORMAT_PRINTF(s, i) JEMALLOC_ATTR(format(printf, s, i))
+#else
+#define JEMALLOC_FORMAT_PRINTF(s, i)
+#endif
+#ifdef JEMALLOC_HAVE_ATTR_FALLTHROUGH
+#define JEMALLOC_FALLTHROUGH JEMALLOC_ATTR(fallthrough)
+#else
+#define JEMALLOC_FALLTHROUGH
+#endif
+#define JEMALLOC_NOINLINE JEMALLOC_ATTR(noinline)
+#define JEMALLOC_NOTHROW JEMALLOC_ATTR(nothrow)
+#define JEMALLOC_SECTION(s) JEMALLOC_ATTR(section(s))
+#define JEMALLOC_RESTRICT_RETURN
+#define JEMALLOC_ALLOCATOR
+#ifdef JEMALLOC_HAVE_ATTR_COLD
+#define JEMALLOC_COLD JEMALLOC_ATTR(__cold__)
+#else
+#define JEMALLOC_COLD
+#endif
+#else
+#define JEMALLOC_ATTR(s)
+#define JEMALLOC_ALIGNED(s)
+#define JEMALLOC_ALLOC_SIZE(s)
+#define JEMALLOC_ALLOC_SIZE2(s1, s2)
+#define JEMALLOC_EXPORT
+#define JEMALLOC_FORMAT_PRINTF(s, i)
+#define JEMALLOC_FALLTHROUGH
+#define JEMALLOC_NOINLINE
+#define JEMALLOC_NOTHROW
+#define JEMALLOC_SECTION(s)
+#define JEMALLOC_RESTRICT_RETURN
+#define JEMALLOC_ALLOCATOR
+#define JEMALLOC_COLD
+#endif
+
+#if (defined(__APPLE__) || defined(__FreeBSD__)) && !defined(JEMALLOC_NO_RENAME)
+#define JEMALLOC_SYS_NOTHROW
+#else
+#define JEMALLOC_SYS_NOTHROW JEMALLOC_NOTHROW
+#endif
diff --git a/contrib/jemalloc-cmake/include/jemalloc/jemalloc_protos.h b/contrib/jemalloc-cmake/include/jemalloc/jemalloc_protos.h
new file mode 100644
index 00000000000..31f72d3a2af
--- /dev/null
+++ b/contrib/jemalloc-cmake/include/jemalloc/jemalloc_protos.h
@@ -0,0 +1,86 @@
+// OSX does not have this for system alloc functions, so you will get
+// "exception specification in declaration" error.
+#if defined(__APPLE__) || defined(__FreeBSD__) || defined(USE_MUSL)
+#undef JEMALLOC_NOTHROW
+#define JEMALLOC_NOTHROW
+
+#undef JEMALLOC_SYS_NOTHROW
+#define JEMALLOC_SYS_NOTHROW
+
+#undef JEMALLOC_CXX_THROW
+#define JEMALLOC_CXX_THROW
+#endif
+
+#include "jemalloc_rename.h"
+
+/*
+ * The je_ prefix on the following public symbol declarations is an artifact
+ * of namespace management, and should be omitted in application code unless
+ * JEMALLOC_NO_DEMANGLE is defined (see jemalloc_mangle.h).
+ */
+extern JEMALLOC_EXPORT const char * je_malloc_conf;
+extern JEMALLOC_EXPORT void (*je_malloc_message)(void * cbopaque,
+                                                 const char * s);
+
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_SYS_NOTHROW * je_malloc(size_t size)
+    JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1);
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_SYS_NOTHROW * je_calloc(size_t num, size_t size)
+    JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE2(1, 2);
+JEMALLOC_EXPORT int JEMALLOC_SYS_NOTHROW je_posix_memalign(
+    void ** memptr,
+    size_t alignment,
+    size_t size) JEMALLOC_CXX_THROW
+    JEMALLOC_ATTR(nonnull(1));
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_SYS_NOTHROW * je_aligned_alloc(size_t alignment,
+                                                                                                         size_t size) JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc)
+    JEMALLOC_ALLOC_SIZE(2);
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_SYS_NOTHROW * je_realloc(void * ptr, size_t size)
+    JEMALLOC_CXX_THROW JEMALLOC_ALLOC_SIZE(2);
+JEMALLOC_EXPORT void JEMALLOC_SYS_NOTHROW je_free(void * ptr)
+    JEMALLOC_CXX_THROW;
+
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW * je_mallocx(size_t size, int flags)
+    JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1);
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW * je_rallocx(void * ptr, size_t size, int flags) JEMALLOC_ALLOC_SIZE(2);
+JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW je_xallocx(void * ptr, size_t size, size_t extra, int flags);
+JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW je_sallocx(const void * ptr,
+                                                   int flags) JEMALLOC_ATTR(pure);
+JEMALLOC_EXPORT void JEMALLOC_NOTHROW je_dallocx(void * ptr, int flags);
+JEMALLOC_EXPORT void JEMALLOC_NOTHROW je_sdallocx(void * ptr, size_t size, int flags);
+JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW je_nallocx(size_t size, int flags)
+    JEMALLOC_ATTR(pure);
+
+JEMALLOC_EXPORT int JEMALLOC_NOTHROW je_mallctl(const char * name,
+                                                void * oldp,
+                                                size_t * oldlenp,
+                                                void * newp,
+                                                size_t newlen);
+JEMALLOC_EXPORT int JEMALLOC_NOTHROW je_mallctlnametomib(const char * name,
+                                                         size_t * mibp,
+                                                         size_t * miblenp);
+JEMALLOC_EXPORT int JEMALLOC_NOTHROW je_mallctlbymib(const size_t * mib,
+                                                     size_t miblen,
+                                                     void * oldp,
+                                                     size_t * oldlenp,
+                                                     void * newp,
+                                                     size_t newlen);
+JEMALLOC_EXPORT void JEMALLOC_NOTHROW je_malloc_stats_print(
+    void (*write_cb)(void *, const char *),
+    void * je_cbopaque,
+    const char * opts);
+JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW je_malloc_usable_size(
+    JEMALLOC_USABLE_SIZE_CONST void * ptr) JEMALLOC_CXX_THROW;
+#ifdef JEMALLOC_HAVE_MALLOC_SIZE
+JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW je_malloc_size(
+    const void * ptr);
+#endif
+
+#ifdef JEMALLOC_OVERRIDE_MEMALIGN
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_SYS_NOTHROW * je_memalign(size_t alignment, size_t size)
+    JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc);
+#endif
+
+#ifdef JEMALLOC_OVERRIDE_VALLOC
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_SYS_NOTHROW * je_valloc(size_t size) JEMALLOC_CXX_THROW
+    JEMALLOC_ATTR(malloc);
+#endif
diff --git a/contrib/jemalloc-cmake/include/jemalloc/jemalloc_protos_jet.h b/contrib/jemalloc-cmake/include/jemalloc/jemalloc_protos_jet.h
new file mode 100644
index 00000000000..195d57e2997
--- /dev/null
+++ b/contrib/jemalloc-cmake/include/jemalloc/jemalloc_protos_jet.h
@@ -0,0 +1,71 @@
+/*
+ * The jet_ prefix on the following public symbol declarations is an artifact
+ * of namespace management, and should be omitted in application code unless
+ * JEMALLOC_NO_DEMANGLE is defined (see jemalloc_mangle@install_suffix@.h).
+ */
+extern JEMALLOC_EXPORT const char * jet_malloc_conf;
+extern JEMALLOC_EXPORT void (*jet_malloc_message)(void * cbopaque,
+                                                  const char * s);
+
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_SYS_NOTHROW * jet_malloc(size_t size)
+    JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1);
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_SYS_NOTHROW * jet_calloc(size_t num, size_t size)
+    JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE2(1, 2);
+JEMALLOC_EXPORT int JEMALLOC_SYS_NOTHROW jet_posix_memalign(
+    void ** memptr,
+    size_t alignment,
+    size_t size) JEMALLOC_CXX_THROW
+    JEMALLOC_ATTR(nonnull(1));
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_SYS_NOTHROW * jet_aligned_alloc(size_t alignment,
+                                                                                                          size_t size) JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc)
+    JEMALLOC_ALLOC_SIZE(2);
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_SYS_NOTHROW * jet_realloc(void * ptr, size_t size)
+    JEMALLOC_CXX_THROW JEMALLOC_ALLOC_SIZE(2);
+JEMALLOC_EXPORT void JEMALLOC_SYS_NOTHROW jet_free(void * ptr)
+    JEMALLOC_CXX_THROW;
+
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW * jet_mallocx(size_t size, int flags)
+    JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1);
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW * jet_rallocx(void * ptr, size_t size, int flags) JEMALLOC_ALLOC_SIZE(2);
+JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW jet_xallocx(void * ptr, size_t size, size_t extra, int flags);
+JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW jet_sallocx(const void * ptr,
+                                                    int flags) JEMALLOC_ATTR(pure);
+JEMALLOC_EXPORT void JEMALLOC_NOTHROW jet_dallocx(void * ptr, int flags);
+JEMALLOC_EXPORT void JEMALLOC_NOTHROW jet_sdallocx(void * ptr, size_t size, int flags);
+JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW jet_nallocx(size_t size, int flags)
+    JEMALLOC_ATTR(pure);
+
+JEMALLOC_EXPORT int JEMALLOC_NOTHROW jet_mallctl(const char * name,
+                                                 void * oldp,
+                                                 size_t * oldlenp,
+                                                 void * newp,
+                                                 size_t newlen);
+JEMALLOC_EXPORT int JEMALLOC_NOTHROW jet_mallctlnametomib(const char * name,
+                                                          size_t * mibp,
+                                                          size_t * miblenp);
+JEMALLOC_EXPORT int JEMALLOC_NOTHROW jet_mallctlbymib(const size_t * mib,
+                                                      size_t miblen,
+                                                      void * oldp,
+                                                      size_t * oldlenp,
+                                                      void * newp,
+                                                      size_t newlen);
+JEMALLOC_EXPORT void JEMALLOC_NOTHROW jet_malloc_stats_print(
+    void (*write_cb)(void *, const char *),
+    void * jet_cbopaque,
+    const char * opts);
+JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW jet_malloc_usable_size(
+    JEMALLOC_USABLE_SIZE_CONST void * ptr) JEMALLOC_CXX_THROW;
+#ifdef JEMALLOC_HAVE_MALLOC_SIZE
+JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW jet_malloc_size(
+    const void * ptr);
+#endif
+
+#ifdef JEMALLOC_OVERRIDE_MEMALIGN
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_SYS_NOTHROW * jet_memalign(size_t alignment, size_t size)
+    JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc);
+#endif
+
+#ifdef JEMALLOC_OVERRIDE_VALLOC
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_SYS_NOTHROW * jet_valloc(size_t size) JEMALLOC_CXX_THROW
+    JEMALLOC_ATTR(malloc);
+#endif
diff --git a/contrib/jemalloc-cmake/include/jemalloc/jemalloc_rename.h b/contrib/jemalloc-cmake/include/jemalloc/jemalloc_rename.h
index a2ea2dd3533..d032d46752d 100644
--- a/contrib/jemalloc-cmake/include/jemalloc/jemalloc_rename.h
+++ b/contrib/jemalloc-cmake/include/jemalloc/jemalloc_rename.h
@@ -4,26 +4,28 @@
  * these macro definitions.
  */
 #ifndef JEMALLOC_NO_RENAME
-#  define je_aligned_alloc aligned_alloc
-#  define je_calloc calloc
-#  define je_dallocx dallocx
-#  define je_free free
-#  define je_mallctl mallctl
-#  define je_mallctlbymib mallctlbymib
-#  define je_mallctlnametomib mallctlnametomib
-#  define je_malloc malloc
-#  define je_malloc_conf malloc_conf
-#  define je_malloc_message malloc_message
-#  define je_malloc_stats_print malloc_stats_print
-#  define je_malloc_usable_size malloc_usable_size
-#  define je_mallocx mallocx
-#  define je_nallocx nallocx
-#  define je_posix_memalign posix_memalign
-#  define je_rallocx rallocx
-#  define je_realloc realloc
-#  define je_sallocx sallocx
-#  define je_sdallocx sdallocx
-#  define je_xallocx xallocx
-#  define je_memalign memalign
-#  define je_valloc valloc
+#define je_aligned_alloc aligned_alloc
+#define je_calloc calloc
+#define je_dallocx dallocx
+#define je_free free
+#define je_mallctl mallctl
+#define je_mallctlbymib mallctlbymib
+#define je_mallctlnametomib mallctlnametomib
+#define je_malloc malloc
+#define je_malloc_conf malloc_conf
+#define je_malloc_conf_2_conf_harder malloc_conf_2_conf_harder
+#define je_malloc_message malloc_message
+#define je_malloc_stats_print malloc_stats_print
+#define je_malloc_usable_size malloc_usable_size
+#define je_mallocx mallocx
+#define je_smallocx_ca709c3139f77f4c00a903cdee46d71e9028f6c6 smallocx_ca709c3139f77f4c00a903cdee46d71e9028f6c6
+#define je_nallocx nallocx
+#define je_posix_memalign posix_memalign
+#define je_rallocx rallocx
+#define je_realloc realloc
+#define je_sallocx sallocx
+#define je_sdallocx sdallocx
+#define je_xallocx xallocx
+#define je_memalign memalign
+#define je_valloc valloc
 #endif
diff --git a/contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/jemalloc_typedefs.h b/contrib/jemalloc-cmake/include/jemalloc/jemalloc_typedefs.h
similarity index 57%
rename from contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/jemalloc_typedefs.h
rename to contrib/jemalloc-cmake/include/jemalloc/jemalloc_typedefs.h
index 1a58874306e..eeaf7a6760e 100644
--- a/contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/jemalloc_typedefs.h
+++ b/contrib/jemalloc-cmake/include/jemalloc/jemalloc_typedefs.h
@@ -5,73 +5,66 @@ typedef struct extent_hooks_s extent_hooks_t;
  * extent_alloc(extent_hooks_t *extent_hooks, void *new_addr, size_t size,
  *     size_t alignment, bool *zero, bool *commit, unsigned arena_ind);
  */
-typedef void *(extent_alloc_t)(extent_hooks_t *, void *, size_t, size_t, bool *,
-    bool *, unsigned);
+typedef void *(extent_alloc_t)(extent_hooks_t *, void *, size_t, size_t, bool *, bool *, unsigned);
 
 /*
  * bool
  * extent_dalloc(extent_hooks_t *extent_hooks, void *addr, size_t size,
  *     bool committed, unsigned arena_ind);
  */
-typedef bool (extent_dalloc_t)(extent_hooks_t *, void *, size_t, bool,
-    unsigned);
+typedef bool(extent_dalloc_t)(extent_hooks_t *, void *, size_t, bool, unsigned);
 
 /*
  * void
  * extent_destroy(extent_hooks_t *extent_hooks, void *addr, size_t size,
  *     bool committed, unsigned arena_ind);
  */
-typedef void (extent_destroy_t)(extent_hooks_t *, void *, size_t, bool,
-    unsigned);
+typedef void(extent_destroy_t)(extent_hooks_t *, void *, size_t, bool, unsigned);
 
 /*
  * bool
  * extent_commit(extent_hooks_t *extent_hooks, void *addr, size_t size,
  *     size_t offset, size_t length, unsigned arena_ind);
  */
-typedef bool (extent_commit_t)(extent_hooks_t *, void *, size_t, size_t, size_t,
-    unsigned);
+typedef bool(extent_commit_t)(extent_hooks_t *, void *, size_t, size_t, size_t, unsigned);
 
 /*
  * bool
  * extent_decommit(extent_hooks_t *extent_hooks, void *addr, size_t size,
  *     size_t offset, size_t length, unsigned arena_ind);
  */
-typedef bool (extent_decommit_t)(extent_hooks_t *, void *, size_t, size_t,
-    size_t, unsigned);
+typedef bool(extent_decommit_t)(extent_hooks_t *, void *, size_t, size_t, size_t, unsigned);
 
 /*
  * bool
  * extent_purge(extent_hooks_t *extent_hooks, void *addr, size_t size,
  *     size_t offset, size_t length, unsigned arena_ind);
  */
-typedef bool (extent_purge_t)(extent_hooks_t *, void *, size_t, size_t, size_t,
-    unsigned);
+typedef bool(extent_purge_t)(extent_hooks_t *, void *, size_t, size_t, size_t, unsigned);
 
 /*
  * bool
  * extent_split(extent_hooks_t *extent_hooks, void *addr, size_t size,
  *     size_t size_a, size_t size_b, bool committed, unsigned arena_ind);
  */
-typedef bool (extent_split_t)(extent_hooks_t *, void *, size_t, size_t, size_t,
-    bool, unsigned);
+typedef bool(extent_split_t)(extent_hooks_t *, void *, size_t, size_t, size_t, bool, unsigned);
 
 /*
  * bool
  * extent_merge(extent_hooks_t *extent_hooks, void *addr_a, size_t size_a,
  *     void *addr_b, size_t size_b, bool committed, unsigned arena_ind);
  */
-typedef bool (extent_merge_t)(extent_hooks_t *, void *, size_t, void *, size_t,
-    bool, unsigned);
+typedef bool(extent_merge_t)(extent_hooks_t *, void *, size_t, void *, size_t, bool, unsigned);
 
-struct extent_hooks_s {
-	extent_alloc_t		*alloc;
-	extent_dalloc_t		*dalloc;
-	extent_destroy_t	*destroy;
-	extent_commit_t		*commit;
-	extent_decommit_t	*decommit;
-	extent_purge_t		*purge_lazy;
-	extent_purge_t		*purge_forced;
-	extent_split_t		*split;
-	extent_merge_t		*merge;
+struct extent_hooks_s
+{
+    extent_alloc_t * alloc;
+    extent_dalloc_t * dalloc;
+    extent_destroy_t * destroy;
+    extent_commit_t * commit;
+    extent_decommit_t * decommit;
+    extent_purge_t * purge_lazy;
+    extent_purge_t * purge_forced;
+    extent_split_t * split;
+    extent_merge_t * merge;
 };
diff --git a/contrib/jemalloc-cmake/include_darwin_aarch64/jemalloc/internal/jemalloc_internal_defs.h.in b/contrib/jemalloc-cmake/include_darwin_aarch64/jemalloc/internal/jemalloc_internal_defs.h.in
new file mode 100644
index 00000000000..8ad95c51560
--- /dev/null
+++ b/contrib/jemalloc-cmake/include_darwin_aarch64/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -0,0 +1,425 @@
+/* include/jemalloc/internal/jemalloc_internal_defs.h.  Generated from jemalloc_internal_defs.h.in by configure.  */
+#ifndef JEMALLOC_INTERNAL_DEFS_H_
+#define JEMALLOC_INTERNAL_DEFS_H_
+/*
+ * If JEMALLOC_PREFIX is defined via --with-jemalloc-prefix, it will cause all
+ * public APIs to be prefixed.  This makes it possible, with some care, to use
+ * multiple allocators simultaneously.
+ */
+#define JEMALLOC_PREFIX "je_"
+#define JEMALLOC_CPREFIX "JE_"
+
+/*
+ * Define overrides for non-standard allocator-related functions if they are
+ * present on the system.
+ */
+/* #undef JEMALLOC_OVERRIDE___LIBC_CALLOC */
+/* #undef JEMALLOC_OVERRIDE___LIBC_FREE */
+/* #undef JEMALLOC_OVERRIDE___LIBC_MALLOC */
+/* #undef JEMALLOC_OVERRIDE___LIBC_MEMALIGN */
+/* #undef JEMALLOC_OVERRIDE___LIBC_REALLOC */
+/* #undef JEMALLOC_OVERRIDE___LIBC_VALLOC */
+/* #undef JEMALLOC_OVERRIDE___POSIX_MEMALIGN */
+
+/*
+ * JEMALLOC_PRIVATE_NAMESPACE is used as a prefix for all library-private APIs.
+ * For shared libraries, symbol visibility mechanisms prevent these symbols
+ * from being exported, but for static libraries, naming collisions are a real
+ * possibility.
+ */
+#define JEMALLOC_PRIVATE_NAMESPACE je_
+
+/*
+ * Hyper-threaded CPUs may need a special instruction inside spin loops in
+ * order to yield to another virtual CPU.
+ */
+#define CPU_SPINWAIT
+/* 1 if CPU_SPINWAIT is defined, 0 otherwise. */
+#define HAVE_CPU_SPINWAIT 0
+
+/*
+ * Number of significant bits in virtual addresses.  This may be less than the
+ * total number of bits in a pointer, e.g. on x64, for which the uppermost 16
+ * bits are the same as bit 47.
+ */
+#define LG_VADDR 64
+
+/* Defined if C11 atomics are available. */
+#define JEMALLOC_C11_ATOMICS
+
+/* Defined if GCC __atomic atomics are available. */
+#define JEMALLOC_GCC_ATOMIC_ATOMICS
+/* and the 8-bit variant support. */
+#define JEMALLOC_GCC_U8_ATOMIC_ATOMICS
+
+/* Defined if GCC __sync atomics are available. */
+#define JEMALLOC_GCC_SYNC_ATOMICS
+/* and the 8-bit variant support. */
+#define JEMALLOC_GCC_U8_SYNC_ATOMICS
+
+/*
+ * Defined if __builtin_clz() and __builtin_clzl() are available.
+ */
+#define JEMALLOC_HAVE_BUILTIN_CLZ
+
+/*
+ * Defined if os_unfair_lock_*() functions are available, as provided by Darwin.
+ */
+#define JEMALLOC_OS_UNFAIR_LOCK
+
+/* Defined if syscall(2) is usable. */
+/* #undef JEMALLOC_USE_SYSCALL */
+
+/*
+ * Defined if secure_getenv(3) is available.
+ */
+/* #undef JEMALLOC_HAVE_SECURE_GETENV */
+
+/*
+ * Defined if issetugid(2) is available.
+ */
+#define JEMALLOC_HAVE_ISSETUGID
+
+/* Defined if pthread_atfork(3) is available. */
+#define JEMALLOC_HAVE_PTHREAD_ATFORK
+
+/* Defined if pthread_setname_np(3) is available. */
+/* #undef JEMALLOC_HAVE_PTHREAD_SETNAME_NP */
+
+/* Defined if pthread_getname_np(3) is available. */
+#define JEMALLOC_HAVE_PTHREAD_GETNAME_NP
+
+/* Defined if pthread_get_name_np(3) is available. */
+/* #undef JEMALLOC_HAVE_PTHREAD_GET_NAME_NP */
+
+/*
+ * Defined if clock_gettime(CLOCK_MONOTONIC_COARSE, ...) is available.
+ */
+/* #undef JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE */
+
+/*
+ * Defined if clock_gettime(CLOCK_MONOTONIC, ...) is available.
+ */
+/* #undef JEMALLOC_HAVE_CLOCK_MONOTONIC */
+
+/*
+ * Defined if mach_absolute_time() is available.
+ */
+#define JEMALLOC_HAVE_MACH_ABSOLUTE_TIME
+
+/*
+ * Defined if clock_gettime(CLOCK_REALTIME, ...) is available.
+ */
+#define JEMALLOC_HAVE_CLOCK_REALTIME
+
+/*
+ * Defined if _malloc_thread_cleanup() exists.  At least in the case of
+ * FreeBSD, pthread_key_create() allocates, which if used during malloc
+ * bootstrapping will cause recursion into the pthreads library.  Therefore, if
+ * _malloc_thread_cleanup() exists, use it as the basis for thread cleanup in
+ * malloc_tsd.
+ */
+/* #undef JEMALLOC_MALLOC_THREAD_CLEANUP */
+
+/*
+ * Defined if threaded initialization is known to be safe on this platform.
+ * Among other things, it must be possible to initialize a mutex without
+ * triggering allocation in order for threaded allocation to be safe.
+ */
+/* #undef JEMALLOC_THREADED_INIT */
+
+/*
+ * Defined if the pthreads implementation defines
+ * _pthread_mutex_init_calloc_cb(), in which case the function is used in order
+ * to avoid recursive allocation during mutex initialization.
+ */
+/* #undef JEMALLOC_MUTEX_INIT_CB */
+
+/* Non-empty if the tls_model attribute is supported. */
+#define JEMALLOC_TLS_MODEL __attribute__((tls_model("initial-exec")))
+
+/*
+ * JEMALLOC_DEBUG enables assertions and other sanity checks, and disables
+ * inline functions.
+ */
+/* #undef JEMALLOC_DEBUG */
+
+/* JEMALLOC_STATS enables statistics calculation. */
+#define JEMALLOC_STATS
+
+/* JEMALLOC_EXPERIMENTAL_SMALLOCX_API enables experimental smallocx API. */
+/* #undef JEMALLOC_EXPERIMENTAL_SMALLOCX_API */
+
+/* JEMALLOC_PROF enables allocation profiling. */
+/* #undef JEMALLOC_PROF */
+
+/* Use libunwind for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_LIBUNWIND */
+
+/* Use libgcc for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_LIBGCC */
+
+/* Use gcc intrinsics for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_GCC */
+
+/*
+ * JEMALLOC_DSS enables use of sbrk(2) to allocate extents from the data storage
+ * segment (DSS).
+ */
+/* #undef JEMALLOC_DSS */
+
+/* Support memory filling (junk/zero). */
+#define JEMALLOC_FILL
+
+/* Support utrace(2)-based tracing. */
+/* #undef JEMALLOC_UTRACE */
+
+/* Support utrace(2)-based tracing (label based signature). */
+/* #undef JEMALLOC_UTRACE_LABEL */
+
+/* Support optional abort() on OOM. */
+/* #undef JEMALLOC_XMALLOC */
+
+/* Support lazy locking (avoid locking unless a second thread is launched). */
+/* #undef JEMALLOC_LAZY_LOCK */
+
+/*
+ * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size
+ * classes).
+ */
+/* #undef LG_QUANTUM */
+
+/* One page is 2^LG_PAGE bytes. */
+#define LG_PAGE 14
+
+/* Maximum number of regions in a slab. */
+/* #undef CONFIG_LG_SLAB_MAXREGS */
+
+/*
+ * One huge page is 2^LG_HUGEPAGE bytes.  Note that this is defined even if the
+ * system does not explicitly support huge pages; system calls that require
+ * explicit huge page support are separately configured.
+ */
+#define LG_HUGEPAGE 21
+
+/*
+ * If defined, adjacent virtual memory mappings with identical attributes
+ * automatically coalesce, and they fragment when changes are made to subranges.
+ * This is the normal order of things for mmap()/munmap(), but on Windows
+ * VirtualAlloc()/VirtualFree() operations must be precisely matched, i.e.
+ * mappings do *not* coalesce/fragment.
+ */
+#define JEMALLOC_MAPS_COALESCE
+
+/*
+ * If defined, retain memory for later reuse by default rather than using e.g.
+ * munmap() to unmap freed extents.  This is enabled on 64-bit Linux because
+ * common sequences of mmap()/munmap() calls will cause virtual memory map
+ * holes.
+ */
+/* #undef JEMALLOC_RETAIN */
+
+/* TLS is used to map arenas and magazine caches to threads. */
+/* #undef JEMALLOC_TLS */
+
+/*
+ * Used to mark unreachable code to quiet "end of non-void" compiler warnings.
+ * Don't use this directly; instead use unreachable() from util.h
+ */
+#define JEMALLOC_INTERNAL_UNREACHABLE __builtin_unreachable
+
+/*
+ * ffs*() functions to use for bitmapping.  Don't use these directly; instead,
+ * use ffs_*() from util.h.
+ */
+#define JEMALLOC_INTERNAL_FFSLL __builtin_ffsll
+#define JEMALLOC_INTERNAL_FFSL __builtin_ffsl
+#define JEMALLOC_INTERNAL_FFS __builtin_ffs
+
+/*
+ * popcount*() functions to use for bitmapping.
+ */
+#define JEMALLOC_INTERNAL_POPCOUNTL __builtin_popcountl
+#define JEMALLOC_INTERNAL_POPCOUNT __builtin_popcount
+
+/*
+ * If defined, explicitly attempt to more uniformly distribute large allocation
+ * pointer alignments across all cache indices.
+ */
+#define JEMALLOC_CACHE_OBLIVIOUS
+
+/*
+ * If defined, enable logging facilities.  We make this a configure option to
+ * avoid taking extra branches everywhere.
+ */
+/* #undef JEMALLOC_LOG */
+
+/*
+ * If defined, use readlinkat() (instead of readlink()) to follow
+ * /etc/malloc_conf.
+ */
+/* #undef JEMALLOC_READLINKAT */
+
+/*
+ * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
+ */
+#define JEMALLOC_ZONE
+
+/*
+ * Methods for determining whether the OS overcommits.
+ * JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY: Linux's
+ *                                         /proc/sys/vm.overcommit_memory file.
+ * JEMALLOC_SYSCTL_VM_OVERCOMMIT: FreeBSD's vm.overcommit sysctl.
+ */
+/* #undef JEMALLOC_SYSCTL_VM_OVERCOMMIT */
+/* #undef JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY */
+
+/* Defined if madvise(2) is available. */
+#define JEMALLOC_HAVE_MADVISE
+
+/*
+ * Defined if transparent huge pages are supported via the MADV_[NO]HUGEPAGE
+ * arguments to madvise(2).
+ */
+/* #undef JEMALLOC_HAVE_MADVISE_HUGE */
+
+/*
+ * Methods for purging unused pages differ between operating systems.
+ *
+ *   madvise(..., MADV_FREE) : This marks pages as being unused, such that they
+ *                             will be discarded rather than swapped out.
+ *   madvise(..., MADV_DONTNEED) : If JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS is
+ *                                 defined, this immediately discards pages,
+ *                                 such that new pages will be demand-zeroed if
+ *                                 the address region is later touched;
+ *                                 otherwise this behaves similarly to
+ *                                 MADV_FREE, though typically with higher
+ *                                 system overhead.
+ */
+#define JEMALLOC_PURGE_MADVISE_FREE
+#define JEMALLOC_PURGE_MADVISE_DONTNEED
+/* #undef JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS */
+
+/* Defined if madvise(2) is available but MADV_FREE is not (x86 Linux only). */
+/* #undef JEMALLOC_DEFINE_MADVISE_FREE */
+
+/*
+ * Defined if MADV_DO[NT]DUMP is supported as an argument to madvise.
+ */
+/* #undef JEMALLOC_MADVISE_DONTDUMP */
+
+/*
+ * Defined if MADV_[NO]CORE is supported as an argument to madvise.
+ */
+/* #undef JEMALLOC_MADVISE_NOCORE */
+
+/* Defined if mprotect(2) is available. */
+#define JEMALLOC_HAVE_MPROTECT
+
+/*
+ * Defined if transparent huge pages (THPs) are supported via the
+ * MADV_[NO]HUGEPAGE arguments to madvise(2), and THP support is enabled.
+ */
+/* #undef JEMALLOC_THP */
+
+/* Defined if posix_madvise is available. */
+/* #undef JEMALLOC_HAVE_POSIX_MADVISE */
+
+/*
+ * Method for purging unused pages using posix_madvise.
+ *
+ *   posix_madvise(..., POSIX_MADV_DONTNEED)
+ */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED_ZEROS */
+
+/*
+ * Defined if memcntl page admin call is supported
+ */
+/* #undef JEMALLOC_HAVE_MEMCNTL */
+
+/*
+ * Defined if malloc_size is supported
+ */
+#define JEMALLOC_HAVE_MALLOC_SIZE
+
+/* Define if operating system has alloca.h header. */
+/* #undef JEMALLOC_HAS_ALLOCA_H */
+
+/* C99 restrict keyword supported. */
+#define JEMALLOC_HAS_RESTRICT
+
+/* For use by hash code. */
+/* #undef JEMALLOC_BIG_ENDIAN */
+
+/* sizeof(int) == 2^LG_SIZEOF_INT. */
+#define LG_SIZEOF_INT 2
+
+/* sizeof(long) == 2^LG_SIZEOF_LONG. */
+#define LG_SIZEOF_LONG 3
+
+/* sizeof(long long) == 2^LG_SIZEOF_LONG_LONG. */
+#define LG_SIZEOF_LONG_LONG 3
+
+/* sizeof(intmax_t) == 2^LG_SIZEOF_INTMAX_T. */
+#define LG_SIZEOF_INTMAX_T 3
+
+/* glibc malloc hooks (__malloc_hook, __realloc_hook, __free_hook). */
+/* #undef JEMALLOC_GLIBC_MALLOC_HOOK */
+
+/* glibc memalign hook. */
+/* #undef JEMALLOC_GLIBC_MEMALIGN_HOOK */
+
+/* pthread support */
+#define JEMALLOC_HAVE_PTHREAD
+
+/* dlsym() support */
+#define JEMALLOC_HAVE_DLSYM
+
+/* Adaptive mutex support in pthreads. */
+/* #undef JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP */
+
+/* GNU specific sched_getcpu support */
+/* #undef JEMALLOC_HAVE_SCHED_GETCPU */
+
+/* GNU specific sched_setaffinity support */
+/* #undef JEMALLOC_HAVE_SCHED_SETAFFINITY */
+
+/*
+ * If defined, all the features necessary for background threads are present.
+ */
+/* #undef JEMALLOC_BACKGROUND_THREAD */
+
+/*
+ * If defined, jemalloc symbols are not exported (doesn't work when
+ * JEMALLOC_PREFIX is not defined).
+ */
+/* #undef JEMALLOC_EXPORT */
+
+/* config.malloc_conf options string. */
+#define JEMALLOC_CONFIG_MALLOC_CONF "@JEMALLOC_CONFIG_MALLOC_CONF@"
+
+/* If defined, jemalloc takes the malloc/free/etc. symbol names. */
+/* #undef JEMALLOC_IS_MALLOC */
+
+/*
+ * Defined if strerror_r returns char * if _GNU_SOURCE is defined.
+ */
+/* #undef JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE */
+
+/* Performs additional safety checks when defined. */
+/* #undef JEMALLOC_OPT_SAFETY_CHECKS */
+
+/* Is C++ support being built? */
+/* #undef JEMALLOC_ENABLE_CXX */
+
+/* Performs additional size checks when defined. */
+/* #undef JEMALLOC_OPT_SIZE_CHECKS */
+
+/* Allows sampled junk and stash for checking use-after-free when defined. */
+/* #undef JEMALLOC_UAF_DETECTION */
+
+/* Darwin VM_MAKE_TAG support */
+#define JEMALLOC_HAVE_VM_MAKE_TAG
+
+#endif /* JEMALLOC_INTERNAL_DEFS_H_ */
diff --git a/contrib/jemalloc-cmake/include_darwin_x86_64/jemalloc/internal/jemalloc_internal_defs.h.in b/contrib/jemalloc-cmake/include_darwin_x86_64/jemalloc/internal/jemalloc_internal_defs.h.in
new file mode 100644
index 00000000000..8671da5db69
--- /dev/null
+++ b/contrib/jemalloc-cmake/include_darwin_x86_64/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -0,0 +1,425 @@
+/* include/jemalloc/internal/jemalloc_internal_defs.h.  Generated from jemalloc_internal_defs.h.in by configure.  */
+#ifndef JEMALLOC_INTERNAL_DEFS_H_
+#define JEMALLOC_INTERNAL_DEFS_H_
+/*
+ * If JEMALLOC_PREFIX is defined via --with-jemalloc-prefix, it will cause all
+ * public APIs to be prefixed.  This makes it possible, with some care, to use
+ * multiple allocators simultaneously.
+ */
+#define JEMALLOC_PREFIX "je_"
+#define JEMALLOC_CPREFIX "JE_"
+
+/*
+ * Define overrides for non-standard allocator-related functions if they are
+ * present on the system.
+ */
+/* #undef JEMALLOC_OVERRIDE___LIBC_CALLOC */
+/* #undef JEMALLOC_OVERRIDE___LIBC_FREE */
+/* #undef JEMALLOC_OVERRIDE___LIBC_MALLOC */
+/* #undef JEMALLOC_OVERRIDE___LIBC_MEMALIGN */
+/* #undef JEMALLOC_OVERRIDE___LIBC_REALLOC */
+/* #undef JEMALLOC_OVERRIDE___LIBC_VALLOC */
+/* #undef JEMALLOC_OVERRIDE___POSIX_MEMALIGN */
+
+/*
+ * JEMALLOC_PRIVATE_NAMESPACE is used as a prefix for all library-private APIs.
+ * For shared libraries, symbol visibility mechanisms prevent these symbols
+ * from being exported, but for static libraries, naming collisions are a real
+ * possibility.
+ */
+#define JEMALLOC_PRIVATE_NAMESPACE je_
+
+/*
+ * Hyper-threaded CPUs may need a special instruction inside spin loops in
+ * order to yield to another virtual CPU.
+ */
+#define CPU_SPINWAIT __asm__ volatile("pause")
+/* 1 if CPU_SPINWAIT is defined, 0 otherwise. */
+#define HAVE_CPU_SPINWAIT 1
+
+/*
+ * Number of significant bits in virtual addresses.  This may be less than the
+ * total number of bits in a pointer, e.g. on x64, for which the uppermost 16
+ * bits are the same as bit 47.
+ */
+#define LG_VADDR 48
+
+/* Defined if C11 atomics are available. */
+#define JEMALLOC_C11_ATOMICS
+
+/* Defined if GCC __atomic atomics are available. */
+#define JEMALLOC_GCC_ATOMIC_ATOMICS
+/* and the 8-bit variant support. */
+#define JEMALLOC_GCC_U8_ATOMIC_ATOMICS
+
+/* Defined if GCC __sync atomics are available. */
+#define JEMALLOC_GCC_SYNC_ATOMICS
+/* and the 8-bit variant support. */
+#define JEMALLOC_GCC_U8_SYNC_ATOMICS
+
+/*
+ * Defined if __builtin_clz() and __builtin_clzl() are available.
+ */
+#define JEMALLOC_HAVE_BUILTIN_CLZ
+
+/*
+ * Defined if os_unfair_lock_*() functions are available, as provided by Darwin.
+ */
+#define JEMALLOC_OS_UNFAIR_LOCK
+
+/* Defined if syscall(2) is usable. */
+/* #undef JEMALLOC_USE_SYSCALL */
+
+/*
+ * Defined if secure_getenv(3) is available.
+ */
+/* #undef JEMALLOC_HAVE_SECURE_GETENV */
+
+/*
+ * Defined if issetugid(2) is available.
+ */
+#define JEMALLOC_HAVE_ISSETUGID
+
+/* Defined if pthread_atfork(3) is available. */
+#define JEMALLOC_HAVE_PTHREAD_ATFORK
+
+/* Defined if pthread_setname_np(3) is available. */
+/* #undef JEMALLOC_HAVE_PTHREAD_SETNAME_NP */
+
+/* Defined if pthread_getname_np(3) is available. */
+#define JEMALLOC_HAVE_PTHREAD_GETNAME_NP
+
+/* Defined if pthread_get_name_np(3) is available. */
+/* #undef JEMALLOC_HAVE_PTHREAD_GET_NAME_NP */
+
+/*
+ * Defined if clock_gettime(CLOCK_MONOTONIC_COARSE, ...) is available.
+ */
+/* #undef JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE */
+
+/*
+ * Defined if clock_gettime(CLOCK_MONOTONIC, ...) is available.
+ */
+/* #undef JEMALLOC_HAVE_CLOCK_MONOTONIC */
+
+/*
+ * Defined if mach_absolute_time() is available.
+ */
+#define JEMALLOC_HAVE_MACH_ABSOLUTE_TIME
+
+/*
+ * Defined if clock_gettime(CLOCK_REALTIME, ...) is available.
+ */
+#define JEMALLOC_HAVE_CLOCK_REALTIME
+
+/*
+ * Defined if _malloc_thread_cleanup() exists.  At least in the case of
+ * FreeBSD, pthread_key_create() allocates, which if used during malloc
+ * bootstrapping will cause recursion into the pthreads library.  Therefore, if
+ * _malloc_thread_cleanup() exists, use it as the basis for thread cleanup in
+ * malloc_tsd.
+ */
+/* #undef JEMALLOC_MALLOC_THREAD_CLEANUP */
+
+/*
+ * Defined if threaded initialization is known to be safe on this platform.
+ * Among other things, it must be possible to initialize a mutex without
+ * triggering allocation in order for threaded allocation to be safe.
+ */
+/* #undef JEMALLOC_THREADED_INIT */
+
+/*
+ * Defined if the pthreads implementation defines
+ * _pthread_mutex_init_calloc_cb(), in which case the function is used in order
+ * to avoid recursive allocation during mutex initialization.
+ */
+/* #undef JEMALLOC_MUTEX_INIT_CB */
+
+/* Non-empty if the tls_model attribute is supported. */
+#define JEMALLOC_TLS_MODEL __attribute__((tls_model("initial-exec")))
+
+/*
+ * JEMALLOC_DEBUG enables assertions and other sanity checks, and disables
+ * inline functions.
+ */
+/* #undef JEMALLOC_DEBUG */
+
+/* JEMALLOC_STATS enables statistics calculation. */
+#define JEMALLOC_STATS
+
+/* JEMALLOC_EXPERIMENTAL_SMALLOCX_API enables experimental smallocx API. */
+/* #undef JEMALLOC_EXPERIMENTAL_SMALLOCX_API */
+
+/* JEMALLOC_PROF enables allocation profiling. */
+/* #undef JEMALLOC_PROF */
+
+/* Use libunwind for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_LIBUNWIND */
+
+/* Use libgcc for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_LIBGCC */
+
+/* Use gcc intrinsics for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_GCC */
+
+/*
+ * JEMALLOC_DSS enables use of sbrk(2) to allocate extents from the data storage
+ * segment (DSS).
+ */
+/* #undef JEMALLOC_DSS */
+
+/* Support memory filling (junk/zero). */
+#define JEMALLOC_FILL
+
+/* Support utrace(2)-based tracing. */
+/* #undef JEMALLOC_UTRACE */
+
+/* Support utrace(2)-based tracing (label based signature). */
+/* #undef JEMALLOC_UTRACE_LABEL */
+
+/* Support optional abort() on OOM. */
+/* #undef JEMALLOC_XMALLOC */
+
+/* Support lazy locking (avoid locking unless a second thread is launched). */
+/* #undef JEMALLOC_LAZY_LOCK */
+
+/*
+ * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size
+ * classes).
+ */
+/* #undef LG_QUANTUM */
+
+/* One page is 2^LG_PAGE bytes. */
+#define LG_PAGE 12
+
+/* Maximum number of regions in a slab. */
+/* #undef CONFIG_LG_SLAB_MAXREGS */
+
+/*
+ * One huge page is 2^LG_HUGEPAGE bytes.  Note that this is defined even if the
+ * system does not explicitly support huge pages; system calls that require
+ * explicit huge page support are separately configured.
+ */
+#define LG_HUGEPAGE 21
+
+/*
+ * If defined, adjacent virtual memory mappings with identical attributes
+ * automatically coalesce, and they fragment when changes are made to subranges.
+ * This is the normal order of things for mmap()/munmap(), but on Windows
+ * VirtualAlloc()/VirtualFree() operations must be precisely matched, i.e.
+ * mappings do *not* coalesce/fragment.
+ */
+#define JEMALLOC_MAPS_COALESCE
+
+/*
+ * If defined, retain memory for later reuse by default rather than using e.g.
+ * munmap() to unmap freed extents.  This is enabled on 64-bit Linux because
+ * common sequences of mmap()/munmap() calls will cause virtual memory map
+ * holes.
+ */
+/* #undef JEMALLOC_RETAIN */
+
+/* TLS is used to map arenas and magazine caches to threads. */
+/* #undef JEMALLOC_TLS */
+
+/*
+ * Used to mark unreachable code to quiet "end of non-void" compiler warnings.
+ * Don't use this directly; instead use unreachable() from util.h
+ */
+#define JEMALLOC_INTERNAL_UNREACHABLE __builtin_unreachable
+
+/*
+ * ffs*() functions to use for bitmapping.  Don't use these directly; instead,
+ * use ffs_*() from util.h.
+ */
+#define JEMALLOC_INTERNAL_FFSLL __builtin_ffsll
+#define JEMALLOC_INTERNAL_FFSL __builtin_ffsl
+#define JEMALLOC_INTERNAL_FFS __builtin_ffs
+
+/*
+ * popcount*() functions to use for bitmapping.
+ */
+#define JEMALLOC_INTERNAL_POPCOUNTL __builtin_popcountl
+#define JEMALLOC_INTERNAL_POPCOUNT __builtin_popcount
+
+/*
+ * If defined, explicitly attempt to more uniformly distribute large allocation
+ * pointer alignments across all cache indices.
+ */
+#define JEMALLOC_CACHE_OBLIVIOUS
+
+/*
+ * If defined, enable logging facilities.  We make this a configure option to
+ * avoid taking extra branches everywhere.
+ */
+/* #undef JEMALLOC_LOG */
+
+/*
+ * If defined, use readlinkat() (instead of readlink()) to follow
+ * /etc/malloc_conf.
+ */
+/* #undef JEMALLOC_READLINKAT */
+
+/*
+ * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
+ */
+#define JEMALLOC_ZONE
+
+/*
+ * Methods for determining whether the OS overcommits.
+ * JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY: Linux's
+ *                                         /proc/sys/vm.overcommit_memory file.
+ * JEMALLOC_SYSCTL_VM_OVERCOMMIT: FreeBSD's vm.overcommit sysctl.
+ */
+/* #undef JEMALLOC_SYSCTL_VM_OVERCOMMIT */
+/* #undef JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY */
+
+/* Defined if madvise(2) is available. */
+#define JEMALLOC_HAVE_MADVISE
+
+/*
+ * Defined if transparent huge pages are supported via the MADV_[NO]HUGEPAGE
+ * arguments to madvise(2).
+ */
+/* #undef JEMALLOC_HAVE_MADVISE_HUGE */
+
+/*
+ * Methods for purging unused pages differ between operating systems.
+ *
+ *   madvise(..., MADV_FREE) : This marks pages as being unused, such that they
+ *                             will be discarded rather than swapped out.
+ *   madvise(..., MADV_DONTNEED) : If JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS is
+ *                                 defined, this immediately discards pages,
+ *                                 such that new pages will be demand-zeroed if
+ *                                 the address region is later touched;
+ *                                 otherwise this behaves similarly to
+ *                                 MADV_FREE, though typically with higher
+ *                                 system overhead.
+ */
+#define JEMALLOC_PURGE_MADVISE_FREE
+#define JEMALLOC_PURGE_MADVISE_DONTNEED
+/* #undef JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS */
+
+/* Defined if madvise(2) is available but MADV_FREE is not (x86 Linux only). */
+/* #undef JEMALLOC_DEFINE_MADVISE_FREE */
+
+/*
+ * Defined if MADV_DO[NT]DUMP is supported as an argument to madvise.
+ */
+/* #undef JEMALLOC_MADVISE_DONTDUMP */
+
+/*
+ * Defined if MADV_[NO]CORE is supported as an argument to madvise.
+ */
+/* #undef JEMALLOC_MADVISE_NOCORE */
+
+/* Defined if mprotect(2) is available. */
+#define JEMALLOC_HAVE_MPROTECT
+
+/*
+ * Defined if transparent huge pages (THPs) are supported via the
+ * MADV_[NO]HUGEPAGE arguments to madvise(2), and THP support is enabled.
+ */
+/* #undef JEMALLOC_THP */
+
+/* Defined if posix_madvise is available. */
+/* #undef JEMALLOC_HAVE_POSIX_MADVISE */
+
+/*
+ * Method for purging unused pages using posix_madvise.
+ *
+ *   posix_madvise(..., POSIX_MADV_DONTNEED)
+ */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED_ZEROS */
+
+/*
+ * Defined if memcntl page admin call is supported
+ */
+/* #undef JEMALLOC_HAVE_MEMCNTL */
+
+/*
+ * Defined if malloc_size is supported
+ */
+#define JEMALLOC_HAVE_MALLOC_SIZE
+
+/* Define if operating system has alloca.h header. */
+/* #undef JEMALLOC_HAS_ALLOCA_H */
+
+/* C99 restrict keyword supported. */
+#define JEMALLOC_HAS_RESTRICT
+
+/* For use by hash code. */
+/* #undef JEMALLOC_BIG_ENDIAN */
+
+/* sizeof(int) == 2^LG_SIZEOF_INT. */
+#define LG_SIZEOF_INT 2
+
+/* sizeof(long) == 2^LG_SIZEOF_LONG. */
+#define LG_SIZEOF_LONG 3
+
+/* sizeof(long long) == 2^LG_SIZEOF_LONG_LONG. */
+#define LG_SIZEOF_LONG_LONG 3
+
+/* sizeof(intmax_t) == 2^LG_SIZEOF_INTMAX_T. */
+#define LG_SIZEOF_INTMAX_T 3
+
+/* glibc malloc hooks (__malloc_hook, __realloc_hook, __free_hook). */
+/* #undef JEMALLOC_GLIBC_MALLOC_HOOK */
+
+/* glibc memalign hook. */
+/* #undef JEMALLOC_GLIBC_MEMALIGN_HOOK */
+
+/* pthread support */
+#define JEMALLOC_HAVE_PTHREAD
+
+/* dlsym() support */
+#define JEMALLOC_HAVE_DLSYM
+
+/* Adaptive mutex support in pthreads. */
+/* #undef JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP */
+
+/* GNU specific sched_getcpu support */
+/* #undef JEMALLOC_HAVE_SCHED_GETCPU */
+
+/* GNU specific sched_setaffinity support */
+/* #undef JEMALLOC_HAVE_SCHED_SETAFFINITY */
+
+/*
+ * If defined, all the features necessary for background threads are present.
+ */
+/* #undef JEMALLOC_BACKGROUND_THREAD */
+
+/*
+ * If defined, jemalloc symbols are not exported (doesn't work when
+ * JEMALLOC_PREFIX is not defined).
+ */
+/* #undef JEMALLOC_EXPORT */
+
+/* config.malloc_conf options string. */
+#define JEMALLOC_CONFIG_MALLOC_CONF "@JEMALLOC_CONFIG_MALLOC_CONF@"
+
+/* If defined, jemalloc takes the malloc/free/etc. symbol names. */
+/* #undef JEMALLOC_IS_MALLOC */
+
+/*
+ * Defined if strerror_r returns char * if _GNU_SOURCE is defined.
+ */
+/* #undef JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE */
+
+/* Performs additional safety checks when defined. */
+/* #undef JEMALLOC_OPT_SAFETY_CHECKS */
+
+/* Is C++ support being built? */
+/* #undef JEMALLOC_ENABLE_CXX */
+
+/* Performs additional size checks when defined. */
+/* #undef JEMALLOC_OPT_SIZE_CHECKS */
+
+/* Allows sampled junk and stash for checking use-after-free when defined. */
+/* #undef JEMALLOC_UAF_DETECTION */
+
+/* Darwin VM_MAKE_TAG support */
+#define JEMALLOC_HAVE_VM_MAKE_TAG
+
+#endif /* JEMALLOC_INTERNAL_DEFS_H_ */
diff --git a/contrib/jemalloc-cmake/include_freebsd_aarch64/jemalloc/internal/jemalloc_internal_defs.h.in b/contrib/jemalloc-cmake/include_freebsd_aarch64/jemalloc/internal/jemalloc_internal_defs.h.in
new file mode 100644
index 00000000000..0f61417d65f
--- /dev/null
+++ b/contrib/jemalloc-cmake/include_freebsd_aarch64/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -0,0 +1,427 @@
+/* include/jemalloc/internal/jemalloc_internal_defs.h.  Generated from jemalloc_internal_defs.h.in by configure.  */
+#ifndef JEMALLOC_INTERNAL_DEFS_H_
+#define JEMALLOC_INTERNAL_DEFS_H_
+/*
+ * If JEMALLOC_PREFIX is defined via --with-jemalloc-prefix, it will cause all
+ * public APIs to be prefixed.  This makes it possible, with some care, to use
+ * multiple allocators simultaneously.
+ */
+/* #undef JEMALLOC_PREFIX */
+/* #undef JEMALLOC_CPREFIX */
+
+/*
+ * Define overrides for non-standard allocator-related functions if they are
+ * present on the system.
+ */
+/* #undef JEMALLOC_OVERRIDE___LIBC_CALLOC */
+/* #undef JEMALLOC_OVERRIDE___LIBC_FREE */
+/* #undef JEMALLOC_OVERRIDE___LIBC_MALLOC */
+/* #undef JEMALLOC_OVERRIDE___LIBC_MEMALIGN */
+/* #undef JEMALLOC_OVERRIDE___LIBC_REALLOC */
+/* #undef JEMALLOC_OVERRIDE___LIBC_VALLOC */
+#define JEMALLOC_OVERRIDE___POSIX_MEMALIGN
+
+/*
+ * JEMALLOC_PRIVATE_NAMESPACE is used as a prefix for all library-private APIs.
+ * For shared libraries, symbol visibility mechanisms prevent these symbols
+ * from being exported, but for static libraries, naming collisions are a real
+ * possibility.
+ */
+#define JEMALLOC_PRIVATE_NAMESPACE je_
+
+/*
+ * Hyper-threaded CPUs may need a special instruction inside spin loops in
+ * order to yield to another virtual CPU.
+ */
+#define CPU_SPINWAIT
+/* 1 if CPU_SPINWAIT is defined, 0 otherwise. */
+#define HAVE_CPU_SPINWAIT 0
+
+/*
+ * Number of significant bits in virtual addresses.  This may be less than the
+ * total number of bits in a pointer, e.g. on x64, for which the uppermost 16
+ * bits are the same as bit 47.
+ */
+#define LG_VADDR 48
+
+/* Defined if C11 atomics are available. */
+#define JEMALLOC_C11_ATOMICS
+
+/* Defined if GCC __atomic atomics are available. */
+#define JEMALLOC_GCC_ATOMIC_ATOMICS
+/* and the 8-bit variant support. */
+#define JEMALLOC_GCC_U8_ATOMIC_ATOMICS
+
+/* Defined if GCC __sync atomics are available. */
+#define JEMALLOC_GCC_SYNC_ATOMICS
+/* and the 8-bit variant support. */
+#define JEMALLOC_GCC_U8_SYNC_ATOMICS
+
+/*
+ * Defined if __builtin_clz() and __builtin_clzl() are available.
+ */
+#define JEMALLOC_HAVE_BUILTIN_CLZ
+
+/*
+ * Defined if os_unfair_lock_*() functions are available, as provided by Darwin.
+ */
+/* #undef JEMALLOC_OS_UNFAIR_LOCK */
+
+/* Defined if syscall(2) is usable. */
+#define JEMALLOC_USE_SYSCALL
+
+/*
+ * Defined if secure_getenv(3) is available.
+ */
+/* #undef JEMALLOC_HAVE_SECURE_GETENV */
+
+/*
+ * Defined if issetugid(2) is available.
+ */
+#define JEMALLOC_HAVE_ISSETUGID
+
+/* Defined if pthread_atfork(3) is available. */
+#define JEMALLOC_HAVE_PTHREAD_ATFORK
+
+/* Only since 12.1-STABLE */
+/* Defined if pthread_setname_np(3) is available. */
+/* #undef JEMALLOC_HAVE_PTHREAD_SETNAME_NP */
+
+/* Only since 12.1-STABLE */
+/* Defined if pthread_getname_np(3) is available. */
+/* #undef JEMALLOC_HAVE_PTHREAD_GETNAME_NP */
+
+/* Defined if pthread_get_name_np(3) is available. */
+#define JEMALLOC_HAVE_PTHREAD_GET_NAME_NP
+
+/*
+ * Defined if clock_gettime(CLOCK_MONOTONIC_COARSE, ...) is available.
+ */
+/* #undef JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE */
+
+/*
+ * Defined if clock_gettime(CLOCK_MONOTONIC, ...) is available.
+ */
+#define JEMALLOC_HAVE_CLOCK_MONOTONIC
+
+/*
+ * Defined if mach_absolute_time() is available.
+ */
+/* #undef JEMALLOC_HAVE_MACH_ABSOLUTE_TIME */
+
+/*
+ * Defined if clock_gettime(CLOCK_REALTIME, ...) is available.
+ */
+#define JEMALLOC_HAVE_CLOCK_REALTIME
+
+/*
+ * Defined if _malloc_thread_cleanup() exists.  At least in the case of
+ * FreeBSD, pthread_key_create() allocates, which if used during malloc
+ * bootstrapping will cause recursion into the pthreads library.  Therefore, if
+ * _malloc_thread_cleanup() exists, use it as the basis for thread cleanup in
+ * malloc_tsd.
+ */
+#define JEMALLOC_MALLOC_THREAD_CLEANUP
+
+/*
+ * Defined if threaded initialization is known to be safe on this platform.
+ * Among other things, it must be possible to initialize a mutex without
+ * triggering allocation in order for threaded allocation to be safe.
+ */
+/* #undef JEMALLOC_THREADED_INIT */
+
+/*
+ * Defined if the pthreads implementation defines
+ * _pthread_mutex_init_calloc_cb(), in which case the function is used in order
+ * to avoid recursive allocation during mutex initialization.
+ */
+#define JEMALLOC_MUTEX_INIT_CB
+
+/* Non-empty if the tls_model attribute is supported. */
+#define JEMALLOC_TLS_MODEL __attribute__((tls_model("initial-exec")))
+
+/*
+ * JEMALLOC_DEBUG enables assertions and other sanity checks, and disables
+ * inline functions.
+ */
+/* #undef JEMALLOC_DEBUG */
+
+/* JEMALLOC_STATS enables statistics calculation. */
+#define JEMALLOC_STATS
+
+/* JEMALLOC_EXPERIMENTAL_SMALLOCX_API enables experimental smallocx API. */
+/* #undef JEMALLOC_EXPERIMENTAL_SMALLOCX_API */
+
+/* JEMALLOC_PROF enables allocation profiling. */
+/* #undef JEMALLOC_PROF */
+
+/* Use libunwind for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_LIBUNWIND */
+
+/* Use libgcc for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_LIBGCC */
+
+/* Use gcc intrinsics for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_GCC */
+
+/*
+ * JEMALLOC_DSS enables use of sbrk(2) to allocate extents from the data storage
+ * segment (DSS).
+ */
+#define JEMALLOC_DSS
+
+/* Support memory filling (junk/zero). */
+#define JEMALLOC_FILL
+
+/* Support utrace(2)-based tracing. */
+/* #undef JEMALLOC_UTRACE */
+
+/* Support utrace(2)-based tracing (label based signature). */
+/* #undef JEMALLOC_UTRACE_LABEL */
+
+/* Support optional abort() on OOM. */
+/* #undef JEMALLOC_XMALLOC */
+
+/* Support lazy locking (avoid locking unless a second thread is launched). */
+#define JEMALLOC_LAZY_LOCK
+
+/*
+ * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size
+ * classes).
+ */
+/* #undef LG_QUANTUM */
+
+/* One page is 2^LG_PAGE bytes. */
+#define LG_PAGE 16
+
+/* Maximum number of regions in a slab. */
+/* #undef CONFIG_LG_SLAB_MAXREGS */
+
+/*
+ * One huge page is 2^LG_HUGEPAGE bytes.  Note that this is defined even if the
+ * system does not explicitly support huge pages; system calls that require
+ * explicit huge page support are separately configured.
+ */
+#define LG_HUGEPAGE 29
+
+/*
+ * If defined, adjacent virtual memory mappings with identical attributes
+ * automatically coalesce, and they fragment when changes are made to subranges.
+ * This is the normal order of things for mmap()/munmap(), but on Windows
+ * VirtualAlloc()/VirtualFree() operations must be precisely matched, i.e.
+ * mappings do *not* coalesce/fragment.
+ */
+#define JEMALLOC_MAPS_COALESCE
+
+/*
+ * If defined, retain memory for later reuse by default rather than using e.g.
+ * munmap() to unmap freed extents.  This is enabled on 64-bit Linux because
+ * common sequences of mmap()/munmap() calls will cause virtual memory map
+ * holes.
+ */
+/* #undef JEMALLOC_RETAIN */
+
+/* TLS is used to map arenas and magazine caches to threads. */
+#define JEMALLOC_TLS
+
+/*
+ * Used to mark unreachable code to quiet "end of non-void" compiler warnings.
+ * Don't use this directly; instead use unreachable() from util.h
+ */
+#define JEMALLOC_INTERNAL_UNREACHABLE __builtin_unreachable
+
+/*
+ * ffs*() functions to use for bitmapping.  Don't use these directly; instead,
+ * use ffs_*() from util.h.
+ */
+#define JEMALLOC_INTERNAL_FFSLL __builtin_ffsll
+#define JEMALLOC_INTERNAL_FFSL __builtin_ffsl
+#define JEMALLOC_INTERNAL_FFS __builtin_ffs
+
+/*
+ * popcount*() functions to use for bitmapping.
+ */
+#define JEMALLOC_INTERNAL_POPCOUNTL __builtin_popcountl
+#define JEMALLOC_INTERNAL_POPCOUNT __builtin_popcount
+
+/*
+ * If defined, explicitly attempt to more uniformly distribute large allocation
+ * pointer alignments across all cache indices.
+ */
+#define JEMALLOC_CACHE_OBLIVIOUS
+
+/*
+ * If defined, enable logging facilities.  We make this a configure option to
+ * avoid taking extra branches everywhere.
+ */
+/* #undef JEMALLOC_LOG */
+
+/*
+ * If defined, use readlinkat() (instead of readlink()) to follow
+ * /etc/malloc_conf.
+ */
+/* #undef JEMALLOC_READLINKAT */
+
+/*
+ * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
+ */
+/* #undef JEMALLOC_ZONE */
+
+/*
+ * Methods for determining whether the OS overcommits.
+ * JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY: Linux's
+ *                                         /proc/sys/vm.overcommit_memory file.
+ * JEMALLOC_SYSCTL_VM_OVERCOMMIT: FreeBSD's vm.overcommit sysctl.
+ */
+#define JEMALLOC_SYSCTL_VM_OVERCOMMIT
+/* #undef JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY */
+
+/* Defined if madvise(2) is available. */
+#define JEMALLOC_HAVE_MADVISE
+
+/*
+ * Defined if transparent huge pages are supported via the MADV_[NO]HUGEPAGE
+ * arguments to madvise(2).
+ */
+/* #undef JEMALLOC_HAVE_MADVISE_HUGE */
+
+/*
+ * Methods for purging unused pages differ between operating systems.
+ *
+ *   madvise(..., MADV_FREE) : This marks pages as being unused, such that they
+ *                             will be discarded rather than swapped out.
+ *   madvise(..., MADV_DONTNEED) : If JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS is
+ *                                 defined, this immediately discards pages,
+ *                                 such that new pages will be demand-zeroed if
+ *                                 the address region is later touched;
+ *                                 otherwise this behaves similarly to
+ *                                 MADV_FREE, though typically with higher
+ *                                 system overhead.
+ */
+#define JEMALLOC_PURGE_MADVISE_FREE
+#define JEMALLOC_PURGE_MADVISE_DONTNEED
+/* #undef JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS */
+
+/* Defined if madvise(2) is available but MADV_FREE is not (x86 Linux only). */
+/* #undef JEMALLOC_DEFINE_MADVISE_FREE */
+
+/*
+ * Defined if MADV_DO[NT]DUMP is supported as an argument to madvise.
+ */
+/* #undef JEMALLOC_MADVISE_DONTDUMP */
+
+/*
+ * Defined if MADV_[NO]CORE is supported as an argument to madvise.
+ */
+#define JEMALLOC_MADVISE_NOCORE
+
+/* Defined if mprotect(2) is available. */
+#define JEMALLOC_HAVE_MPROTECT
+
+/*
+ * Defined if transparent huge pages (THPs) are supported via the
+ * MADV_[NO]HUGEPAGE arguments to madvise(2), and THP support is enabled.
+ */
+/* #undef JEMALLOC_THP */
+
+/* Defined if posix_madvise is available. */
+/* #undef JEMALLOC_HAVE_POSIX_MADVISE */
+
+/*
+ * Method for purging unused pages using posix_madvise.
+ *
+ *   posix_madvise(..., POSIX_MADV_DONTNEED)
+ */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED_ZEROS */
+
+/*
+ * Defined if memcntl page admin call is supported
+ */
+/* #undef JEMALLOC_HAVE_MEMCNTL */
+
+/*
+ * Defined if malloc_size is supported
+ */
+/* #undef JEMALLOC_HAVE_MALLOC_SIZE */
+
+/* Define if operating system has alloca.h header. */
+/* #undef JEMALLOC_HAS_ALLOCA_H */
+
+/* C99 restrict keyword supported. */
+#define JEMALLOC_HAS_RESTRICT
+
+/* For use by hash code. */
+/* #undef JEMALLOC_BIG_ENDIAN */
+
+/* sizeof(int) == 2^LG_SIZEOF_INT. */
+#define LG_SIZEOF_INT 2
+
+/* sizeof(long) == 2^LG_SIZEOF_LONG. */
+#define LG_SIZEOF_LONG 3
+
+/* sizeof(long long) == 2^LG_SIZEOF_LONG_LONG. */
+#define LG_SIZEOF_LONG_LONG 3
+
+/* sizeof(intmax_t) == 2^LG_SIZEOF_INTMAX_T. */
+#define LG_SIZEOF_INTMAX_T 3
+
+/* glibc malloc hooks (__malloc_hook, __realloc_hook, __free_hook). */
+/* #undef JEMALLOC_GLIBC_MALLOC_HOOK */
+
+/* glibc memalign hook. */
+/* #undef JEMALLOC_GLIBC_MEMALIGN_HOOK */
+
+/* pthread support */
+#define JEMALLOC_HAVE_PTHREAD
+
+/* dlsym() support */
+#define JEMALLOC_HAVE_DLSYM
+
+/* Adaptive mutex support in pthreads. */
+#define JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP
+
+/* GNU specific sched_getcpu support */
+/* #undef JEMALLOC_HAVE_SCHED_GETCPU */
+
+/* GNU specific sched_setaffinity support */
+/* #undef JEMALLOC_HAVE_SCHED_SETAFFINITY */
+
+/*
+ * If defined, all the features necessary for background threads are present.
+ */
+#define JEMALLOC_BACKGROUND_THREAD
+
+/*
+ * If defined, jemalloc symbols are not exported (doesn't work when
+ * JEMALLOC_PREFIX is not defined).
+ */
+/* #undef JEMALLOC_EXPORT */
+
+/* config.malloc_conf options string. */
+#define JEMALLOC_CONFIG_MALLOC_CONF "@JEMALLOC_CONFIG_MALLOC_CONF@"
+
+/* If defined, jemalloc takes the malloc/free/etc. symbol names. */
+#define JEMALLOC_IS_MALLOC
+
+/*
+ * Defined if strerror_r returns char * if _GNU_SOURCE is defined.
+ */
+/* #undef JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE */
+
+/* Performs additional safety checks when defined. */
+/* #undef JEMALLOC_OPT_SAFETY_CHECKS */
+
+/* Is C++ support being built? */
+/* #undef JEMALLOC_ENABLE_CXX */
+
+/* Performs additional size checks when defined. */
+/* #undef JEMALLOC_OPT_SIZE_CHECKS */
+
+/* Allows sampled junk and stash for checking use-after-free when defined. */
+/* #undef JEMALLOC_UAF_DETECTION */
+
+/* Darwin VM_MAKE_TAG support */
+/* #undef JEMALLOC_HAVE_VM_MAKE_TAG */
+
+#endif /* JEMALLOC_INTERNAL_DEFS_H_ */
diff --git a/contrib/jemalloc-cmake/include_freebsd_x86_64/jemalloc/internal/jemalloc_internal_defs.h.in b/contrib/jemalloc-cmake/include_freebsd_x86_64/jemalloc/internal/jemalloc_internal_defs.h.in
new file mode 100644
index 00000000000..32cad025f5f
--- /dev/null
+++ b/contrib/jemalloc-cmake/include_freebsd_x86_64/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -0,0 +1,427 @@
+/* include/jemalloc/internal/jemalloc_internal_defs.h.  Generated from jemalloc_internal_defs.h.in by configure.  */
+#ifndef JEMALLOC_INTERNAL_DEFS_H_
+#define JEMALLOC_INTERNAL_DEFS_H_
+/*
+ * If JEMALLOC_PREFIX is defined via --with-jemalloc-prefix, it will cause all
+ * public APIs to be prefixed.  This makes it possible, with some care, to use
+ * multiple allocators simultaneously.
+ */
+/* #undef JEMALLOC_PREFIX */
+/* #undef JEMALLOC_CPREFIX */
+
+/*
+ * Define overrides for non-standard allocator-related functions if they are
+ * present on the system.
+ */
+/* #undef JEMALLOC_OVERRIDE___LIBC_CALLOC */
+/* #undef JEMALLOC_OVERRIDE___LIBC_FREE */
+/* #undef JEMALLOC_OVERRIDE___LIBC_MALLOC */
+/* #undef JEMALLOC_OVERRIDE___LIBC_MEMALIGN */
+/* #undef JEMALLOC_OVERRIDE___LIBC_REALLOC */
+/* #undef JEMALLOC_OVERRIDE___LIBC_VALLOC */
+#define JEMALLOC_OVERRIDE___POSIX_MEMALIGN
+
+/*
+ * JEMALLOC_PRIVATE_NAMESPACE is used as a prefix for all library-private APIs.
+ * For shared libraries, symbol visibility mechanisms prevent these symbols
+ * from being exported, but for static libraries, naming collisions are a real
+ * possibility.
+ */
+#define JEMALLOC_PRIVATE_NAMESPACE je_
+
+/*
+ * Hyper-threaded CPUs may need a special instruction inside spin loops in
+ * order to yield to another virtual CPU.
+ */
+#define CPU_SPINWAIT __asm__ volatile("pause")
+/* 1 if CPU_SPINWAIT is defined, 0 otherwise. */
+#define HAVE_CPU_SPINWAIT 1
+
+/*
+ * Number of significant bits in virtual addresses.  This may be less than the
+ * total number of bits in a pointer, e.g. on x64, for which the uppermost 16
+ * bits are the same as bit 47.
+ */
+#define LG_VADDR 48
+
+/* Defined if C11 atomics are available. */
+#define JEMALLOC_C11_ATOMICS
+
+/* Defined if GCC __atomic atomics are available. */
+#define JEMALLOC_GCC_ATOMIC_ATOMICS
+/* and the 8-bit variant support. */
+#define JEMALLOC_GCC_U8_ATOMIC_ATOMICS
+
+/* Defined if GCC __sync atomics are available. */
+#define JEMALLOC_GCC_SYNC_ATOMICS
+/* and the 8-bit variant support. */
+#define JEMALLOC_GCC_U8_SYNC_ATOMICS
+
+/*
+ * Defined if __builtin_clz() and __builtin_clzl() are available.
+ */
+#define JEMALLOC_HAVE_BUILTIN_CLZ
+
+/*
+ * Defined if os_unfair_lock_*() functions are available, as provided by Darwin.
+ */
+/* #undef JEMALLOC_OS_UNFAIR_LOCK */
+
+/* Defined if syscall(2) is usable. */
+#define JEMALLOC_USE_SYSCALL
+
+/*
+ * Defined if secure_getenv(3) is available.
+ */
+/* #undef JEMALLOC_HAVE_SECURE_GETENV */
+
+/*
+ * Defined if issetugid(2) is available.
+ */
+#define JEMALLOC_HAVE_ISSETUGID
+
+/* Defined if pthread_atfork(3) is available. */
+#define JEMALLOC_HAVE_PTHREAD_ATFORK
+
+/* Only since 12.1-STABLE */
+/* Defined if pthread_setname_np(3) is available. */
+/* #undef JEMALLOC_HAVE_PTHREAD_SETNAME_NP */
+
+/* Only since 12.1-STABLE */
+/* Defined if pthread_getname_np(3) is available. */
+/* #undef JEMALLOC_HAVE_PTHREAD_GETNAME_NP */
+
+/* Defined if pthread_get_name_np(3) is available. */
+#define JEMALLOC_HAVE_PTHREAD_GET_NAME_NP
+
+/*
+ * Defined if clock_gettime(CLOCK_MONOTONIC_COARSE, ...) is available.
+ */
+/* #undef JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE */
+
+/*
+ * Defined if clock_gettime(CLOCK_MONOTONIC, ...) is available.
+ */
+#define JEMALLOC_HAVE_CLOCK_MONOTONIC
+
+/*
+ * Defined if mach_absolute_time() is available.
+ */
+/* #undef JEMALLOC_HAVE_MACH_ABSOLUTE_TIME */
+
+/*
+ * Defined if clock_gettime(CLOCK_REALTIME, ...) is available.
+ */
+#define JEMALLOC_HAVE_CLOCK_REALTIME
+
+/*
+ * Defined if _malloc_thread_cleanup() exists.  At least in the case of
+ * FreeBSD, pthread_key_create() allocates, which if used during malloc
+ * bootstrapping will cause recursion into the pthreads library.  Therefore, if
+ * _malloc_thread_cleanup() exists, use it as the basis for thread cleanup in
+ * malloc_tsd.
+ */
+#define JEMALLOC_MALLOC_THREAD_CLEANUP
+
+/*
+ * Defined if threaded initialization is known to be safe on this platform.
+ * Among other things, it must be possible to initialize a mutex without
+ * triggering allocation in order for threaded allocation to be safe.
+ */
+/* #undef JEMALLOC_THREADED_INIT */
+
+/*
+ * Defined if the pthreads implementation defines
+ * _pthread_mutex_init_calloc_cb(), in which case the function is used in order
+ * to avoid recursive allocation during mutex initialization.
+ */
+#define JEMALLOC_MUTEX_INIT_CB
+
+/* Non-empty if the tls_model attribute is supported. */
+#define JEMALLOC_TLS_MODEL __attribute__((tls_model("initial-exec")))
+
+/*
+ * JEMALLOC_DEBUG enables assertions and other sanity checks, and disables
+ * inline functions.
+ */
+/* #undef JEMALLOC_DEBUG */
+
+/* JEMALLOC_STATS enables statistics calculation. */
+#define JEMALLOC_STATS
+
+/* JEMALLOC_EXPERIMENTAL_SMALLOCX_API enables experimental smallocx API. */
+/* #undef JEMALLOC_EXPERIMENTAL_SMALLOCX_API */
+
+/* JEMALLOC_PROF enables allocation profiling. */
+/* #undef JEMALLOC_PROF */
+
+/* Use libunwind for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_LIBUNWIND */
+
+/* Use libgcc for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_LIBGCC */
+
+/* Use gcc intrinsics for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_GCC */
+
+/*
+ * JEMALLOC_DSS enables use of sbrk(2) to allocate extents from the data storage
+ * segment (DSS).
+ */
+#define JEMALLOC_DSS
+
+/* Support memory filling (junk/zero). */
+#define JEMALLOC_FILL
+
+/* Support utrace(2)-based tracing. */
+/* #undef JEMALLOC_UTRACE */
+
+/* Support utrace(2)-based tracing (label based signature). */
+/* #undef JEMALLOC_UTRACE_LABEL */
+
+/* Support optional abort() on OOM. */
+/* #undef JEMALLOC_XMALLOC */
+
+/* Support lazy locking (avoid locking unless a second thread is launched). */
+#define JEMALLOC_LAZY_LOCK
+
+/*
+ * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size
+ * classes).
+ */
+/* #undef LG_QUANTUM */
+
+/* One page is 2^LG_PAGE bytes. */
+#define LG_PAGE 12
+
+/* Maximum number of regions in a slab. */
+/* #undef CONFIG_LG_SLAB_MAXREGS */
+
+/*
+ * One huge page is 2^LG_HUGEPAGE bytes.  Note that this is defined even if the
+ * system does not explicitly support huge pages; system calls that require
+ * explicit huge page support are separately configured.
+ */
+#define LG_HUGEPAGE 21
+
+/*
+ * If defined, adjacent virtual memory mappings with identical attributes
+ * automatically coalesce, and they fragment when changes are made to subranges.
+ * This is the normal order of things for mmap()/munmap(), but on Windows
+ * VirtualAlloc()/VirtualFree() operations must be precisely matched, i.e.
+ * mappings do *not* coalesce/fragment.
+ */
+#define JEMALLOC_MAPS_COALESCE
+
+/*
+ * If defined, retain memory for later reuse by default rather than using e.g.
+ * munmap() to unmap freed extents.  This is enabled on 64-bit Linux because
+ * common sequences of mmap()/munmap() calls will cause virtual memory map
+ * holes.
+ */
+/* #undef JEMALLOC_RETAIN */
+
+/* TLS is used to map arenas and magazine caches to threads. */
+#define JEMALLOC_TLS
+
+/*
+ * Used to mark unreachable code to quiet "end of non-void" compiler warnings.
+ * Don't use this directly; instead use unreachable() from util.h
+ */
+#define JEMALLOC_INTERNAL_UNREACHABLE __builtin_unreachable
+
+/*
+ * ffs*() functions to use for bitmapping.  Don't use these directly; instead,
+ * use ffs_*() from util.h.
+ */
+#define JEMALLOC_INTERNAL_FFSLL __builtin_ffsll
+#define JEMALLOC_INTERNAL_FFSL __builtin_ffsl
+#define JEMALLOC_INTERNAL_FFS __builtin_ffs
+
+/*
+ * popcount*() functions to use for bitmapping.
+ */
+#define JEMALLOC_INTERNAL_POPCOUNTL __builtin_popcountl
+#define JEMALLOC_INTERNAL_POPCOUNT __builtin_popcount
+
+/*
+ * If defined, explicitly attempt to more uniformly distribute large allocation
+ * pointer alignments across all cache indices.
+ */
+#define JEMALLOC_CACHE_OBLIVIOUS
+
+/*
+ * If defined, enable logging facilities.  We make this a configure option to
+ * avoid taking extra branches everywhere.
+ */
+/* #undef JEMALLOC_LOG */
+
+/*
+ * If defined, use readlinkat() (instead of readlink()) to follow
+ * /etc/malloc_conf.
+ */
+/* #undef JEMALLOC_READLINKAT */
+
+/*
+ * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
+ */
+/* #undef JEMALLOC_ZONE */
+
+/*
+ * Methods for determining whether the OS overcommits.
+ * JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY: Linux's
+ *                                         /proc/sys/vm.overcommit_memory file.
+ * JEMALLOC_SYSCTL_VM_OVERCOMMIT: FreeBSD's vm.overcommit sysctl.
+ */
+#define JEMALLOC_SYSCTL_VM_OVERCOMMIT
+/* #undef JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY */
+
+/* Defined if madvise(2) is available. */
+#define JEMALLOC_HAVE_MADVISE
+
+/*
+ * Defined if transparent huge pages are supported via the MADV_[NO]HUGEPAGE
+ * arguments to madvise(2).
+ */
+/* #undef JEMALLOC_HAVE_MADVISE_HUGE */
+
+/*
+ * Methods for purging unused pages differ between operating systems.
+ *
+ *   madvise(..., MADV_FREE) : This marks pages as being unused, such that they
+ *                             will be discarded rather than swapped out.
+ *   madvise(..., MADV_DONTNEED) : If JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS is
+ *                                 defined, this immediately discards pages,
+ *                                 such that new pages will be demand-zeroed if
+ *                                 the address region is later touched;
+ *                                 otherwise this behaves similarly to
+ *                                 MADV_FREE, though typically with higher
+ *                                 system overhead.
+ */
+#define JEMALLOC_PURGE_MADVISE_FREE
+#define JEMALLOC_PURGE_MADVISE_DONTNEED
+/* #undef JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS */
+
+/* Defined if madvise(2) is available but MADV_FREE is not (x86 Linux only). */
+/* #undef JEMALLOC_DEFINE_MADVISE_FREE */
+
+/*
+ * Defined if MADV_DO[NT]DUMP is supported as an argument to madvise.
+ */
+/* #undef JEMALLOC_MADVISE_DONTDUMP */
+
+/*
+ * Defined if MADV_[NO]CORE is supported as an argument to madvise.
+ */
+#define JEMALLOC_MADVISE_NOCORE
+
+/* Defined if mprotect(2) is available. */
+#define JEMALLOC_HAVE_MPROTECT
+
+/*
+ * Defined if transparent huge pages (THPs) are supported via the
+ * MADV_[NO]HUGEPAGE arguments to madvise(2), and THP support is enabled.
+ */
+/* #undef JEMALLOC_THP */
+
+/* Defined if posix_madvise is available. */
+/* #undef JEMALLOC_HAVE_POSIX_MADVISE */
+
+/*
+ * Method for purging unused pages using posix_madvise.
+ *
+ *   posix_madvise(..., POSIX_MADV_DONTNEED)
+ */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED_ZEROS */
+
+/*
+ * Defined if memcntl page admin call is supported
+ */
+/* #undef JEMALLOC_HAVE_MEMCNTL */
+
+/*
+ * Defined if malloc_size is supported
+ */
+/* #undef JEMALLOC_HAVE_MALLOC_SIZE */
+
+/* Define if operating system has alloca.h header. */
+/* #undef JEMALLOC_HAS_ALLOCA_H */
+
+/* C99 restrict keyword supported. */
+#define JEMALLOC_HAS_RESTRICT
+
+/* For use by hash code. */
+/* #undef JEMALLOC_BIG_ENDIAN */
+
+/* sizeof(int) == 2^LG_SIZEOF_INT. */
+#define LG_SIZEOF_INT 2
+
+/* sizeof(long) == 2^LG_SIZEOF_LONG. */
+#define LG_SIZEOF_LONG 3
+
+/* sizeof(long long) == 2^LG_SIZEOF_LONG_LONG. */
+#define LG_SIZEOF_LONG_LONG 3
+
+/* sizeof(intmax_t) == 2^LG_SIZEOF_INTMAX_T. */
+#define LG_SIZEOF_INTMAX_T 3
+
+/* glibc malloc hooks (__malloc_hook, __realloc_hook, __free_hook). */
+/* #undef JEMALLOC_GLIBC_MALLOC_HOOK */
+
+/* glibc memalign hook. */
+/* #undef JEMALLOC_GLIBC_MEMALIGN_HOOK */
+
+/* pthread support */
+#define JEMALLOC_HAVE_PTHREAD
+
+/* dlsym() support */
+#define JEMALLOC_HAVE_DLSYM
+
+/* Adaptive mutex support in pthreads. */
+#define JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP
+
+/* GNU specific sched_getcpu support */
+/* #undef JEMALLOC_HAVE_SCHED_GETCPU */
+
+/* GNU specific sched_setaffinity support */
+/* #undef JEMALLOC_HAVE_SCHED_SETAFFINITY */
+
+/*
+ * If defined, all the features necessary for background threads are present.
+ */
+#define JEMALLOC_BACKGROUND_THREAD
+
+/*
+ * If defined, jemalloc symbols are not exported (doesn't work when
+ * JEMALLOC_PREFIX is not defined).
+ */
+/* #undef JEMALLOC_EXPORT */
+
+/* config.malloc_conf options string. */
+#define JEMALLOC_CONFIG_MALLOC_CONF "@JEMALLOC_CONFIG_MALLOC_CONF@"
+
+/* If defined, jemalloc takes the malloc/free/etc. symbol names. */
+#define JEMALLOC_IS_MALLOC
+
+/*
+ * Defined if strerror_r returns char * if _GNU_SOURCE is defined.
+ */
+/* #undef JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE */
+
+/* Performs additional safety checks when defined. */
+/* #undef JEMALLOC_OPT_SAFETY_CHECKS */
+
+/* Is C++ support being built? */
+/* #undef JEMALLOC_ENABLE_CXX */
+
+/* Performs additional size checks when defined. */
+/* #undef JEMALLOC_OPT_SIZE_CHECKS */
+
+/* Allows sampled junk and stash for checking use-after-free when defined. */
+/* #undef JEMALLOC_UAF_DETECTION */
+
+/* Darwin VM_MAKE_TAG support */
+/* #undef JEMALLOC_HAVE_VM_MAKE_TAG */
+
+#endif /* JEMALLOC_INTERNAL_DEFS_H_ */
diff --git a/contrib/jemalloc-cmake/include_linux_aarch64/README b/contrib/jemalloc-cmake/include_linux_aarch64/README
deleted file mode 100644
index 2ab582803a2..00000000000
--- a/contrib/jemalloc-cmake/include_linux_aarch64/README
+++ /dev/null
@@ -1,7 +0,0 @@
-Here are pre-generated files from jemalloc on Linux aarch64.
-You can obtain these files by running ./autogen.sh inside jemalloc source directory.
-
-Added #define GNU_SOURCE
-Added JEMALLOC_OVERRIDE___POSIX_MEMALIGN because why not.
-Removed JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF because it's non standard.
-Removed JEMALLOC_PURGE_MADVISE_FREE because it's available only from Linux 4.5.
diff --git a/contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/internal/jemalloc_internal_defs.h b/contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/internal/jemalloc_internal_defs.h.in
similarity index 80%
rename from contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/internal/jemalloc_internal_defs.h
rename to contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/internal/jemalloc_internal_defs.h.in
index 5e598348e72..ad535e6d773 100644
--- a/contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/internal/jemalloc_internal_defs.h
+++ b/contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -1,12 +1,6 @@
 /* include/jemalloc/internal/jemalloc_internal_defs.h.  Generated from jemalloc_internal_defs.h.in by configure.  */
 #ifndef JEMALLOC_INTERNAL_DEFS_H_
 #define JEMALLOC_INTERNAL_DEFS_H_
-
-
-#ifndef _GNU_SOURCE
-    #define _GNU_SOURCE
-#endif
-
 /*
  * If JEMALLOC_PREFIX is defined via --with-jemalloc-prefix, it will cause all
  * public APIs to be prefixed.  This makes it possible, with some care, to use
@@ -19,13 +13,15 @@
  * Define overrides for non-standard allocator-related functions if they are
  * present on the system.
  */
-#define JEMALLOC_OVERRIDE___LIBC_CALLOC
-#define JEMALLOC_OVERRIDE___LIBC_FREE
-#define JEMALLOC_OVERRIDE___LIBC_MALLOC
-#define JEMALLOC_OVERRIDE___LIBC_MEMALIGN
-#define JEMALLOC_OVERRIDE___LIBC_REALLOC
-#define JEMALLOC_OVERRIDE___LIBC_VALLOC
-#define JEMALLOC_OVERRIDE___POSIX_MEMALIGN
+#if !defined(USE_MUSL)
+    #define JEMALLOC_OVERRIDE___LIBC_CALLOC
+    #define JEMALLOC_OVERRIDE___LIBC_FREE
+    #define JEMALLOC_OVERRIDE___LIBC_MALLOC
+    #define JEMALLOC_OVERRIDE___LIBC_MEMALIGN
+    #define JEMALLOC_OVERRIDE___LIBC_REALLOC
+    #define JEMALLOC_OVERRIDE___LIBC_VALLOC
+#endif
+/* #undef JEMALLOC_OVERRIDE___POSIX_MEMALIGN */
 
 /*
  * JEMALLOC_PRIVATE_NAMESPACE is used as a prefix for all library-private APIs.
@@ -51,29 +47,17 @@
 #define LG_VADDR 48
 
 /* Defined if C11 atomics are available. */
-#define JEMALLOC_C11_ATOMICS 1
+#define JEMALLOC_C11_ATOMICS
 
 /* Defined if GCC __atomic atomics are available. */
-#define JEMALLOC_GCC_ATOMIC_ATOMICS 1
+#define JEMALLOC_GCC_ATOMIC_ATOMICS
+/* and the 8-bit variant support. */
+#define JEMALLOC_GCC_U8_ATOMIC_ATOMICS
 
 /* Defined if GCC __sync atomics are available. */
-#define JEMALLOC_GCC_SYNC_ATOMICS 1
-
-/*
- * Defined if __sync_add_and_fetch(uint32_t *, uint32_t) and
- * __sync_sub_and_fetch(uint32_t *, uint32_t) are available, despite
- * __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 not being defined (which means the
- * functions are defined in libgcc instead of being inlines).
- */
-/* #undef JE_FORCE_SYNC_COMPARE_AND_SWAP_4 */
-
-/*
- * Defined if __sync_add_and_fetch(uint64_t *, uint64_t) and
- * __sync_sub_and_fetch(uint64_t *, uint64_t) are available, despite
- * __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 not being defined (which means the
- * functions are defined in libgcc instead of being inlines).
- */
-/* #undef JE_FORCE_SYNC_COMPARE_AND_SWAP_8 */
+#define JEMALLOC_GCC_SYNC_ATOMICS
+/* and the 8-bit variant support. */
+#define JEMALLOC_GCC_U8_SYNC_ATOMICS
 
 /*
  * Defined if __builtin_clz() and __builtin_clzl() are available.
@@ -85,19 +69,13 @@
  */
 /* #undef JEMALLOC_OS_UNFAIR_LOCK */
 
-/*
- * Defined if OSSpin*() functions are available, as provided by Darwin, and
- * documented in the spinlock(3) manual page.
- */
-/* #undef JEMALLOC_OSSPIN */
-
 /* Defined if syscall(2) is usable. */
 #define JEMALLOC_USE_SYSCALL
 
 /*
  * Defined if secure_getenv(3) is available.
  */
-#define JEMALLOC_HAVE_SECURE_GETENV
+/* #undef JEMALLOC_HAVE_SECURE_GETENV */
 
 /*
  * Defined if issetugid(2) is available.
@@ -110,21 +88,32 @@
 /* Defined if pthread_setname_np(3) is available. */
 #define JEMALLOC_HAVE_PTHREAD_SETNAME_NP
 
+/* Defined if pthread_getname_np(3) is available. */
+#define JEMALLOC_HAVE_PTHREAD_GETNAME_NP
+
+/* Defined if pthread_get_name_np(3) is available. */
+/* #undef JEMALLOC_HAVE_PTHREAD_GET_NAME_NP */
+
 /*
  * Defined if clock_gettime(CLOCK_MONOTONIC_COARSE, ...) is available.
  */
-#define JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE 1
+#define JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE
 
 /*
  * Defined if clock_gettime(CLOCK_MONOTONIC, ...) is available.
  */
-#define JEMALLOC_HAVE_CLOCK_MONOTONIC 1
+#define JEMALLOC_HAVE_CLOCK_MONOTONIC
 
 /*
  * Defined if mach_absolute_time() is available.
  */
 /* #undef JEMALLOC_HAVE_MACH_ABSOLUTE_TIME */
 
+/*
+ * Defined if clock_gettime(CLOCK_REALTIME, ...) is available.
+ */
+#define JEMALLOC_HAVE_CLOCK_REALTIME
+
 /*
  * Defined if _malloc_thread_cleanup() exists.  At least in the case of
  * FreeBSD, pthread_key_create() allocates, which if used during malloc
@@ -187,6 +176,9 @@
 /* Support utrace(2)-based tracing. */
 /* #undef JEMALLOC_UTRACE */
 
+/* Support utrace(2)-based tracing (label based signature). */
+/* #undef JEMALLOC_UTRACE_LABEL */
+
 /* Support optional abort() on OOM. */
 /* #undef JEMALLOC_XMALLOC */
 
@@ -202,6 +194,9 @@
 /* One page is 2^LG_PAGE bytes. */
 #define LG_PAGE 16
 
+/* Maximum number of regions in a slab. */
+/* #undef CONFIG_LG_SLAB_MAXREGS */
+
 /*
  * One huge page is 2^LG_HUGEPAGE bytes.  Note that this is defined even if the
  * system does not explicitly support huge pages; system calls that require
@@ -243,6 +238,12 @@
 #define JEMALLOC_INTERNAL_FFSL __builtin_ffsl
 #define JEMALLOC_INTERNAL_FFS __builtin_ffs
 
+/*
+ * popcount*() functions to use for bitmapping.
+ */
+#define JEMALLOC_INTERNAL_POPCOUNTL __builtin_popcountl
+#define JEMALLOC_INTERNAL_POPCOUNT __builtin_popcount
+
 /*
  * If defined, explicitly attempt to more uniformly distribute large allocation
  * pointer alignments across all cache indices.
@@ -297,7 +298,7 @@
  *                                 MADV_FREE, though typically with higher
  *                                 system overhead.
  */
-// #define JEMALLOC_PURGE_MADVISE_FREE
+#define JEMALLOC_PURGE_MADVISE_FREE
 #define JEMALLOC_PURGE_MADVISE_DONTNEED
 #define JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS
 
@@ -309,17 +310,46 @@
  */
 #define JEMALLOC_MADVISE_DONTDUMP
 
+/*
+ * Defined if MADV_[NO]CORE is supported as an argument to madvise.
+ */
+/* #undef JEMALLOC_MADVISE_NOCORE */
+
+/* Defined if mprotect(2) is available. */
+#define JEMALLOC_HAVE_MPROTECT
+
 /*
  * Defined if transparent huge pages (THPs) are supported via the
  * MADV_[NO]HUGEPAGE arguments to madvise(2), and THP support is enabled.
  */
 /* #undef JEMALLOC_THP */
 
+/* Defined if posix_madvise is available. */
+/* #undef JEMALLOC_HAVE_POSIX_MADVISE */
+
+/*
+ * Method for purging unused pages using posix_madvise.
+ *
+ *   posix_madvise(..., POSIX_MADV_DONTNEED)
+ */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED_ZEROS */
+
+/*
+ * Defined if memcntl page admin call is supported
+ */
+/* #undef JEMALLOC_HAVE_MEMCNTL */
+
+/*
+ * Defined if malloc_size is supported
+ */
+/* #undef JEMALLOC_HAVE_MALLOC_SIZE */
+
 /* Define if operating system has alloca.h header. */
-#define JEMALLOC_HAS_ALLOCA_H 1
+#define JEMALLOC_HAS_ALLOCA_H
 
 /* C99 restrict keyword supported. */
-#define JEMALLOC_HAS_RESTRICT 1
+#define JEMALLOC_HAS_RESTRICT
 
 /* For use by hash code. */
 /* #undef JEMALLOC_BIG_ENDIAN */
@@ -360,7 +390,7 @@
 /*
  * If defined, all the features necessary for background threads are present.
  */
-#define JEMALLOC_BACKGROUND_THREAD 1
+#define JEMALLOC_BACKGROUND_THREAD
 
 /*
  * If defined, jemalloc symbols are not exported (doesn't work when
@@ -369,20 +399,29 @@
 /* #undef JEMALLOC_EXPORT */
 
 /* config.malloc_conf options string. */
-#define JEMALLOC_CONFIG_MALLOC_CONF ""
+#define JEMALLOC_CONFIG_MALLOC_CONF "@JEMALLOC_CONFIG_MALLOC_CONF@"
 
 /* If defined, jemalloc takes the malloc/free/etc. symbol names. */
-#define JEMALLOC_IS_MALLOC 1
+#define JEMALLOC_IS_MALLOC
 
 /*
  * Defined if strerror_r returns char * if _GNU_SOURCE is defined.
  */
 #define JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE
 
-/*
- * popcount*() functions to use for bitmapping.
- */
-#define JEMALLOC_INTERNAL_POPCOUNTL __builtin_popcountl
-#define JEMALLOC_INTERNAL_POPCOUNT __builtin_popcount
+/* Performs additional safety checks when defined. */
+/* #undef JEMALLOC_OPT_SAFETY_CHECKS */
+
+/* Is C++ support being built? */
+/* #undef JEMALLOC_ENABLE_CXX */
+
+/* Performs additional size checks when defined. */
+/* #undef JEMALLOC_OPT_SIZE_CHECKS */
+
+/* Allows sampled junk and stash for checking use-after-free when defined. */
+/* #undef JEMALLOC_UAF_DETECTION */
+
+/* Darwin VM_MAKE_TAG support */
+/* #undef JEMALLOC_HAVE_VM_MAKE_TAG */
 
 #endif /* JEMALLOC_INTERNAL_DEFS_H_ */
diff --git a/contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/jemalloc_defs.h b/contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/jemalloc_defs.h
deleted file mode 100644
index d1389237a77..00000000000
--- a/contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/jemalloc_defs.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* include/jemalloc/jemalloc_defs.h.  Generated from jemalloc_defs.h.in by configure.  */
-/* Defined if __attribute__((...)) syntax is supported. */
-#define JEMALLOC_HAVE_ATTR
-
-/* Defined if alloc_size attribute is supported. */
-#define JEMALLOC_HAVE_ATTR_ALLOC_SIZE
-
-/* Defined if format(printf, ...) attribute is supported. */
-#define JEMALLOC_HAVE_ATTR_FORMAT_PRINTF
-
-/*
- * Define overrides for non-standard allocator-related functions if they are
- * present on the system.
- */
-#define JEMALLOC_OVERRIDE_MEMALIGN
-#define JEMALLOC_OVERRIDE_VALLOC
-
-/*
- * At least Linux omits the "const" in:
- *
- *   size_t malloc_usable_size(const void *ptr);
- *
- * Match the operating system's prototype.
- */
-#define JEMALLOC_USABLE_SIZE_CONST
-
-/*
- * If defined, specify throw() for the public function prototypes when compiling
- * with C++.  The only justification for this is to match the prototypes that
- * glibc defines.
- */
-#define JEMALLOC_USE_CXX_THROW
-
-#ifdef _MSC_VER
-#  ifdef _WIN64
-#    define LG_SIZEOF_PTR_WIN 3
-#  else
-#    define LG_SIZEOF_PTR_WIN 2
-#  endif
-#endif
-
-/* sizeof(void *) == 2^LG_SIZEOF_PTR. */
-#define LG_SIZEOF_PTR 3
diff --git a/contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/jemalloc_macros.h b/contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/jemalloc_macros.h
deleted file mode 100644
index 34235894285..00000000000
--- a/contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/jemalloc_macros.h
+++ /dev/null
@@ -1,129 +0,0 @@
-#include <stdlib.h>
-#include <stdbool.h>
-#include <stdint.h>
-#include <limits.h>
-#include <strings.h>
-
-#define JEMALLOC_VERSION "5.2.1-0-gea6b3e973b477b8061e0076bb257dbd7f3faa756"
-#define JEMALLOC_VERSION_MAJOR 5
-#define JEMALLOC_VERSION_MINOR 2
-#define JEMALLOC_VERSION_BUGFIX 1
-#define JEMALLOC_VERSION_NREV 0
-#define JEMALLOC_VERSION_GID "ea6b3e973b477b8061e0076bb257dbd7f3faa756"
-#define JEMALLOC_VERSION_GID_IDENT ea6b3e973b477b8061e0076bb257dbd7f3faa756
-
-#define MALLOCX_LG_ALIGN(la)	((int)(la))
-#if LG_SIZEOF_PTR == 2
-#  define MALLOCX_ALIGN(a)	((int)(ffs((int)(a))-1))
-#else
-#  define MALLOCX_ALIGN(a)						\
-     ((int)(((size_t)(a) < (size_t)INT_MAX) ? ffs((int)(a))-1 :	\
-     ffs((int)(((size_t)(a))>>32))+31))
-#endif
-#define MALLOCX_ZERO	((int)0x40)
-/*
- * Bias tcache index bits so that 0 encodes "automatic tcache management", and 1
- * encodes MALLOCX_TCACHE_NONE.
- */
-#define MALLOCX_TCACHE(tc)	((int)(((tc)+2) << 8))
-#define MALLOCX_TCACHE_NONE	MALLOCX_TCACHE(-1)
-/*
- * Bias arena index bits so that 0 encodes "use an automatically chosen arena".
- */
-#define MALLOCX_ARENA(a)	((((int)(a))+1) << 20)
-
-/*
- * Use as arena index in "arena.<i>.{purge,decay,dss}" and
- * "stats.arenas.<i>.*" mallctl interfaces to select all arenas.  This
- * definition is intentionally specified in raw decimal format to support
- * cpp-based string concatenation, e.g.
- *
- *   #define STRINGIFY_HELPER(x) #x
- *   #define STRINGIFY(x) STRINGIFY_HELPER(x)
- *
- *   mallctl("arena." STRINGIFY(MALLCTL_ARENAS_ALL) ".purge", NULL, NULL, NULL,
- *       0);
- */
-#define MALLCTL_ARENAS_ALL	4096
-/*
- * Use as arena index in "stats.arenas.<i>.*" mallctl interfaces to select
- * destroyed arenas.
- */
-#define MALLCTL_ARENAS_DESTROYED	4097
-
-#if defined(__cplusplus) && defined(JEMALLOC_USE_CXX_THROW)
-#  define JEMALLOC_CXX_THROW throw()
-#else
-#  define JEMALLOC_CXX_THROW
-#endif
-
-#if defined(_MSC_VER)
-#  define JEMALLOC_ATTR(s)
-#  define JEMALLOC_ALIGNED(s) __declspec(align(s))
-#  define JEMALLOC_ALLOC_SIZE(s)
-#  define JEMALLOC_ALLOC_SIZE2(s1, s2)
-#  ifndef JEMALLOC_EXPORT
-#    ifdef DLLEXPORT
-#      define JEMALLOC_EXPORT __declspec(dllexport)
-#    else
-#      define JEMALLOC_EXPORT __declspec(dllimport)
-#    endif
-#  endif
-#  define JEMALLOC_FORMAT_ARG(i)
-#  define JEMALLOC_FORMAT_PRINTF(s, i)
-#  define JEMALLOC_NOINLINE __declspec(noinline)
-#  ifdef __cplusplus
-#    define JEMALLOC_NOTHROW __declspec(nothrow)
-#  else
-#    define JEMALLOC_NOTHROW
-#  endif
-#  define JEMALLOC_SECTION(s) __declspec(allocate(s))
-#  define JEMALLOC_RESTRICT_RETURN __declspec(restrict)
-#  if _MSC_VER >= 1900 && !defined(__EDG__)
-#    define JEMALLOC_ALLOCATOR __declspec(allocator)
-#  else
-#    define JEMALLOC_ALLOCATOR
-#  endif
-#elif defined(JEMALLOC_HAVE_ATTR)
-#  define JEMALLOC_ATTR(s) __attribute__((s))
-#  define JEMALLOC_ALIGNED(s) JEMALLOC_ATTR(aligned(s))
-#  ifdef JEMALLOC_HAVE_ATTR_ALLOC_SIZE
-#    define JEMALLOC_ALLOC_SIZE(s) JEMALLOC_ATTR(alloc_size(s))
-#    define JEMALLOC_ALLOC_SIZE2(s1, s2) JEMALLOC_ATTR(alloc_size(s1, s2))
-#  else
-#    define JEMALLOC_ALLOC_SIZE(s)
-#    define JEMALLOC_ALLOC_SIZE2(s1, s2)
-#  endif
-#  ifndef JEMALLOC_EXPORT
-#    define JEMALLOC_EXPORT JEMALLOC_ATTR(visibility("default"))
-#  endif
-#  ifdef JEMALLOC_HAVE_ATTR_FORMAT_ARG
-#    define JEMALLOC_FORMAT_ARG(i) JEMALLOC_ATTR(__format_arg__(3))
-#  else
-#    define JEMALLOC_FORMAT_ARG(i)
-#  endif
-#  ifdef JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF
-#    define JEMALLOC_FORMAT_PRINTF(s, i) JEMALLOC_ATTR(format(gnu_printf, s, i))
-#  elif defined(JEMALLOC_HAVE_ATTR_FORMAT_PRINTF)
-#    define JEMALLOC_FORMAT_PRINTF(s, i) JEMALLOC_ATTR(format(printf, s, i))
-#  else
-#    define JEMALLOC_FORMAT_PRINTF(s, i)
-#  endif
-#  define JEMALLOC_NOINLINE JEMALLOC_ATTR(noinline)
-#  define JEMALLOC_NOTHROW JEMALLOC_ATTR(nothrow)
-#  define JEMALLOC_SECTION(s) JEMALLOC_ATTR(section(s))
-#  define JEMALLOC_RESTRICT_RETURN
-#  define JEMALLOC_ALLOCATOR
-#else
-#  define JEMALLOC_ATTR(s)
-#  define JEMALLOC_ALIGNED(s)
-#  define JEMALLOC_ALLOC_SIZE(s)
-#  define JEMALLOC_ALLOC_SIZE2(s1, s2)
-#  define JEMALLOC_EXPORT
-#  define JEMALLOC_FORMAT_PRINTF(s, i)
-#  define JEMALLOC_NOINLINE
-#  define JEMALLOC_NOTHROW
-#  define JEMALLOC_SECTION(s)
-#  define JEMALLOC_RESTRICT_RETURN
-#  define JEMALLOC_ALLOCATOR
-#endif
diff --git a/contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/jemalloc_protos.h b/contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/jemalloc_protos.h
deleted file mode 100644
index ff025e30fa7..00000000000
--- a/contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/jemalloc_protos.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * The je_ prefix on the following public symbol declarations is an artifact
- * of namespace management, and should be omitted in application code unless
- * JEMALLOC_NO_DEMANGLE is defined (see jemalloc_mangle.h).
- */
-extern JEMALLOC_EXPORT const char	*je_malloc_conf;
-extern JEMALLOC_EXPORT void		(*je_malloc_message)(void *cbopaque,
-    const char *s);
-
-JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
-    void JEMALLOC_NOTHROW	*je_malloc(size_t size)
-    JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1);
-JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
-    void JEMALLOC_NOTHROW	*je_calloc(size_t num, size_t size)
-    JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE2(1, 2);
-JEMALLOC_EXPORT int JEMALLOC_NOTHROW	je_posix_memalign(void **memptr,
-    size_t alignment, size_t size) JEMALLOC_CXX_THROW JEMALLOC_ATTR(nonnull(1));
-JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
-    void JEMALLOC_NOTHROW	*je_aligned_alloc(size_t alignment,
-    size_t size) JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc)
-    JEMALLOC_ALLOC_SIZE(2);
-JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
-    void JEMALLOC_NOTHROW	*je_realloc(void *ptr, size_t size)
-    JEMALLOC_CXX_THROW JEMALLOC_ALLOC_SIZE(2);
-JEMALLOC_EXPORT void JEMALLOC_NOTHROW	je_free(void *ptr)
-    JEMALLOC_CXX_THROW;
-
-JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
-    void JEMALLOC_NOTHROW	*je_mallocx(size_t size, int flags)
-    JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1);
-JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
-    void JEMALLOC_NOTHROW	*je_rallocx(void *ptr, size_t size,
-    int flags) JEMALLOC_ALLOC_SIZE(2);
-JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW	je_xallocx(void *ptr, size_t size,
-    size_t extra, int flags);
-JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW	je_sallocx(const void *ptr,
-    int flags) JEMALLOC_ATTR(pure);
-JEMALLOC_EXPORT void JEMALLOC_NOTHROW	je_dallocx(void *ptr, int flags);
-JEMALLOC_EXPORT void JEMALLOC_NOTHROW	je_sdallocx(void *ptr, size_t size,
-    int flags);
-JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW	je_nallocx(size_t size, int flags)
-    JEMALLOC_ATTR(pure);
-
-JEMALLOC_EXPORT int JEMALLOC_NOTHROW	je_mallctl(const char *name,
-    void *oldp, size_t *oldlenp, void *newp, size_t newlen);
-JEMALLOC_EXPORT int JEMALLOC_NOTHROW	je_mallctlnametomib(const char *name,
-    size_t *mibp, size_t *miblenp);
-JEMALLOC_EXPORT int JEMALLOC_NOTHROW	je_mallctlbymib(const size_t *mib,
-    size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen);
-JEMALLOC_EXPORT void JEMALLOC_NOTHROW	je_malloc_stats_print(
-    void (*write_cb)(void *, const char *), void *je_cbopaque,
-    const char *opts);
-JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW	je_malloc_usable_size(
-    JEMALLOC_USABLE_SIZE_CONST void *ptr) JEMALLOC_CXX_THROW;
-
-#ifdef JEMALLOC_OVERRIDE_MEMALIGN
-JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
-    void JEMALLOC_NOTHROW	*je_memalign(size_t alignment, size_t size)
-    JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc);
-#endif
-
-#ifdef JEMALLOC_OVERRIDE_VALLOC
-JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
-    void JEMALLOC_NOTHROW	*je_valloc(size_t size) JEMALLOC_CXX_THROW
-    JEMALLOC_ATTR(malloc);
-#endif
diff --git a/contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/jemalloc_typedefs.h b/contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/jemalloc_typedefs.h
deleted file mode 100644
index 1a58874306e..00000000000
--- a/contrib/jemalloc-cmake/include_linux_aarch64/jemalloc/jemalloc_typedefs.h
+++ /dev/null
@@ -1,77 +0,0 @@
-typedef struct extent_hooks_s extent_hooks_t;
-
-/*
- * void *
- * extent_alloc(extent_hooks_t *extent_hooks, void *new_addr, size_t size,
- *     size_t alignment, bool *zero, bool *commit, unsigned arena_ind);
- */
-typedef void *(extent_alloc_t)(extent_hooks_t *, void *, size_t, size_t, bool *,
-    bool *, unsigned);
-
-/*
- * bool
- * extent_dalloc(extent_hooks_t *extent_hooks, void *addr, size_t size,
- *     bool committed, unsigned arena_ind);
- */
-typedef bool (extent_dalloc_t)(extent_hooks_t *, void *, size_t, bool,
-    unsigned);
-
-/*
- * void
- * extent_destroy(extent_hooks_t *extent_hooks, void *addr, size_t size,
- *     bool committed, unsigned arena_ind);
- */
-typedef void (extent_destroy_t)(extent_hooks_t *, void *, size_t, bool,
-    unsigned);
-
-/*
- * bool
- * extent_commit(extent_hooks_t *extent_hooks, void *addr, size_t size,
- *     size_t offset, size_t length, unsigned arena_ind);
- */
-typedef bool (extent_commit_t)(extent_hooks_t *, void *, size_t, size_t, size_t,
-    unsigned);
-
-/*
- * bool
- * extent_decommit(extent_hooks_t *extent_hooks, void *addr, size_t size,
- *     size_t offset, size_t length, unsigned arena_ind);
- */
-typedef bool (extent_decommit_t)(extent_hooks_t *, void *, size_t, size_t,
-    size_t, unsigned);
-
-/*
- * bool
- * extent_purge(extent_hooks_t *extent_hooks, void *addr, size_t size,
- *     size_t offset, size_t length, unsigned arena_ind);
- */
-typedef bool (extent_purge_t)(extent_hooks_t *, void *, size_t, size_t, size_t,
-    unsigned);
-
-/*
- * bool
- * extent_split(extent_hooks_t *extent_hooks, void *addr, size_t size,
- *     size_t size_a, size_t size_b, bool committed, unsigned arena_ind);
- */
-typedef bool (extent_split_t)(extent_hooks_t *, void *, size_t, size_t, size_t,
-    bool, unsigned);
-
-/*
- * bool
- * extent_merge(extent_hooks_t *extent_hooks, void *addr_a, size_t size_a,
- *     void *addr_b, size_t size_b, bool committed, unsigned arena_ind);
- */
-typedef bool (extent_merge_t)(extent_hooks_t *, void *, size_t, void *, size_t,
-    bool, unsigned);
-
-struct extent_hooks_s {
-	extent_alloc_t		*alloc;
-	extent_dalloc_t		*dalloc;
-	extent_destroy_t	*destroy;
-	extent_commit_t		*commit;
-	extent_decommit_t	*decommit;
-	extent_purge_t		*purge_lazy;
-	extent_purge_t		*purge_forced;
-	extent_split_t		*split;
-	extent_merge_t		*merge;
-};
diff --git a/contrib/jemalloc-cmake/include_linux_ppc64le/jemalloc/internal/jemalloc_internal_defs.h.in b/contrib/jemalloc-cmake/include_linux_ppc64le/jemalloc/internal/jemalloc_internal_defs.h.in
new file mode 100644
index 00000000000..12890f80ef1
--- /dev/null
+++ b/contrib/jemalloc-cmake/include_linux_ppc64le/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -0,0 +1,427 @@
+/* include/jemalloc/internal/jemalloc_internal_defs.h.  Generated from jemalloc_internal_defs.h.in by configure.  */
+#ifndef JEMALLOC_INTERNAL_DEFS_H_
+#define JEMALLOC_INTERNAL_DEFS_H_
+/*
+ * If JEMALLOC_PREFIX is defined via --with-jemalloc-prefix, it will cause all
+ * public APIs to be prefixed.  This makes it possible, with some care, to use
+ * multiple allocators simultaneously.
+ */
+/* #undef JEMALLOC_PREFIX */
+/* #undef JEMALLOC_CPREFIX */
+
+/*
+ * Define overrides for non-standard allocator-related functions if they are
+ * present on the system.
+ */
+#if !defined(USE_MUSL)
+    #define JEMALLOC_OVERRIDE___LIBC_CALLOC
+    #define JEMALLOC_OVERRIDE___LIBC_FREE
+    #define JEMALLOC_OVERRIDE___LIBC_MALLOC
+    #define JEMALLOC_OVERRIDE___LIBC_MEMALIGN
+    #define JEMALLOC_OVERRIDE___LIBC_REALLOC
+    #define JEMALLOC_OVERRIDE___LIBC_VALLOC
+#endif
+/* #undef JEMALLOC_OVERRIDE___POSIX_MEMALIGN */
+
+/*
+ * JEMALLOC_PRIVATE_NAMESPACE is used as a prefix for all library-private APIs.
+ * For shared libraries, symbol visibility mechanisms prevent these symbols
+ * from being exported, but for static libraries, naming collisions are a real
+ * possibility.
+ */
+#define JEMALLOC_PRIVATE_NAMESPACE je_
+
+/*
+ * Hyper-threaded CPUs may need a special instruction inside spin loops in
+ * order to yield to another virtual CPU.
+ */
+#define CPU_SPINWAIT
+/* 1 if CPU_SPINWAIT is defined, 0 otherwise. */
+#define HAVE_CPU_SPINWAIT 0
+
+/*
+ * Number of significant bits in virtual addresses.  This may be less than the
+ * total number of bits in a pointer, e.g. on x64, for which the uppermost 16
+ * bits are the same as bit 47.
+ */
+#define LG_VADDR 64
+
+/* Defined if C11 atomics are available. */
+#define JEMALLOC_C11_ATOMICS
+
+/* Defined if GCC __atomic atomics are available. */
+#define JEMALLOC_GCC_ATOMIC_ATOMICS
+/* and the 8-bit variant support. */
+#define JEMALLOC_GCC_U8_ATOMIC_ATOMICS
+
+/* Defined if GCC __sync atomics are available. */
+#define JEMALLOC_GCC_SYNC_ATOMICS
+/* and the 8-bit variant support. */
+#define JEMALLOC_GCC_U8_SYNC_ATOMICS
+
+/*
+ * Defined if __builtin_clz() and __builtin_clzl() are available.
+ */
+#define JEMALLOC_HAVE_BUILTIN_CLZ
+
+/*
+ * Defined if os_unfair_lock_*() functions are available, as provided by Darwin.
+ */
+/* #undef JEMALLOC_OS_UNFAIR_LOCK */
+
+/* Defined if syscall(2) is usable. */
+#define JEMALLOC_USE_SYSCALL
+
+/*
+ * Defined if secure_getenv(3) is available.
+ */
+/* #undef JEMALLOC_HAVE_SECURE_GETENV */
+
+/*
+ * Defined if issetugid(2) is available.
+ */
+/* #undef JEMALLOC_HAVE_ISSETUGID */
+
+/* Defined if pthread_atfork(3) is available. */
+/* #undef JEMALLOC_HAVE_PTHREAD_ATFORK */
+
+/* Defined if pthread_setname_np(3) is available. */
+#define JEMALLOC_HAVE_PTHREAD_SETNAME_NP
+
+/* Defined if pthread_getname_np(3) is available. */
+#define JEMALLOC_HAVE_PTHREAD_GETNAME_NP
+
+/* Defined if pthread_get_name_np(3) is available. */
+/* #undef JEMALLOC_HAVE_PTHREAD_GET_NAME_NP */
+
+/*
+ * Defined if clock_gettime(CLOCK_MONOTONIC_COARSE, ...) is available.
+ */
+#define JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE
+
+/*
+ * Defined if clock_gettime(CLOCK_MONOTONIC, ...) is available.
+ */
+#define JEMALLOC_HAVE_CLOCK_MONOTONIC
+
+/*
+ * Defined if mach_absolute_time() is available.
+ */
+/* #undef JEMALLOC_HAVE_MACH_ABSOLUTE_TIME */
+
+/*
+ * Defined if clock_gettime(CLOCK_REALTIME, ...) is available.
+ */
+#define JEMALLOC_HAVE_CLOCK_REALTIME
+
+/*
+ * Defined if _malloc_thread_cleanup() exists.  At least in the case of
+ * FreeBSD, pthread_key_create() allocates, which if used during malloc
+ * bootstrapping will cause recursion into the pthreads library.  Therefore, if
+ * _malloc_thread_cleanup() exists, use it as the basis for thread cleanup in
+ * malloc_tsd.
+ */
+/* #undef JEMALLOC_MALLOC_THREAD_CLEANUP */
+
+/*
+ * Defined if threaded initialization is known to be safe on this platform.
+ * Among other things, it must be possible to initialize a mutex without
+ * triggering allocation in order for threaded allocation to be safe.
+ */
+#define JEMALLOC_THREADED_INIT
+
+/*
+ * Defined if the pthreads implementation defines
+ * _pthread_mutex_init_calloc_cb(), in which case the function is used in order
+ * to avoid recursive allocation during mutex initialization.
+ */
+/* #undef JEMALLOC_MUTEX_INIT_CB */
+
+/* Non-empty if the tls_model attribute is supported. */
+#define JEMALLOC_TLS_MODEL __attribute__((tls_model("initial-exec")))
+
+/*
+ * JEMALLOC_DEBUG enables assertions and other sanity checks, and disables
+ * inline functions.
+ */
+/* #undef JEMALLOC_DEBUG */
+
+/* JEMALLOC_STATS enables statistics calculation. */
+#define JEMALLOC_STATS
+
+/* JEMALLOC_EXPERIMENTAL_SMALLOCX_API enables experimental smallocx API. */
+/* #undef JEMALLOC_EXPERIMENTAL_SMALLOCX_API */
+
+/* JEMALLOC_PROF enables allocation profiling. */
+/* #undef JEMALLOC_PROF */
+
+/* Use libunwind for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_LIBUNWIND */
+
+/* Use libgcc for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_LIBGCC */
+
+/* Use gcc intrinsics for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_GCC */
+
+/*
+ * JEMALLOC_DSS enables use of sbrk(2) to allocate extents from the data storage
+ * segment (DSS).
+ */
+#define JEMALLOC_DSS
+
+/* Support memory filling (junk/zero). */
+#define JEMALLOC_FILL
+
+/* Support utrace(2)-based tracing. */
+/* #undef JEMALLOC_UTRACE */
+
+/* Support utrace(2)-based tracing (label based signature). */
+/* #undef JEMALLOC_UTRACE_LABEL */
+
+/* Support optional abort() on OOM. */
+/* #undef JEMALLOC_XMALLOC */
+
+/* Support lazy locking (avoid locking unless a second thread is launched). */
+/* #undef JEMALLOC_LAZY_LOCK */
+
+/*
+ * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size
+ * classes).
+ */
+/* #undef LG_QUANTUM */
+
+/* One page is 2^LG_PAGE bytes. */
+#define LG_PAGE 16
+
+/* Maximum number of regions in a slab. */
+/* #undef CONFIG_LG_SLAB_MAXREGS */
+
+/*
+ * One huge page is 2^LG_HUGEPAGE bytes.  Note that this is defined even if the
+ * system does not explicitly support huge pages; system calls that require
+ * explicit huge page support are separately configured.
+ */
+#define LG_HUGEPAGE 21
+
+/*
+ * If defined, adjacent virtual memory mappings with identical attributes
+ * automatically coalesce, and they fragment when changes are made to subranges.
+ * This is the normal order of things for mmap()/munmap(), but on Windows
+ * VirtualAlloc()/VirtualFree() operations must be precisely matched, i.e.
+ * mappings do *not* coalesce/fragment.
+ */
+#define JEMALLOC_MAPS_COALESCE
+
+/*
+ * If defined, retain memory for later reuse by default rather than using e.g.
+ * munmap() to unmap freed extents.  This is enabled on 64-bit Linux because
+ * common sequences of mmap()/munmap() calls will cause virtual memory map
+ * holes.
+ */
+#define JEMALLOC_RETAIN
+
+/* TLS is used to map arenas and magazine caches to threads. */
+#define JEMALLOC_TLS
+
+/*
+ * Used to mark unreachable code to quiet "end of non-void" compiler warnings.
+ * Don't use this directly; instead use unreachable() from util.h
+ */
+#define JEMALLOC_INTERNAL_UNREACHABLE __builtin_unreachable
+
+/*
+ * ffs*() functions to use for bitmapping.  Don't use these directly; instead,
+ * use ffs_*() from util.h.
+ */
+#define JEMALLOC_INTERNAL_FFSLL __builtin_ffsll
+#define JEMALLOC_INTERNAL_FFSL __builtin_ffsl
+#define JEMALLOC_INTERNAL_FFS __builtin_ffs
+
+/*
+ * popcount*() functions to use for bitmapping.
+ */
+#define JEMALLOC_INTERNAL_POPCOUNTL __builtin_popcountl
+#define JEMALLOC_INTERNAL_POPCOUNT __builtin_popcount
+
+/*
+ * If defined, explicitly attempt to more uniformly distribute large allocation
+ * pointer alignments across all cache indices.
+ */
+#define JEMALLOC_CACHE_OBLIVIOUS
+
+/*
+ * If defined, enable logging facilities.  We make this a configure option to
+ * avoid taking extra branches everywhere.
+ */
+/* #undef JEMALLOC_LOG */
+
+/*
+ * If defined, use readlinkat() (instead of readlink()) to follow
+ * /etc/malloc_conf.
+ */
+/* #undef JEMALLOC_READLINKAT */
+
+/*
+ * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
+ */
+/* #undef JEMALLOC_ZONE */
+
+/*
+ * Methods for determining whether the OS overcommits.
+ * JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY: Linux's
+ *                                         /proc/sys/vm.overcommit_memory file.
+ * JEMALLOC_SYSCTL_VM_OVERCOMMIT: FreeBSD's vm.overcommit sysctl.
+ */
+/* #undef JEMALLOC_SYSCTL_VM_OVERCOMMIT */
+#define JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY
+
+/* Defined if madvise(2) is available. */
+#define JEMALLOC_HAVE_MADVISE
+
+/*
+ * Defined if transparent huge pages are supported via the MADV_[NO]HUGEPAGE
+ * arguments to madvise(2).
+ */
+#define JEMALLOC_HAVE_MADVISE_HUGE
+
+/*
+ * Methods for purging unused pages differ between operating systems.
+ *
+ *   madvise(..., MADV_FREE) : This marks pages as being unused, such that they
+ *                             will be discarded rather than swapped out.
+ *   madvise(..., MADV_DONTNEED) : If JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS is
+ *                                 defined, this immediately discards pages,
+ *                                 such that new pages will be demand-zeroed if
+ *                                 the address region is later touched;
+ *                                 otherwise this behaves similarly to
+ *                                 MADV_FREE, though typically with higher
+ *                                 system overhead.
+ */
+#define JEMALLOC_PURGE_MADVISE_FREE
+#define JEMALLOC_PURGE_MADVISE_DONTNEED
+#define JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS
+
+/* Defined if madvise(2) is available but MADV_FREE is not (x86 Linux only). */
+#define JEMALLOC_DEFINE_MADVISE_FREE
+
+/*
+ * Defined if MADV_DO[NT]DUMP is supported as an argument to madvise.
+ */
+#define JEMALLOC_MADVISE_DONTDUMP
+
+/*
+ * Defined if MADV_[NO]CORE is supported as an argument to madvise.
+ */
+/* #undef JEMALLOC_MADVISE_NOCORE */
+
+/* Defined if mprotect(2) is available. */
+#define JEMALLOC_HAVE_MPROTECT
+
+/*
+ * Defined if transparent huge pages (THPs) are supported via the
+ * MADV_[NO]HUGEPAGE arguments to madvise(2), and THP support is enabled.
+ */
+/* #undef JEMALLOC_THP */
+
+/* Defined if posix_madvise is available. */
+/* #undef JEMALLOC_HAVE_POSIX_MADVISE */
+
+/*
+ * Method for purging unused pages using posix_madvise.
+ *
+ *   posix_madvise(..., POSIX_MADV_DONTNEED)
+ */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED_ZEROS */
+
+/*
+ * Defined if memcntl page admin call is supported
+ */
+/* #undef JEMALLOC_HAVE_MEMCNTL */
+
+/*
+ * Defined if malloc_size is supported
+ */
+/* #undef JEMALLOC_HAVE_MALLOC_SIZE */
+
+/* Define if operating system has alloca.h header. */
+#define JEMALLOC_HAS_ALLOCA_H
+
+/* C99 restrict keyword supported. */
+#define JEMALLOC_HAS_RESTRICT
+
+/* For use by hash code. */
+/* #undef JEMALLOC_BIG_ENDIAN */
+
+/* sizeof(int) == 2^LG_SIZEOF_INT. */
+#define LG_SIZEOF_INT 2
+
+/* sizeof(long) == 2^LG_SIZEOF_LONG. */
+#define LG_SIZEOF_LONG 3
+
+/* sizeof(long long) == 2^LG_SIZEOF_LONG_LONG. */
+#define LG_SIZEOF_LONG_LONG 3
+
+/* sizeof(intmax_t) == 2^LG_SIZEOF_INTMAX_T. */
+#define LG_SIZEOF_INTMAX_T 3
+
+/* glibc malloc hooks (__malloc_hook, __realloc_hook, __free_hook). */
+#define JEMALLOC_GLIBC_MALLOC_HOOK
+
+/* glibc memalign hook. */
+#define JEMALLOC_GLIBC_MEMALIGN_HOOK
+
+/* pthread support */
+#define JEMALLOC_HAVE_PTHREAD
+
+/* dlsym() support */
+#define JEMALLOC_HAVE_DLSYM
+
+/* Adaptive mutex support in pthreads. */
+#define JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP
+
+/* GNU specific sched_getcpu support */
+#define JEMALLOC_HAVE_SCHED_GETCPU
+
+/* GNU specific sched_setaffinity support */
+#define JEMALLOC_HAVE_SCHED_SETAFFINITY
+
+/*
+ * If defined, all the features necessary for background threads are present.
+ */
+#define JEMALLOC_BACKGROUND_THREAD
+
+/*
+ * If defined, jemalloc symbols are not exported (doesn't work when
+ * JEMALLOC_PREFIX is not defined).
+ */
+/* #undef JEMALLOC_EXPORT */
+
+/* config.malloc_conf options string. */
+#define JEMALLOC_CONFIG_MALLOC_CONF "@JEMALLOC_CONFIG_MALLOC_CONF@"
+
+/* If defined, jemalloc takes the malloc/free/etc. symbol names. */
+#define JEMALLOC_IS_MALLOC
+
+/*
+ * Defined if strerror_r returns char * if _GNU_SOURCE is defined.
+ */
+#define JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE
+
+/* Performs additional safety checks when defined. */
+/* #undef JEMALLOC_OPT_SAFETY_CHECKS */
+
+/* Is C++ support being built? */
+/* #undef JEMALLOC_ENABLE_CXX */
+
+/* Performs additional size checks when defined. */
+/* #undef JEMALLOC_OPT_SIZE_CHECKS */
+
+/* Allows sampled junk and stash for checking use-after-free when defined. */
+/* #undef JEMALLOC_UAF_DETECTION */
+
+/* Darwin VM_MAKE_TAG support */
+/* #undef JEMALLOC_HAVE_VM_MAKE_TAG */
+
+#endif /* JEMALLOC_INTERNAL_DEFS_H_ */
diff --git a/contrib/jemalloc-cmake/include_linux_riscv64/jemalloc/internal/jemalloc_internal_defs.h.in b/contrib/jemalloc-cmake/include_linux_riscv64/jemalloc/internal/jemalloc_internal_defs.h.in
new file mode 100644
index 00000000000..ad535e6d773
--- /dev/null
+++ b/contrib/jemalloc-cmake/include_linux_riscv64/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -0,0 +1,427 @@
+/* include/jemalloc/internal/jemalloc_internal_defs.h.  Generated from jemalloc_internal_defs.h.in by configure.  */
+#ifndef JEMALLOC_INTERNAL_DEFS_H_
+#define JEMALLOC_INTERNAL_DEFS_H_
+/*
+ * If JEMALLOC_PREFIX is defined via --with-jemalloc-prefix, it will cause all
+ * public APIs to be prefixed.  This makes it possible, with some care, to use
+ * multiple allocators simultaneously.
+ */
+/* #undef JEMALLOC_PREFIX */
+/* #undef JEMALLOC_CPREFIX */
+
+/*
+ * Define overrides for non-standard allocator-related functions if they are
+ * present on the system.
+ */
+#if !defined(USE_MUSL)
+    #define JEMALLOC_OVERRIDE___LIBC_CALLOC
+    #define JEMALLOC_OVERRIDE___LIBC_FREE
+    #define JEMALLOC_OVERRIDE___LIBC_MALLOC
+    #define JEMALLOC_OVERRIDE___LIBC_MEMALIGN
+    #define JEMALLOC_OVERRIDE___LIBC_REALLOC
+    #define JEMALLOC_OVERRIDE___LIBC_VALLOC
+#endif
+/* #undef JEMALLOC_OVERRIDE___POSIX_MEMALIGN */
+
+/*
+ * JEMALLOC_PRIVATE_NAMESPACE is used as a prefix for all library-private APIs.
+ * For shared libraries, symbol visibility mechanisms prevent these symbols
+ * from being exported, but for static libraries, naming collisions are a real
+ * possibility.
+ */
+#define JEMALLOC_PRIVATE_NAMESPACE je_
+
+/*
+ * Hyper-threaded CPUs may need a special instruction inside spin loops in
+ * order to yield to another virtual CPU.
+ */
+#define CPU_SPINWAIT
+/* 1 if CPU_SPINWAIT is defined, 0 otherwise. */
+#define HAVE_CPU_SPINWAIT 0
+
+/*
+ * Number of significant bits in virtual addresses.  This may be less than the
+ * total number of bits in a pointer, e.g. on x64, for which the uppermost 16
+ * bits are the same as bit 47.
+ */
+#define LG_VADDR 48
+
+/* Defined if C11 atomics are available. */
+#define JEMALLOC_C11_ATOMICS
+
+/* Defined if GCC __atomic atomics are available. */
+#define JEMALLOC_GCC_ATOMIC_ATOMICS
+/* and the 8-bit variant support. */
+#define JEMALLOC_GCC_U8_ATOMIC_ATOMICS
+
+/* Defined if GCC __sync atomics are available. */
+#define JEMALLOC_GCC_SYNC_ATOMICS
+/* and the 8-bit variant support. */
+#define JEMALLOC_GCC_U8_SYNC_ATOMICS
+
+/*
+ * Defined if __builtin_clz() and __builtin_clzl() are available.
+ */
+#define JEMALLOC_HAVE_BUILTIN_CLZ
+
+/*
+ * Defined if os_unfair_lock_*() functions are available, as provided by Darwin.
+ */
+/* #undef JEMALLOC_OS_UNFAIR_LOCK */
+
+/* Defined if syscall(2) is usable. */
+#define JEMALLOC_USE_SYSCALL
+
+/*
+ * Defined if secure_getenv(3) is available.
+ */
+/* #undef JEMALLOC_HAVE_SECURE_GETENV */
+
+/*
+ * Defined if issetugid(2) is available.
+ */
+/* #undef JEMALLOC_HAVE_ISSETUGID */
+
+/* Defined if pthread_atfork(3) is available. */
+#define JEMALLOC_HAVE_PTHREAD_ATFORK
+
+/* Defined if pthread_setname_np(3) is available. */
+#define JEMALLOC_HAVE_PTHREAD_SETNAME_NP
+
+/* Defined if pthread_getname_np(3) is available. */
+#define JEMALLOC_HAVE_PTHREAD_GETNAME_NP
+
+/* Defined if pthread_get_name_np(3) is available. */
+/* #undef JEMALLOC_HAVE_PTHREAD_GET_NAME_NP */
+
+/*
+ * Defined if clock_gettime(CLOCK_MONOTONIC_COARSE, ...) is available.
+ */
+#define JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE
+
+/*
+ * Defined if clock_gettime(CLOCK_MONOTONIC, ...) is available.
+ */
+#define JEMALLOC_HAVE_CLOCK_MONOTONIC
+
+/*
+ * Defined if mach_absolute_time() is available.
+ */
+/* #undef JEMALLOC_HAVE_MACH_ABSOLUTE_TIME */
+
+/*
+ * Defined if clock_gettime(CLOCK_REALTIME, ...) is available.
+ */
+#define JEMALLOC_HAVE_CLOCK_REALTIME
+
+/*
+ * Defined if _malloc_thread_cleanup() exists.  At least in the case of
+ * FreeBSD, pthread_key_create() allocates, which if used during malloc
+ * bootstrapping will cause recursion into the pthreads library.  Therefore, if
+ * _malloc_thread_cleanup() exists, use it as the basis for thread cleanup in
+ * malloc_tsd.
+ */
+/* #undef JEMALLOC_MALLOC_THREAD_CLEANUP */
+
+/*
+ * Defined if threaded initialization is known to be safe on this platform.
+ * Among other things, it must be possible to initialize a mutex without
+ * triggering allocation in order for threaded allocation to be safe.
+ */
+#define JEMALLOC_THREADED_INIT
+
+/*
+ * Defined if the pthreads implementation defines
+ * _pthread_mutex_init_calloc_cb(), in which case the function is used in order
+ * to avoid recursive allocation during mutex initialization.
+ */
+/* #undef JEMALLOC_MUTEX_INIT_CB */
+
+/* Non-empty if the tls_model attribute is supported. */
+#define JEMALLOC_TLS_MODEL __attribute__((tls_model("initial-exec")))
+
+/*
+ * JEMALLOC_DEBUG enables assertions and other sanity checks, and disables
+ * inline functions.
+ */
+/* #undef JEMALLOC_DEBUG */
+
+/* JEMALLOC_STATS enables statistics calculation. */
+#define JEMALLOC_STATS
+
+/* JEMALLOC_EXPERIMENTAL_SMALLOCX_API enables experimental smallocx API. */
+/* #undef JEMALLOC_EXPERIMENTAL_SMALLOCX_API */
+
+/* JEMALLOC_PROF enables allocation profiling. */
+/* #undef JEMALLOC_PROF */
+
+/* Use libunwind for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_LIBUNWIND */
+
+/* Use libgcc for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_LIBGCC */
+
+/* Use gcc intrinsics for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_GCC */
+
+/*
+ * JEMALLOC_DSS enables use of sbrk(2) to allocate extents from the data storage
+ * segment (DSS).
+ */
+#define JEMALLOC_DSS
+
+/* Support memory filling (junk/zero). */
+#define JEMALLOC_FILL
+
+/* Support utrace(2)-based tracing. */
+/* #undef JEMALLOC_UTRACE */
+
+/* Support utrace(2)-based tracing (label based signature). */
+/* #undef JEMALLOC_UTRACE_LABEL */
+
+/* Support optional abort() on OOM. */
+/* #undef JEMALLOC_XMALLOC */
+
+/* Support lazy locking (avoid locking unless a second thread is launched). */
+/* #undef JEMALLOC_LAZY_LOCK */
+
+/*
+ * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size
+ * classes).
+ */
+/* #undef LG_QUANTUM */
+
+/* One page is 2^LG_PAGE bytes. */
+#define LG_PAGE 16
+
+/* Maximum number of regions in a slab. */
+/* #undef CONFIG_LG_SLAB_MAXREGS */
+
+/*
+ * One huge page is 2^LG_HUGEPAGE bytes.  Note that this is defined even if the
+ * system does not explicitly support huge pages; system calls that require
+ * explicit huge page support are separately configured.
+ */
+#define LG_HUGEPAGE 29
+
+/*
+ * If defined, adjacent virtual memory mappings with identical attributes
+ * automatically coalesce, and they fragment when changes are made to subranges.
+ * This is the normal order of things for mmap()/munmap(), but on Windows
+ * VirtualAlloc()/VirtualFree() operations must be precisely matched, i.e.
+ * mappings do *not* coalesce/fragment.
+ */
+#define JEMALLOC_MAPS_COALESCE
+
+/*
+ * If defined, retain memory for later reuse by default rather than using e.g.
+ * munmap() to unmap freed extents.  This is enabled on 64-bit Linux because
+ * common sequences of mmap()/munmap() calls will cause virtual memory map
+ * holes.
+ */
+#define JEMALLOC_RETAIN
+
+/* TLS is used to map arenas and magazine caches to threads. */
+#define JEMALLOC_TLS
+
+/*
+ * Used to mark unreachable code to quiet "end of non-void" compiler warnings.
+ * Don't use this directly; instead use unreachable() from util.h
+ */
+#define JEMALLOC_INTERNAL_UNREACHABLE __builtin_unreachable
+
+/*
+ * ffs*() functions to use for bitmapping.  Don't use these directly; instead,
+ * use ffs_*() from util.h.
+ */
+#define JEMALLOC_INTERNAL_FFSLL __builtin_ffsll
+#define JEMALLOC_INTERNAL_FFSL __builtin_ffsl
+#define JEMALLOC_INTERNAL_FFS __builtin_ffs
+
+/*
+ * popcount*() functions to use for bitmapping.
+ */
+#define JEMALLOC_INTERNAL_POPCOUNTL __builtin_popcountl
+#define JEMALLOC_INTERNAL_POPCOUNT __builtin_popcount
+
+/*
+ * If defined, explicitly attempt to more uniformly distribute large allocation
+ * pointer alignments across all cache indices.
+ */
+#define JEMALLOC_CACHE_OBLIVIOUS
+
+/*
+ * If defined, enable logging facilities.  We make this a configure option to
+ * avoid taking extra branches everywhere.
+ */
+/* #undef JEMALLOC_LOG */
+
+/*
+ * If defined, use readlinkat() (instead of readlink()) to follow
+ * /etc/malloc_conf.
+ */
+/* #undef JEMALLOC_READLINKAT */
+
+/*
+ * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
+ */
+/* #undef JEMALLOC_ZONE */
+
+/*
+ * Methods for determining whether the OS overcommits.
+ * JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY: Linux's
+ *                                         /proc/sys/vm.overcommit_memory file.
+ * JEMALLOC_SYSCTL_VM_OVERCOMMIT: FreeBSD's vm.overcommit sysctl.
+ */
+/* #undef JEMALLOC_SYSCTL_VM_OVERCOMMIT */
+#define JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY
+
+/* Defined if madvise(2) is available. */
+#define JEMALLOC_HAVE_MADVISE
+
+/*
+ * Defined if transparent huge pages are supported via the MADV_[NO]HUGEPAGE
+ * arguments to madvise(2).
+ */
+#define JEMALLOC_HAVE_MADVISE_HUGE
+
+/*
+ * Methods for purging unused pages differ between operating systems.
+ *
+ *   madvise(..., MADV_FREE) : This marks pages as being unused, such that they
+ *                             will be discarded rather than swapped out.
+ *   madvise(..., MADV_DONTNEED) : If JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS is
+ *                                 defined, this immediately discards pages,
+ *                                 such that new pages will be demand-zeroed if
+ *                                 the address region is later touched;
+ *                                 otherwise this behaves similarly to
+ *                                 MADV_FREE, though typically with higher
+ *                                 system overhead.
+ */
+#define JEMALLOC_PURGE_MADVISE_FREE
+#define JEMALLOC_PURGE_MADVISE_DONTNEED
+#define JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS
+
+/* Defined if madvise(2) is available but MADV_FREE is not (x86 Linux only). */
+/* #undef JEMALLOC_DEFINE_MADVISE_FREE */
+
+/*
+ * Defined if MADV_DO[NT]DUMP is supported as an argument to madvise.
+ */
+#define JEMALLOC_MADVISE_DONTDUMP
+
+/*
+ * Defined if MADV_[NO]CORE is supported as an argument to madvise.
+ */
+/* #undef JEMALLOC_MADVISE_NOCORE */
+
+/* Defined if mprotect(2) is available. */
+#define JEMALLOC_HAVE_MPROTECT
+
+/*
+ * Defined if transparent huge pages (THPs) are supported via the
+ * MADV_[NO]HUGEPAGE arguments to madvise(2), and THP support is enabled.
+ */
+/* #undef JEMALLOC_THP */
+
+/* Defined if posix_madvise is available. */
+/* #undef JEMALLOC_HAVE_POSIX_MADVISE */
+
+/*
+ * Method for purging unused pages using posix_madvise.
+ *
+ *   posix_madvise(..., POSIX_MADV_DONTNEED)
+ */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED_ZEROS */
+
+/*
+ * Defined if memcntl page admin call is supported
+ */
+/* #undef JEMALLOC_HAVE_MEMCNTL */
+
+/*
+ * Defined if malloc_size is supported
+ */
+/* #undef JEMALLOC_HAVE_MALLOC_SIZE */
+
+/* Define if operating system has alloca.h header. */
+#define JEMALLOC_HAS_ALLOCA_H
+
+/* C99 restrict keyword supported. */
+#define JEMALLOC_HAS_RESTRICT
+
+/* For use by hash code. */
+/* #undef JEMALLOC_BIG_ENDIAN */
+
+/* sizeof(int) == 2^LG_SIZEOF_INT. */
+#define LG_SIZEOF_INT 2
+
+/* sizeof(long) == 2^LG_SIZEOF_LONG. */
+#define LG_SIZEOF_LONG 3
+
+/* sizeof(long long) == 2^LG_SIZEOF_LONG_LONG. */
+#define LG_SIZEOF_LONG_LONG 3
+
+/* sizeof(intmax_t) == 2^LG_SIZEOF_INTMAX_T. */
+#define LG_SIZEOF_INTMAX_T 3
+
+/* glibc malloc hooks (__malloc_hook, __realloc_hook, __free_hook). */
+#define JEMALLOC_GLIBC_MALLOC_HOOK
+
+/* glibc memalign hook. */
+#define JEMALLOC_GLIBC_MEMALIGN_HOOK
+
+/* pthread support */
+#define JEMALLOC_HAVE_PTHREAD
+
+/* dlsym() support */
+#define JEMALLOC_HAVE_DLSYM
+
+/* Adaptive mutex support in pthreads. */
+#define JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP
+
+/* GNU specific sched_getcpu support */
+#define JEMALLOC_HAVE_SCHED_GETCPU
+
+/* GNU specific sched_setaffinity support */
+#define JEMALLOC_HAVE_SCHED_SETAFFINITY
+
+/*
+ * If defined, all the features necessary for background threads are present.
+ */
+#define JEMALLOC_BACKGROUND_THREAD
+
+/*
+ * If defined, jemalloc symbols are not exported (doesn't work when
+ * JEMALLOC_PREFIX is not defined).
+ */
+/* #undef JEMALLOC_EXPORT */
+
+/* config.malloc_conf options string. */
+#define JEMALLOC_CONFIG_MALLOC_CONF "@JEMALLOC_CONFIG_MALLOC_CONF@"
+
+/* If defined, jemalloc takes the malloc/free/etc. symbol names. */
+#define JEMALLOC_IS_MALLOC
+
+/*
+ * Defined if strerror_r returns char * if _GNU_SOURCE is defined.
+ */
+#define JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE
+
+/* Performs additional safety checks when defined. */
+/* #undef JEMALLOC_OPT_SAFETY_CHECKS */
+
+/* Is C++ support being built? */
+/* #undef JEMALLOC_ENABLE_CXX */
+
+/* Performs additional size checks when defined. */
+/* #undef JEMALLOC_OPT_SIZE_CHECKS */
+
+/* Allows sampled junk and stash for checking use-after-free when defined. */
+/* #undef JEMALLOC_UAF_DETECTION */
+
+/* Darwin VM_MAKE_TAG support */
+/* #undef JEMALLOC_HAVE_VM_MAKE_TAG */
+
+#endif /* JEMALLOC_INTERNAL_DEFS_H_ */
diff --git a/contrib/jemalloc-cmake/include_linux_x86_64/README b/contrib/jemalloc-cmake/include_linux_x86_64/README
deleted file mode 100644
index bf7663bda8d..00000000000
--- a/contrib/jemalloc-cmake/include_linux_x86_64/README
+++ /dev/null
@@ -1,7 +0,0 @@
-Here are pre-generated files from jemalloc on Linux x86_64.
-You can obtain these files by running ./autogen.sh inside jemalloc source directory.
-
-Added #define GNU_SOURCE
-Added JEMALLOC_OVERRIDE___POSIX_MEMALIGN because why not.
-Removed JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF because it's non standard.
-Removed JEMALLOC_PURGE_MADVISE_FREE because it's available only from Linux 4.5.
diff --git a/contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/internal/jemalloc_internal_defs.h b/contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/internal/jemalloc_internal_defs.h.in
similarity index 78%
rename from contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/internal/jemalloc_internal_defs.h
rename to contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/internal/jemalloc_internal_defs.h.in
index 7c21fa79397..99ab2d53ca9 100644
--- a/contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/internal/jemalloc_internal_defs.h
+++ b/contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -1,11 +1,6 @@
 /* include/jemalloc/internal/jemalloc_internal_defs.h.  Generated from jemalloc_internal_defs.h.in by configure.  */
 #ifndef JEMALLOC_INTERNAL_DEFS_H_
 #define JEMALLOC_INTERNAL_DEFS_H_
-
-#ifndef _GNU_SOURCE
-    #define _GNU_SOURCE
-#endif
-
 /*
  * If JEMALLOC_PREFIX is defined via --with-jemalloc-prefix, it will cause all
  * public APIs to be prefixed.  This makes it possible, with some care, to use
@@ -18,13 +13,15 @@
  * Define overrides for non-standard allocator-related functions if they are
  * present on the system.
  */
-#define JEMALLOC_OVERRIDE___LIBC_CALLOC
-#define JEMALLOC_OVERRIDE___LIBC_FREE
-#define JEMALLOC_OVERRIDE___LIBC_MALLOC
-#define JEMALLOC_OVERRIDE___LIBC_MEMALIGN
-#define JEMALLOC_OVERRIDE___LIBC_REALLOC
-#define JEMALLOC_OVERRIDE___LIBC_VALLOC
-#define JEMALLOC_OVERRIDE___POSIX_MEMALIGN
+#if !defined(USE_MUSL)
+    #define JEMALLOC_OVERRIDE___LIBC_CALLOC
+    #define JEMALLOC_OVERRIDE___LIBC_FREE
+    #define JEMALLOC_OVERRIDE___LIBC_MALLOC
+    #define JEMALLOC_OVERRIDE___LIBC_MEMALIGN
+    #define JEMALLOC_OVERRIDE___LIBC_REALLOC
+    #define JEMALLOC_OVERRIDE___LIBC_VALLOC
+#endif
+/* #undef JEMALLOC_OVERRIDE___POSIX_MEMALIGN */
 
 /*
  * JEMALLOC_PRIVATE_NAMESPACE is used as a prefix for all library-private APIs.
@@ -50,29 +47,17 @@
 #define LG_VADDR 48
 
 /* Defined if C11 atomics are available. */
-#define JEMALLOC_C11_ATOMICS 1
+#define JEMALLOC_C11_ATOMICS
 
 /* Defined if GCC __atomic atomics are available. */
-#define JEMALLOC_GCC_ATOMIC_ATOMICS 1
+#define JEMALLOC_GCC_ATOMIC_ATOMICS
+/* and the 8-bit variant support. */
+#define JEMALLOC_GCC_U8_ATOMIC_ATOMICS
 
 /* Defined if GCC __sync atomics are available. */
-#define JEMALLOC_GCC_SYNC_ATOMICS 1
-
-/*
- * Defined if __sync_add_and_fetch(uint32_t *, uint32_t) and
- * __sync_sub_and_fetch(uint32_t *, uint32_t) are available, despite
- * __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 not being defined (which means the
- * functions are defined in libgcc instead of being inlines).
- */
-/* #undef JE_FORCE_SYNC_COMPARE_AND_SWAP_4 */
-
-/*
- * Defined if __sync_add_and_fetch(uint64_t *, uint64_t) and
- * __sync_sub_and_fetch(uint64_t *, uint64_t) are available, despite
- * __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 not being defined (which means the
- * functions are defined in libgcc instead of being inlines).
- */
-/* #undef JE_FORCE_SYNC_COMPARE_AND_SWAP_8 */
+#define JEMALLOC_GCC_SYNC_ATOMICS
+/* and the 8-bit variant support. */
+#define JEMALLOC_GCC_U8_SYNC_ATOMICS
 
 /*
  * Defined if __builtin_clz() and __builtin_clzl() are available.
@@ -84,20 +69,13 @@
  */
 /* #undef JEMALLOC_OS_UNFAIR_LOCK */
 
-/*
- * Defined if OSSpin*() functions are available, as provided by Darwin, and
- * documented in the spinlock(3) manual page.
- */
-/* #undef JEMALLOC_OSSPIN */
-
 /* Defined if syscall(2) is usable. */
 #define JEMALLOC_USE_SYSCALL
 
 /*
  * Defined if secure_getenv(3) is available.
  */
-// Don't want dependency on newer GLIBC
-//#define JEMALLOC_HAVE_SECURE_GETENV
+/* #undef JEMALLOC_HAVE_SECURE_GETENV */
 
 /*
  * Defined if issetugid(2) is available.
@@ -110,21 +88,32 @@
 /* Defined if pthread_setname_np(3) is available. */
 #define JEMALLOC_HAVE_PTHREAD_SETNAME_NP
 
+/* Defined if pthread_getname_np(3) is available. */
+#define JEMALLOC_HAVE_PTHREAD_GETNAME_NP
+
+/* Defined if pthread_get_name_np(3) is available. */
+/* #undef JEMALLOC_HAVE_PTHREAD_GET_NAME_NP */
+
 /*
  * Defined if clock_gettime(CLOCK_MONOTONIC_COARSE, ...) is available.
  */
-#define JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE 1
+#define JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE
 
 /*
  * Defined if clock_gettime(CLOCK_MONOTONIC, ...) is available.
  */
-#define JEMALLOC_HAVE_CLOCK_MONOTONIC 1
+#define JEMALLOC_HAVE_CLOCK_MONOTONIC
 
 /*
  * Defined if mach_absolute_time() is available.
  */
 /* #undef JEMALLOC_HAVE_MACH_ABSOLUTE_TIME */
 
+/*
+ * Defined if clock_gettime(CLOCK_REALTIME, ...) is available.
+ */
+#define JEMALLOC_HAVE_CLOCK_REALTIME
+
 /*
  * Defined if _malloc_thread_cleanup() exists.  At least in the case of
  * FreeBSD, pthread_key_create() allocates, which if used during malloc
@@ -160,6 +149,9 @@
 /* JEMALLOC_STATS enables statistics calculation. */
 #define JEMALLOC_STATS
 
+/* JEMALLOC_EXPERIMENTAL_SMALLOCX_API enables experimental smallocx API. */
+/* #undef JEMALLOC_EXPERIMENTAL_SMALLOCX_API */
+
 /* JEMALLOC_PROF enables allocation profiling. */
 /* #undef JEMALLOC_PROF */
 
@@ -184,6 +176,9 @@
 /* Support utrace(2)-based tracing. */
 /* #undef JEMALLOC_UTRACE */
 
+/* Support utrace(2)-based tracing (label based signature). */
+/* #undef JEMALLOC_UTRACE_LABEL */
+
 /* Support optional abort() on OOM. */
 /* #undef JEMALLOC_XMALLOC */
 
@@ -199,6 +194,9 @@
 /* One page is 2^LG_PAGE bytes. */
 #define LG_PAGE 12
 
+/* Maximum number of regions in a slab. */
+/* #undef CONFIG_LG_SLAB_MAXREGS */
+
 /*
  * One huge page is 2^LG_HUGEPAGE bytes.  Note that this is defined even if the
  * system does not explicitly support huge pages; system calls that require
@@ -240,6 +238,12 @@
 #define JEMALLOC_INTERNAL_FFSL __builtin_ffsl
 #define JEMALLOC_INTERNAL_FFS __builtin_ffs
 
+/*
+ * popcount*() functions to use for bitmapping.
+ */
+#define JEMALLOC_INTERNAL_POPCOUNTL __builtin_popcountl
+#define JEMALLOC_INTERNAL_POPCOUNT __builtin_popcount
+
 /*
  * If defined, explicitly attempt to more uniformly distribute large allocation
  * pointer alignments across all cache indices.
@@ -252,6 +256,12 @@
  */
 /* #undef JEMALLOC_LOG */
 
+/*
+ * If defined, use readlinkat() (instead of readlink()) to follow
+ * /etc/malloc_conf.
+ */
+/* #undef JEMALLOC_READLINKAT */
+
 /*
  * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
  */
@@ -288,7 +298,7 @@
  *                                 MADV_FREE, though typically with higher
  *                                 system overhead.
  */
-//#define JEMALLOC_PURGE_MADVISE_FREE
+#define JEMALLOC_PURGE_MADVISE_FREE
 #define JEMALLOC_PURGE_MADVISE_DONTNEED
 #define JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS
 
@@ -300,17 +310,46 @@
  */
 #define JEMALLOC_MADVISE_DONTDUMP
 
+/*
+ * Defined if MADV_[NO]CORE is supported as an argument to madvise.
+ */
+/* #undef JEMALLOC_MADVISE_NOCORE */
+
+/* Defined if mprotect(2) is available. */
+#define JEMALLOC_HAVE_MPROTECT
+
 /*
  * Defined if transparent huge pages (THPs) are supported via the
  * MADV_[NO]HUGEPAGE arguments to madvise(2), and THP support is enabled.
  */
 /* #undef JEMALLOC_THP */
 
+/* Defined if posix_madvise is available. */
+/* #undef JEMALLOC_HAVE_POSIX_MADVISE */
+
+/*
+ * Method for purging unused pages using posix_madvise.
+ *
+ *   posix_madvise(..., POSIX_MADV_DONTNEED)
+ */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED_ZEROS */
+
+/*
+ * Defined if memcntl page admin call is supported
+ */
+/* #undef JEMALLOC_HAVE_MEMCNTL */
+
+/*
+ * Defined if malloc_size is supported
+ */
+/* #undef JEMALLOC_HAVE_MALLOC_SIZE */
+
 /* Define if operating system has alloca.h header. */
-#define JEMALLOC_HAS_ALLOCA_H 1
+#define JEMALLOC_HAS_ALLOCA_H
 
 /* C99 restrict keyword supported. */
-#define JEMALLOC_HAS_RESTRICT 1
+#define JEMALLOC_HAS_RESTRICT
 
 /* For use by hash code. */
 /* #undef JEMALLOC_BIG_ENDIAN */
@@ -351,7 +390,7 @@
 /*
  * If defined, all the features necessary for background threads are present.
  */
-#define JEMALLOC_BACKGROUND_THREAD 1
+#define JEMALLOC_BACKGROUND_THREAD
 
 /*
  * If defined, jemalloc symbols are not exported (doesn't work when
@@ -360,20 +399,29 @@
 /* #undef JEMALLOC_EXPORT */
 
 /* config.malloc_conf options string. */
-#define JEMALLOC_CONFIG_MALLOC_CONF ""
+#define JEMALLOC_CONFIG_MALLOC_CONF "@JEMALLOC_CONFIG_MALLOC_CONF@"
 
 /* If defined, jemalloc takes the malloc/free/etc. symbol names. */
-#define JEMALLOC_IS_MALLOC 1
+#define JEMALLOC_IS_MALLOC
 
 /*
  * Defined if strerror_r returns char * if _GNU_SOURCE is defined.
  */
 #define JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE
 
-/*
- * popcount*() functions to use for bitmapping.
- */
-#define JEMALLOC_INTERNAL_POPCOUNTL __builtin_popcountl
-#define JEMALLOC_INTERNAL_POPCOUNT __builtin_popcount
+/* Performs additional safety checks when defined. */
+/* #undef JEMALLOC_OPT_SAFETY_CHECKS */
+
+/* Is C++ support being built? */
+/* #undef JEMALLOC_ENABLE_CXX */
+
+/* Performs additional size checks when defined. */
+/* #undef JEMALLOC_OPT_SIZE_CHECKS */
+
+/* Allows sampled junk and stash for checking use-after-free when defined. */
+/* #undef JEMALLOC_UAF_DETECTION */
+
+/* Darwin VM_MAKE_TAG support */
+/* #undef JEMALLOC_HAVE_VM_MAKE_TAG */
 
 #endif /* JEMALLOC_INTERNAL_DEFS_H_ */
diff --git a/contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/internal/jemalloc_preamble.h b/contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/internal/jemalloc_preamble.h
deleted file mode 100644
index d79551e1f25..00000000000
--- a/contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/internal/jemalloc_preamble.h
+++ /dev/null
@@ -1,216 +0,0 @@
-#ifndef JEMALLOC_PREAMBLE_H
-#define JEMALLOC_PREAMBLE_H
-
-#include "jemalloc_internal_defs.h"
-#include "jemalloc/internal/jemalloc_internal_decls.h"
-
-#ifdef JEMALLOC_UTRACE
-#include <sys/ktrace.h>
-#endif
-
-#define JEMALLOC_NO_DEMANGLE
-#ifdef JEMALLOC_JET
-#  undef JEMALLOC_IS_MALLOC
-#  define JEMALLOC_N(n) jet_##n
-#  include "jemalloc/internal/public_namespace.h"
-#  define JEMALLOC_NO_RENAME
-#  include "jemalloc/jemalloc.h"
-#  undef JEMALLOC_NO_RENAME
-#else
-#  define JEMALLOC_N(n) je_##n
-#  include "jemalloc/jemalloc.h"
-#endif
-
-#if (defined(JEMALLOC_OSATOMIC) || defined(JEMALLOC_OSSPIN))
-#include <libkern/OSAtomic.h>
-#endif
-
-#ifdef JEMALLOC_ZONE
-#include <mach/mach_error.h>
-#include <mach/mach_init.h>
-#include <mach/vm_map.h>
-#endif
-
-#include "jemalloc/internal/jemalloc_internal_macros.h"
-
-/*
- * Note that the ordering matters here; the hook itself is name-mangled.  We
- * want the inclusion of hooks to happen early, so that we hook as much as
- * possible.
- */
-#ifndef JEMALLOC_NO_PRIVATE_NAMESPACE
-#  ifndef JEMALLOC_JET
-#    include "jemalloc/internal/private_namespace.h"
-#  else
-#    include "jemalloc/internal/private_namespace_jet.h"
-#  endif
-#endif
-#include "jemalloc/internal/test_hooks.h"
-
-#ifdef JEMALLOC_DEFINE_MADVISE_FREE
-#  define JEMALLOC_MADV_FREE 8
-#endif
-
-static const bool config_debug =
-#ifdef JEMALLOC_DEBUG
-    true
-#else
-    false
-#endif
-    ;
-static const bool have_dss =
-#ifdef JEMALLOC_DSS
-    true
-#else
-    false
-#endif
-    ;
-static const bool have_madvise_huge =
-#ifdef JEMALLOC_HAVE_MADVISE_HUGE
-    true
-#else
-    false
-#endif
-    ;
-static const bool config_fill =
-#ifdef JEMALLOC_FILL
-    true
-#else
-    false
-#endif
-    ;
-static const bool config_lazy_lock =
-#ifdef JEMALLOC_LAZY_LOCK
-    true
-#else
-    false
-#endif
-    ;
-static const char * const config_malloc_conf = JEMALLOC_CONFIG_MALLOC_CONF;
-static const bool config_prof =
-#ifdef JEMALLOC_PROF
-    true
-#else
-    false
-#endif
-    ;
-static const bool config_prof_libgcc =
-#ifdef JEMALLOC_PROF_LIBGCC
-    true
-#else
-    false
-#endif
-    ;
-static const bool config_prof_libunwind =
-#ifdef JEMALLOC_PROF_LIBUNWIND
-    true
-#else
-    false
-#endif
-    ;
-static const bool maps_coalesce =
-#ifdef JEMALLOC_MAPS_COALESCE
-    true
-#else
-    false
-#endif
-    ;
-static const bool config_stats =
-#ifdef JEMALLOC_STATS
-    true
-#else
-    false
-#endif
-    ;
-static const bool config_tls =
-#ifdef JEMALLOC_TLS
-    true
-#else
-    false
-#endif
-    ;
-static const bool config_utrace =
-#ifdef JEMALLOC_UTRACE
-    true
-#else
-    false
-#endif
-    ;
-static const bool config_xmalloc =
-#ifdef JEMALLOC_XMALLOC
-    true
-#else
-    false
-#endif
-    ;
-static const bool config_cache_oblivious =
-#ifdef JEMALLOC_CACHE_OBLIVIOUS
-    true
-#else
-    false
-#endif
-    ;
-/*
- * Undocumented, for jemalloc development use only at the moment.  See the note
- * in jemalloc/internal/log.h.
- */
-static const bool config_log =
-#ifdef JEMALLOC_LOG
-    true
-#else
-    false
-#endif
-    ;
-#ifdef JEMALLOC_HAVE_SCHED_GETCPU
-/* Currently percpu_arena depends on sched_getcpu. */
-#define JEMALLOC_PERCPU_ARENA
-#endif
-static const bool have_percpu_arena =
-#ifdef JEMALLOC_PERCPU_ARENA
-    true
-#else
-    false
-#endif
-    ;
-/*
- * Undocumented, and not recommended; the application should take full
- * responsibility for tracking provenance.
- */
-static const bool force_ivsalloc =
-#ifdef JEMALLOC_FORCE_IVSALLOC
-    true
-#else
-    false
-#endif
-    ;
-static const bool have_background_thread =
-#ifdef JEMALLOC_BACKGROUND_THREAD
-    true
-#else
-    false
-#endif
-    ;
-
-#define JEMALLOC_GCC_U8_ATOMIC_ATOMICS 1
-#define JEMALLOC_GCC_U8_SYNC_ATOMICS 1
-
-/*
- * Are extra safety checks enabled; things like checking the size of sized
- * deallocations, double-frees, etc.
- */
-static const bool config_opt_safety_checks =
-#ifdef JEMALLOC_OPT_SAFETY_CHECKS
-    true
-#elif defined(JEMALLOC_DEBUG)
-    /*
-     * This lets us only guard safety checks by one flag instead of two; fast
-     * checks can guard solely by config_opt_safety_checks and run in debug mode
-     * too.
-     */
-    true
-#else
-    false
-#endif
-    ;
-
-#endif /* JEMALLOC_PREAMBLE_H */
diff --git a/contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/jemalloc_macros.h b/contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/jemalloc_macros.h
deleted file mode 100644
index 34235894285..00000000000
--- a/contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/jemalloc_macros.h
+++ /dev/null
@@ -1,129 +0,0 @@
-#include <stdlib.h>
-#include <stdbool.h>
-#include <stdint.h>
-#include <limits.h>
-#include <strings.h>
-
-#define JEMALLOC_VERSION "5.2.1-0-gea6b3e973b477b8061e0076bb257dbd7f3faa756"
-#define JEMALLOC_VERSION_MAJOR 5
-#define JEMALLOC_VERSION_MINOR 2
-#define JEMALLOC_VERSION_BUGFIX 1
-#define JEMALLOC_VERSION_NREV 0
-#define JEMALLOC_VERSION_GID "ea6b3e973b477b8061e0076bb257dbd7f3faa756"
-#define JEMALLOC_VERSION_GID_IDENT ea6b3e973b477b8061e0076bb257dbd7f3faa756
-
-#define MALLOCX_LG_ALIGN(la)	((int)(la))
-#if LG_SIZEOF_PTR == 2
-#  define MALLOCX_ALIGN(a)	((int)(ffs((int)(a))-1))
-#else
-#  define MALLOCX_ALIGN(a)						\
-     ((int)(((size_t)(a) < (size_t)INT_MAX) ? ffs((int)(a))-1 :	\
-     ffs((int)(((size_t)(a))>>32))+31))
-#endif
-#define MALLOCX_ZERO	((int)0x40)
-/*
- * Bias tcache index bits so that 0 encodes "automatic tcache management", and 1
- * encodes MALLOCX_TCACHE_NONE.
- */
-#define MALLOCX_TCACHE(tc)	((int)(((tc)+2) << 8))
-#define MALLOCX_TCACHE_NONE	MALLOCX_TCACHE(-1)
-/*
- * Bias arena index bits so that 0 encodes "use an automatically chosen arena".
- */
-#define MALLOCX_ARENA(a)	((((int)(a))+1) << 20)
-
-/*
- * Use as arena index in "arena.<i>.{purge,decay,dss}" and
- * "stats.arenas.<i>.*" mallctl interfaces to select all arenas.  This
- * definition is intentionally specified in raw decimal format to support
- * cpp-based string concatenation, e.g.
- *
- *   #define STRINGIFY_HELPER(x) #x
- *   #define STRINGIFY(x) STRINGIFY_HELPER(x)
- *
- *   mallctl("arena." STRINGIFY(MALLCTL_ARENAS_ALL) ".purge", NULL, NULL, NULL,
- *       0);
- */
-#define MALLCTL_ARENAS_ALL	4096
-/*
- * Use as arena index in "stats.arenas.<i>.*" mallctl interfaces to select
- * destroyed arenas.
- */
-#define MALLCTL_ARENAS_DESTROYED	4097
-
-#if defined(__cplusplus) && defined(JEMALLOC_USE_CXX_THROW)
-#  define JEMALLOC_CXX_THROW throw()
-#else
-#  define JEMALLOC_CXX_THROW
-#endif
-
-#if defined(_MSC_VER)
-#  define JEMALLOC_ATTR(s)
-#  define JEMALLOC_ALIGNED(s) __declspec(align(s))
-#  define JEMALLOC_ALLOC_SIZE(s)
-#  define JEMALLOC_ALLOC_SIZE2(s1, s2)
-#  ifndef JEMALLOC_EXPORT
-#    ifdef DLLEXPORT
-#      define JEMALLOC_EXPORT __declspec(dllexport)
-#    else
-#      define JEMALLOC_EXPORT __declspec(dllimport)
-#    endif
-#  endif
-#  define JEMALLOC_FORMAT_ARG(i)
-#  define JEMALLOC_FORMAT_PRINTF(s, i)
-#  define JEMALLOC_NOINLINE __declspec(noinline)
-#  ifdef __cplusplus
-#    define JEMALLOC_NOTHROW __declspec(nothrow)
-#  else
-#    define JEMALLOC_NOTHROW
-#  endif
-#  define JEMALLOC_SECTION(s) __declspec(allocate(s))
-#  define JEMALLOC_RESTRICT_RETURN __declspec(restrict)
-#  if _MSC_VER >= 1900 && !defined(__EDG__)
-#    define JEMALLOC_ALLOCATOR __declspec(allocator)
-#  else
-#    define JEMALLOC_ALLOCATOR
-#  endif
-#elif defined(JEMALLOC_HAVE_ATTR)
-#  define JEMALLOC_ATTR(s) __attribute__((s))
-#  define JEMALLOC_ALIGNED(s) JEMALLOC_ATTR(aligned(s))
-#  ifdef JEMALLOC_HAVE_ATTR_ALLOC_SIZE
-#    define JEMALLOC_ALLOC_SIZE(s) JEMALLOC_ATTR(alloc_size(s))
-#    define JEMALLOC_ALLOC_SIZE2(s1, s2) JEMALLOC_ATTR(alloc_size(s1, s2))
-#  else
-#    define JEMALLOC_ALLOC_SIZE(s)
-#    define JEMALLOC_ALLOC_SIZE2(s1, s2)
-#  endif
-#  ifndef JEMALLOC_EXPORT
-#    define JEMALLOC_EXPORT JEMALLOC_ATTR(visibility("default"))
-#  endif
-#  ifdef JEMALLOC_HAVE_ATTR_FORMAT_ARG
-#    define JEMALLOC_FORMAT_ARG(i) JEMALLOC_ATTR(__format_arg__(3))
-#  else
-#    define JEMALLOC_FORMAT_ARG(i)
-#  endif
-#  ifdef JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF
-#    define JEMALLOC_FORMAT_PRINTF(s, i) JEMALLOC_ATTR(format(gnu_printf, s, i))
-#  elif defined(JEMALLOC_HAVE_ATTR_FORMAT_PRINTF)
-#    define JEMALLOC_FORMAT_PRINTF(s, i) JEMALLOC_ATTR(format(printf, s, i))
-#  else
-#    define JEMALLOC_FORMAT_PRINTF(s, i)
-#  endif
-#  define JEMALLOC_NOINLINE JEMALLOC_ATTR(noinline)
-#  define JEMALLOC_NOTHROW JEMALLOC_ATTR(nothrow)
-#  define JEMALLOC_SECTION(s) JEMALLOC_ATTR(section(s))
-#  define JEMALLOC_RESTRICT_RETURN
-#  define JEMALLOC_ALLOCATOR
-#else
-#  define JEMALLOC_ATTR(s)
-#  define JEMALLOC_ALIGNED(s)
-#  define JEMALLOC_ALLOC_SIZE(s)
-#  define JEMALLOC_ALLOC_SIZE2(s1, s2)
-#  define JEMALLOC_EXPORT
-#  define JEMALLOC_FORMAT_PRINTF(s, i)
-#  define JEMALLOC_NOINLINE
-#  define JEMALLOC_NOTHROW
-#  define JEMALLOC_SECTION(s)
-#  define JEMALLOC_RESTRICT_RETURN
-#  define JEMALLOC_ALLOCATOR
-#endif
diff --git a/contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/jemalloc_protos.h b/contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/jemalloc_protos.h
deleted file mode 100644
index ff025e30fa7..00000000000
--- a/contrib/jemalloc-cmake/include_linux_x86_64/jemalloc/jemalloc_protos.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * The je_ prefix on the following public symbol declarations is an artifact
- * of namespace management, and should be omitted in application code unless
- * JEMALLOC_NO_DEMANGLE is defined (see jemalloc_mangle.h).
- */
-extern JEMALLOC_EXPORT const char	*je_malloc_conf;
-extern JEMALLOC_EXPORT void		(*je_malloc_message)(void *cbopaque,
-    const char *s);
-
-JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
-    void JEMALLOC_NOTHROW	*je_malloc(size_t size)
-    JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1);
-JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
-    void JEMALLOC_NOTHROW	*je_calloc(size_t num, size_t size)
-    JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE2(1, 2);
-JEMALLOC_EXPORT int JEMALLOC_NOTHROW	je_posix_memalign(void **memptr,
-    size_t alignment, size_t size) JEMALLOC_CXX_THROW JEMALLOC_ATTR(nonnull(1));
-JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
-    void JEMALLOC_NOTHROW	*je_aligned_alloc(size_t alignment,
-    size_t size) JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc)
-    JEMALLOC_ALLOC_SIZE(2);
-JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
-    void JEMALLOC_NOTHROW	*je_realloc(void *ptr, size_t size)
-    JEMALLOC_CXX_THROW JEMALLOC_ALLOC_SIZE(2);
-JEMALLOC_EXPORT void JEMALLOC_NOTHROW	je_free(void *ptr)
-    JEMALLOC_CXX_THROW;
-
-JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
-    void JEMALLOC_NOTHROW	*je_mallocx(size_t size, int flags)
-    JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1);
-JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
-    void JEMALLOC_NOTHROW	*je_rallocx(void *ptr, size_t size,
-    int flags) JEMALLOC_ALLOC_SIZE(2);
-JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW	je_xallocx(void *ptr, size_t size,
-    size_t extra, int flags);
-JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW	je_sallocx(const void *ptr,
-    int flags) JEMALLOC_ATTR(pure);
-JEMALLOC_EXPORT void JEMALLOC_NOTHROW	je_dallocx(void *ptr, int flags);
-JEMALLOC_EXPORT void JEMALLOC_NOTHROW	je_sdallocx(void *ptr, size_t size,
-    int flags);
-JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW	je_nallocx(size_t size, int flags)
-    JEMALLOC_ATTR(pure);
-
-JEMALLOC_EXPORT int JEMALLOC_NOTHROW	je_mallctl(const char *name,
-    void *oldp, size_t *oldlenp, void *newp, size_t newlen);
-JEMALLOC_EXPORT int JEMALLOC_NOTHROW	je_mallctlnametomib(const char *name,
-    size_t *mibp, size_t *miblenp);
-JEMALLOC_EXPORT int JEMALLOC_NOTHROW	je_mallctlbymib(const size_t *mib,
-    size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen);
-JEMALLOC_EXPORT void JEMALLOC_NOTHROW	je_malloc_stats_print(
-    void (*write_cb)(void *, const char *), void *je_cbopaque,
-    const char *opts);
-JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW	je_malloc_usable_size(
-    JEMALLOC_USABLE_SIZE_CONST void *ptr) JEMALLOC_CXX_THROW;
-
-#ifdef JEMALLOC_OVERRIDE_MEMALIGN
-JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
-    void JEMALLOC_NOTHROW	*je_memalign(size_t alignment, size_t size)
-    JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc);
-#endif
-
-#ifdef JEMALLOC_OVERRIDE_VALLOC
-JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
-    void JEMALLOC_NOTHROW	*je_valloc(size_t size) JEMALLOC_CXX_THROW
-    JEMALLOC_ATTR(malloc);
-#endif
diff --git a/contrib/jemalloc-cmake/include_linux_x86_64_musl/jemalloc/internal/jemalloc_internal_defs.h.in b/contrib/jemalloc-cmake/include_linux_x86_64_musl/jemalloc/internal/jemalloc_internal_defs.h.in
new file mode 100644
index 00000000000..684d4debb14
--- /dev/null
+++ b/contrib/jemalloc-cmake/include_linux_x86_64_musl/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -0,0 +1,428 @@
+/* include/jemalloc/internal/jemalloc_internal_defs.h.  Generated from jemalloc_internal_defs.h.in by configure.  */
+#ifndef JEMALLOC_INTERNAL_DEFS_H_
+#define JEMALLOC_INTERNAL_DEFS_H_
+/*
+ * If JEMALLOC_PREFIX is defined via --with-jemalloc-prefix, it will cause all
+ * public APIs to be prefixed.  This makes it possible, with some care, to use
+ * multiple allocators simultaneously.
+ */
+/* #undef JEMALLOC_PREFIX */
+/* #undef JEMALLOC_CPREFIX */
+
+/*
+ * Define overrides for non-standard allocator-related functions if they are
+ * present on the system.
+ */
+#if !defined(USE_MUSL)
+    #define JEMALLOC_OVERRIDE___LIBC_CALLOC
+    #define JEMALLOC_OVERRIDE___LIBC_FREE
+    #define JEMALLOC_OVERRIDE___LIBC_MALLOC
+    #define JEMALLOC_OVERRIDE___LIBC_MEMALIGN
+    #define JEMALLOC_OVERRIDE___LIBC_REALLOC
+    #define JEMALLOC_OVERRIDE___LIBC_VALLOC
+#endif
+/* #undef JEMALLOC_OVERRIDE___POSIX_MEMALIGN */
+
+/*
+ * JEMALLOC_PRIVATE_NAMESPACE is used as a prefix for all library-private APIs.
+ * For shared libraries, symbol visibility mechanisms prevent these symbols
+ * from being exported, but for static libraries, naming collisions are a real
+ * possibility.
+ */
+#define JEMALLOC_PRIVATE_NAMESPACE je_
+
+/*
+ * Hyper-threaded CPUs may need a special instruction inside spin loops in
+ * order to yield to another virtual CPU.
+ */
+#define CPU_SPINWAIT __asm__ volatile("pause")
+/* 1 if CPU_SPINWAIT is defined, 0 otherwise. */
+#define HAVE_CPU_SPINWAIT 1
+
+/*
+ * Number of significant bits in virtual addresses.  This may be less than the
+ * total number of bits in a pointer, e.g. on x64, for which the uppermost 16
+ * bits are the same as bit 47.
+ */
+#define LG_VADDR 48
+
+/* Defined if C11 atomics are available. */
+#define JEMALLOC_C11_ATOMICS
+
+/* Defined if GCC __atomic atomics are available. */
+#define JEMALLOC_GCC_ATOMIC_ATOMICS
+/* and the 8-bit variant support. */
+#define JEMALLOC_GCC_U8_ATOMIC_ATOMICS
+
+/* Defined if GCC __sync atomics are available. */
+#define JEMALLOC_GCC_SYNC_ATOMICS
+/* and the 8-bit variant support. */
+#define JEMALLOC_GCC_U8_SYNC_ATOMICS
+
+/*
+ * Defined if __builtin_clz() and __builtin_clzl() are available.
+ */
+#define JEMALLOC_HAVE_BUILTIN_CLZ
+
+/*
+ * Defined if os_unfair_lock_*() functions are available, as provided by Darwin.
+ */
+/* #undef JEMALLOC_OS_UNFAIR_LOCK */
+
+/* Defined if syscall(2) is usable. */
+#define JEMALLOC_USE_SYSCALL
+
+/*
+ * Defined if secure_getenv(3) is available.
+ */
+/* #undef JEMALLOC_HAVE_SECURE_GETENV */
+
+/*
+ * Defined if issetugid(2) is available.
+ */
+/* #undef JEMALLOC_HAVE_ISSETUGID */
+
+/* Defined if pthread_atfork(3) is available. */
+#define JEMALLOC_HAVE_PTHREAD_ATFORK
+
+/* Defined if pthread_setname_np(3) is available. */
+#define JEMALLOC_HAVE_PTHREAD_SETNAME_NP
+
+/// musl doesn't support it
+/* Defined if pthread_getname_np(3) is available. */
+/* #define JEMALLOC_HAVE_PTHREAD_GETNAME_NP */
+
+/* Defined if pthread_get_name_np(3) is available. */
+/* #undef JEMALLOC_HAVE_PTHREAD_GET_NAME_NP */
+
+/*
+ * Defined if clock_gettime(CLOCK_MONOTONIC_COARSE, ...) is available.
+ */
+#define JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE
+
+/*
+ * Defined if clock_gettime(CLOCK_MONOTONIC, ...) is available.
+ */
+#define JEMALLOC_HAVE_CLOCK_MONOTONIC
+
+/*
+ * Defined if mach_absolute_time() is available.
+ */
+/* #undef JEMALLOC_HAVE_MACH_ABSOLUTE_TIME */
+
+/*
+ * Defined if clock_gettime(CLOCK_REALTIME, ...) is available.
+ */
+#define JEMALLOC_HAVE_CLOCK_REALTIME
+
+/*
+ * Defined if _malloc_thread_cleanup() exists.  At least in the case of
+ * FreeBSD, pthread_key_create() allocates, which if used during malloc
+ * bootstrapping will cause recursion into the pthreads library.  Therefore, if
+ * _malloc_thread_cleanup() exists, use it as the basis for thread cleanup in
+ * malloc_tsd.
+ */
+/* #undef JEMALLOC_MALLOC_THREAD_CLEANUP */
+
+/*
+ * Defined if threaded initialization is known to be safe on this platform.
+ * Among other things, it must be possible to initialize a mutex without
+ * triggering allocation in order for threaded allocation to be safe.
+ */
+#define JEMALLOC_THREADED_INIT
+
+/*
+ * Defined if the pthreads implementation defines
+ * _pthread_mutex_init_calloc_cb(), in which case the function is used in order
+ * to avoid recursive allocation during mutex initialization.
+ */
+/* #undef JEMALLOC_MUTEX_INIT_CB */
+
+/* Non-empty if the tls_model attribute is supported. */
+#define JEMALLOC_TLS_MODEL __attribute__((tls_model("initial-exec")))
+
+/*
+ * JEMALLOC_DEBUG enables assertions and other sanity checks, and disables
+ * inline functions.
+ */
+/* #undef JEMALLOC_DEBUG */
+
+/* JEMALLOC_STATS enables statistics calculation. */
+#define JEMALLOC_STATS
+
+/* JEMALLOC_EXPERIMENTAL_SMALLOCX_API enables experimental smallocx API. */
+/* #undef JEMALLOC_EXPERIMENTAL_SMALLOCX_API */
+
+/* JEMALLOC_PROF enables allocation profiling. */
+/* #undef JEMALLOC_PROF */
+
+/* Use libunwind for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_LIBUNWIND */
+
+/* Use libgcc for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_LIBGCC */
+
+/* Use gcc intrinsics for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_GCC */
+
+/*
+ * JEMALLOC_DSS enables use of sbrk(2) to allocate extents from the data storage
+ * segment (DSS).
+ */
+#define JEMALLOC_DSS
+
+/* Support memory filling (junk/zero). */
+#define JEMALLOC_FILL
+
+/* Support utrace(2)-based tracing. */
+/* #undef JEMALLOC_UTRACE */
+
+/* Support utrace(2)-based tracing (label based signature). */
+/* #undef JEMALLOC_UTRACE_LABEL */
+
+/* Support optional abort() on OOM. */
+/* #undef JEMALLOC_XMALLOC */
+
+/* Support lazy locking (avoid locking unless a second thread is launched). */
+/* #undef JEMALLOC_LAZY_LOCK */
+
+/*
+ * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size
+ * classes).
+ */
+/* #undef LG_QUANTUM */
+
+/* One page is 2^LG_PAGE bytes. */
+#define LG_PAGE 12
+
+/* Maximum number of regions in a slab. */
+/* #undef CONFIG_LG_SLAB_MAXREGS */
+
+/*
+ * One huge page is 2^LG_HUGEPAGE bytes.  Note that this is defined even if the
+ * system does not explicitly support huge pages; system calls that require
+ * explicit huge page support are separately configured.
+ */
+#define LG_HUGEPAGE 21
+
+/*
+ * If defined, adjacent virtual memory mappings with identical attributes
+ * automatically coalesce, and they fragment when changes are made to subranges.
+ * This is the normal order of things for mmap()/munmap(), but on Windows
+ * VirtualAlloc()/VirtualFree() operations must be precisely matched, i.e.
+ * mappings do *not* coalesce/fragment.
+ */
+#define JEMALLOC_MAPS_COALESCE
+
+/*
+ * If defined, retain memory for later reuse by default rather than using e.g.
+ * munmap() to unmap freed extents.  This is enabled on 64-bit Linux because
+ * common sequences of mmap()/munmap() calls will cause virtual memory map
+ * holes.
+ */
+#define JEMALLOC_RETAIN
+
+/* TLS is used to map arenas and magazine caches to threads. */
+#define JEMALLOC_TLS
+
+/*
+ * Used to mark unreachable code to quiet "end of non-void" compiler warnings.
+ * Don't use this directly; instead use unreachable() from util.h
+ */
+#define JEMALLOC_INTERNAL_UNREACHABLE __builtin_unreachable
+
+/*
+ * ffs*() functions to use for bitmapping.  Don't use these directly; instead,
+ * use ffs_*() from util.h.
+ */
+#define JEMALLOC_INTERNAL_FFSLL __builtin_ffsll
+#define JEMALLOC_INTERNAL_FFSL __builtin_ffsl
+#define JEMALLOC_INTERNAL_FFS __builtin_ffs
+
+/*
+ * popcount*() functions to use for bitmapping.
+ */
+#define JEMALLOC_INTERNAL_POPCOUNTL __builtin_popcountl
+#define JEMALLOC_INTERNAL_POPCOUNT __builtin_popcount
+
+/*
+ * If defined, explicitly attempt to more uniformly distribute large allocation
+ * pointer alignments across all cache indices.
+ */
+#define JEMALLOC_CACHE_OBLIVIOUS
+
+/*
+ * If defined, enable logging facilities.  We make this a configure option to
+ * avoid taking extra branches everywhere.
+ */
+/* #undef JEMALLOC_LOG */
+
+/*
+ * If defined, use readlinkat() (instead of readlink()) to follow
+ * /etc/malloc_conf.
+ */
+/* #undef JEMALLOC_READLINKAT */
+
+/*
+ * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
+ */
+/* #undef JEMALLOC_ZONE */
+
+/*
+ * Methods for determining whether the OS overcommits.
+ * JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY: Linux's
+ *                                         /proc/sys/vm.overcommit_memory file.
+ * JEMALLOC_SYSCTL_VM_OVERCOMMIT: FreeBSD's vm.overcommit sysctl.
+ */
+/* #undef JEMALLOC_SYSCTL_VM_OVERCOMMIT */
+#define JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY
+
+/* Defined if madvise(2) is available. */
+#define JEMALLOC_HAVE_MADVISE
+
+/*
+ * Defined if transparent huge pages are supported via the MADV_[NO]HUGEPAGE
+ * arguments to madvise(2).
+ */
+#define JEMALLOC_HAVE_MADVISE_HUGE
+
+/*
+ * Methods for purging unused pages differ between operating systems.
+ *
+ *   madvise(..., MADV_FREE) : This marks pages as being unused, such that they
+ *                             will be discarded rather than swapped out.
+ *   madvise(..., MADV_DONTNEED) : If JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS is
+ *                                 defined, this immediately discards pages,
+ *                                 such that new pages will be demand-zeroed if
+ *                                 the address region is later touched;
+ *                                 otherwise this behaves similarly to
+ *                                 MADV_FREE, though typically with higher
+ *                                 system overhead.
+ */
+#define JEMALLOC_PURGE_MADVISE_FREE
+#define JEMALLOC_PURGE_MADVISE_DONTNEED
+#define JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS
+
+/* Defined if madvise(2) is available but MADV_FREE is not (x86 Linux only). */
+/* #undef JEMALLOC_DEFINE_MADVISE_FREE */
+
+/*
+ * Defined if MADV_DO[NT]DUMP is supported as an argument to madvise.
+ */
+#define JEMALLOC_MADVISE_DONTDUMP
+
+/*
+ * Defined if MADV_[NO]CORE is supported as an argument to madvise.
+ */
+/* #undef JEMALLOC_MADVISE_NOCORE */
+
+/* Defined if mprotect(2) is available. */
+#define JEMALLOC_HAVE_MPROTECT
+
+/*
+ * Defined if transparent huge pages (THPs) are supported via the
+ * MADV_[NO]HUGEPAGE arguments to madvise(2), and THP support is enabled.
+ */
+/* #undef JEMALLOC_THP */
+
+/* Defined if posix_madvise is available. */
+/* #undef JEMALLOC_HAVE_POSIX_MADVISE */
+
+/*
+ * Method for purging unused pages using posix_madvise.
+ *
+ *   posix_madvise(..., POSIX_MADV_DONTNEED)
+ */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED_ZEROS */
+
+/*
+ * Defined if memcntl page admin call is supported
+ */
+/* #undef JEMALLOC_HAVE_MEMCNTL */
+
+/*
+ * Defined if malloc_size is supported
+ */
+/* #undef JEMALLOC_HAVE_MALLOC_SIZE */
+
+/* Define if operating system has alloca.h header. */
+#define JEMALLOC_HAS_ALLOCA_H
+
+/* C99 restrict keyword supported. */
+#define JEMALLOC_HAS_RESTRICT
+
+/* For use by hash code. */
+/* #undef JEMALLOC_BIG_ENDIAN */
+
+/* sizeof(int) == 2^LG_SIZEOF_INT. */
+#define LG_SIZEOF_INT 2
+
+/* sizeof(long) == 2^LG_SIZEOF_LONG. */
+#define LG_SIZEOF_LONG 3
+
+/* sizeof(long long) == 2^LG_SIZEOF_LONG_LONG. */
+#define LG_SIZEOF_LONG_LONG 3
+
+/* sizeof(intmax_t) == 2^LG_SIZEOF_INTMAX_T. */
+#define LG_SIZEOF_INTMAX_T 3
+
+/* glibc malloc hooks (__malloc_hook, __realloc_hook, __free_hook). */
+#define JEMALLOC_GLIBC_MALLOC_HOOK
+
+/* glibc memalign hook. */
+#define JEMALLOC_GLIBC_MEMALIGN_HOOK
+
+/* pthread support */
+#define JEMALLOC_HAVE_PTHREAD
+
+/* dlsym() support */
+#define JEMALLOC_HAVE_DLSYM
+
+/* Adaptive mutex support in pthreads. */
+#define JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP
+
+/* GNU specific sched_getcpu support */
+#define JEMALLOC_HAVE_SCHED_GETCPU
+
+/* GNU specific sched_setaffinity support */
+#define JEMALLOC_HAVE_SCHED_SETAFFINITY
+
+/*
+ * If defined, all the features necessary for background threads are present.
+ */
+#define JEMALLOC_BACKGROUND_THREAD
+
+/*
+ * If defined, jemalloc symbols are not exported (doesn't work when
+ * JEMALLOC_PREFIX is not defined).
+ */
+/* #undef JEMALLOC_EXPORT */
+
+/* config.malloc_conf options string. */
+#define JEMALLOC_CONFIG_MALLOC_CONF "@JEMALLOC_CONFIG_MALLOC_CONF@"
+
+/* If defined, jemalloc takes the malloc/free/etc. symbol names. */
+#define JEMALLOC_IS_MALLOC
+
+/*
+ * Defined if strerror_r returns char * if _GNU_SOURCE is defined.
+ */
+#define JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE
+
+/* Performs additional safety checks when defined. */
+/* #undef JEMALLOC_OPT_SAFETY_CHECKS */
+
+/* Is C++ support being built? */
+/* #undef JEMALLOC_ENABLE_CXX */
+
+/* Performs additional size checks when defined. */
+/* #undef JEMALLOC_OPT_SIZE_CHECKS */
+
+/* Allows sampled junk and stash for checking use-after-free when defined. */
+/* #undef JEMALLOC_UAF_DETECTION */
+
+/* Darwin VM_MAKE_TAG support */
+/* #undef JEMALLOC_HAVE_VM_MAKE_TAG */
+
+#endif /* JEMALLOC_INTERNAL_DEFS_H_ */
diff --git a/contrib/kvproto b/contrib/kvproto
index 12e2f5a9d16..a5d4ffd2ba3 160000
--- a/contrib/kvproto
+++ b/contrib/kvproto
@@ -1 +1 @@
-Subproject commit 12e2f5a9d167f46602804840857ddc8ff06dc695
+Subproject commit a5d4ffd2ba337dad0bc99e9fb53bf665864a3f3b
diff --git a/contrib/prometheus-cpp b/contrib/prometheus-cpp
index ca1f3463e74..76470b3ec02 160000
--- a/contrib/prometheus-cpp
+++ b/contrib/prometheus-cpp
@@ -1 +1 @@
-Subproject commit ca1f3463e74d957d1cccddd4a1a29e3e5d34bd83
+Subproject commit 76470b3ec024c8214e1f4253fb1f4c0b28d3df94
diff --git a/contrib/prometheus-cpp-cmake/pull/CMakeLists.txt b/contrib/prometheus-cpp-cmake/pull/CMakeLists.txt
index daebd1b7c5a..993618e16ac 100644
--- a/contrib/prometheus-cpp-cmake/pull/CMakeLists.txt
+++ b/contrib/prometheus-cpp-cmake/pull/CMakeLists.txt
@@ -12,9 +12,18 @@ if(ENABLE_COMPRESSION)
 endif()
 
 add_library(pull
+  ${PROMETHEUS_SRC_DIR}/pull/src/basic_auth.cc
+  ${PROMETHEUS_SRC_DIR}/pull/src/basic_auth.h
+  ${PROMETHEUS_SRC_DIR}/pull/src/endpoint.cc
+  ${PROMETHEUS_SRC_DIR}/pull/src/endpoint.h
   ${PROMETHEUS_SRC_DIR}/pull/src/exposer.cc
   ${PROMETHEUS_SRC_DIR}/pull/src/handler.cc
   ${PROMETHEUS_SRC_DIR}/pull/src/handler.h
+  ${PROMETHEUS_SRC_DIR}/pull/src/metrics_collector.cc
+  ${PROMETHEUS_SRC_DIR}/pull/src/metrics_collector.h
+
+  ${PROMETHEUS_SRC_DIR}/pull/src/detail/base64.h
+
   $<$<BOOL:${USE_THIRDPARTY_LIBRARIES}>:$<TARGET_OBJECTS:civetweb>>
 )
 
diff --git a/contrib/prometheus-cpp-cmake/push/CMakeLists.txt b/contrib/prometheus-cpp-cmake/push/CMakeLists.txt
index 71dad9fb812..b776d17bdaf 100644
--- a/contrib/prometheus-cpp-cmake/push/CMakeLists.txt
+++ b/contrib/prometheus-cpp-cmake/push/CMakeLists.txt
@@ -3,6 +3,8 @@ if(NOT CURL_FOUND)
 endif()
 
 add_library(push
+  ${PROMETHEUS_SRC_DIR}/push/src/curl_wrapper.cc
+  ${PROMETHEUS_SRC_DIR}/push/src/curl_wrapper.h
   ${PROMETHEUS_SRC_DIR}/push/src/gateway.cc
 )
 
diff --git a/contrib/tiflash-proxy b/contrib/tiflash-proxy
index ca2f51f94e5..573efc6d3d1 160000
--- a/contrib/tiflash-proxy
+++ b/contrib/tiflash-proxy
@@ -1 +1 @@
-Subproject commit ca2f51f94e55bdd23749dcc02ab4afb94eeb5ae5
+Subproject commit 573efc6d3d155a9a01da003e70f111485becf2bc
diff --git a/contrib/tiflash-proxy-cmake/CMakeLists.txt b/contrib/tiflash-proxy-cmake/CMakeLists.txt
index e243ecba37c..e3e2df379a1 100644
--- a/contrib/tiflash-proxy-cmake/CMakeLists.txt
+++ b/contrib/tiflash-proxy-cmake/CMakeLists.txt
@@ -4,7 +4,11 @@ file(GLOB_RECURSE _TIFLASH_PROXY_SRCS "${_TIFLASH_PROXY_SOURCE_DIR}/*.rs")
 list(FILTER _TIFLASH_PROXY_SRCS EXCLUDE REGEX ${_TIFLASH_PROXY_SOURCE_DIR}/target/.*)
 
 # use `CFLAGS=-w CXXFLAGS=-w` to inhibit warning messages.
-set(TIFLASH_RUST_ENV CMAKE=${CMAKE_COMMAND} CFLAGS=-w CXXFLAGS=-w)
+if (TIFLASH_LLVM_TOOLCHAIN)
+    set(TIFLASH_RUST_ENV CMAKE=${CMAKE_COMMAND} "CFLAGS=-w -fuse-ld=lld" "CXXFLAGS=-w -fuse-ld=lld -stdlib=libc++")
+else()
+    set(TIFLASH_RUST_ENV CMAKE=${CMAKE_COMMAND} CFLAGS=-w CXXFLAGS=-w)
+endif()
 
 if(TIFLASH_LLVM_TOOLCHAIN AND USE_LIBCXX)
     set(TIFLASH_RUST_LINKER ${CMAKE_CURRENT_BINARY_DIR}/tiflash-linker)
diff --git a/contrib/tipb b/contrib/tipb
index bfb5c2c5518..0f4f873beca 160000
--- a/contrib/tipb
+++ b/contrib/tipb
@@ -1 +1 @@
-Subproject commit bfb5c2c55188c254018d3cf77bfad73b4d4b77ec
+Subproject commit 0f4f873beca8d5078dde0a23d15ad5ce3188ed0d
diff --git a/dbms/src/Columns/ColumnConst.h b/dbms/src/Columns/ColumnConst.h
index 27283c0f24a..da071507a72 100644
--- a/dbms/src/Columns/ColumnConst.h
+++ b/dbms/src/Columns/ColumnConst.h
@@ -233,7 +233,8 @@ class ColumnConst final : public COWPtrHelper<IColumn, ColumnConst>
     template <typename T>
     T getValue() const
     {
-        return getField().safeGet<typename NearestFieldType<T>::Type>();
+        auto && tmp = getField();
+        return std::move(tmp.safeGet<typename NearestFieldType<T>::Type>());
     }
 };
 
diff --git a/dbms/src/Common/CurrentMetrics.cpp b/dbms/src/Common/CurrentMetrics.cpp
index b7ce9fd1e89..8673784c590 100644
--- a/dbms/src/Common/CurrentMetrics.cpp
+++ b/dbms/src/Common/CurrentMetrics.cpp
@@ -22,6 +22,8 @@
     M(OpenFileForReadWrite)                     \
     M(MemoryTracking)                           \
     M(MemoryTrackingInBackgroundProcessingPool) \
+    M(LogicalCPUCores)                          \
+    M(MemoryCapacity)                           \
     M(PSMVCCNumSnapshots)                       \
     M(PSMVCCSnapshotsList)                      \
     M(RWLockWaitingReaders)                     \
diff --git a/dbms/src/Common/FailPoint.cpp b/dbms/src/Common/FailPoint.cpp
index c6c3caa44ad..ad5010d7826 100644
--- a/dbms/src/Common/FailPoint.cpp
+++ b/dbms/src/Common/FailPoint.cpp
@@ -12,7 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <Common/Exception.h>
 #include <Common/FailPoint.h>
+#include <Poco/String.h>
+#include <Poco/StringTokenizer.h>
+#include <Poco/Util/LayeredConfiguration.h>
+#include <common/defines.h>
+#include <common/logger_useful.h>
 
 #include <boost/core/noncopyable.hpp>
 #include <condition_variable>
@@ -21,7 +27,6 @@
 namespace DB
 {
 std::unordered_map<String, std::shared_ptr<FailPointChannel>> FailPointHelper::fail_point_wait_channels;
-
 #define APPLY_FOR_FAILPOINTS_ONCE(M)                              \
     M(exception_between_drop_meta_and_data)                       \
     M(exception_between_alter_data_and_meta)                      \
@@ -85,33 +90,54 @@ std::unordered_map<String, std::shared_ptr<FailPointChannel>> FailPointHelper::f
     M(force_remote_read_for_batch_cop)                       \
     M(force_context_path)                                    \
     M(force_slow_page_storage_snapshot_release)              \
-    M(force_change_all_blobs_to_read_only)
-
-#define APPLY_FOR_FAILPOINTS_ONCE_WITH_CHANNEL(M) \
-    M(pause_with_alter_locks_acquired)            \
-    M(hang_in_execution)                          \
-    M(pause_before_dt_background_delta_merge)     \
-    M(pause_until_dt_background_delta_merge)      \
-    M(pause_before_apply_raft_cmd)                \
-    M(pause_before_apply_raft_snapshot)           \
-    M(pause_until_apply_raft_snapshot)            \
+    M(force_change_all_blobs_to_read_only)                   \
+    M(unblock_query_init_after_write)
+
+
+#define APPLY_FOR_PAUSEABLE_FAILPOINTS_ONCE(M) \
+    M(pause_with_alter_locks_acquired)         \
+    M(hang_in_execution)                       \
+    M(pause_before_dt_background_delta_merge)  \
+    M(pause_until_dt_background_delta_merge)   \
+    M(pause_before_apply_raft_cmd)             \
+    M(pause_before_apply_raft_snapshot)        \
+    M(pause_until_apply_raft_snapshot)         \
     M(pause_after_copr_streams_acquired_once)
 
-#define APPLY_FOR_FAILPOINTS_WITH_CHANNEL(M) \
-    M(pause_when_reading_from_dt_stream)     \
-    M(pause_when_writing_to_dt_store)        \
-    M(pause_when_ingesting_to_dt_store)      \
-    M(pause_when_altering_dt_store)          \
-    M(pause_after_copr_streams_acquired)     \
-    M(pause_before_server_merge_one_delta)
+#define APPLY_FOR_PAUSEABLE_FAILPOINTS(M)  \
+    M(pause_when_reading_from_dt_stream)   \
+    M(pause_when_writing_to_dt_store)      \
+    M(pause_when_ingesting_to_dt_store)    \
+    M(pause_when_altering_dt_store)        \
+    M(pause_after_copr_streams_acquired)   \
+    M(pause_before_server_merge_one_delta) \
+    M(pause_query_init)
+
+
+#define APPLY_FOR_RANDOM_FAILPOINTS(M)                  \
+    M(random_tunnel_wait_timeout_failpoint)             \
+    M(random_tunnel_init_rpc_failure_failpoint)         \
+    M(random_receiver_sync_msg_push_failure_failpoint)  \
+    M(random_receiver_async_msg_push_failure_failpoint) \
+    M(random_limit_check_failpoint)                     \
+    M(random_join_build_failpoint)                      \
+    M(random_join_prob_failpoint)                       \
+    M(random_aggregate_create_state_failpoint)          \
+    M(random_aggregate_merge_failpoint)                 \
+    M(random_sharedquery_failpoint)                     \
+    M(random_interpreter_failpoint)                     \
+    M(random_task_lifecycle_failpoint)                  \
+    M(random_task_manager_find_task_failure_failpoint)  \
+    M(random_min_tso_scheduler_failpoint)
 
 namespace FailPoints
 {
 #define M(NAME) extern const char(NAME)[] = #NAME "";
 APPLY_FOR_FAILPOINTS_ONCE(M)
 APPLY_FOR_FAILPOINTS(M)
-APPLY_FOR_FAILPOINTS_ONCE_WITH_CHANNEL(M)
-APPLY_FOR_FAILPOINTS_WITH_CHANNEL(M)
+APPLY_FOR_PAUSEABLE_FAILPOINTS_ONCE(M)
+APPLY_FOR_PAUSEABLE_FAILPOINTS(M)
+APPLY_FOR_RANDOM_FAILPOINTS(M)
 #undef M
 } // namespace FailPoints
 
@@ -167,15 +193,15 @@ void FailPointHelper::enableFailPoint(const String & fail_point_name)
     }
 
 #define M(NAME) SUB_M(NAME, FIU_ONETIME)
-    APPLY_FOR_FAILPOINTS_ONCE_WITH_CHANNEL(M)
+    APPLY_FOR_PAUSEABLE_FAILPOINTS_ONCE(M)
 #undef M
 
 #define M(NAME) SUB_M(NAME, 0)
-    APPLY_FOR_FAILPOINTS_WITH_CHANNEL(M)
+    APPLY_FOR_PAUSEABLE_FAILPOINTS(M)
 #undef M
 #undef SUB_M
 
-    throw Exception("Cannot find fail point " + fail_point_name, ErrorCodes::FAIL_POINT_ERROR);
+    throw Exception(fmt::format("Cannot find fail point {}", fail_point_name), ErrorCodes::FAIL_POINT_ERROR);
 }
 
 void FailPointHelper::disableFailPoint(const String & fail_point_name)
@@ -200,6 +226,41 @@ void FailPointHelper::wait(const String & fail_point_name)
         ptr->wait();
     }
 }
+
+void FailPointHelper::initRandomFailPoints(Poco::Util::LayeredConfiguration & config, Poco::Logger * log)
+{
+    String random_fail_point_cfg = config.getString("flash.random_fail_points", "");
+    if (random_fail_point_cfg.empty())
+        return;
+
+    Poco::StringTokenizer string_tokens(random_fail_point_cfg, ",");
+    for (const auto & string_token : string_tokens)
+    {
+        Poco::StringTokenizer pair_tokens(string_token, "-");
+        RUNTIME_ASSERT((pair_tokens.count() == 2), log, "RandomFailPoints config should be FailPointA-RatioA,FailPointB-RatioB,... format");
+        double rate = atof(pair_tokens[1].c_str()); //NOLINT(cert-err34-c): check conversion error manually
+        RUNTIME_ASSERT((0 <= rate && rate <= 1.0), log, "RandomFailPoint trigger rate should in [0,1], while {}", rate);
+        enableRandomFailPoint(pair_tokens[0], rate);
+    }
+    LOG_FMT_INFO(log, "Enable RandomFailPoints: {}", random_fail_point_cfg);
+}
+
+void FailPointHelper::enableRandomFailPoint(const String & fail_point_name, double rate)
+{
+#define SUB_M(NAME)                                               \
+    if (fail_point_name == FailPoints::NAME)                      \
+    {                                                             \
+        fiu_enable_random(FailPoints::NAME, 1, nullptr, 0, rate); \
+        return;                                                   \
+    }
+
+#define M(NAME) SUB_M(NAME)
+    APPLY_FOR_RANDOM_FAILPOINTS(M)
+#undef M
+#undef SUB_M
+
+    throw Exception(fmt::format("Cannot find fail point {}", fail_point_name), ErrorCodes::FAIL_POINT_ERROR);
+}
 #else
 class FailPointChannel
 {
@@ -210,6 +271,10 @@ void FailPointHelper::enableFailPoint(const String &) {}
 void FailPointHelper::disableFailPoint(const String &) {}
 
 void FailPointHelper::wait(const String &) {}
+
+void FailPointHelper::initRandomFailPoints(Poco::Util::LayeredConfiguration &, Poco::Logger *) {}
+
+void FailPointHelper::enableRandomFailPoint(const String &, double) {}
 #endif
 
 } // namespace DB
diff --git a/dbms/src/Common/FailPoint.h b/dbms/src/Common/FailPoint.h
index 2cf40ad55e4..31df2dbdcd2 100644
--- a/dbms/src/Common/FailPoint.h
+++ b/dbms/src/Common/FailPoint.h
@@ -21,6 +21,15 @@
 
 #include <unordered_map>
 
+namespace Poco
+{
+class Logger;
+namespace Util
+{
+class LayeredConfiguration;
+}
+} // namespace Poco
+
 namespace DB
 {
 namespace ErrorCodes
@@ -35,7 +44,6 @@ extern const int FAIL_POINT_ERROR;
 // When `fail_point` is enabled, wait till it is disabled
 #define FAIL_POINT_PAUSE(fail_point) fiu_do_on(fail_point, FailPointHelper::wait(fail_point);)
 
-
 class FailPointChannel;
 class FailPointHelper
 {
@@ -46,6 +54,16 @@ class FailPointHelper
 
     static void wait(const String & fail_point_name);
 
+    /*
+     * For Server RandomFailPoint test usage. When FIU_ENABLE is defined, this function does the following work:
+     * 1. Return if TiFlash config has empty flash.random_fail_points cfg
+     * 2. Parse flash.random_fail_points, which expect to has "FailPointA-RatioA,FailPointB-RatioB,..." format
+     * 3. Call enableRandomFailPoint method with parsed FailPointName and Rate
+     */
+    static void initRandomFailPoints(Poco::Util::LayeredConfiguration & config, Poco::Logger * log);
+
+    static void enableRandomFailPoint(const String & fail_point_name, double rate);
+
 private:
     static std::unordered_map<String, std::shared_ptr<FailPointChannel>> fail_point_wait_channels;
 };
diff --git a/dbms/src/Common/MPMCQueue.h b/dbms/src/Common/MPMCQueue.h
index f550ecc7ca2..e005c363eae 100644
--- a/dbms/src/Common/MPMCQueue.h
+++ b/dbms/src/Common/MPMCQueue.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <Common/SimpleIntrusiveNode.h>
+#include <Common/nocopyable.h>
 #include <common/defines.h>
 #include <common/types.h>
 
@@ -74,56 +75,85 @@ class MPMCQueue
             destruct(getObj(read_pos));
     }
 
-    /// Block util:
+    // Cannot to use copy/move constructor,
+    // because MPMCQueue maybe used by different threads.
+    // Copy and move it is dangerous.
+    DISALLOW_COPY_AND_MOVE(MPMCQueue);
+
+    /// Block until:
     /// 1. Pop succeeds with a valid T: return true.
     /// 2. The queue is cancelled or finished: return false.
-    bool pop(T & obj)
+    ALWAYS_INLINE bool pop(T & obj)
     {
-        return popObj(obj);
+        return popObj<true>(obj);
     }
 
-    /// Besides all conditions mentioned at `pop`, `tryPop` will return false if `timeout` is exceeded.
+    /// Besides all conditions mentioned at `pop`, `popTimeout` will return false if `timeout` is exceeded.
     template <typename Duration>
-    bool tryPop(T & obj, const Duration & timeout)
+    ALWAYS_INLINE bool popTimeout(T & obj, const Duration & timeout)
     {
         /// std::condition_variable::wait_until will always use system_clock.
         auto deadline = std::chrono::system_clock::now() + timeout;
-        return popObj(obj, &deadline);
+        return popObj<true>(obj, &deadline);
+    }
+
+    /// Non-blocking function.
+    /// Return true if pop succeed.
+    /// else return false.
+    ALWAYS_INLINE bool tryPop(T & obj)
+    {
+        return popObj<false>(obj);
     }
 
-    /// Block util:
+    /// Block until:
     /// 1. Push succeeds and return true.
     /// 2. The queue is cancelled and return false.
     /// 3. The queue has finished and return false.
     template <typename U>
     ALWAYS_INLINE bool push(U && u)
     {
-        return pushObj(std::forward<U>(u));
+        return pushObj<true>(std::forward<U>(u));
     }
 
-    /// Besides all conditions mentioned at `push`, `tryPush` will return false if `timeout` is exceeded.
+    /// Besides all conditions mentioned at `push`, `pushTimeout` will return false if `timeout` is exceeded.
     template <typename U, typename Duration>
-    ALWAYS_INLINE bool tryPush(U && u, const Duration & timeout)
+    ALWAYS_INLINE bool pushTimeout(U && u, const Duration & timeout)
     {
         /// std::condition_variable::wait_until will always use system_clock.
         auto deadline = std::chrono::system_clock::now() + timeout;
-        return pushObj(std::forward<U>(u), &deadline);
+        return pushObj<true>(std::forward<U>(u), &deadline);
+    }
+
+    /// Non-blocking function.
+    /// Return true if push succeed.
+    /// else return false.
+    template <typename U>
+    ALWAYS_INLINE bool tryPush(U && u)
+    {
+        return pushObj<false>(std::forward<U>(u));
     }
 
     /// The same as `push` except it will construct the object in place.
     template <typename... Args>
     ALWAYS_INLINE bool emplace(Args &&... args)
     {
-        return emplaceObj(nullptr, std::forward<Args>(args)...);
+        return emplaceObj<true>(nullptr, std::forward<Args>(args)...);
     }
 
-    /// The same as `tryPush` except it will construct the object in place.
+    /// The same as `pushTimeout` except it will construct the object in place.
     template <typename... Args, typename Duration>
-    ALWAYS_INLINE bool tryEmplace(Args &&... args, const Duration & timeout)
+    ALWAYS_INLINE bool emplaceTimeout(Args &&... args, const Duration & timeout)
     {
         /// std::condition_variable::wait_until will always use system_clock.
         auto deadline = std::chrono::system_clock::now() + timeout;
-        return emplaceObj(&deadline, std::forward<Args>(args)...);
+        return emplaceObj<true>(&deadline, std::forward<Args>(args)...);
+    }
+
+    /// The same as `tryPush` except it will construct the object in place.
+    template <typename... Args>
+    ALWAYS_INLINE bool tryEmplace(Args &&... args)
+    {
+        return emplaceObj<false>(nullptr, std::forward<Args>(args)...);
     }
 
     /// Cancel a NORMAL queue will wake up all blocking readers and writers.
@@ -233,7 +263,8 @@ class MPMCQueue
         }
     }
 
-    bool popObj(T & res, const TimePoint * deadline = nullptr)
+    template <bool need_wait>
+    bool popObj(T & res, [[maybe_unused]] const TimePoint * deadline = nullptr)
     {
 #ifdef __APPLE__
         WaitingNode node;
@@ -241,14 +272,16 @@ class MPMCQueue
         thread_local WaitingNode node;
 #endif
         {
-            /// read_pos < write_pos means the queue isn't empty
-            auto pred = [&] {
-                return read_pos < write_pos || !isNormal();
-            };
-
             std::unique_lock lock(mu);
 
-            wait(lock, reader_head, node, pred, deadline);
+            if constexpr (need_wait)
+            {
+                /// read_pos < write_pos means the queue isn't empty
+                auto pred = [&] {
+                    return read_pos < write_pos || !isNormal();
+                };
+                wait(lock, reader_head, node, pred, deadline);
+            }
 
             if (!isCancelled() && read_pos < write_pos)
             {
@@ -272,21 +305,23 @@ class MPMCQueue
         return false;
     }
 
-    template <typename F>
-    bool assignObj(const TimePoint * deadline, F && assigner)
+    template <bool need_wait, typename F>
+    bool assignObj([[maybe_unused]] const TimePoint * deadline, F && assigner)
     {
 #ifdef __APPLE__
         WaitingNode node;
 #else
         thread_local WaitingNode node;
 #endif
-        auto pred = [&] {
-            return write_pos - read_pos < capacity || !isNormal();
-        };
-
         std::unique_lock lock(mu);
 
-        wait(lock, writer_head, node, pred, deadline);
+        if constexpr (need_wait)
+        {
+            auto pred = [&] {
+                return write_pos - read_pos < capacity || !isNormal();
+            };
+            wait(lock, writer_head, node, pred, deadline);
+        }
 
         /// double check status after potential wait
         /// check write_pos because timeouted will also reach here.
@@ -305,16 +340,16 @@ class MPMCQueue
         return false;
     }
 
-    template <typename U>
+    template <bool need_wait, typename U>
     ALWAYS_INLINE bool pushObj(U && u, const TimePoint * deadline = nullptr)
     {
-        return assignObj(deadline, [&](void * addr) { new (addr) T(std::forward<U>(u)); });
+        return assignObj<need_wait>(deadline, [&](void * addr) { new (addr) T(std::forward<U>(u)); });
     }
 
-    template <typename... Args>
+    template <bool need_wait, typename... Args>
     ALWAYS_INLINE bool emplaceObj(const TimePoint * deadline, Args &&... args)
     {
-        return assignObj(deadline, [&](void * addr) { new (addr) T(std::forward<Args>(args)...); });
+        return assignObj<need_wait>(deadline, [&](void * addr) { new (addr) T(std::forward<Args>(args)...); });
     }
 
     ALWAYS_INLINE bool isNormal() const
diff --git a/dbms/src/Common/MyDuration.cpp b/dbms/src/Common/MyDuration.cpp
index 8801ae0de44..513c40b6dbc 100644
--- a/dbms/src/Common/MyDuration.cpp
+++ b/dbms/src/Common/MyDuration.cpp
@@ -67,4 +67,4 @@ String MyDuration::toString() const
     auto frac_str = fmt::format("{:06}", microsecond);
     return fmt::format(fmt_str, sign > 0 ? "" : "-", hour, minute, second, frac_str);
 }
-} // namespace DB
\ No newline at end of file
+} // namespace DB
diff --git a/dbms/src/Common/Stopwatch.h b/dbms/src/Common/Stopwatch.h
index aced9fced11..d33be52fbd6 100644
--- a/dbms/src/Common/Stopwatch.h
+++ b/dbms/src/Common/Stopwatch.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <common/defines.h>
 #include <common/types.h>
 #include <time.h>
 
@@ -23,21 +24,28 @@
 #include <common/apple_rt.h>
 #endif
 
-
-namespace StopWatchDetail
-{
-inline UInt64 nanoseconds(clockid_t clock_type)
+inline UInt64 clock_gettime_ns(clockid_t clock_type = CLOCK_MONOTONIC)
 {
-    struct timespec ts;
+    struct timespec ts
+    {
+    };
     clock_gettime(clock_type, &ts);
-    return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
+    return UInt64(ts.tv_sec * 1000000000LL + ts.tv_nsec);
 }
-inline UInt64 seconds(clockid_t clock_type)
+
+/// Sometimes monotonic clock may not be monotonic (due to bug in kernel?).
+/// It may cause some operations to fail with "Timeout exceeded: elapsed 18446744073.709553 seconds".
+/// Takes previously returned value and returns it again if time stepped back for some reason.
+inline UInt64 clock_gettime_ns_adjusted(UInt64 prev_time, clockid_t clock_type = CLOCK_MONOTONIC)
 {
-    return nanoseconds(clock_type) / 1000000000ULL;
-}
-} // namespace StopWatchDetail
+    UInt64 current_time = clock_gettime_ns(clock_type);
+    if (likely(prev_time <= current_time))
+        return current_time;
 
+    /// Something probably went completely wrong if time stepped back for more than 1 second.
+    assert(prev_time - current_time <= 1000000000ULL);
+    return prev_time;
+}
 
 /** Differs from Poco::Stopwatch only by using 'clock_gettime' instead of 'gettimeofday',
   *  returns nanoseconds instead of microseconds, and also by other minor differencies.
@@ -104,7 +112,7 @@ class Stopwatch
     clockid_t clock_type;
     bool is_running = false;
 
-    UInt64 nanoseconds() const { return StopWatchDetail::nanoseconds(clock_type); }
+    UInt64 nanoseconds() const { return clock_gettime_ns_adjusted(start_ns, clock_type); }
 };
 
 
@@ -112,13 +120,18 @@ class AtomicStopwatch
 {
 public:
     explicit AtomicStopwatch(clockid_t clock_type_ = CLOCK_MONOTONIC)
-        : clock_type(clock_type_)
+        : start_ns(0)
+        , clock_type(clock_type_)
     {
         restart();
     }
 
-    void restart() { start_ns = nanoseconds(); }
-    UInt64 elapsed() const { return nanoseconds() - start_ns; }
+    void restart() { start_ns = nanoseconds(0); }
+    UInt64 elapsed() const
+    {
+        UInt64 current_start_ns = start_ns;
+        return nanoseconds(current_start_ns) - start_ns;
+    }
     UInt64 elapsedMilliseconds() const { return elapsed() / 1000000UL; }
     double elapsedSeconds() const { return static_cast<double>(elapsed()) / 1000000000ULL; }
 
@@ -129,8 +142,8 @@ class AtomicStopwatch
     bool compareAndRestart(double seconds)
     {
         UInt64 threshold = seconds * 1000000000ULL;
-        UInt64 current_ns = nanoseconds();
         UInt64 current_start_ns = start_ns;
+        UInt64 current_ns = nanoseconds(current_start_ns);
 
         while (true)
         {
@@ -175,8 +188,8 @@ class AtomicStopwatch
     Lock compareAndRestartDeferred(double seconds)
     {
         UInt64 threshold = seconds * 1000000000ULL;
-        UInt64 current_ns = nanoseconds();
         UInt64 current_start_ns = start_ns;
+        UInt64 current_ns = nanoseconds(current_start_ns);
 
         while (true)
         {
@@ -197,5 +210,5 @@ class AtomicStopwatch
     clockid_t clock_type;
 
     /// Most significant bit is a lock. When it is set, compareAndRestartDeferred method will return false.
-    UInt64 nanoseconds() const { return StopWatchDetail::nanoseconds(clock_type) & 0x7FFFFFFFFFFFFFFFULL; }
+    UInt64 nanoseconds(UInt64 prev_time) const { return clock_gettime_ns_adjusted(prev_time, clock_type) & 0x7FFFFFFFFFFFFFFFULL; }
 };
diff --git a/dbms/src/Common/ThreadMetricUtil.cpp b/dbms/src/Common/ThreadMetricUtil.cpp
index aa496b943ab..340417b969b 100644
--- a/dbms/src/Common/ThreadMetricUtil.cpp
+++ b/dbms/src/Common/ThreadMetricUtil.cpp
@@ -24,7 +24,7 @@ namespace DB
 {
 bool tryToResetMaxThreadsMetrics()
 {
-    UInt64 now_ts = StopWatchDetail::seconds(CLOCK_MONOTONIC);
+    UInt64 now_ts = clock_gettime_ns_adjusted(last_max_thds_metric_reset_ts, CLOCK_MONOTONIC);
     if (now_ts > last_max_thds_metric_reset_ts + max_thds_metric_reset_interval)
     {
         last_max_thds_metric_reset_ts = now_ts;
diff --git a/dbms/src/Common/tests/gtest_mpmc_queue.cpp b/dbms/src/Common/tests/gtest_mpmc_queue.cpp
index 85ad1892067..3f2748b452b 100644
--- a/dbms/src/Common/tests/gtest_mpmc_queue.cpp
+++ b/dbms/src/Common/tests/gtest_mpmc_queue.cpp
@@ -98,12 +98,14 @@ class MPMCQueueTest : public ::testing::Test
     void testCannotTryPush(MPMCQueue<T> & queue)
     {
         auto old_size = queue.size();
-        auto res = queue.tryPush(ValueHelper<T>::make(-1), std::chrono::microseconds(1));
-        auto new_size = queue.size();
-        if (res)
+        bool ok1 = queue.tryPush(ValueHelper<T>::make(-1));
+        auto new_size1 = queue.size();
+        bool ok2 = queue.pushTimeout(ValueHelper<T>::make(-1), std::chrono::microseconds(1));
+        auto new_size2 = queue.size();
+        if (ok1 || ok2)
             throw TiFlashTestException("Should push fail");
-        if (old_size != new_size)
-            throw TiFlashTestException(fmt::format("Size changed from {} to {} without push", old_size, new_size));
+        if (old_size != new_size1 || old_size != new_size2)
+            throw TiFlashTestException(fmt::format("Size changed from {} to {} and {} without push", old_size, new_size1, new_size2));
     }
 
     template <typename T>
@@ -124,12 +126,14 @@ class MPMCQueueTest : public ::testing::Test
     {
         auto old_size = queue.size();
         T res;
-        bool ok = queue.tryPop(res, std::chrono::microseconds(1));
-        auto new_size = queue.size();
-        if (ok)
+        bool ok1 = queue.tryPop(res);
+        auto new_size1 = queue.size();
+        bool ok2 = queue.popTimeout(res, std::chrono::microseconds(1));
+        auto new_size2 = queue.size();
+        if (ok1 || ok2)
             throw TiFlashTestException("Should pop fail");
-        if (old_size != new_size)
-            throw TiFlashTestException(fmt::format("Size changed from {} to {} without pop", old_size, new_size));
+        if (old_size != new_size1 || old_size != new_size2)
+            throw TiFlashTestException(fmt::format("Size changed from {} to {} and {} without pop", old_size, new_size1, new_size2));
     }
 
     template <typename T>
@@ -474,7 +478,6 @@ class MPMCQueueTest : public ::testing::Test
             throwOrMove(std::move(rhs));
         }
 
-
         ThrowInjectable & operator=(ThrowInjectable && rhs)
         {
             if (this != &rhs)
diff --git a/dbms/src/Common/tests/mpmc_queue_perftest.cpp b/dbms/src/Common/tests/mpmc_queue_perftest.cpp
index d047b5d498f..ba0d00001a3 100644
--- a/dbms/src/Common/tests/mpmc_queue_perftest.cpp
+++ b/dbms/src/Common/tests/mpmc_queue_perftest.cpp
@@ -87,7 +87,7 @@ struct Helper<MPMCQueue<T>>
     template <typename U>
     static void pushOneTo(MPMCQueue<T> & queue, U && data)
     {
-        queue.tryPush(std::forward<U>(data), std::chrono::milliseconds(1));
+        queue.pushTimeout(std::forward<U>(data), std::chrono::milliseconds(1));
     }
 };
 
diff --git a/dbms/src/Common/wrapInvocable.h b/dbms/src/Common/wrapInvocable.h
index d6cee519835..1c93bb3e782 100644
--- a/dbms/src/Common/wrapInvocable.h
+++ b/dbms/src/Common/wrapInvocable.h
@@ -35,7 +35,6 @@ inline auto wrapInvocable(bool propagate_memory_tracker, Func && func, Args &&..
         // run the task with the parameters provided
         return std::apply(std::move(func), std::move(args));
     };
-
     return capture;
 }
 } // namespace DB
diff --git a/dbms/src/Core/Block.cpp b/dbms/src/Core/Block.cpp
index 28db7af82e1..971e8f36e2a 100644
--- a/dbms/src/Core/Block.cpp
+++ b/dbms/src/Core/Block.cpp
@@ -238,10 +238,18 @@ void Block::checkNumberOfRows() const
         if (rows == -1)
             rows = size;
         else if (rows != size)
-            throw Exception("Sizes of columns doesn't match: "
-                                + data.front().name + ": " + toString(rows)
-                                + ", " + elem.name + ": " + toString(size),
+        {
+            auto first_col = data.front();
+            throw Exception(fmt::format(
+                                "Sizes of columns doesn't match: {}(id={}): {}, {}(id={}): {}",
+                                first_col.name,
+                                first_col.column_id,
+                                rows,
+                                elem.name,
+                                elem.column_id,
+                                size),
                             ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
+        }
     }
 }
 
diff --git a/dbms/src/DataStreams/ParallelAggregatingBlockInputStream.cpp b/dbms/src/DataStreams/ParallelAggregatingBlockInputStream.cpp
index f4f8dfc1338..cd9d6235f52 100644
--- a/dbms/src/DataStreams/ParallelAggregatingBlockInputStream.cpp
+++ b/dbms/src/DataStreams/ParallelAggregatingBlockInputStream.cpp
@@ -24,7 +24,7 @@ namespace DB
 {
 ParallelAggregatingBlockInputStream::ParallelAggregatingBlockInputStream(
     const BlockInputStreams & inputs,
-    const BlockInputStreamPtr & additional_input_at_end,
+    const BlockInputStreams & additional_inputs_at_end,
     const Aggregator::Params & params_,
     const FileProviderPtr & file_provider_,
     bool final_,
@@ -41,11 +41,10 @@ ParallelAggregatingBlockInputStream::ParallelAggregatingBlockInputStream(
     , keys_size(params.keys_size)
     , aggregates_size(params.aggregates_size)
     , handler(*this)
-    , processor(inputs, additional_input_at_end, max_threads, handler, log)
+    , processor(inputs, additional_inputs_at_end, max_threads, handler, log)
 {
     children = inputs;
-    if (additional_input_at_end)
-        children.push_back(additional_input_at_end);
+    children.insert(children.end(), additional_inputs_at_end.begin(), additional_inputs_at_end.end());
 }
 
 
@@ -198,8 +197,8 @@ void ParallelAggregatingBlockInputStream::Handler::onException(std::exception_pt
 
     /// can not cancel parent inputStream or the exception might be lost
     if (!parent.executed)
-        /// kill the processor so ExchangeReceiver will be closed
-        parent.processor.cancel(true);
+        /// use cancel instead of kill to avoid too many useless error message
+        parent.processor.cancel(false);
 }
 
 
diff --git a/dbms/src/DataStreams/ParallelAggregatingBlockInputStream.h b/dbms/src/DataStreams/ParallelAggregatingBlockInputStream.h
index 41e61786370..907622c8364 100644
--- a/dbms/src/DataStreams/ParallelAggregatingBlockInputStream.h
+++ b/dbms/src/DataStreams/ParallelAggregatingBlockInputStream.h
@@ -36,7 +36,7 @@ class ParallelAggregatingBlockInputStream : public IProfilingBlockInputStream
       */
     ParallelAggregatingBlockInputStream(
         const BlockInputStreams & inputs,
-        const BlockInputStreamPtr & additional_input_at_end,
+        const BlockInputStreams & additional_inputs_at_end,
         const Aggregator::Params & params_,
         const FileProviderPtr & file_provider_,
         bool final_,
diff --git a/dbms/src/DataStreams/ParallelInputsProcessor.h b/dbms/src/DataStreams/ParallelInputsProcessor.h
index 34c70a7085e..57ab37e1756 100644
--- a/dbms/src/DataStreams/ParallelInputsProcessor.h
+++ b/dbms/src/DataStreams/ParallelInputsProcessor.h
@@ -16,6 +16,7 @@
 
 #include <Common/CurrentMetrics.h>
 #include <Common/Logger.h>
+#include <Common/MPMCQueue.h>
 #include <Common/MemoryTracker.h>
 #include <Common/ThreadFactory.h>
 #include <Common/ThreadManager.h>
@@ -83,9 +84,8 @@ template <typename Handler, StreamUnionMode mode = StreamUnionMode::Basic>
 class ParallelInputsProcessor
 {
 public:
-    /** additional_input_at_end - if not nullptr,
-      *  then the blocks from this source will start to be processed only after all other sources are processed.
-      * This is done in the main thread.
+    /** additional_inputs_at_end - if not empty,
+      *  then the blocks from the sources will start to be processed only after all other sources are processed.
       *
       * Intended for implementation of FULL and RIGHT JOIN
       * - where you must first make JOIN in parallel, while noting which keys are not found,
@@ -93,19 +93,18 @@ class ParallelInputsProcessor
       */
     ParallelInputsProcessor(
         const BlockInputStreams & inputs_,
-        const BlockInputStreamPtr & additional_input_at_end_,
+        const BlockInputStreams & additional_inputs_at_end_,
         size_t max_threads_,
         Handler & handler_,
         const LoggerPtr & log_)
         : inputs(inputs_)
-        , additional_input_at_end(additional_input_at_end_)
-        , max_threads(std::min(inputs_.size(), max_threads_))
+        , additional_inputs_at_end(additional_inputs_at_end_)
+        , max_threads(std::min(std::max(inputs_.size(), additional_inputs_at_end_.size()), max_threads_))
         , handler(handler_)
+        , working_inputs(inputs_)
+        , working_additional_inputs(additional_inputs_at_end_)
         , log(log_)
-    {
-        for (size_t i = 0; i < inputs_.size(); ++i)
-            unprepared_inputs.emplace(inputs_[i], i);
-    }
+    {}
 
     ~ParallelInputsProcessor()
     {
@@ -132,36 +131,21 @@ class ParallelInputsProcessor
     /// Ask all sources to stop earlier than they run out.
     void cancel(bool kill)
     {
-        finish = true;
+        working_inputs.available_inputs.cancel();
+        working_additional_inputs.available_inputs.cancel();
 
-        for (auto & input : inputs)
-        {
-            if (IProfilingBlockInputStream * child = dynamic_cast<IProfilingBlockInputStream *>(&*input))
-            {
-                try
-                {
-                    child->cancel(kill);
-                }
-                catch (...)
-                {
-                    /** If you can not ask one or more sources to stop.
-                      * (for example, the connection is broken for distributed query processing)
-                      * - then do not care.
-                      */
-                    LOG_FMT_ERROR(log, "Exception while cancelling {}", child->getName());
-                }
-            }
-        }
+        cancelStreams(inputs, kill);
+        cancelStreams(additional_inputs_at_end, kill);
     }
 
     /// Wait until all threads are finished, before the destructor.
     void wait()
     {
-        if (joined_threads)
-            return;
         if (thread_manager)
+        {
             thread_manager->wait();
-        joined_threads = true;
+            thread_manager.reset();
+        }
     }
 
     size_t getNumActiveThreads() const
@@ -181,13 +165,78 @@ class ParallelInputsProcessor
         BlockInputStreamPtr in;
         size_t i; /// The source number (for debugging).
 
-        InputData() {}
+        InputData()
+            : i(0)
+        {}
         InputData(const BlockInputStreamPtr & in_, size_t i_)
             : in(in_)
             , i(i_)
         {}
     };
 
+    struct WorkingInputs
+    {
+        explicit WorkingInputs(const BlockInputStreams & inputs_)
+            : available_inputs(inputs_.size())
+            , active_inputs(inputs_.size())
+            , unprepared_inputs(inputs_.size())
+        {
+            for (size_t i = 0; i < inputs_.size(); ++i)
+                unprepared_inputs.emplace(inputs_[i], i);
+        }
+        /** A set of available sources that are not currently processed by any thread.
+          * Each thread takes one source from this set, takes a block out of the source (at this moment the source does the calculations)
+          *  and (if the source is not run out), puts it back into the set of available sources.
+          *
+          * The question arises what is better to use:
+          * - the queue (just processed source will be processed the next time later than the rest)
+          * - stack (just processed source will be processed as soon as possible).
+          *
+          * The stack is better than the queue when you need to do work on reading one source more consequentially,
+          *  and theoretically, this allows you to achieve more consequent/consistent reads from the disk.
+          *
+          * But when using the stack, there is a problem with distributed query processing:
+          *  data is read only from a part of the servers, and on the other servers
+          * a timeout occurs during send, and the request processing ends with an exception.
+          *
+          * Therefore, a queue is used. This can be improved in the future.
+          */
+        using AvailableInputs = MPMCQueue<InputData>;
+        AvailableInputs available_inputs;
+
+        /// How many active input streams.
+        std::atomic<size_t> active_inputs;
+
+        /** For parallel preparing (readPrefix) child streams.
+          * First, streams are located here.
+          * After a stream was prepared, it is moved to "available_inputs" for reading.
+          */
+        using UnpreparedInputs = MPMCQueue<InputData>;
+        UnpreparedInputs unprepared_inputs;
+    };
+
+    void cancelStreams(const BlockInputStreams & streams, bool kill)
+    {
+        for (const auto & input : streams)
+        {
+            if (auto * p_child = dynamic_cast<IProfilingBlockInputStream *>(&*input))
+            {
+                try
+                {
+                    p_child->cancel(kill);
+                }
+                catch (...)
+                {
+                    /** If you can not ask one or more sources to stop.
+                      * (for example, the connection is broken for distributed query processing)
+                      * - then do not care.
+                      */
+                    LOG_FMT_ERROR(log, "Exception while cancelling {}", p_child->getName());
+                }
+            }
+        }
+    }
+
     void publishPayload(BlockInputStreamPtr & stream, Block & block, size_t thread_num)
     {
         if constexpr (mode == StreamUnionMode::Basic)
@@ -201,32 +250,24 @@ class ParallelInputsProcessor
 
     void thread(size_t thread_num)
     {
-        std::exception_ptr exception;
+        work(thread_num, working_inputs);
+        work(thread_num, working_additional_inputs);
 
-        try
-        {
-            while (!finish)
-            {
-                InputData unprepared_input;
-                {
-                    std::lock_guard lock(unprepared_inputs_mutex);
-
-                    if (unprepared_inputs.empty())
-                        break;
-
-                    unprepared_input = unprepared_inputs.front();
-                    unprepared_inputs.pop();
-                }
+        handler.onFinishThread(thread_num);
 
-                unprepared_input.in->readPrefix();
+        if (0 == --active_threads)
+        {
+            handler.onFinish();
+        }
+    }
 
-                {
-                    std::lock_guard lock(available_inputs_mutex);
-                    available_inputs.push(unprepared_input);
-                }
-            }
+    void work(size_t thread_num, WorkingInputs & work)
+    {
+        std::exception_ptr exception;
 
-            loop(thread_num);
+        try
+        {
+            loop(thread_num, work);
         }
         catch (...)
         {
@@ -237,134 +278,63 @@ class ParallelInputsProcessor
         {
             handler.onException(exception, thread_num);
         }
-
-        handler.onFinishThread(thread_num);
-
-        /// The last thread on the output indicates that there is no more data.
-        if (0 == --active_threads)
-        {
-            /// And then it processes an additional source, if there is one.
-            if (additional_input_at_end)
-            {
-                try
-                {
-                    additional_input_at_end->readPrefix();
-                    while (Block block = additional_input_at_end->read())
-                        publishPayload(additional_input_at_end, block, thread_num);
-                }
-                catch (...)
-                {
-                    exception = std::current_exception();
-                }
-
-                if (exception)
-                {
-                    handler.onException(exception, thread_num);
-                }
-            }
-
-            handler.onFinish(); /// TODO If in `onFinish` or `onFinishThread` there is an exception, then std::terminate is called.
-        }
     }
 
-    void loop(size_t thread_num)
+    /// This function may be called in different threads.
+    /// If no exception occurs, we can ensure that the work is all done when the function
+    /// returns in any thread.
+    void loop(size_t thread_num, WorkingInputs & work)
     {
-        while (!finish) /// You may need to stop work earlier than all sources run out.
+        if (work.active_inputs == 0)
         {
-            InputData input;
+            return;
+        }
 
-            /// Select the next source.
-            {
-                std::lock_guard lock(available_inputs_mutex);
+        InputData input;
 
-                /// If there are no free sources, then this thread is no longer needed. (But other threads can work with their sources.)
-                if (available_inputs.empty())
-                    break;
-
-                input = available_inputs.front();
+        while (work.unprepared_inputs.tryPop(input))
+        {
+            input.in->readPrefix();
 
-                /// We remove the source from the queue of available sources.
-                available_inputs.pop();
-            }
+            work.available_inputs.push(input);
+        }
 
+        // The condition is false when all input streams are exhausted or
+        // an exception occurred then the queue was cancelled.
+        while (work.available_inputs.pop(input))
+        {
             /// The main work.
             Block block = input.in->read();
 
+            if (block)
             {
-                if (finish)
-                    break;
-
-                /// If this source is not run out yet, then put the resulting block in the ready queue.
+                work.available_inputs.push(input);
+                publishPayload(input.in, block, thread_num);
+            }
+            else
+            {
+                if (0 == --work.active_inputs)
                 {
-                    std::lock_guard lock(available_inputs_mutex);
-
-                    if (block)
-                    {
-                        available_inputs.push(input);
-                    }
-                    else
-                    {
-                        if (available_inputs.empty())
-                            break;
-                    }
-                }
-
-                if (finish)
+                    work.available_inputs.finish();
                     break;
-
-                if (block)
-                    publishPayload(input.in, block, thread_num);
+                }
             }
         }
     }
 
-    BlockInputStreams inputs;
-    BlockInputStreamPtr additional_input_at_end;
+    const BlockInputStreams inputs;
+    const BlockInputStreams additional_inputs_at_end;
     unsigned max_threads;
 
     Handler & handler;
 
     std::shared_ptr<ThreadManager> thread_manager;
 
-    /** A set of available sources that are not currently processed by any thread.
-      * Each thread takes one source from this set, takes a block out of the source (at this moment the source does the calculations)
-      *  and (if the source is not run out), puts it back into the set of available sources.
-      *
-      * The question arises what is better to use:
-      * - the queue (just processed source will be processed the next time later than the rest)
-      * - stack (just processed source will be processed as soon as possible).
-      *
-      * The stack is better than the queue when you need to do work on reading one source more consequentially,
-      *  and theoretically, this allows you to achieve more consequent/consistent reads from the disk.
-      *
-      * But when using the stack, there is a problem with distributed query processing:
-      *  data is read only from a part of the servers, and on the other servers
-      * a timeout occurs during send, and the request processing ends with an exception.
-      *
-      * Therefore, a queue is used. This can be improved in the future.
-      */
-    using AvailableInputs = std::queue<InputData>;
-    AvailableInputs available_inputs;
-
-    /** For parallel preparing (readPrefix) child streams.
-      * First, streams are located here.
-      * After a stream was prepared, it is moved to "available_inputs" for reading.
-      */
-    using UnpreparedInputs = std::queue<InputData>;
-    UnpreparedInputs unprepared_inputs;
-
-    /// For operations with available_inputs.
-    std::mutex available_inputs_mutex;
-
-    /// For operations with unprepared_inputs.
-    std::mutex unprepared_inputs_mutex;
+    WorkingInputs working_inputs;
+    WorkingInputs working_additional_inputs;
 
     /// How many sources ran out.
     std::atomic<size_t> active_threads{0};
-    /// Finish the threads work (before the sources run out).
-    std::atomic<bool> finish{false};
-    /// Wait for the completion of all threads.
-    std::atomic<bool> joined_threads{false};
 
     const LoggerPtr log;
 };
diff --git a/dbms/src/DataStreams/SharedQueryBlockInputStream.h b/dbms/src/DataStreams/SharedQueryBlockInputStream.h
index e7cece67f0b..d7c0707b5aa 100644
--- a/dbms/src/DataStreams/SharedQueryBlockInputStream.h
+++ b/dbms/src/DataStreams/SharedQueryBlockInputStream.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <Common/FailPoint.h>
 #include <Common/MPMCQueue.h>
 #include <Common/ThreadFactory.h>
 #include <Common/ThreadManager.h>
@@ -24,6 +25,11 @@
 
 namespace DB
 {
+namespace FailPoints
+{
+extern const char random_sharedquery_failpoint[];
+} // namespace FailPoints
+
 /** This block input stream is used by SharedQuery.
   * It enable multiple threads read from one stream.
  */
@@ -136,6 +142,7 @@ class SharedQueryBlockInputStream : public IProfilingBlockInputStream
             in->readPrefix();
             while (true)
             {
+                FAIL_POINT_TRIGGER_EXCEPTION(FailPoints::random_sharedquery_failpoint);
                 Block block = in->read();
                 // in is finished or queue is canceled
                 if (!block || !queue.push(block))
diff --git a/dbms/src/DataStreams/SizeLimits.cpp b/dbms/src/DataStreams/SizeLimits.cpp
index 7dd5e1524ba..4d1bfaae997 100644
--- a/dbms/src/DataStreams/SizeLimits.cpp
+++ b/dbms/src/DataStreams/SizeLimits.cpp
@@ -12,22 +12,30 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <DataStreams/SizeLimits.h>
-#include <Common/formatReadable.h>
 #include <Common/Exception.h>
-#include <string>
+#include <Common/FailPoint.h>
+#include <Common/formatReadable.h>
+#include <DataStreams/SizeLimits.h>
 
+#include <string>
 
 namespace DB
 {
+namespace FailPoints
+{
+extern const char random_limit_check_failpoint[];
+} // namespace FailPoints
 
 bool SizeLimits::check(UInt64 rows, UInt64 bytes, const char * what, int exception_code) const
 {
-    if (max_rows && rows > max_rows)
+    bool rows_exceed_limit = max_rows && rows > max_rows;
+    fiu_do_on(FailPoints::random_limit_check_failpoint, rows_exceed_limit = true;);
+    if (rows_exceed_limit)
     {
         if (overflow_mode == OverflowMode::THROW)
             throw Exception("Limit for " + std::string(what) + " exceeded, max rows: " + formatReadableQuantity(max_rows)
-                + ", current rows: " + formatReadableQuantity(rows), exception_code);
+                                + ", current rows: " + formatReadableQuantity(rows),
+                            exception_code);
         else
             return false;
     }
@@ -36,7 +44,8 @@ bool SizeLimits::check(UInt64 rows, UInt64 bytes, const char * what, int excepti
     {
         if (overflow_mode == OverflowMode::THROW)
             throw Exception("Limit for " + std::string(what) + " exceeded, max bytes: " + formatReadableSizeWithBinarySuffix(max_bytes)
-                + ", current bytes: " + formatReadableSizeWithBinarySuffix(bytes), exception_code);
+                                + ", current bytes: " + formatReadableSizeWithBinarySuffix(bytes),
+                            exception_code);
         else
             return false;
     }
@@ -44,4 +53,4 @@ bool SizeLimits::check(UInt64 rows, UInt64 bytes, const char * what, int excepti
     return true;
 }
 
-}
+} // namespace DB
diff --git a/dbms/src/DataStreams/TiRemoteBlockInputStream.h b/dbms/src/DataStreams/TiRemoteBlockInputStream.h
index f249bf1a0dc..c1afb1e9f4e 100644
--- a/dbms/src/DataStreams/TiRemoteBlockInputStream.h
+++ b/dbms/src/DataStreams/TiRemoteBlockInputStream.h
@@ -19,6 +19,7 @@
 #include <Flash/Coprocessor/CHBlockChunkCodec.h>
 #include <Flash/Coprocessor/CoprocessorReader.h>
 #include <Flash/Coprocessor/DAGResponseWriter.h>
+#include <Flash/Coprocessor/GenSchemaAndColumn.h>
 #include <Flash/Mpp/ExchangeReceiver.h>
 #include <Flash/Statistics/ConnectionProfileInfo.h>
 #include <Interpreters/Context.h>
@@ -58,6 +59,11 @@ class TiRemoteBlockInputStream : public IProfilingBlockInputStream
 
     uint64_t total_rows;
 
+    // For fine grained shuffle, sender will partition data into muiltiple streams by hashing.
+    // ExchangeReceiverBlockInputStream only need to read its own stream, i.e., streams[stream_id].
+    // CoprocessorBlockInputStream doesn't take care of this.
+    size_t stream_id;
+
     void initRemoteExecutionSummaries(tipb::SelectResponse & resp, size_t index)
     {
         for (const auto & execution_summary : resp.execution_summaries())
@@ -120,7 +126,7 @@ class TiRemoteBlockInputStream : public IProfilingBlockInputStream
 
     bool fetchRemoteResult()
     {
-        auto result = remote_reader->nextResult(block_queue, sample_block);
+        auto result = remote_reader->nextResult(block_queue, sample_block, stream_id);
         if (result.meet_error)
         {
             LOG_FMT_WARNING(log, "remote reader meets error: {}", result.error_msg);
@@ -168,29 +174,22 @@ class TiRemoteBlockInputStream : public IProfilingBlockInputStream
     }
 
 public:
-    TiRemoteBlockInputStream(std::shared_ptr<RemoteReader> remote_reader_, const String & req_id, const String & executor_id)
+    TiRemoteBlockInputStream(std::shared_ptr<RemoteReader> remote_reader_, const String & req_id, const String & executor_id, size_t stream_id_)
         : remote_reader(remote_reader_)
         , source_num(remote_reader->getSourceNum())
         , name(fmt::format("TiRemoteBlockInputStream({})", RemoteReader::name))
         , execution_summaries_inited(source_num)
         , log(Logger::get(name, req_id, executor_id))
         , total_rows(0)
+        , stream_id(stream_id_)
     {
-        // generate sample block
-        ColumnsWithTypeAndName columns;
-        for (auto & dag_col : remote_reader->getOutputSchema())
-        {
-            auto tp = getDataTypeByColumnInfoForComputingLayer(dag_col.second);
-            ColumnWithTypeAndName col(tp, dag_col.first);
-            columns.emplace_back(col);
-        }
-        for (size_t i = 0; i < source_num; i++)
+        for (size_t i = 0; i < source_num; ++i)
         {
             execution_summaries_inited[i].store(false);
         }
         execution_summaries.resize(source_num);
         connection_profile_infos.resize(source_num);
-        sample_block = Block(columns);
+        sample_block = Block(getColumnWithTypeAndName(toNamesAndTypes(remote_reader->getOutputSchema())));
     }
 
     Block getHeader() const override { return sample_block; }
diff --git a/dbms/src/DataStreams/UnionBlockInputStream.h b/dbms/src/DataStreams/UnionBlockInputStream.h
index 251d0663e14..ffcc8d77c10 100644
--- a/dbms/src/DataStreams/UnionBlockInputStream.h
+++ b/dbms/src/DataStreams/UnionBlockInputStream.h
@@ -94,20 +94,19 @@ class UnionBlockInputStream final : public IProfilingBlockInputStream
 public:
     UnionBlockInputStream(
         BlockInputStreams inputs,
-        BlockInputStreamPtr additional_input_at_end,
+        BlockInputStreams additional_inputs_at_end,
         size_t max_threads,
         const String & req_id,
         ExceptionCallback exception_callback_ = ExceptionCallback())
-        : output_queue(std::min(inputs.size(), max_threads) * 5) // reduce contention
+        : output_queue(std::min(std::max(inputs.size(), additional_inputs_at_end.size()), max_threads) * 5) // reduce contention
         , log(Logger::get(NAME, req_id))
         , handler(*this)
-        , processor(inputs, additional_input_at_end, max_threads, handler, log)
+        , processor(inputs, additional_inputs_at_end, max_threads, handler, log)
         , exception_callback(exception_callback_)
     {
         // TODO: assert capacity of output_queue is not less than processor.getMaxThreads()
         children = inputs;
-        if (additional_input_at_end)
-            children.push_back(additional_input_at_end);
+        children.insert(children.end(), additional_inputs_at_end.begin(), additional_inputs_at_end.end());
 
         size_t num_children = children.size();
         if (num_children > 1)
@@ -293,8 +292,8 @@ class UnionBlockInputStream final : public IProfilingBlockInputStream
         /// and the exception is lost.
         output_queue.emplace(exception);
         /// can not cancel itself or the exception might be lost
-        /// kill the processor so ExchangeReceiver will be closed
-        processor.cancel(true);
+        /// use cancel instead of kill to avoid too many useless error message
+        processor.cancel(false);
     }
 
     struct Handler
diff --git a/dbms/src/DataStreams/tests/union_stream2.cpp b/dbms/src/DataStreams/tests/union_stream2.cpp
index f939cda4e14..fb3f7238414 100644
--- a/dbms/src/DataStreams/tests/union_stream2.cpp
+++ b/dbms/src/DataStreams/tests/union_stream2.cpp
@@ -51,7 +51,7 @@ try
     for (size_t i = 0, size = streams.size(); i < size; ++i)
         streams[i] = std::make_shared<AsynchronousBlockInputStream>(streams[i]);
 
-    BlockInputStreamPtr stream = std::make_shared<UnionBlockInputStream<>>(streams, nullptr, settings.max_threads, /*req_id=*/"");
+    BlockInputStreamPtr stream = std::make_shared<UnionBlockInputStream<>>(streams, BlockInputStreams{}, settings.max_threads, /*req_id=*/"");
     stream = std::make_shared<LimitBlockInputStream>(stream, 10, 0, "");
 
     WriteBufferFromFileDescriptor wb(STDERR_FILENO);
diff --git a/dbms/src/DataTypes/NumberTraits.h b/dbms/src/DataTypes/NumberTraits.h
index 925628a8894..a8b91b88075 100644
--- a/dbms/src/DataTypes/NumberTraits.h
+++ b/dbms/src/DataTypes/NumberTraits.h
@@ -277,6 +277,7 @@ struct ResultOfAbs<Decimal<T>>
 };
 
 /** For bitwise operations, an integer is obtained with number of bits is equal to the maximum of the arguments.
+  * todo: note that MySQL handles only unsigned 64-bit integer argument and result values. We should refine the code.
     */
 template <typename A, typename B>
 struct ResultOfBit
diff --git a/dbms/src/Debug/DBGInvoker.cpp b/dbms/src/Debug/DBGInvoker.cpp
index 3f633c08e67..df993d8e6e9 100644
--- a/dbms/src/Debug/DBGInvoker.cpp
+++ b/dbms/src/Debug/DBGInvoker.cpp
@@ -118,6 +118,10 @@ DBGInvoker::DBGInvoker()
     regSchemalessFunc("mapped_database", dbgFuncMappedDatabase);
     regSchemalessFunc("mapped_table", dbgFuncMappedTable);
     regSchemafulFunc("query_mapped", dbgFuncQueryMapped);
+    regSchemalessFunc("get_tiflash_replica_count", dbgFuncGetTiflashReplicaCount);
+    regSchemalessFunc("get_partition_tables_tiflash_replica_count", dbgFuncGetPartitionTablesTiflashReplicaCount);
+    regSchemalessFunc("get_tiflash_mode", dbgFuncGetTiflashMode);
+    regSchemalessFunc("get_partition_tables_tiflash_mode", dbgFuncGetPartitionTablesTiflashMode);
 
     regSchemalessFunc("search_log_for_key", dbgFuncSearchLogForKey);
     regSchemalessFunc("tidb_dag", dbgFuncTiDBQueryFromNaturalDag);
diff --git a/dbms/src/Debug/MockSchemaGetter.h b/dbms/src/Debug/MockSchemaGetter.h
index f02699866ce..11c5d97f036 100644
--- a/dbms/src/Debug/MockSchemaGetter.h
+++ b/dbms/src/Debug/MockSchemaGetter.h
@@ -17,16 +17,25 @@
 #include <Debug/MockTiDB.h>
 #include <TiDB/Schema/SchemaGetter.h>
 
+#include <optional>
+
 namespace DB
 {
-
 struct MockSchemaGetter
 {
     TiDB::DBInfoPtr getDatabase(DatabaseID db_id) { return MockTiDB::instance().getDBInfoByID(db_id); }
 
     Int64 getVersion() { return MockTiDB::instance().getVersion(); }
 
-    SchemaDiff getSchemaDiff(Int64 version) { return MockTiDB::instance().getSchemaDiff(version); }
+    std::optional<SchemaDiff> getSchemaDiff(Int64 version)
+    {
+        return MockTiDB::instance().getSchemaDiff(version);
+    }
+
+    bool checkSchemaDiffExists(Int64 version)
+    {
+        return MockTiDB::instance().checkSchemaDiffExists(version);
+    }
 
     TiDB::TableInfoPtr getTableInfo(DatabaseID, TableID table_id) { return MockTiDB::instance().getTableInfoByID(table_id); }
 
diff --git a/dbms/src/Debug/MockTiDB.cpp b/dbms/src/Debug/MockTiDB.cpp
index 42ab56a97c1..99d9625461b 100644
--- a/dbms/src/Debug/MockTiDB.cpp
+++ b/dbms/src/Debug/MockTiDB.cpp
@@ -221,7 +221,6 @@ TiDB::TableInfoPtr MockTiDB::parseColumns(
             {
                 String & name = string_tokens[index];
                 index_info.idx_cols[index].name = name;
-                index_info.idx_cols[index].offset = pk_column_pos_map[name];
                 index_info.idx_cols[index].length = -1;
             }
         }
@@ -302,7 +301,7 @@ int MockTiDB::newTables(
         tables_by_id.emplace(table->table_info.id, table);
         tables_by_name.emplace(qualified_name, table);
 
-        AffectedOption opt;
+        AffectedOption opt{};
         opt.schema_id = table->database_id;
         opt.table_id = table->id();
         opt.old_schema_id = table->database_id;
@@ -571,7 +570,7 @@ void MockTiDB::renameTables(const std::vector<std::tuple<std::string, std::strin
         tables_by_name.erase(qualified_name);
         tables_by_name.emplace(new_qualified_name, new_table);
 
-        AffectedOption opt;
+        AffectedOption opt{};
         opt.schema_id = table->database_id;
         opt.table_id = new_table->id();
         opt.old_schema_id = table->database_id;
@@ -669,9 +668,14 @@ std::pair<bool, DatabaseID> MockTiDB::getDBIDByName(const String & database_name
     return std::make_pair(false, -1);
 }
 
-SchemaDiff MockTiDB::getSchemaDiff(Int64 version_)
+std::optional<SchemaDiff> MockTiDB::getSchemaDiff(Int64 version_)
 {
     return version_diff[version_];
 }
 
+bool MockTiDB::checkSchemaDiffExists(Int64 version)
+{
+    return version_diff.find(version) != version_diff.end();
+}
+
 } // namespace DB
diff --git a/dbms/src/Debug/MockTiDB.h b/dbms/src/Debug/MockTiDB.h
index 36d2af90859..261e547b13a 100644
--- a/dbms/src/Debug/MockTiDB.h
+++ b/dbms/src/Debug/MockTiDB.h
@@ -127,7 +127,9 @@ class MockTiDB : public ext::Singleton<MockTiDB>
 
     std::pair<bool, DatabaseID> getDBIDByName(const String & database_name);
 
-    SchemaDiff getSchemaDiff(Int64 version);
+    bool checkSchemaDiffExists(Int64 version);
+
+    std::optional<SchemaDiff> getSchemaDiff(Int64 version);
 
     std::unordered_map<String, DatabaseID> getDatabases() { return databases; }
 
diff --git a/dbms/src/Debug/astToExecutor.cpp b/dbms/src/Debug/astToExecutor.cpp
index fec76d7a085..61f4474f919 100644
--- a/dbms/src/Debug/astToExecutor.cpp
+++ b/dbms/src/Debug/astToExecutor.cpp
@@ -170,6 +170,7 @@ std::unordered_map<String, tipb::ScalarFuncSig> func_name_to_sig({
     {"cast_decimal_datetime", tipb::ScalarFuncSig::CastDecimalAsTime},
     {"cast_time_datetime", tipb::ScalarFuncSig::CastTimeAsTime},
     {"cast_string_datetime", tipb::ScalarFuncSig::CastStringAsTime},
+    {"concat", tipb::ScalarFuncSig::Concat},
     {"round_int", tipb::ScalarFuncSig::RoundInt},
     {"round_uint", tipb::ScalarFuncSig::RoundInt},
     {"round_dec", tipb::ScalarFuncSig::RoundDec},
@@ -461,6 +462,14 @@ void functionToPB(const DAGSchema & input, ASTFunction * func, tipb::Expr * expr
         ft->set_collate(collator_id);
         break;
     }
+    case tipb::ScalarFuncSig::Concat:
+    {
+        expr->set_sig(it_sig->second);
+        auto * ft = expr->mutable_field_type();
+        ft->set_tp(TiDB::TypeString);
+        ft->set_collate(collator_id);
+        break;
+    }
     case tipb::ScalarFuncSig::RoundInt:
     case tipb::ScalarFuncSig::RoundWithFracInt:
     {
@@ -851,6 +860,7 @@ bool ExchangeReceiver::toTiPBExecutor(tipb::Executor * tipb_executor, uint32_t c
 {
     tipb_executor->set_tp(tipb::ExecType::TypeExchangeReceiver);
     tipb_executor->set_executor_id(name);
+    tipb_executor->set_fine_grained_shuffle_stream_count(fine_grained_shuffle_stream_count);
     tipb::ExchangeReceiver * exchange_receiver = tipb_executor->mutable_exchange_receiver();
     for (auto & field : output_schema)
     {
@@ -1354,6 +1364,7 @@ bool Window::toTiPBExecutor(tipb::Executor * tipb_executor, uint32_t collator_id
 {
     tipb_executor->set_tp(tipb::ExecType::TypeWindow);
     tipb_executor->set_executor_id(name);
+    tipb_executor->set_fine_grained_shuffle_stream_count(fine_grained_shuffle_stream_count);
     tipb::Window * window = tipb_executor->mutable_window();
     auto & input_schema = children[0]->output_schema;
     for (const auto & expr : func_descs)
@@ -1430,6 +1441,7 @@ bool Sort::toTiPBExecutor(tipb::Executor * tipb_executor, uint32_t collator_id,
 {
     tipb_executor->set_tp(tipb::ExecType::TypeSort);
     tipb_executor->set_executor_id(name);
+    tipb_executor->set_fine_grained_shuffle_stream_count(fine_grained_shuffle_stream_count);
     tipb::Sort * sort = tipb_executor->mutable_sort();
     sort->set_ispartialsort(is_partial_sort);
 
@@ -1545,7 +1557,7 @@ ExecutorPtr compileAggregation(ExecutorPtr input, size_t & executor_index, ASTPt
                 ci.tp = TiDB::TypeLongLong;
                 ci.flag = TiDB::ColumnFlagUnsigned | TiDB::ColumnFlagNotNull;
             }
-            else if (func->name == "max" || func->name == "min" || func->name == "first_row")
+            else if (func->name == "max" || func->name == "min" || func->name == "first_row" || func->name == "sum")
             {
                 ci = children_ci[0];
                 ci.flag &= ~TiDB::ColumnFlagNotNull;
@@ -1629,7 +1641,6 @@ ExecutorPtr compileProject(ExecutorPtr input, size_t & executor_index, ASTPtr se
             }
         }
     }
-
     auto project = std::make_shared<mock::Project>(executor_index, output_schema, std::move(exprs));
     project->children.push_back(input);
     return project;
@@ -1666,13 +1677,13 @@ ExecutorPtr compileExchangeSender(ExecutorPtr input, size_t & executor_index, ti
     return exchange_sender;
 }
 
-ExecutorPtr compileExchangeReceiver(size_t & executor_index, DAGSchema schema)
+ExecutorPtr compileExchangeReceiver(size_t & executor_index, DAGSchema schema, uint64_t fine_grained_shuffle_stream_count)
 {
-    ExecutorPtr exchange_receiver = std::make_shared<mock::ExchangeReceiver>(executor_index, schema);
+    ExecutorPtr exchange_receiver = std::make_shared<mock::ExchangeReceiver>(executor_index, schema, fine_grained_shuffle_stream_count);
     return exchange_receiver;
 }
 
-ExecutorPtr compileWindow(ExecutorPtr input, size_t & executor_index, ASTPtr func_desc_list, ASTPtr partition_by_expr_list, ASTPtr order_by_expr_list, mock::MockWindowFrame frame)
+ExecutorPtr compileWindow(ExecutorPtr input, size_t & executor_index, ASTPtr func_desc_list, ASTPtr partition_by_expr_list, ASTPtr order_by_expr_list, mock::MockWindowFrame frame, uint64_t fine_grained_shuffle_stream_count)
 {
     std::vector<ASTPtr> partition_columns;
     if (partition_by_expr_list != nullptr)
@@ -1740,12 +1751,13 @@ ExecutorPtr compileWindow(ExecutorPtr input, size_t & executor_index, ASTPtr fun
         window_exprs,
         std::move(partition_columns),
         std::move(order_columns),
-        frame);
+        frame,
+        fine_grained_shuffle_stream_count);
     window->children.push_back(input);
     return window;
 }
 
-ExecutorPtr compileSort(ExecutorPtr input, size_t & executor_index, ASTPtr order_by_expr_list, bool is_partial_sort)
+ExecutorPtr compileSort(ExecutorPtr input, size_t & executor_index, ASTPtr order_by_expr_list, bool is_partial_sort, uint64_t fine_grained_shuffle_stream_count)
 {
     std::vector<ASTPtr> order_columns;
     if (order_by_expr_list != nullptr)
@@ -1759,8 +1771,8 @@ ExecutorPtr compileSort(ExecutorPtr input, size_t & executor_index, ASTPtr order
             compileExpr(input->output_schema, elem->children[0]);
         }
     }
-    ExecutorPtr sort = std::make_shared<mock::Sort>(executor_index, input->output_schema, std::move(order_columns), is_partial_sort);
+    ExecutorPtr sort = std::make_shared<mock::Sort>(executor_index, input->output_schema, std::move(order_columns), is_partial_sort, fine_grained_shuffle_stream_count);
     sort->children.push_back(input);
     return sort;
 }
-} // namespace DB
\ No newline at end of file
+} // namespace DB
diff --git a/dbms/src/Debug/astToExecutor.h b/dbms/src/Debug/astToExecutor.h
index 4d87c0db77e..f39f4059d26 100644
--- a/dbms/src/Debug/astToExecutor.h
+++ b/dbms/src/Debug/astToExecutor.h
@@ -139,8 +139,11 @@ struct ExchangeSender : Executor
 struct ExchangeReceiver : Executor
 {
     TaskMetas task_metas;
-    ExchangeReceiver(size_t & index, const DAGSchema & output)
+    uint64_t fine_grained_shuffle_stream_count;
+
+    ExchangeReceiver(size_t & index, const DAGSchema & output, uint64_t fine_grained_shuffle_stream_count_ = 0)
         : Executor(index, "exchange_receiver_" + std::to_string(index), output)
+        , fine_grained_shuffle_stream_count(fine_grained_shuffle_stream_count_)
     {}
     void columnPrune(std::unordered_set<String> &) override { throw Exception("Should not reach here"); }
     bool toTiPBExecutor(tipb::Executor * tipb_executor, uint32_t collator_id, const MPPInfo & mpp_info, const Context &) override;
@@ -292,13 +295,15 @@ struct Window : Executor
     std::vector<ASTPtr> partition_by_exprs;
     std::vector<ASTPtr> order_by_exprs;
     MockWindowFrame frame;
+    uint64_t fine_grained_shuffle_stream_count;
 
-    Window(size_t & index_, const DAGSchema & output_schema_, std::vector<ASTPtr> func_descs_, std::vector<ASTPtr> partition_by_exprs_, std::vector<ASTPtr> order_by_exprs_, MockWindowFrame frame_)
+    Window(size_t & index_, const DAGSchema & output_schema_, std::vector<ASTPtr> func_descs_, std::vector<ASTPtr> partition_by_exprs_, std::vector<ASTPtr> order_by_exprs_, MockWindowFrame frame_, uint64_t fine_grained_shuffle_stream_count_ = 0)
         : Executor(index_, "window_" + std::to_string(index_), output_schema_)
         , func_descs(std::move(func_descs_))
         , partition_by_exprs(std::move(partition_by_exprs_))
         , order_by_exprs(order_by_exprs_)
         , frame(frame_)
+        , fine_grained_shuffle_stream_count(fine_grained_shuffle_stream_count_)
     {
     }
     // Currently only use Window Executor in Unit Test which don't call columnPrume.
@@ -311,11 +316,13 @@ struct Sort : Executor
 {
     std::vector<ASTPtr> by_exprs;
     bool is_partial_sort;
+    uint64_t fine_grained_shuffle_stream_count;
 
-    Sort(size_t & index_, const DAGSchema & output_schema_, std::vector<ASTPtr> by_exprs_, bool is_partial_sort_)
+    Sort(size_t & index_, const DAGSchema & output_schema_, std::vector<ASTPtr> by_exprs_, bool is_partial_sort_, uint64_t fine_grained_shuffle_stream_count_ = 0)
         : Executor(index_, "sort_" + std::to_string(index_), output_schema_)
         , by_exprs(by_exprs_)
         , is_partial_sort(is_partial_sort_)
+        , fine_grained_shuffle_stream_count(fine_grained_shuffle_stream_count_)
     {
     }
     // Currently only use Sort Executor in Unit Test which don't call columnPrume.
@@ -343,11 +350,11 @@ ExecutorPtr compileJoin(size_t & executor_index, ExecutorPtr left, ExecutorPtr r
 
 ExecutorPtr compileExchangeSender(ExecutorPtr input, size_t & executor_index, tipb::ExchangeType exchange_type);
 
-ExecutorPtr compileExchangeReceiver(size_t & executor_index, DAGSchema schema);
+ExecutorPtr compileExchangeReceiver(size_t & executor_index, DAGSchema schema, uint64_t fine_grained_shuffle_stream_count = 0);
 
-ExecutorPtr compileWindow(ExecutorPtr input, size_t & executor_index, ASTPtr func_desc_list, ASTPtr partition_by_expr_list, ASTPtr order_by_expr_list, mock::MockWindowFrame frame);
+ExecutorPtr compileWindow(ExecutorPtr input, size_t & executor_index, ASTPtr func_desc_list, ASTPtr partition_by_expr_list, ASTPtr order_by_expr_list, mock::MockWindowFrame frame, uint64_t fine_grained_shuffle_stream_count = 0);
 
-ExecutorPtr compileSort(ExecutorPtr input, size_t & executor_index, ASTPtr order_by_expr_list, bool is_partial_sort);
+ExecutorPtr compileSort(ExecutorPtr input, size_t & executor_index, ASTPtr order_by_expr_list, bool is_partial_sort, uint64_t fine_grained_shuffle_stream_count = 0);
 
 void literalFieldToTiPBExpr(const ColumnInfo & ci, const Field & field, tipb::Expr * expr, Int32 collator_id);
 } // namespace DB
diff --git a/dbms/src/Debug/dbgFuncCoprocessor.cpp b/dbms/src/Debug/dbgFuncCoprocessor.cpp
index e9335d1e2bd..62a8b7537f1 100644
--- a/dbms/src/Debug/dbgFuncCoprocessor.cpp
+++ b/dbms/src/Debug/dbgFuncCoprocessor.cpp
@@ -290,8 +290,9 @@ BlockInputStreamPtr executeQuery(Context & context, RegionID region_id, const DA
                 tipb_exchange_receiver.encoded_task_meta_size(),
                 10,
                 /*req_id=*/"",
-                /*executor_id=*/"");
-        BlockInputStreamPtr ret = std::make_shared<ExchangeReceiverInputStream>(exchange_receiver, /*req_id=*/"", /*executor_id=*/"");
+                /*executor_id=*/"",
+                /*fine_grained_shuffle_stream_count=*/0);
+        BlockInputStreamPtr ret = std::make_shared<ExchangeReceiverInputStream>(exchange_receiver, /*req_id=*/"", /*executor_id=*/"", /*stream_id*/ 0);
         return ret;
     }
     else
diff --git a/dbms/src/Debug/dbgFuncMockRaftCommand.cpp b/dbms/src/Debug/dbgFuncMockRaftCommand.cpp
index df93ee1c78d..3626041f428 100644
--- a/dbms/src/Debug/dbgFuncMockRaftCommand.cpp
+++ b/dbms/src/Debug/dbgFuncMockRaftCommand.cpp
@@ -40,7 +40,7 @@ void MockRaftCommand::dbgFuncRegionBatchSplit(Context & context, const ASTs & ar
     auto & tmt = context.getTMTContext();
     auto & kvstore = tmt.getKVStore();
 
-    RegionID region_id = (RegionID)safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[0]).value);
+    auto region_id = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[0]).value));
     const String & database_name = typeid_cast<const ASTIdentifier &>(*args[1]).name;
     const String & table_name = typeid_cast<const ASTIdentifier &>(*args[2]).name;
     auto table = MockTiDB::instance().getTableByName(database_name, table_name);
@@ -49,7 +49,7 @@ void MockRaftCommand::dbgFuncRegionBatchSplit(Context & context, const ASTs & ar
     if (4 + handle_column_size * 4 != args.size())
         throw Exception("Args not matched, should be: region-id1, database-name, table-name, start1, end1, start2, end2, region-id2",
                         ErrorCodes::BAD_ARGUMENTS);
-    RegionID region_id2 = (RegionID)safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[args.size() - 1]).value);
+    auto region_id2 = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[args.size() - 1]).value));
 
     auto table_id = table->id();
     TiKVKey start_key1, start_key2, end_key1, end_key2;
@@ -59,9 +59,17 @@ void MockRaftCommand::dbgFuncRegionBatchSplit(Context & context, const ASTs & ar
         std::vector<Field> start_keys2;
         std::vector<Field> end_keys1;
         std::vector<Field> end_keys2;
+
+        std::unordered_map<String, size_t> column_name_columns_index_map;
+        for (size_t i = 0; i < table_info.columns.size(); i++)
+        {
+            column_name_columns_index_map.emplace(table_info.columns[i].name, i);
+        }
+
         for (size_t i = 0; i < handle_column_size; i++)
         {
-            auto & column_info = table_info.columns[table_info.getPrimaryIndexInfo().idx_cols[i].offset];
+            auto idx = column_name_columns_index_map[table_info.getPrimaryIndexInfo().idx_cols[i].name];
+            auto & column_info = table_info.columns[idx];
 
             auto start_field1 = RegionBench::convertField(column_info, typeid_cast<const ASTLiteral &>(*args[3 + i]).value);
             TiDB::DatumBumpy start_datum1 = TiDB::DatumBumpy(start_field1, column_info.tp);
@@ -88,10 +96,10 @@ void MockRaftCommand::dbgFuncRegionBatchSplit(Context & context, const ASTs & ar
     }
     else
     {
-        HandleID start1 = (HandleID)safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[3]).value);
-        HandleID end1 = (HandleID)safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[4]).value);
-        HandleID start2 = (HandleID)safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[5]).value);
-        HandleID end2 = (HandleID)safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[6]).value);
+        auto start1 = static_cast<HandleID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[3]).value));
+        auto end1 = static_cast<HandleID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[4]).value));
+        auto start2 = static_cast<HandleID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[5]).value));
+        auto end2 = static_cast<HandleID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[6]).value));
         start_key1 = RecordKVFormat::genKey(table_id, start1);
         start_key2 = RecordKVFormat::genKey(table_id, start2);
         end_key1 = RecordKVFormat::genKey(table_id, end1);
@@ -110,7 +118,7 @@ void MockRaftCommand::dbgFuncRegionBatchSplit(Context & context, const ASTs & ar
         request.set_cmd_type(raft_cmdpb::AdminCmdType::BatchSplit);
         raft_cmdpb::BatchSplitResponse * splits = response.mutable_splits();
         {
-            auto region = splits->add_regions();
+            auto * region = splits->add_regions();
             region->set_id(region_id);
             region->set_start_key(start_key1);
             region->set_end_key(end_key1);
@@ -118,7 +126,7 @@ void MockRaftCommand::dbgFuncRegionBatchSplit(Context & context, const ASTs & ar
             *region->mutable_region_epoch() = new_epoch;
         }
         {
-            auto region = splits->add_regions();
+            auto * region = splits->add_regions();
             region->set_id(region_id2);
             region->set_start_key(start_key2);
             region->set_end_key(end_key2);
@@ -144,8 +152,8 @@ void MockRaftCommand::dbgFuncPrepareMerge(Context & context, const ASTs & args,
         throw Exception("Args not matched, should be: source-id1, target-id2", ErrorCodes::BAD_ARGUMENTS);
     }
 
-    RegionID region_id = (RegionID)safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[0]).value);
-    RegionID target_id = (RegionID)safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[1]).value);
+    auto region_id = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[0]).value));
+    auto target_id = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[1]).value));
 
     auto & tmt = context.getTMTContext();
     auto & kvstore = tmt.getKVStore();
@@ -157,7 +165,7 @@ void MockRaftCommand::dbgFuncPrepareMerge(Context & context, const ASTs & args,
     {
         request.set_cmd_type(raft_cmdpb::AdminCmdType::PrepareMerge);
 
-        auto prepare_merge = request.mutable_prepare_merge();
+        auto * prepare_merge = request.mutable_prepare_merge();
         {
             auto min_index = region->appliedIndex();
             prepare_merge->set_min_index(min_index);
@@ -184,8 +192,8 @@ void MockRaftCommand::dbgFuncCommitMerge(Context & context, const ASTs & args, D
         throw Exception("Args not matched, should be: source-id1, current-id2", ErrorCodes::BAD_ARGUMENTS);
     }
 
-    RegionID source_id = (RegionID)safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[0]).value);
-    RegionID current_id = (RegionID)safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[1]).value);
+    auto source_id = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[0]).value));
+    auto current_id = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[1]).value));
 
     auto & tmt = context.getTMTContext();
     auto & kvstore = tmt.getKVStore();
@@ -196,7 +204,7 @@ void MockRaftCommand::dbgFuncCommitMerge(Context & context, const ASTs & args, D
 
     {
         request.set_cmd_type(raft_cmdpb::AdminCmdType::CommitMerge);
-        auto commit_merge = request.mutable_commit_merge();
+        auto * commit_merge = request.mutable_commit_merge();
         {
             commit_merge->set_commit(source_region->appliedIndex());
             *commit_merge->mutable_source() = source_region->getMetaRegion();
@@ -220,7 +228,7 @@ void MockRaftCommand::dbgFuncRollbackMerge(Context & context, const ASTs & args,
         throw Exception("Args not matched, should be: region-id", ErrorCodes::BAD_ARGUMENTS);
     }
 
-    RegionID region_id = (RegionID)safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[0]).value);
+    auto region_id = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[0]).value));
 
     auto & tmt = context.getTMTContext();
     auto & kvstore = tmt.getKVStore();
@@ -231,7 +239,7 @@ void MockRaftCommand::dbgFuncRollbackMerge(Context & context, const ASTs & args,
     {
         request.set_cmd_type(raft_cmdpb::AdminCmdType::RollbackMerge);
 
-        auto rollback_merge = request.mutable_rollback_merge();
+        auto * rollback_merge = request.mutable_rollback_merge();
         {
             auto merge_state = region->getMergeState();
             rollback_merge->set_commit(merge_state.commit());
diff --git a/dbms/src/Debug/dbgFuncMockRaftSnapshot.cpp b/dbms/src/Debug/dbgFuncMockRaftSnapshot.cpp
index 9d5b848ddea..b5d3f252d0a 100644
--- a/dbms/src/Debug/dbgFuncMockRaftSnapshot.cpp
+++ b/dbms/src/Debug/dbgFuncMockRaftSnapshot.cpp
@@ -68,6 +68,12 @@ RegionPtr GenDbgRegionSnapshotWithData(Context & context, const ASTs & args)
     size_t handle_column_size = is_common_handle ? table_info.getPrimaryIndexInfo().idx_cols.size() : 1;
     RegionPtr region;
 
+    std::unordered_map<String, size_t> column_name_columns_index_map;
+    for (size_t i = 0; i < table_info.columns.size(); i++)
+    {
+        column_name_columns_index_map.emplace(table_info.columns[i].name, i);
+    }
+
     if (!is_common_handle)
     {
         auto start = static_cast<HandleID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[3]).value));
@@ -81,7 +87,8 @@ RegionPtr GenDbgRegionSnapshotWithData(Context & context, const ASTs & args)
         std::vector<Field> end_keys;
         for (size_t i = 0; i < handle_column_size; i++)
         {
-            auto & column_info = table_info.columns[table_info.getPrimaryIndexInfo().idx_cols[i].offset];
+            auto idx = column_name_columns_index_map[table_info.getPrimaryIndexInfo().idx_cols[i].name];
+            auto & column_info = table_info.columns[idx];
             auto start_field = RegionBench::convertField(column_info, typeid_cast<const ASTLiteral &>(*args[3 + i]).value);
             TiDB::DatumBumpy start_datum = TiDB::DatumBumpy(start_field, column_info.tp);
             start_keys.emplace_back(start_datum.field());
@@ -122,9 +129,9 @@ RegionPtr GenDbgRegionSnapshotWithData(Context & context, const ASTs & args)
                 std::vector<Field> keys; // handle key
                 for (size_t i = 0; i < table_info.getPrimaryIndexInfo().idx_cols.size(); i++)
                 {
-                    auto & idx_col = table_info.getPrimaryIndexInfo().idx_cols[i];
-                    auto & column_info = table_info.columns[idx_col.offset];
-                    auto start_field = RegionBench::convertField(column_info, fields[idx_col.offset]);
+                    auto idx = column_name_columns_index_map[table_info.getPrimaryIndexInfo().idx_cols[i].name];
+                    auto & column_info = table_info.columns[idx];
+                    auto start_field = RegionBench::convertField(column_info, fields[idx]);
                     TiDB::DatumBumpy start_datum = TiDB::DatumBumpy(start_field, column_info.tp);
                     keys.emplace_back(start_datum.field());
                 }
@@ -198,9 +205,16 @@ void MockRaftCommand::dbgFuncRegionSnapshot(Context & context, const ASTs & args
         // Get start key and end key form multiple column if it is clustered_index.
         std::vector<Field> start_keys;
         std::vector<Field> end_keys;
+
+        std::unordered_map<String, size_t> column_name_columns_index_map;
+        for (size_t i = 0; i < table_info.columns.size(); i++)
+        {
+            column_name_columns_index_map.emplace(table_info.columns[i].name, i);
+        }
         for (size_t i = 0; i < handle_column_size; i++)
         {
-            const auto & column_info = table_info.columns[table_info.getPrimaryIndexInfo().idx_cols[i].offset];
+            auto idx = column_name_columns_index_map[table_info.getPrimaryIndexInfo().idx_cols[i].name];
+            const auto & column_info = table_info.columns[idx];
             auto start_field = RegionBench::convertField(column_info, typeid_cast<const ASTLiteral &>(*args[1 + i]).value);
             TiDB::DatumBumpy start_datum = TiDB::DatumBumpy(start_field, column_info.tp);
             start_keys.emplace_back(start_datum.field());
diff --git a/dbms/src/Debug/dbgFuncRegion.cpp b/dbms/src/Debug/dbgFuncRegion.cpp
index b2024eac1d8..f65a18b8fd0 100644
--- a/dbms/src/Debug/dbgFuncRegion.cpp
+++ b/dbms/src/Debug/dbgFuncRegion.cpp
@@ -61,9 +61,15 @@ void dbgFuncPutRegion(Context & context, const ASTs & args, DBGInvoker::Printer
     {
         std::vector<Field> start_keys;
         std::vector<Field> end_keys;
+        std::unordered_map<String, size_t> column_name_columns_index_map;
+        for (size_t i = 0; i < table_info.columns.size(); i++)
+        {
+            column_name_columns_index_map.emplace(table_info.columns[i].name, i);
+        }
         for (size_t i = 0; i < handle_column_size; i++)
         {
-            const auto & column_info = table_info.columns[table_info.getPrimaryIndexInfo().idx_cols[i].offset];
+            auto idx = column_name_columns_index_map[table_info.getPrimaryIndexInfo().idx_cols[i].name];
+            const auto & column_info = table_info.columns[idx];
             auto start_field = RegionBench::convertField(column_info, typeid_cast<const ASTLiteral &>(*args[1 + i]).value);
             TiDB::DatumBumpy start_datum = TiDB::DatumBumpy(start_field, column_info.tp);
             start_keys.emplace_back(start_datum.field());
diff --git a/dbms/src/Debug/dbgFuncSchema.cpp b/dbms/src/Debug/dbgFuncSchema.cpp
index c388015dc10..9ef07f16e8b 100644
--- a/dbms/src/Debug/dbgFuncSchema.cpp
+++ b/dbms/src/Debug/dbgFuncSchema.cpp
@@ -24,6 +24,7 @@
 #include <Storages/IManageableStorage.h>
 #include <Storages/Transaction/TMTContext.h>
 #include <Storages/Transaction/TiDB.h>
+#include <TiDB/Schema/SchemaNameMapper.h>
 #include <TiDB/Schema/SchemaSyncService.h>
 #include <TiDB/Schema/SchemaSyncer.h>
 #include <fmt/core.h>
@@ -137,4 +138,5 @@ void dbgFuncIsTombstone(Context & context, const ASTs & args, DBGInvoker::Printe
     output(fmt_buf.toString());
 }
 
+
 } // namespace DB
\ No newline at end of file
diff --git a/dbms/src/Debug/dbgFuncSchema.h b/dbms/src/Debug/dbgFuncSchema.h
index 162bc0af46b..51ab3ad41cf 100644
--- a/dbms/src/Debug/dbgFuncSchema.h
+++ b/dbms/src/Debug/dbgFuncSchema.h
@@ -46,5 +46,4 @@ void dbgFuncResetSchemas(Context & context, const ASTs & args, DBGInvoker::Print
 // Usage:
 //   ./storage-client.sh "DBGInvoke is_tombstone(db_name, table_name)"
 void dbgFuncIsTombstone(Context & context, const ASTs & args, DBGInvoker::Printer output);
-
 } // namespace DB
diff --git a/dbms/src/Debug/dbgFuncSchemaName.cpp b/dbms/src/Debug/dbgFuncSchemaName.cpp
index 4c2ad86bd62..3aa7b6e3af4 100644
--- a/dbms/src/Debug/dbgFuncSchemaName.cpp
+++ b/dbms/src/Debug/dbgFuncSchemaName.cpp
@@ -128,4 +128,109 @@ BlockInputStreamPtr dbgFuncQueryMapped(Context & context, const ASTs & args)
     return executeQuery(query, context, true).in;
 }
 
+
+void dbgFuncGetTiflashReplicaCount(Context & context, const ASTs & args, DBGInvoker::Printer output)
+{
+    if (args.empty() || args.size() != 2)
+        throw Exception("Args not matched, should be: database-name[, table-name]", ErrorCodes::BAD_ARGUMENTS);
+
+    const String & database_name = typeid_cast<const ASTIdentifier &>(*args[0]).name;
+    FmtBuffer fmt_buf;
+
+    const String & table_name = typeid_cast<const ASTIdentifier &>(*args[1]).name;
+    auto mapped = mappedTable(context, database_name, table_name);
+    auto storage = context.getTable(mapped->first, mapped->second);
+    auto managed_storage = std::dynamic_pointer_cast<IManageableStorage>(storage);
+    if (!managed_storage)
+        throw Exception(database_name + "." + table_name + " is not ManageableStorage", ErrorCodes::BAD_ARGUMENTS);
+
+    fmt_buf.append((std::to_string(managed_storage->getTableInfo().replica_info.count)));
+
+    output(fmt_buf.toString());
+}
+
+void dbgFuncGetPartitionTablesTiflashReplicaCount(Context & context, const ASTs & args, DBGInvoker::Printer output)
+{
+    if (args.empty() || args.size() != 2)
+        throw Exception("Args not matched, should be: database-name[, table-name]", ErrorCodes::BAD_ARGUMENTS);
+
+    const String & database_name = typeid_cast<const ASTIdentifier &>(*args[0]).name;
+    FmtBuffer fmt_buf;
+
+    const String & table_name = typeid_cast<const ASTIdentifier &>(*args[1]).name;
+    auto mapped = mappedTable(context, database_name, table_name);
+    auto storage = context.getTable(mapped->first, mapped->second);
+    auto managed_storage = std::dynamic_pointer_cast<IManageableStorage>(storage);
+    if (!managed_storage)
+        throw Exception(database_name + "." + table_name + " is not ManageableStorage", ErrorCodes::BAD_ARGUMENTS);
+
+    auto table_info = managed_storage->getTableInfo();
+
+    if (!table_info.isLogicalPartitionTable())
+        throw Exception(database_name + "." + table_name + " is not logical partition table", ErrorCodes::BAD_ARGUMENTS);
+
+    SchemaNameMapper name_mapper;
+    for (const auto & part_def : table_info.partition.definitions)
+    {
+        auto paritition_table_info = table_info.producePartitionTableInfo(part_def.id, name_mapper);
+        auto partition_storage = context.getTMTContext().getStorages().get(paritition_table_info->id);
+        fmt_buf.append((std::to_string(partition_storage->getTableInfo().replica_info.count)));
+        fmt_buf.append("/");
+    }
+
+    output(fmt_buf.toString());
+}
+
+void dbgFuncGetTiflashMode(Context & context, const ASTs & args, DBGInvoker::Printer output)
+{
+    if (args.empty() || args.size() != 2)
+        throw Exception("Args not matched, should be: database-name[, table-name]", ErrorCodes::BAD_ARGUMENTS);
+
+    const String & database_name = typeid_cast<const ASTIdentifier &>(*args[0]).name;
+    FmtBuffer fmt_buf;
+
+    const String & table_name = typeid_cast<const ASTIdentifier &>(*args[1]).name;
+    auto mapped = mappedTable(context, database_name, table_name);
+    auto storage = context.getTable(mapped->first, mapped->second);
+    auto managed_storage = std::dynamic_pointer_cast<IManageableStorage>(storage);
+    if (!managed_storage)
+        throw Exception(database_name + "." + table_name + " is not ManageableStorage", ErrorCodes::BAD_ARGUMENTS);
+
+    fmt_buf.append((TiFlashModeToString(managed_storage->getTableInfo().tiflash_mode)));
+
+    output(fmt_buf.toString());
+}
+
+void dbgFuncGetPartitionTablesTiflashMode(Context & context, const ASTs & args, DBGInvoker::Printer output)
+{
+    if (args.empty() || args.size() != 2)
+        throw Exception("Args not matched, should be: database-name[, table-name]", ErrorCodes::BAD_ARGUMENTS);
+
+    const String & database_name = typeid_cast<const ASTIdentifier &>(*args[0]).name;
+    FmtBuffer fmt_buf;
+
+    const String & table_name = typeid_cast<const ASTIdentifier &>(*args[1]).name;
+    auto mapped = mappedTable(context, database_name, table_name);
+    auto storage = context.getTable(mapped->first, mapped->second);
+    auto managed_storage = std::dynamic_pointer_cast<IManageableStorage>(storage);
+    if (!managed_storage)
+        throw Exception(database_name + "." + table_name + " is not ManageableStorage", ErrorCodes::BAD_ARGUMENTS);
+
+    auto table_info = managed_storage->getTableInfo();
+
+    if (!table_info.isLogicalPartitionTable())
+        throw Exception(database_name + "." + table_name + " is not logical partition table", ErrorCodes::BAD_ARGUMENTS);
+
+    SchemaNameMapper name_mapper;
+    for (const auto & part_def : table_info.partition.definitions)
+    {
+        auto paritition_table_info = table_info.producePartitionTableInfo(part_def.id, name_mapper);
+        auto partition_storage = context.getTMTContext().getStorages().get(paritition_table_info->id);
+        fmt_buf.append((TiFlashModeToString(partition_storage->getTableInfo().tiflash_mode)));
+        fmt_buf.append("/");
+    }
+
+    output(fmt_buf.toString());
+}
+
 } // namespace DB
diff --git a/dbms/src/Debug/dbgFuncSchemaName.h b/dbms/src/Debug/dbgFuncSchemaName.h
index 8e95aaab908..ec18f89e911 100644
--- a/dbms/src/Debug/dbgFuncSchemaName.h
+++ b/dbms/src/Debug/dbgFuncSchemaName.h
@@ -40,4 +40,24 @@ void dbgFuncMappedTable(Context & context, const ASTs & args, DBGInvoker::Printe
 //   ./storage-client.sh "DBGInvoke query_mapped('select * from $d.$t', database_name[, table_name])"
 BlockInputStreamPtr dbgFuncQueryMapped(Context & context, const ASTs & args);
 
+// Get table's tiflash replica counts with mapped table name
+// Usage:
+//   ./storage-client.sh "DBGInvoke get_tiflash_replica_count(db_name, table_name)"
+void dbgFuncGetTiflashReplicaCount(Context & context, const ASTs & args, DBGInvoker::Printer output);
+
+// Get the logical table's partition tables' tiflash replica counts with mapped table name
+// Usage:
+//   ./storage-client.sh "DBGInvoke get_partition_tables_tiflash_replica_count(db_name, table_name)"
+void dbgFuncGetPartitionTablesTiflashReplicaCount(Context & context, const ASTs & args, DBGInvoker::Printer output);
+
+// Get table's tiflash mode with mapped table name
+// Usage:
+//   ./storage-client.sh "DBGInvoke get_tiflash_mode(db_name, table_name)"
+void dbgFuncGetTiflashMode(Context & context, const ASTs & args, DBGInvoker::Printer output);
+
+// Get the logical table's partition tables' tiflash replica counts with mapped table name
+// Usage:
+//   ./storage-client.sh "DBGInvoke get_partition_tables_tiflash_mode(db_name, table_name)"
+void dbgFuncGetPartitionTablesTiflashMode(Context & context, const ASTs & args, DBGInvoker::Printer output);
+
 } // namespace DB
diff --git a/dbms/src/Debug/dbgTools.cpp b/dbms/src/Debug/dbgTools.cpp
index 685b2563a3b..854d8a18bd5 100644
--- a/dbms/src/Debug/dbgTools.cpp
+++ b/dbms/src/Debug/dbgTools.cpp
@@ -310,7 +310,7 @@ void insert( //
     // Parse the fields in the inserted row
     std::vector<Field> fields;
     {
-        for (ASTs::const_iterator it = values_begin; it != values_end; ++it)
+        for (auto it = values_begin; it != values_end; ++it)
         {
             auto field = typeid_cast<const ASTLiteral *>((*it).get())->value;
             fields.emplace_back(field);
@@ -330,11 +330,18 @@ void insert( //
     if (table_info.is_common_handle)
     {
         std::vector<Field> keys;
+
+        std::unordered_map<String, size_t> column_name_columns_index_map;
+        for (size_t i = 0; i < table_info.columns.size(); i++)
+        {
+            column_name_columns_index_map.emplace(table_info.columns[i].name, i);
+        }
+
         for (size_t i = 0; i < table_info.getPrimaryIndexInfo().idx_cols.size(); i++)
         {
-            const auto & idx_col = table_info.getPrimaryIndexInfo().idx_cols[i];
-            const auto & column_info = table_info.columns[idx_col.offset];
-            auto start_field = RegionBench::convertField(column_info, fields[idx_col.offset]);
+            const auto & col_idx = column_name_columns_index_map[table_info.getPrimaryIndexInfo().idx_cols[i].name];
+            const auto & column_info = table_info.columns[col_idx];
+            auto start_field = RegionBench::convertField(column_info, fields[col_idx]);
             TiDB::DatumBumpy start_datum = TiDB::DatumBumpy(start_field, column_info.tp);
             keys.emplace_back(start_datum.field());
         }
diff --git a/dbms/src/Flash/Coprocessor/ArrowColCodec.cpp b/dbms/src/Flash/Coprocessor/ArrowColCodec.cpp
index a1c6061948a..1609c83b029 100644
--- a/dbms/src/Flash/Coprocessor/ArrowColCodec.cpp
+++ b/dbms/src/Flash/Coprocessor/ArrowColCodec.cpp
@@ -20,7 +20,6 @@
 #include <DataTypes/DataTypeDecimal.h>
 #include <DataTypes/DataTypeEnum.h>
 #include <DataTypes/DataTypeMyDate.h>
-#include <DataTypes/DataTypeMyDateTime.h>
 #include <DataTypes/DataTypeNullable.h>
 #include <DataTypes/DataTypeString.h>
 #include <DataTypes/DataTypesNumber.h>
@@ -41,7 +40,7 @@ extern const int NOT_IMPLEMENTED;
 const IColumn * getNestedCol(const IColumn * flash_col)
 {
     if (flash_col->isColumnNullable())
-        return dynamic_cast<const ColumnNullable *>(flash_col)->getNestedColumnPtr().get();
+        return static_cast<const ColumnNullable *>(flash_col)->getNestedColumnPtr().get();
     else
         return flash_col;
 }
@@ -75,8 +74,8 @@ bool flashDecimalColToArrowColInternal(
     const IColumn * nested_col = getNestedCol(flash_col_untyped);
     if (checkColumn<ColumnDecimal<T>>(nested_col) && checkDataType<DataTypeDecimal<T>>(data_type))
     {
-        const ColumnDecimal<T> * flash_col = checkAndGetColumn<ColumnDecimal<T>>(nested_col);
-        const DataTypeDecimal<T> * type = checkAndGetDataType<DataTypeDecimal<T>>(data_type);
+        const auto * flash_col = checkAndGetColumn<ColumnDecimal<T>>(nested_col);
+        const auto * type = checkAndGetDataType<DataTypeDecimal<T>>(data_type);
         UInt32 scale = type->getScale();
         for (size_t i = start_index; i < end_index; i++)
         {
@@ -92,8 +91,8 @@ bool flashDecimalColToArrowColInternal(
             std::vector<Int32> digits;
             digits.reserve(type->getPrec());
             decimalToVector<typename T::NativeType>(dec.value, digits, scale);
-            TiDBDecimal tiDecimal(scale, digits, dec.value < 0);
-            dag_column.append(tiDecimal);
+            TiDBDecimal ti_decimal(scale, digits, dec.value < 0);
+            dag_column.append(ti_decimal);
         }
         return true;
     }
@@ -121,7 +120,7 @@ template <typename T, bool is_nullable>
 bool flashIntegerColToArrowColInternal(TiDBColumn & dag_column, const IColumn * flash_col_untyped, size_t start_index, size_t end_index)
 {
     const IColumn * nested_col = getNestedCol(flash_col_untyped);
-    if (const ColumnVector<T> * flash_col = checkAndGetColumn<ColumnVector<T>>(nested_col))
+    if (const auto * flash_col = checkAndGetColumn<ColumnVector<T>>(nested_col))
     {
         constexpr bool is_unsigned = std::is_unsigned_v<T>;
         for (size_t i = start_index; i < end_index; i++)
@@ -135,9 +134,9 @@ bool flashIntegerColToArrowColInternal(TiDBColumn & dag_column, const IColumn *
                 }
             }
             if constexpr (is_unsigned)
-                dag_column.append((UInt64)flash_col->getElement(i));
+                dag_column.append(static_cast<UInt64>(flash_col->getElement(i)));
             else
-                dag_column.append((Int64)flash_col->getElement(i));
+                dag_column.append(static_cast<UInt64>(flash_col->getElement(i)));
         }
         return true;
     }
@@ -148,7 +147,7 @@ template <typename T, bool is_nullable>
 void flashDoubleColToArrowCol(TiDBColumn & dag_column, const IColumn * flash_col_untyped, size_t start_index, size_t end_index)
 {
     const IColumn * nested_col = getNestedCol(flash_col_untyped);
-    if (const ColumnVector<T> * flash_col = checkAndGetColumn<ColumnVector<T>>(nested_col))
+    if (const auto * flash_col = checkAndGetColumn<ColumnVector<T>>(nested_col))
     {
         for (size_t i = start_index; i < end_index; i++)
         {
@@ -160,7 +159,7 @@ void flashDoubleColToArrowCol(TiDBColumn & dag_column, const IColumn * flash_col
                     continue;
                 }
             }
-            dag_column.append((T)flash_col->getElement(i));
+            dag_column.append(static_cast<T>(flash_col->getElement(i)));
         }
         return;
     }
@@ -196,7 +195,7 @@ void flashDateOrDateTimeColToArrowCol(
 {
     const IColumn * nested_col = getNestedCol(flash_col_untyped);
     using DateFieldType = DataTypeMyTimeBase::FieldType;
-    auto * flash_col = checkAndGetColumn<ColumnVector<DateFieldType>>(nested_col);
+    const auto * flash_col = checkAndGetColumn<ColumnVector<DateFieldType>>(nested_col);
     for (size_t i = start_index; i < end_index; i++)
     {
         if constexpr (is_nullable)
@@ -217,7 +216,7 @@ void flashStringColToArrowCol(TiDBColumn & dag_column, const IColumn * flash_col
 {
     const IColumn * nested_col = getNestedCol(flash_col_untyped);
     // columnFixedString is not used so do not check it
-    auto * flash_col = checkAndGetColumn<ColumnString>(nested_col);
+    const auto * flash_col = checkAndGetColumn<ColumnString>(nested_col);
     for (size_t i = start_index; i < end_index; i++)
     {
         // todo check if we can convert flash_col to DAG col directly since the internal representation is almost the same
@@ -242,7 +241,7 @@ void flashBitColToArrowCol(
     const tipb::FieldType & field_type)
 {
     const IColumn * nested_col = getNestedCol(flash_col_untyped);
-    auto * flash_col = checkAndGetColumn<ColumnVector<UInt64>>(nested_col);
+    const auto * flash_col = checkAndGetColumn<ColumnVector<UInt64>>(nested_col);
     for (size_t i = start_index; i < end_index; i++)
     {
         if constexpr (is_nullable)
@@ -267,7 +266,7 @@ void flashEnumColToArrowCol(
     const IDataType * data_type)
 {
     const IColumn * nested_col = getNestedCol(flash_col_untyped);
-    auto * flash_col = checkAndGetColumn<ColumnVector<DataTypeEnum16::FieldType>>(nested_col);
+    const auto * flash_col = checkAndGetColumn<ColumnVector<DataTypeEnum16::FieldType>>(nested_col);
     const auto * enum_type = checkAndGetDataType<DataTypeEnum16>(data_type);
     size_t enum_value_size = enum_type->getValues().size();
     for (size_t i = start_index; i < end_index; i++)
@@ -280,10 +279,10 @@ void flashEnumColToArrowCol(
                 continue;
             }
         }
-        auto enum_value = (UInt64)flash_col->getElement(i);
+        auto enum_value = static_cast<UInt64>(flash_col->getElement(i));
         if (enum_value == 0 || enum_value > enum_value_size)
             throw TiFlashException("number of enum overflow enum boundary", Errors::Coprocessor::Internal);
-        TiDBEnum ti_enum(enum_value, enum_type->getNameForValue((const DataTypeEnum16::FieldType)enum_value));
+        TiDBEnum ti_enum(enum_value, enum_type->getNameForValue(static_cast<const DataTypeEnum16::FieldType>(enum_value)));
         dag_column.append(ti_enum);
     }
 }
@@ -300,7 +299,7 @@ void flashColToArrowCol(TiDBColumn & dag_column, const ColumnWithTypeAndName & f
         throw TiFlashException("Flash column and TiDB column has different not null flag", Errors::Coprocessor::Internal);
     }
     if (type->isNullable())
-        type = dynamic_cast<const DataTypeNullable *>(type)->getNestedType().get();
+        type = static_cast<const DataTypeNullable *>(type)->getNestedType().get();
 
     switch (tidb_column_info.tp)
     {
@@ -457,7 +456,7 @@ const char * arrowEnumColToFlashCol(
     {
         if (checkNull(i, null_count, null_bitmap, col))
             continue;
-        const auto enum_value = (Int64)toLittleEndian(*(reinterpret_cast<const UInt32 *>(pos + offsets[i])));
+        const auto enum_value = static_cast<Int64>(toLittleEndian(*(reinterpret_cast<const UInt32 *>(pos + offsets[i]))));
         col.column->assumeMutable()->insert(Field(enum_value));
     }
     return pos + offsets[length];
@@ -479,11 +478,11 @@ const char * arrowBitColToFlashCol(
             continue;
         const String value = String(pos + offsets[i], pos + offsets[i + 1]);
         if (value.length() == 0)
-            col.column->assumeMutable()->insert(Field(UInt64(0)));
+            col.column->assumeMutable()->insert(Field(static_cast<UInt64>(0)));
         UInt64 result = 0;
-        for (auto & c : value)
+        for (const auto & c : value)
         {
-            result = (result << 8u) | (UInt8)c;
+            result = (result << 8u) | static_cast<UInt8>(c);
         }
         col.column->assumeMutable()->insert(Field(result));
     }
@@ -500,7 +499,7 @@ T toCHDecimal(UInt8 digits_int, UInt8 digits_frac, bool negative, const Int32 *
     UInt8 tailing_digit = digits_frac % DIGITS_PER_WORD;
 
     typename T::NativeType value = 0;
-    const int word_max = int(1e9);
+    const int word_max = static_cast<int>(1e9);
     for (int i = 0; i < word_int; i++)
     {
         value = value * word_max + word_buf[i];
@@ -552,28 +551,28 @@ const char * arrowDecimalColToFlashCol(
         pos += 1;
         Int32 word_buf[MAX_WORD_BUF_LEN];
         const DataTypePtr decimal_type
-            = col.type->isNullable() ? dynamic_cast<const DataTypeNullable *>(col.type.get())->getNestedType() : col.type;
-        for (int j = 0; j < MAX_WORD_BUF_LEN; j++)
+            = col.type->isNullable() ? static_cast<const DataTypeNullable *>(col.type.get())->getNestedType() : col.type;
+        for (int & j : word_buf)
         {
-            word_buf[j] = toLittleEndian(*(reinterpret_cast<const Int32 *>(pos)));
+            j = toLittleEndian(*(reinterpret_cast<const Int32 *>(pos)));
             pos += 4;
         }
-        if (auto * type32 = checkDecimal<Decimal32>(*decimal_type))
+        if (const auto * type32 = checkDecimal<Decimal32>(*decimal_type))
         {
             auto res = toCHDecimal<Decimal32>(digits_int, digits_frac, negative, word_buf);
             col.column->assumeMutable()->insert(DecimalField<Decimal32>(res, type32->getScale()));
         }
-        else if (auto * type64 = checkDecimal<Decimal64>(*decimal_type))
+        else if (const auto * type64 = checkDecimal<Decimal64>(*decimal_type))
         {
             auto res = toCHDecimal<Decimal64>(digits_int, digits_frac, negative, word_buf);
             col.column->assumeMutable()->insert(DecimalField<Decimal64>(res, type64->getScale()));
         }
-        else if (auto * type128 = checkDecimal<Decimal128>(*decimal_type))
+        else if (const auto * type128 = checkDecimal<Decimal128>(*decimal_type))
         {
             auto res = toCHDecimal<Decimal128>(digits_int, digits_frac, negative, word_buf);
             col.column->assumeMutable()->insert(DecimalField<Decimal128>(res, type128->getScale()));
         }
-        else if (auto * type256 = checkDecimal<Decimal256>(*decimal_type))
+        else if (const auto * type256 = checkDecimal<Decimal256>(*decimal_type))
         {
             auto res = toCHDecimal<Decimal256>(digits_int, digits_frac, negative, word_buf);
             col.column->assumeMutable()->insert(DecimalField<Decimal256>(res, type256->getScale()));
@@ -600,13 +599,13 @@ const char * arrowDateColToFlashCol(
             continue;
         }
         UInt64 chunk_time = toLittleEndian(*(reinterpret_cast<const UInt64 *>(pos)));
-        UInt16 year = (UInt16)((chunk_time & MyTimeBase::YEAR_BIT_FIELD_MASK) >> MyTimeBase::YEAR_BIT_FIELD_OFFSET);
-        UInt8 month = (UInt8)((chunk_time & MyTimeBase::MONTH_BIT_FIELD_MASK) >> MyTimeBase::MONTH_BIT_FIELD_OFFSET);
-        UInt8 day = (UInt8)((chunk_time & MyTimeBase::DAY_BIT_FIELD_MASK) >> MyTimeBase::DAY_BIT_FIELD_OFFSET);
-        UInt16 hour = (UInt16)((chunk_time & MyTimeBase::HOUR_BIT_FIELD_MASK) >> MyTimeBase::HOUR_BIT_FIELD_OFFSET);
-        UInt8 minute = (UInt8)((chunk_time & MyTimeBase::MINUTE_BIT_FIELD_MASK) >> MyTimeBase::MINUTE_BIT_FIELD_OFFSET);
-        UInt8 second = (UInt8)((chunk_time & MyTimeBase::SECOND_BIT_FIELD_MASK) >> MyTimeBase::SECOND_BIT_FIELD_OFFSET);
-        UInt32 micro_second = (UInt32)((chunk_time & MyTimeBase::MICROSECOND_BIT_FIELD_MASK) >> MyTimeBase::MICROSECOND_BIT_FIELD_OFFSET);
+        auto year = static_cast<UInt16>((chunk_time & MyTimeBase::YEAR_BIT_FIELD_MASK) >> MyTimeBase::YEAR_BIT_FIELD_OFFSET);
+        auto month = static_cast<UInt8>((chunk_time & MyTimeBase::MONTH_BIT_FIELD_MASK) >> MyTimeBase::MONTH_BIT_FIELD_OFFSET);
+        auto day = static_cast<UInt8>((chunk_time & MyTimeBase::DAY_BIT_FIELD_MASK) >> MyTimeBase::DAY_BIT_FIELD_OFFSET);
+        auto hour = static_cast<UInt16>((chunk_time & MyTimeBase::HOUR_BIT_FIELD_MASK) >> MyTimeBase::HOUR_BIT_FIELD_OFFSET);
+        auto minute = static_cast<UInt8>((chunk_time & MyTimeBase::MINUTE_BIT_FIELD_MASK) >> MyTimeBase::MINUTE_BIT_FIELD_OFFSET);
+        auto second = static_cast<UInt8>((chunk_time & MyTimeBase::SECOND_BIT_FIELD_MASK) >> MyTimeBase::SECOND_BIT_FIELD_OFFSET);
+        auto micro_second = static_cast<UInt32>((chunk_time & MyTimeBase::MICROSECOND_BIT_FIELD_MASK) >> MyTimeBase::MICROSECOND_BIT_FIELD_OFFSET);
         MyDateTime mt(year, month, day, hour, minute, second, micro_second);
         pos += field_length;
         col.column->assumeMutable()->insert(Field(mt.toPackedUInt()));
@@ -659,7 +658,7 @@ const char * arrowNumColToFlashCol(
         case TiDB::TypeFloat:
             u32 = toLittleEndian(*(reinterpret_cast<const UInt32 *>(pos)));
             std::memcpy(&f32, &u32, sizeof(Float32));
-            col.column->assumeMutable()->insert(Field((Float64)f32));
+            col.column->assumeMutable()->insert(Field(static_cast<Float64>(f32)));
             break;
         case TiDB::TypeDouble:
             u64 = toLittleEndian(*(reinterpret_cast<const UInt64 *>(pos)));
diff --git a/dbms/src/Flash/Coprocessor/CoprocessorReader.h b/dbms/src/Flash/Coprocessor/CoprocessorReader.h
index 25c07cff49c..b48fdbcd6dc 100644
--- a/dbms/src/Flash/Coprocessor/CoprocessorReader.h
+++ b/dbms/src/Flash/Coprocessor/CoprocessorReader.h
@@ -139,7 +139,8 @@ class CoprocessorReader
         return detail;
     }
 
-    CoprocessorReaderResult nextResult(std::queue<Block> & block_queue, const Block & header)
+    // stream_id is only meaningful for ExchagneReceiver.
+    CoprocessorReaderResult nextResult(std::queue<Block> & block_queue, const Block & header, size_t /*stream_id*/)
     {
         auto && [result, has_next] = resp_iter.next();
         if (!result.error.empty())
diff --git a/dbms/src/Flash/Coprocessor/DAGContext.cpp b/dbms/src/Flash/Coprocessor/DAGContext.cpp
index 1ef7338a589..1cf7a0d6c87 100644
--- a/dbms/src/Flash/Coprocessor/DAGContext.cpp
+++ b/dbms/src/Flash/Coprocessor/DAGContext.cpp
@@ -30,6 +30,8 @@ extern const int DIVIDED_BY_ZERO;
 extern const int INVALID_TIME;
 } // namespace ErrorCodes
 
+const String enableFineGrainedShuffleExtraInfo = "enable fine grained shuffle";
+
 bool strictSqlMode(UInt64 sql_mode)
 {
     return sql_mode & TiDBSQLMode::STRICT_ALL_TABLES || sql_mode & TiDBSQLMode::STRICT_TRANS_TABLES;
@@ -75,6 +77,11 @@ std::unordered_map<String, BlockInputStreams> & DAGContext::getProfileStreamsMap
     return profile_streams_map;
 }
 
+void DAGContext::updateFinalConcurrency(size_t cur_streams_size, size_t streams_upper_limit)
+{
+    final_concurrency = std::min(std::max(final_concurrency, cur_streams_size), streams_upper_limit);
+}
+
 void DAGContext::initExecutorIdToJoinIdMap()
 {
     // only mpp task has join executor
@@ -206,12 +213,20 @@ void DAGContext::attachBlockIO(const BlockIO & io_)
     io = io_;
 }
 
-const std::unordered_map<String, std::shared_ptr<ExchangeReceiver>> & DAGContext::getMPPExchangeReceiverMap() const
+ExchangeReceiverPtr DAGContext::getMPPExchangeReceiver(const String & executor_id) const
 {
     if (!isMPPTask())
         throw TiFlashException("mpp_exchange_receiver_map is used in mpp only", Errors::Coprocessor::Internal);
-    RUNTIME_ASSERT(mpp_exchange_receiver_map != nullptr, log, "MPPTask without exchange receiver map");
-    return *mpp_exchange_receiver_map;
+    RUNTIME_ASSERT(mpp_receiver_set != nullptr, log, "MPPTask without receiver set");
+    return mpp_receiver_set->getExchangeReceiver(executor_id);
+}
+
+void DAGContext::addCoprocessorReader(const CoprocessorReaderPtr & coprocessor_reader)
+{
+    if (!isMPPTask())
+        return;
+    RUNTIME_ASSERT(mpp_receiver_set != nullptr, log, "MPPTask without receiver set");
+    return mpp_receiver_set->addCoprocessorReader(coprocessor_reader);
 }
 
 bool DAGContext::containsRegionsInfoForTable(Int64 table_id) const
diff --git a/dbms/src/Flash/Coprocessor/DAGContext.h b/dbms/src/Flash/Coprocessor/DAGContext.h
index 07b65b2d8fe..7bfc67afcad 100644
--- a/dbms/src/Flash/Coprocessor/DAGContext.h
+++ b/dbms/src/Flash/Coprocessor/DAGContext.h
@@ -37,8 +37,13 @@ namespace DB
 class Context;
 class MPPTunnelSet;
 class ExchangeReceiver;
-using ExchangeReceiverMap = std::unordered_map<String, std::shared_ptr<ExchangeReceiver>>;
-using ExchangeReceiverMapPtr = std::shared_ptr<std::unordered_map<String, std::shared_ptr<ExchangeReceiver>>>;
+using ExchangeReceiverPtr = std::shared_ptr<ExchangeReceiver>;
+/// key: executor_id of ExchangeReceiver nodes in dag.
+using ExchangeReceiverMap = std::unordered_map<String, ExchangeReceiverPtr>;
+class MPPReceiverSet;
+using MPPReceiverSetPtr = std::shared_ptr<MPPReceiverSet>;
+class CoprocessorReader;
+using CoprocessorReaderPtr = std::shared_ptr<CoprocessorReader>;
 
 class Join;
 using JoinPtr = std::shared_ptr<Join>;
@@ -111,6 +116,13 @@ constexpr UInt64 NO_ENGINE_SUBSTITUTION = 1ul << 30ul;
 constexpr UInt64 ALLOW_INVALID_DATES = 1ul << 32ul;
 } // namespace TiDBSQLMode
 
+inline bool enableFineGrainedShuffle(uint64_t stream_count)
+{
+    return stream_count > 0;
+}
+
+extern const String enableFineGrainedShuffleExtraInfo;
+
 /// A context used to track the information that needs to be passed around during DAG planning.
 class DAGContext
 {
@@ -298,17 +310,20 @@ class DAGContext
         return sql_mode & f;
     }
 
+    void updateFinalConcurrency(size_t cur_streams_size, size_t streams_upper_limit);
+
     bool isTest() const { return is_test; }
     void setColumnsForTest(std::unordered_map<String, ColumnsWithTypeAndName> & columns_for_test_map_) { columns_for_test_map = columns_for_test_map_; }
     ColumnsWithTypeAndName columnsForTest(String executor_id);
 
     bool columnsForTestEmpty() { return columns_for_test_map.empty(); }
 
-    const std::unordered_map<String, std::shared_ptr<ExchangeReceiver>> & getMPPExchangeReceiverMap() const;
-    void setMPPExchangeReceiverMap(ExchangeReceiverMapPtr & exchange_receiver_map)
+    ExchangeReceiverPtr getMPPExchangeReceiver(const String & executor_id) const;
+    void setMPPReceiverSet(const MPPReceiverSetPtr & receiver_set)
     {
-        mpp_exchange_receiver_map = exchange_receiver_map;
+        mpp_receiver_set = receiver_set;
     }
+    void addCoprocessorReader(const CoprocessorReaderPtr & coprocessor_reader);
 
     void addSubquery(const String & subquery_id, SubqueryForSet && subquery);
     bool hasSubquery() const { return !subqueries.empty(); }
@@ -343,6 +358,10 @@ class DAGContext
     std::vector<tipb::FieldType> output_field_types;
     std::vector<Int32> output_offsets;
 
+    /// Hold the order of list based executors.
+    /// It is used to ensure that the order of Execution summary of list based executors is the same as the order of list based executors.
+    std::vector<String> list_based_executors_order;
+
 private:
     void initExecutorIdToJoinIdMap();
     void initOutputInfo();
@@ -350,7 +369,7 @@ class DAGContext
 private:
     /// Hold io for correcting the destruction order.
     BlockIO io;
-    /// profile_streams_map is a map that maps from executor_id to profile BlockInputStreams
+    /// profile_streams_map is a map that maps from executor_id to profile BlockInputStreams.
     std::unordered_map<String, BlockInputStreams> profile_streams_map;
     /// executor_id_to_join_id_map is a map that maps executor id to all the join executor id of itself and all its children.
     std::unordered_map<String, std::vector<String>> executor_id_to_join_id_map;
@@ -369,8 +388,8 @@ class DAGContext
     ConcurrentBoundedQueue<tipb::Error> warnings;
     /// warning_count is the actual warning count during the entire execution
     std::atomic<UInt64> warning_count;
-    /// key: executor_id of ExchangeReceiver nodes in dag.
-    ExchangeReceiverMapPtr mpp_exchange_receiver_map;
+
+    MPPReceiverSetPtr mpp_receiver_set;
     /// vector of SubqueriesForSets(such as join build subquery).
     /// The order of the vector is also the order of the subquery.
     std::vector<SubqueriesForSets> subqueries;
diff --git a/dbms/src/Flash/Coprocessor/DAGDriver.cpp b/dbms/src/Flash/Coprocessor/DAGDriver.cpp
index 55a2024a8bc..9fe388f8fe4 100644
--- a/dbms/src/Flash/Coprocessor/DAGDriver.cpp
+++ b/dbms/src/Flash/Coprocessor/DAGDriver.cpp
@@ -72,6 +72,7 @@ DAGDriver<true>::DAGDriver(
     ::grpc::ServerWriter<::coprocessor::BatchResponse> * writer_,
     bool internal_)
     : context(context_)
+    , dag_response(nullptr)
     , writer(writer_)
     , internal(internal_)
     , log(&Poco::Logger::get("DAGDriver"))
@@ -129,7 +130,7 @@ try
         auto streaming_writer = std::make_shared<StreamWriter>(writer);
         TiDB::TiDBCollators collators;
 
-        std::unique_ptr<DAGResponseWriter> response_writer = std::make_unique<StreamingDAGResponseWriter<StreamWriterPtr>>(
+        std::unique_ptr<DAGResponseWriter> response_writer = std::make_unique<StreamingDAGResponseWriter<StreamWriterPtr, false>>(
             streaming_writer,
             std::vector<Int64>(),
             collators,
@@ -137,7 +138,9 @@ try
             context.getSettingsRef().dag_records_per_chunk,
             context.getSettingsRef().batch_send_min_limit,
             true,
-            dag_context);
+            dag_context,
+            /*fine_grained_shuffle_stream_count=*/0,
+            /*fine_grained_shuffle_batch_size=*/0);
         dag_output_stream = std::make_shared<DAGBlockOutputStream>(streams.in->getHeader(), std::move(response_writer));
         copyData(*streams.in, *dag_output_stream);
     }
diff --git a/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzer.cpp b/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzer.cpp
index aa269469cdb..5fbd86e9762 100644
--- a/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzer.cpp
+++ b/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzer.cpp
@@ -1130,30 +1130,40 @@ NamesWithAliases DAGExpressionAnalyzer::appendFinalProjectForRootQueryBlock(
     const std::vector<Int32> & output_offsets,
     const String & column_prefix,
     bool keep_session_timezone_info)
+{
+    auto & step = initAndGetLastStep(chain);
+
+    NamesWithAliases final_project = buildFinalProjection(step.actions, schema, output_offsets, column_prefix, keep_session_timezone_info);
+
+    for (const auto & name : final_project)
+    {
+        step.required_output.push_back(name.first);
+    }
+    return final_project;
+}
+
+NamesWithAliases DAGExpressionAnalyzer::buildFinalProjection(
+    const ExpressionActionsPtr & actions,
+    const std::vector<tipb::FieldType> & schema,
+    const std::vector<Int32> & output_offsets,
+    const String & column_prefix,
+    bool keep_session_timezone_info)
 {
     if (unlikely(output_offsets.empty()))
-        throw Exception("Root Query block without output_offsets", ErrorCodes::LOGICAL_ERROR);
+        throw Exception("DAGRequest without output_offsets", ErrorCodes::LOGICAL_ERROR);
 
     bool need_append_timezone_cast = !keep_session_timezone_info && !context.getTimezoneInfo().is_utc_timezone;
     auto [need_append_type_cast, need_append_type_cast_vec] = isCastRequiredForRootFinalProjection(schema, output_offsets);
     assert(need_append_type_cast_vec.size() == output_offsets.size());
 
-    auto & step = initAndGetLastStep(chain);
-
     if (need_append_timezone_cast || need_append_type_cast)
     {
         // after appendCastForRootFinalProjection, source_columns has been modified.
-        appendCastForRootFinalProjection(step.actions, schema, output_offsets, need_append_timezone_cast, need_append_type_cast_vec);
+        appendCastForRootFinalProjection(actions, schema, output_offsets, need_append_timezone_cast, need_append_type_cast_vec);
     }
 
     // generate project aliases from source_columns.
-    NamesWithAliases final_project = genRootFinalProjectAliases(column_prefix, output_offsets);
-
-    for (const auto & name : final_project)
-    {
-        step.required_output.push_back(name.first);
-    }
-    return final_project;
+    return genRootFinalProjectAliases(column_prefix, output_offsets);
 }
 
 String DAGExpressionAnalyzer::alignReturnType(
diff --git a/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzer.h b/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzer.h
index 046088ab2b2..63d35abe26d 100644
--- a/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzer.h
+++ b/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzer.h
@@ -102,6 +102,8 @@ class DAGExpressionAnalyzer : private boost::noncopyable
         ExpressionActionsChain & chain,
         const String & column_prefix) const;
 
+    NamesWithAliases genNonRootFinalProjectAliases(const String & column_prefix) const;
+
     // Generate a project action for root DAGQueryBlock,
     // to keep the schema of Block and tidb-schema the same.
     NamesWithAliases appendFinalProjectForRootQueryBlock(
@@ -111,6 +113,13 @@ class DAGExpressionAnalyzer : private boost::noncopyable
         const String & column_prefix,
         bool keep_session_timezone_info);
 
+    NamesWithAliases buildFinalProjection(
+        const ExpressionActionsPtr & actions,
+        const std::vector<tipb::FieldType> & schema,
+        const std::vector<Int32> & output_offsets,
+        const String & column_prefix,
+        bool keep_session_timezone_info);
+
     String getActions(
         const tipb::Expr & expr,
         const ExpressionActionsPtr & actions,
@@ -153,17 +162,38 @@ class DAGExpressionAnalyzer : private boost::noncopyable
         const tipb::Window & window,
         size_t window_columns_start_index);
 
-#ifndef DBMS_PUBLIC_GTEST
-private:
-#endif
     NamesAndTypes buildOrderColumns(
         const ExpressionActionsPtr & actions,
         const ::google::protobuf::RepeatedPtrField<tipb::ByItem> & order_by);
 
+    String buildFilterColumn(
+        const ExpressionActionsPtr & actions,
+        const std::vector<const tipb::Expr *> & conditions);
+
+    void buildAggFuncs(
+        const tipb::Aggregation & aggregation,
+        const ExpressionActionsPtr & actions,
+        AggregateDescriptions & aggregate_descriptions,
+        NamesAndTypes & aggregated_columns);
+
+    void buildAggGroupBy(
+        const google::protobuf::RepeatedPtrField<tipb::Expr> & group_by,
+        const ExpressionActionsPtr & actions,
+        AggregateDescriptions & aggregate_descriptions,
+        NamesAndTypes & aggregated_columns,
+        Names & aggregation_keys,
+        std::unordered_set<String> & agg_key_set,
+        bool group_by_collation_sensitive,
+        TiDB::TiDBCollators & collators);
+
     void appendCastAfterAgg(
         const ExpressionActionsPtr & actions,
         const tipb::Aggregation & agg);
 
+#ifndef DBMS_PUBLIC_GTEST
+private:
+#endif
+
     String buildTupleFunctionForGroupConcat(
         const tipb::Expr & expr,
         SortDescription & sort_desc,
@@ -187,22 +217,6 @@ class DAGExpressionAnalyzer : private boost::noncopyable
         NamesAndTypes & aggregated_columns,
         bool empty_input_as_null);
 
-    void buildAggFuncs(
-        const tipb::Aggregation & aggregation,
-        const ExpressionActionsPtr & actions,
-        AggregateDescriptions & aggregate_descriptions,
-        NamesAndTypes & aggregated_columns);
-
-    void buildAggGroupBy(
-        const google::protobuf::RepeatedPtrField<tipb::Expr> & group_by,
-        const ExpressionActionsPtr & actions,
-        AggregateDescriptions & aggregate_descriptions,
-        NamesAndTypes & aggregated_columns,
-        Names & aggregation_keys,
-        std::unordered_set<String> & agg_key_set,
-        bool group_by_collation_sensitive,
-        TiDB::TiDBCollators & collators);
-
     void fillArgumentDetail(
         const ExpressionActionsPtr & actions,
         const tipb::Expr & arg,
@@ -275,12 +289,6 @@ class DAGExpressionAnalyzer : private boost::noncopyable
         const ExpressionActionsPtr & actions,
         const String & column_name);
 
-    String buildFilterColumn(
-        const ExpressionActionsPtr & actions,
-        const std::vector<const tipb::Expr *> & conditions);
-
-    NamesWithAliases genNonRootFinalProjectAliases(const String & column_prefix) const;
-
     NamesWithAliases genRootFinalProjectAliases(
         const String & column_prefix,
         const std::vector<Int32> & output_offsets) const;
diff --git a/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzerHelper.cpp b/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzerHelper.cpp
index ee529680d28..23bbb4586b3 100644
--- a/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzerHelper.cpp
+++ b/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzerHelper.cpp
@@ -450,6 +450,7 @@ DAGExpressionAnalyzerHelper::FunctionBuilderMap DAGExpressionAnalyzerHelper::fun
      {"bitOr", DAGExpressionAnalyzerHelper::buildBitwiseFunction},
      {"bitXor", DAGExpressionAnalyzerHelper::buildBitwiseFunction},
      {"bitNot", DAGExpressionAnalyzerHelper::buildBitwiseFunction},
+     {"bitShiftRight", DAGExpressionAnalyzerHelper::buildBitwiseFunction},
      {"leftUTF8", DAGExpressionAnalyzerHelper::buildLeftUTF8Function},
      {"date_add", DAGExpressionAnalyzerHelper::buildDateAddOrSubFunction<DateAdd>},
      {"date_sub", DAGExpressionAnalyzerHelper::buildDateAddOrSubFunction<DateSub>},
diff --git a/dbms/src/Flash/Coprocessor/DAGQueryBlockInterpreter.cpp b/dbms/src/Flash/Coprocessor/DAGQueryBlockInterpreter.cpp
index 86d6428c92a..764bf07f533 100644
--- a/dbms/src/Flash/Coprocessor/DAGQueryBlockInterpreter.cpp
+++ b/dbms/src/Flash/Coprocessor/DAGQueryBlockInterpreter.cpp
@@ -18,7 +18,6 @@
 #include <DataStreams/AggregatingBlockInputStream.h>
 #include <DataStreams/ConcatBlockInputStream.h>
 #include <DataStreams/ExchangeSenderBlockInputStream.h>
-#include <DataStreams/ExpressionBlockInputStream.h>
 #include <DataStreams/FilterBlockInputStream.h>
 #include <DataStreams/HashJoinBuildBlockInputStream.h>
 #include <DataStreams/HashJoinProbeBlockInputStream.h>
@@ -268,7 +267,7 @@ void DAGQueryBlockInterpreter::handleJoin(const tipb::Join & join, DAGPipeline &
     size_t join_build_concurrency = settings.join_concurrent_build ? std::min(max_streams, build_pipeline.streams.size()) : 1;
 
     /// build side streams
-    executeExpression(build_pipeline, build_side_prepare_actions, "append join key and join filters for build side");
+    executeExpression(build_pipeline, build_side_prepare_actions, log, "append join key and join filters for build side");
     // add a HashJoinBuildBlockInputStream to build a shared hash table
     auto get_concurrency_build_index = JoinInterpreterHelper::concurrencyBuildIndexGenerator(join_build_concurrency);
     build_pipeline.transform([&](auto & stream) {
@@ -284,7 +283,7 @@ void DAGQueryBlockInterpreter::handleJoin(const tipb::Join & join, DAGPipeline &
     join_ptr->init(right_query.source->getHeader(), join_build_concurrency);
 
     /// probe side streams
-    executeExpression(probe_pipeline, probe_side_prepare_actions, "append join key and join filters for probe side");
+    executeExpression(probe_pipeline, probe_side_prepare_actions, log, "append join key and join filters for probe side");
     NamesAndTypes source_columns;
     for (const auto & p : probe_pipeline.firstStream()->getHeader())
         source_columns.emplace_back(p.name, p.type);
@@ -347,14 +346,26 @@ void DAGQueryBlockInterpreter::executeWhere(DAGPipeline & pipeline, const Expres
 
 void DAGQueryBlockInterpreter::executeWindow(
     DAGPipeline & pipeline,
-    WindowDescription & window_description)
+    WindowDescription & window_description,
+    bool enable_fine_grained_shuffle)
 {
-    executeExpression(pipeline, window_description.before_window, "before window");
+    executeExpression(pipeline, window_description.before_window, log, "before window");
 
-    /// If there are several streams, we merge them into one
-    executeUnion(pipeline, max_streams, log, false, "merge into one for window input");
-    assert(pipeline.streams.size() == 1);
-    pipeline.firstStream() = std::make_shared<WindowBlockInputStream>(pipeline.firstStream(), window_description, log->identifier());
+    if (enable_fine_grained_shuffle)
+    {
+        /// Window function can be multiple threaded when fine grained shuffle is enabled.
+        pipeline.transform([&](auto & stream) {
+            stream = std::make_shared<WindowBlockInputStream>(stream, window_description, log->identifier());
+            stream->setExtraInfo(enableFineGrainedShuffleExtraInfo);
+        });
+    }
+    else
+    {
+        /// If there are several streams, we merge them into one.
+        executeUnion(pipeline, max_streams, log, false, "merge into one for window input");
+        assert(pipeline.streams.size() == 1);
+        pipeline.firstStream() = std::make_shared<WindowBlockInputStream>(pipeline.firstStream(), window_description, log->identifier());
+    }
 }
 
 void DAGQueryBlockInterpreter::executeAggregation(
@@ -365,10 +376,7 @@ void DAGQueryBlockInterpreter::executeAggregation(
     AggregateDescriptions & aggregate_descriptions,
     bool is_final_agg)
 {
-    pipeline.transform([&](auto & stream) {
-        stream = std::make_shared<ExpressionBlockInputStream>(stream, expression_actions_ptr, log->identifier());
-        stream->setExtraInfo("before aggregation");
-    });
+    executeExpression(pipeline, expression_actions_ptr, log, "before aggregation");
 
     Block before_agg_header = pipeline.firstStream()->getHeader();
 
@@ -383,34 +391,39 @@ void DAGQueryBlockInterpreter::executeAggregation(
         is_final_agg);
 
     /// If there are several sources, then we perform parallel aggregation
-    if (pipeline.streams.size() > 1)
+    if (pipeline.streams.size() > 1 || pipeline.streams_with_non_joined_data.size() > 1)
     {
         const Settings & settings = context.getSettingsRef();
-        BlockInputStreamPtr stream_with_non_joined_data = combinedNonJoinedDataStream(pipeline, max_streams, log);
-        pipeline.firstStream() = std::make_shared<ParallelAggregatingBlockInputStream>(
+        BlockInputStreamPtr stream = std::make_shared<ParallelAggregatingBlockInputStream>(
             pipeline.streams,
-            stream_with_non_joined_data,
+            pipeline.streams_with_non_joined_data,
             params,
             context.getFileProvider(),
             true,
             max_streams,
             settings.aggregation_memory_efficient_merge_threads ? static_cast<size_t>(settings.aggregation_memory_efficient_merge_threads) : static_cast<size_t>(settings.max_threads),
             log->identifier());
+
         pipeline.streams.resize(1);
+        pipeline.streams_with_non_joined_data.clear();
+        pipeline.firstStream() = std::move(stream);
+
         // should record for agg before restore concurrency. See #3804.
         recordProfileStreams(pipeline, query_block.aggregation_name);
         restorePipelineConcurrency(pipeline);
     }
     else
     {
-        BlockInputStreamPtr stream_with_non_joined_data = combinedNonJoinedDataStream(pipeline, max_streams, log);
         BlockInputStreams inputs;
         if (!pipeline.streams.empty())
             inputs.push_back(pipeline.firstStream());
-        else
-            pipeline.streams.resize(1);
-        if (stream_with_non_joined_data)
-            inputs.push_back(stream_with_non_joined_data);
+
+        if (!pipeline.streams_with_non_joined_data.empty())
+            inputs.push_back(pipeline.streams_with_non_joined_data.at(0));
+
+        pipeline.streams.resize(1);
+        pipeline.streams_with_non_joined_data.clear();
+
         pipeline.firstStream() = std::make_shared<AggregatingBlockInputStream>(
             std::make_shared<ConcatBlockInputStream>(inputs, log->identifier()),
             params,
@@ -421,56 +434,15 @@ void DAGQueryBlockInterpreter::executeAggregation(
     }
 }
 
-void DAGQueryBlockInterpreter::executeExpression(DAGPipeline & pipeline, const ExpressionActionsPtr & expressionActionsPtr, const String & extra_info)
-{
-    if (!expressionActionsPtr->getActions().empty())
-    {
-        pipeline.transform([&](auto & stream) {
-            stream = std::make_shared<ExpressionBlockInputStream>(stream, expressionActionsPtr, log->identifier());
-            stream->setExtraInfo(extra_info);
-        });
-    }
-}
-
-void DAGQueryBlockInterpreter::executeWindowOrder(DAGPipeline & pipeline, SortDescription sort_desc)
+void DAGQueryBlockInterpreter::executeWindowOrder(DAGPipeline & pipeline, SortDescription sort_desc, bool enable_fine_grained_shuffle)
 {
-    orderStreams(pipeline, sort_desc, 0);
+    orderStreams(pipeline, max_streams, sort_desc, 0, enable_fine_grained_shuffle, context, log);
 }
 
 void DAGQueryBlockInterpreter::executeOrder(DAGPipeline & pipeline, const NamesAndTypes & order_columns)
 {
     Int64 limit = query_block.limit_or_topn->topn().limit();
-    orderStreams(pipeline, getSortDescription(order_columns, query_block.limit_or_topn->topn().order_by()), limit);
-}
-
-void DAGQueryBlockInterpreter::orderStreams(DAGPipeline & pipeline, SortDescription order_descr, Int64 limit)
-{
-    const Settings & settings = context.getSettingsRef();
-
-    pipeline.transform([&](auto & stream) {
-        auto sorting_stream = std::make_shared<PartialSortingBlockInputStream>(stream, order_descr, log->identifier(), limit);
-
-        /// Limits on sorting
-        IProfilingBlockInputStream::LocalLimits limits;
-        limits.mode = IProfilingBlockInputStream::LIMITS_TOTAL;
-        limits.size_limits = SizeLimits(settings.max_rows_to_sort, settings.max_bytes_to_sort, settings.sort_overflow_mode);
-        sorting_stream->setLimits(limits);
-
-        stream = sorting_stream;
-    });
-
-    /// If there are several streams, we merge them into one
-    executeUnion(pipeline, max_streams, log, false, "for partial order");
-
-    /// Merge the sorted blocks.
-    pipeline.firstStream() = std::make_shared<MergeSortingBlockInputStream>(
-        pipeline.firstStream(),
-        order_descr,
-        settings.max_block_size,
-        limit,
-        settings.max_bytes_before_external_sort,
-        context.getTemporaryPath(),
-        log->identifier());
+    orderStreams(pipeline, max_streams, getSortDescription(order_columns, query_block.limit_or_topn->topn().order_by()), limit, false, context, log);
 }
 
 void DAGQueryBlockInterpreter::recordProfileStreams(DAGPipeline & pipeline, const String & key)
@@ -481,17 +453,30 @@ void DAGQueryBlockInterpreter::recordProfileStreams(DAGPipeline & pipeline, cons
 
 void DAGQueryBlockInterpreter::handleExchangeReceiver(DAGPipeline & pipeline)
 {
-    auto it = dagContext().getMPPExchangeReceiverMap().find(query_block.source_name);
-    if (unlikely(it == dagContext().getMPPExchangeReceiverMap().end()))
+    auto exchange_receiver = dagContext().getMPPExchangeReceiver(query_block.source_name);
+    if (unlikely(exchange_receiver == nullptr))
         throw Exception("Can not find exchange receiver for " + query_block.source_name, ErrorCodes::LOGICAL_ERROR);
     // todo choose a more reasonable stream number
     auto & exchange_receiver_io_input_streams = dagContext().getInBoundIOInputStreamsMap()[query_block.source_name];
-    for (size_t i = 0; i < max_streams; ++i)
+
+    const bool enable_fine_grained_shuffle = enableFineGrainedShuffle(exchange_receiver->getFineGrainedShuffleStreamCount());
+    String extra_info = "squashing after exchange receiver";
+    size_t stream_count = max_streams;
+    if (enable_fine_grained_shuffle)
+    {
+        extra_info += ", " + enableFineGrainedShuffleExtraInfo;
+        stream_count = std::min(max_streams, exchange_receiver->getFineGrainedShuffleStreamCount());
+    }
+
+    for (size_t i = 0; i < stream_count; ++i)
     {
-        BlockInputStreamPtr stream = std::make_shared<ExchangeReceiverInputStream>(it->second, log->identifier(), query_block.source_name);
+        BlockInputStreamPtr stream = std::make_shared<ExchangeReceiverInputStream>(exchange_receiver,
+                                                                                   log->identifier(),
+                                                                                   query_block.source_name,
+                                                                                   /*stream_id=*/enable_fine_grained_shuffle ? i : 0);
         exchange_receiver_io_input_streams.push_back(stream);
         stream = std::make_shared<SquashingBlockInputStream>(stream, 8192, 0, log->identifier());
-        stream->setExtraInfo("squashing after exchange receiver");
+        stream->setExtraInfo(extra_info);
         pipeline.streams.push_back(stream);
     }
     NamesAndTypes source_columns;
@@ -548,15 +533,12 @@ void DAGQueryBlockInterpreter::handleProjection(DAGPipeline & pipeline, const ti
         output_columns.emplace_back(alias, col.type);
         project_cols.emplace_back(col.name, alias);
     }
-    pipeline.transform([&](auto & stream) {
-        stream = std::make_shared<ExpressionBlockInputStream>(stream, chain.getLastActions(), log->identifier());
-        stream->setExtraInfo("before projection");
-    });
+    executeExpression(pipeline, chain.getLastActions(), log, "before projection");
     executeProject(pipeline, project_cols, "projection");
     analyzer = std::make_unique<DAGExpressionAnalyzer>(std::move(output_columns), context);
 }
 
-void DAGQueryBlockInterpreter::handleWindow(DAGPipeline & pipeline, const tipb::Window & window)
+void DAGQueryBlockInterpreter::handleWindow(DAGPipeline & pipeline, const tipb::Window & window, bool enable_fine_grained_shuffle)
 {
     NamesAndTypes input_columns;
     assert(input_streams_vec.size() == 1);
@@ -565,13 +547,13 @@ void DAGQueryBlockInterpreter::handleWindow(DAGPipeline & pipeline, const tipb::
         input_columns.emplace_back(p.name, p.type);
     DAGExpressionAnalyzer dag_analyzer(input_columns, context);
     WindowDescription window_description = dag_analyzer.buildWindowDescription(window);
-    executeWindow(pipeline, window_description);
-    executeExpression(pipeline, window_description.after_window, "cast after window");
+    executeWindow(pipeline, window_description, enable_fine_grained_shuffle);
+    executeExpression(pipeline, window_description.after_window, log, "cast after window");
 
     analyzer = std::make_unique<DAGExpressionAnalyzer>(window_description.after_window_columns, context);
 }
 
-void DAGQueryBlockInterpreter::handleWindowOrder(DAGPipeline & pipeline, const tipb::Sort & window_sort)
+void DAGQueryBlockInterpreter::handleWindowOrder(DAGPipeline & pipeline, const tipb::Sort & window_sort, bool enable_fine_grained_shuffle)
 {
     NamesAndTypes input_columns;
     assert(input_streams_vec.size() == 1);
@@ -580,7 +562,7 @@ void DAGQueryBlockInterpreter::handleWindowOrder(DAGPipeline & pipeline, const t
         input_columns.emplace_back(p.name, p.type);
     DAGExpressionAnalyzer dag_analyzer(input_columns, context);
     auto order_columns = dag_analyzer.buildWindowOrderColumns(window_sort);
-    executeWindowOrder(pipeline, getSortDescription(order_columns, window_sort.byitems()));
+    executeWindowOrder(pipeline, getSortDescription(order_columns, window_sort.byitems()), enable_fine_grained_shuffle);
 
     analyzer = std::make_unique<DAGExpressionAnalyzer>(std::move(input_columns), context);
 }
@@ -628,13 +610,13 @@ void DAGQueryBlockInterpreter::executeImpl(DAGPipeline & pipeline)
     }
     else if (query_block.source->tp() == tipb::ExecType::TypeWindow)
     {
-        handleWindow(pipeline, query_block.source->window());
+        handleWindow(pipeline, query_block.source->window(), enableFineGrainedShuffle(query_block.source->fine_grained_shuffle_stream_count()));
         recordProfileStreams(pipeline, query_block.source_name);
         restorePipelineConcurrency(pipeline);
     }
     else if (query_block.source->tp() == tipb::ExecType::TypeSort)
     {
-        handleWindowOrder(pipeline, query_block.source->sort());
+        handleWindowOrder(pipeline, query_block.source->sort(), enableFineGrainedShuffle(query_block.source->fine_grained_shuffle_stream_count()));
         recordProfileStreams(pipeline, query_block.source_name);
     }
     else
@@ -678,7 +660,7 @@ void DAGQueryBlockInterpreter::executeImpl(DAGPipeline & pipeline)
     }
     if (res.before_order_and_select)
     {
-        executeExpression(pipeline, res.before_order_and_select, "before order and select");
+        executeExpression(pipeline, res.before_order_and_select, log, "before order and select");
     }
 
     if (!res.order_columns.empty())
@@ -714,10 +696,7 @@ void DAGQueryBlockInterpreter::executeProject(DAGPipeline & pipeline, NamesWithA
     if (project_cols.empty())
         return;
     ExpressionActionsPtr project = generateProjectExpressionActions(pipeline.firstStream(), context, project_cols);
-    pipeline.transform([&](auto & stream) {
-        stream = std::make_shared<ExpressionBlockInputStream>(stream, project, log->identifier());
-        stream->setExtraInfo(extra_info);
-    });
+    executeExpression(pipeline, project, log, extra_info);
 }
 
 void DAGQueryBlockInterpreter::executeLimit(DAGPipeline & pipeline)
@@ -743,19 +722,47 @@ void DAGQueryBlockInterpreter::handleExchangeSender(DAGPipeline & pipeline)
     std::vector<Int64> partition_col_ids = ExchangeSenderInterpreterHelper::genPartitionColIds(exchange_sender);
     TiDB::TiDBCollators partition_col_collators = ExchangeSenderInterpreterHelper::genPartitionColCollators(exchange_sender);
     int stream_id = 0;
-    pipeline.transform([&](auto & stream) {
-        // construct writer
-        std::unique_ptr<DAGResponseWriter> response_writer = std::make_unique<StreamingDAGResponseWriter<MPPTunnelSetPtr>>(
-            context.getDAGContext()->tunnel_set,
-            partition_col_ids,
-            partition_col_collators,
-            exchange_sender.tp(),
-            context.getSettingsRef().dag_records_per_chunk,
-            context.getSettingsRef().batch_send_min_limit,
-            stream_id++ == 0, /// only one stream needs to sending execution summaries for the last response
-            dagContext());
-        stream = std::make_shared<ExchangeSenderBlockInputStream>(stream, std::move(response_writer), log->identifier());
-    });
+    const uint64_t stream_count = query_block.exchange_sender->fine_grained_shuffle_stream_count();
+    const uint64_t batch_size = query_block.exchange_sender->fine_grained_shuffle_batch_size();
+
+    if (enableFineGrainedShuffle(stream_count))
+    {
+        pipeline.transform([&](auto & stream) {
+            // construct writer
+            std::unique_ptr<DAGResponseWriter> response_writer = std::make_unique<StreamingDAGResponseWriter<MPPTunnelSetPtr, true>>(
+                context.getDAGContext()->tunnel_set,
+                partition_col_ids,
+                partition_col_collators,
+                exchange_sender.tp(),
+                context.getSettingsRef().dag_records_per_chunk,
+                context.getSettingsRef().batch_send_min_limit,
+                stream_id++ == 0, /// only one stream needs to sending execution summaries for the last response
+                dagContext(),
+                stream_count,
+                batch_size);
+            stream = std::make_shared<ExchangeSenderBlockInputStream>(stream, std::move(response_writer), log->identifier());
+            stream->setExtraInfo(enableFineGrainedShuffleExtraInfo);
+        });
+        RUNTIME_CHECK(exchange_sender.tp() == tipb::ExchangeType::Hash, Exception, "exchange_sender has to be hash partition when fine grained shuffle is enabled");
+        RUNTIME_CHECK(stream_count <= 1024, Exception, "fine_grained_shuffle_stream_count should not be greater than 1024");
+    }
+    else
+    {
+        pipeline.transform([&](auto & stream) {
+            std::unique_ptr<DAGResponseWriter> response_writer = std::make_unique<StreamingDAGResponseWriter<MPPTunnelSetPtr, false>>(
+                context.getDAGContext()->tunnel_set,
+                partition_col_ids,
+                partition_col_collators,
+                exchange_sender.tp(),
+                context.getSettingsRef().dag_records_per_chunk,
+                context.getSettingsRef().batch_send_min_limit,
+                stream_id++ == 0, /// only one stream needs to sending execution summaries for the last response
+                dagContext(),
+                stream_count,
+                batch_size);
+            stream = std::make_shared<ExchangeSenderBlockInputStream>(stream, std::move(response_writer), log->identifier());
+        });
+    }
 }
 
 void DAGQueryBlockInterpreter::handleMockExchangeSender(DAGPipeline & pipeline)
@@ -783,4 +790,4 @@ BlockInputStreams DAGQueryBlockInterpreter::execute()
 
     return pipeline.streams;
 }
-} // namespace DB
\ No newline at end of file
+} // namespace DB
diff --git a/dbms/src/Flash/Coprocessor/DAGQueryBlockInterpreter.h b/dbms/src/Flash/Coprocessor/DAGQueryBlockInterpreter.h
index e68c4f91cee..c449b37e360 100644
--- a/dbms/src/Flash/Coprocessor/DAGQueryBlockInterpreter.h
+++ b/dbms/src/Flash/Coprocessor/DAGQueryBlockInterpreter.h
@@ -64,17 +64,16 @@ class DAGQueryBlockInterpreter
     void handleExchangeReceiver(DAGPipeline & pipeline);
     void handleMockExchangeReceiver(DAGPipeline & pipeline);
     void handleProjection(DAGPipeline & pipeline, const tipb::Projection & projection);
-    void handleWindow(DAGPipeline & pipeline, const tipb::Window & window);
-    void handleWindowOrder(DAGPipeline & pipeline, const tipb::Sort & window_sort);
+    void handleWindow(DAGPipeline & pipeline, const tipb::Window & window, bool enable_fine_grained_shuffle);
+    void handleWindowOrder(DAGPipeline & pipeline, const tipb::Sort & window_sort, bool enable_fine_grained_shuffle);
     void executeWhere(DAGPipeline & pipeline, const ExpressionActionsPtr & expressionActionsPtr, String & filter_column, const String & extra_info = "");
-    void executeExpression(DAGPipeline & pipeline, const ExpressionActionsPtr & expressionActionsPtr, const String & extra_info = "");
-    void executeWindowOrder(DAGPipeline & pipeline, SortDescription sort_desc);
-    void orderStreams(DAGPipeline & pipeline, SortDescription order_descr, Int64 limit);
+    void executeWindowOrder(DAGPipeline & pipeline, SortDescription sort_desc, bool enable_fine_grained_shuffle);
     void executeOrder(DAGPipeline & pipeline, const NamesAndTypes & order_columns);
     void executeLimit(DAGPipeline & pipeline);
     void executeWindow(
         DAGPipeline & pipeline,
-        WindowDescription & window_description);
+        WindowDescription & window_description,
+        bool enable_fine_grained_shuffle);
     void executeAggregation(
         DAGPipeline & pipeline,
         const ExpressionActionsPtr & expression_actions_ptr,
diff --git a/dbms/src/Flash/Coprocessor/DAGQuerySource.cpp b/dbms/src/Flash/Coprocessor/DAGQuerySource.cpp
index 882699e1599..d68a7b17aaa 100644
--- a/dbms/src/Flash/Coprocessor/DAGQuerySource.cpp
+++ b/dbms/src/Flash/Coprocessor/DAGQuerySource.cpp
@@ -20,6 +20,26 @@
 
 namespace DB
 {
+namespace
+{
+void fillOrderForListBasedExecutors(DAGContext & dag_context, const DAGQueryBlock & query_block)
+{
+    assert(query_block.source);
+    auto & list_based_executors_order = dag_context.list_based_executors_order;
+    list_based_executors_order.push_back(query_block.source_name);
+    if (query_block.selection)
+        list_based_executors_order.push_back(query_block.selection_name);
+    if (query_block.aggregation)
+        list_based_executors_order.push_back(query_block.aggregation_name);
+    if (query_block.having)
+        list_based_executors_order.push_back(query_block.having_name);
+    if (query_block.limit_or_topn)
+        list_based_executors_order.push_back(query_block.limit_or_topn_name);
+    if (query_block.exchange_sender)
+        dag_context.list_based_executors_order.push_back(query_block.exchange_sender_name);
+}
+} // namespace
+
 DAGQuerySource::DAGQuerySource(Context & context_)
     : context(context_)
 {
@@ -32,6 +52,9 @@ DAGQuerySource::DAGQuerySource(Context & context_)
     else
     {
         root_query_block = std::make_shared<DAGQueryBlock>(1, dag_request.executors());
+        auto & dag_context = getDAGContext();
+        if (!dag_context.return_executor_id)
+            fillOrderForListBasedExecutors(dag_context, *root_query_block);
     }
 }
 
diff --git a/dbms/src/Flash/Coprocessor/DAGResponseWriter.cpp b/dbms/src/Flash/Coprocessor/DAGResponseWriter.cpp
index 53bebc91da8..33f6d99f9d8 100644
--- a/dbms/src/Flash/Coprocessor/DAGResponseWriter.cpp
+++ b/dbms/src/Flash/Coprocessor/DAGResponseWriter.cpp
@@ -89,12 +89,10 @@ void DAGResponseWriter::addExecuteSummaries(tipb::SelectResponse & response, boo
         }
     }
 
-    /// add execution_summary for local executor
-    for (auto & p : dag_context.getProfileStreamsMap())
-    {
+    auto fill_execution_summary = [&](const String & executor_id, const BlockInputStreams & streams) {
         ExecutionSummary current;
         /// part 1: local execution info
-        for (auto & stream_ptr : p.second)
+        for (const auto & stream_ptr : streams)
         {
             if (auto * p_stream = dynamic_cast<IProfilingBlockInputStream *>(stream_ptr.get()))
             {
@@ -105,16 +103,16 @@ void DAGResponseWriter::addExecuteSummaries(tipb::SelectResponse & response, boo
             current.concurrency++;
         }
         /// part 2: remote execution info
-        if (merged_remote_execution_summaries.find(p.first) != merged_remote_execution_summaries.end())
+        if (merged_remote_execution_summaries.find(executor_id) != merged_remote_execution_summaries.end())
         {
-            for (auto & remote : merged_remote_execution_summaries[p.first])
+            for (auto & remote : merged_remote_execution_summaries[executor_id])
                 current.merge(remote, false);
         }
         /// part 3: for join need to add the build time
         /// In TiFlash, a hash join's build side is finished before probe side starts,
         /// so the join probe side's running time does not include hash table's build time,
         /// when construct ExecSummaries, we need add the build cost to probe executor
-        auto all_join_id_it = dag_context.getExecutorIdToJoinIdMap().find(p.first);
+        auto all_join_id_it = dag_context.getExecutorIdToJoinIdMap().find(executor_id);
         if (all_join_id_it != dag_context.getExecutorIdToJoinIdMap().end())
         {
             for (const auto & join_executor_id : all_join_id_it->second)
@@ -138,8 +136,27 @@ void DAGResponseWriter::addExecuteSummaries(tipb::SelectResponse & response, boo
         }
 
         current.time_processed_ns += dag_context.compile_time_ns;
-        fillTiExecutionSummary(response.add_execution_summaries(), current, p.first, delta_mode);
+        fillTiExecutionSummary(response.add_execution_summaries(), current, executor_id, delta_mode);
+    };
+
+    /// add execution_summary for local executor
+    if (dag_context.return_executor_id)
+    {
+        for (auto & p : dag_context.getProfileStreamsMap())
+            fill_execution_summary(p.first, p.second);
+    }
+    else
+    {
+        const auto & profile_streams_map = dag_context.getProfileStreamsMap();
+        assert(profile_streams_map.size() == dag_context.list_based_executors_order.size());
+        for (const auto & executor_id : dag_context.list_based_executors_order)
+        {
+            auto it = profile_streams_map.find(executor_id);
+            assert(it != profile_streams_map.end());
+            fill_execution_summary(executor_id, it->second);
+        }
     }
+
     for (auto & p : merged_remote_execution_summaries)
     {
         if (local_executors.find(p.first) == local_executors.end())
diff --git a/dbms/src/Flash/Coprocessor/DAGStorageInterpreter.cpp b/dbms/src/Flash/Coprocessor/DAGStorageInterpreter.cpp
index 14cddd94730..390ce7b9948 100644
--- a/dbms/src/Flash/Coprocessor/DAGStorageInterpreter.cpp
+++ b/dbms/src/Flash/Coprocessor/DAGStorageInterpreter.cpp
@@ -486,7 +486,8 @@ void DAGStorageInterpreter::buildRemoteStreams(std::vector<RemoteRequest> && rem
         std::vector<pingcap::coprocessor::copTask> tasks(all_tasks.begin() + task_start, all_tasks.begin() + task_end);
 
         auto coprocessor_reader = std::make_shared<CoprocessorReader>(schema, cluster, tasks, has_enforce_encode_type, 1);
-        BlockInputStreamPtr input = std::make_shared<CoprocessorBlockInputStream>(coprocessor_reader, log->identifier(), table_scan.getTableScanExecutorID());
+        context.getDAGContext()->addCoprocessorReader(coprocessor_reader);
+        BlockInputStreamPtr input = std::make_shared<CoprocessorBlockInputStream>(coprocessor_reader, log->identifier(), table_scan.getTableScanExecutorID(), /*stream_id=*/0);
         pipeline.streams.push_back(input);
         task_start = task_end;
     }
diff --git a/dbms/src/Flash/Coprocessor/DAGUtils.cpp b/dbms/src/Flash/Coprocessor/DAGUtils.cpp
index 9ffa29cd14d..2003103a20a 100644
--- a/dbms/src/Flash/Coprocessor/DAGUtils.cpp
+++ b/dbms/src/Flash/Coprocessor/DAGUtils.cpp
@@ -332,7 +332,7 @@ const std::unordered_map<tipb::ScalarFuncSig, String> scalar_func_map({
     {tipb::ScalarFuncSig::DecimalIsFalseWithNull, "isFalseWithNull"},
 
     //{tipb::ScalarFuncSig::LeftShift, "cast"},
-    //{tipb::ScalarFuncSig::RightShift, "cast"},
+    {tipb::ScalarFuncSig::RightShift, "bitShiftRight"},
 
     //{tipb::ScalarFuncSig::BitCount, "cast"},
     //{tipb::ScalarFuncSig::GetParamString, "cast"},
@@ -513,7 +513,7 @@ const std::unordered_map<tipb::ScalarFuncSig, String> scalar_func_map({
     //{tipb::ScalarFuncSig::YearWeekWithMode, "cast"},
     //{tipb::ScalarFuncSig::YearWeekWithoutMode, "cast"},
 
-    //{tipb::ScalarFuncSig::GetFormat, "cast"},
+    {tipb::ScalarFuncSig::GetFormat, "getFormat"},
     {tipb::ScalarFuncSig::SysDateWithFsp, "sysDateWithFsp"},
     {tipb::ScalarFuncSig::SysDateWithoutFsp, "sysDateWithoutFsp"},
     //{tipb::ScalarFuncSig::CurrentDate, "cast"},
@@ -561,7 +561,7 @@ const std::unordered_map<tipb::ScalarFuncSig, String> scalar_func_map({
     {tipb::ScalarFuncSig::Quarter, "toQuarter"},
 
     //{tipb::ScalarFuncSig::SecToTime, "cast"},
-    //{tipb::ScalarFuncSig::TimeToSec, "cast"},
+    {tipb::ScalarFuncSig::TimeToSec, "tidbTimeToSec"},
     //{tipb::ScalarFuncSig::TimestampAdd, "cast"},
     {tipb::ScalarFuncSig::ToDays, "tidbToDays"},
     {tipb::ScalarFuncSig::ToSeconds, "tidbToSeconds"},
@@ -648,8 +648,8 @@ const std::unordered_map<tipb::ScalarFuncSig, String> scalar_func_map({
     //{tipb::ScalarFuncSig::Quote, "cast"},
     //{tipb::ScalarFuncSig::Repeat, "cast"},
     {tipb::ScalarFuncSig::Replace, "replaceAll"},
-    //{tipb::ScalarFuncSig::ReverseUTF8, "cast"},
-    //{tipb::ScalarFuncSig::Reverse, "cast"},
+    {tipb::ScalarFuncSig::ReverseUTF8, "reverseUTF8"},
+    {tipb::ScalarFuncSig::Reverse, "reverse"},
     {tipb::ScalarFuncSig::RightUTF8, "rightUTF8"},
     //{tipb::ScalarFuncSig::Right, "cast"},
     {tipb::ScalarFuncSig::RpadUTF8, "rpadUTF8"},
diff --git a/dbms/src/Flash/Coprocessor/DecodeDetail.h b/dbms/src/Flash/Coprocessor/DecodeDetail.h
index 9bad0ca2b72..91851650d9e 100644
--- a/dbms/src/Flash/Coprocessor/DecodeDetail.h
+++ b/dbms/src/Flash/Coprocessor/DecodeDetail.h
@@ -21,8 +21,12 @@ namespace DB
 /// Detail of the packet that decoding in TiRemoteInputStream.RemoteReader.decodeChunks()
 struct DecodeDetail
 {
+    // For fine grained shuffle, each ExchangeReceiver/thread will decode its own blocks.
+    // So this is the row number of partial blocks of the original packet.
+    // This will be the row number of all blocks of the original packet if it's not fine grained shuffle.
     Int64 rows = 0;
-    // byte size of origin packet.
+
+    // Total byte size of the origin packet, even for fine grained shuffle.
     Int64 packet_bytes = 0;
 };
-} // namespace DB
\ No newline at end of file
+} // namespace DB
diff --git a/dbms/src/Flash/Coprocessor/GenSchemaAndColumn.cpp b/dbms/src/Flash/Coprocessor/GenSchemaAndColumn.cpp
index be3475f714f..efb8a08f1d8 100644
--- a/dbms/src/Flash/Coprocessor/GenSchemaAndColumn.cpp
+++ b/dbms/src/Flash/Coprocessor/GenSchemaAndColumn.cpp
@@ -54,4 +54,15 @@ ColumnsWithTypeAndName getColumnWithTypeAndName(const NamesAndTypes & names_and_
     }
     return column_with_type_and_names;
 }
-} // namespace DB
\ No newline at end of file
+
+NamesAndTypes toNamesAndTypes(const DAGSchema & dag_schema)
+{
+    NamesAndTypes names_and_types;
+    for (const auto & col : dag_schema)
+    {
+        auto tp = getDataTypeByColumnInfoForComputingLayer(col.second);
+        names_and_types.emplace_back(col.first, tp);
+    }
+    return names_and_types;
+}
+} // namespace DB
diff --git a/dbms/src/Flash/Coprocessor/GenSchemaAndColumn.h b/dbms/src/Flash/Coprocessor/GenSchemaAndColumn.h
index 617f69de925..96f202d800e 100644
--- a/dbms/src/Flash/Coprocessor/GenSchemaAndColumn.h
+++ b/dbms/src/Flash/Coprocessor/GenSchemaAndColumn.h
@@ -16,6 +16,7 @@
 
 #include <Core/ColumnsWithTypeAndName.h>
 #include <Core/NamesAndTypes.h>
+#include <Flash/Coprocessor/ChunkCodec.h>
 #include <Flash/Coprocessor/TiDBTableScan.h>
 #include <Storages/Transaction/TiDB.h>
 
@@ -23,4 +24,5 @@ namespace DB
 {
 NamesAndTypes genNamesAndTypes(const TiDBTableScan & table_scan);
 ColumnsWithTypeAndName getColumnWithTypeAndName(const NamesAndTypes & names_and_types);
-} // namespace DB
\ No newline at end of file
+NamesAndTypes toNamesAndTypes(const DAGSchema & dag_schema);
+} // namespace DB
diff --git a/dbms/src/Flash/Coprocessor/InterpreterUtils.cpp b/dbms/src/Flash/Coprocessor/InterpreterUtils.cpp
index c9810454218..002a06d07b9 100644
--- a/dbms/src/Flash/Coprocessor/InterpreterUtils.cpp
+++ b/dbms/src/Flash/Coprocessor/InterpreterUtils.cpp
@@ -12,8 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <DataStreams/ExpressionBlockInputStream.h>
+#include <DataStreams/MergeSortingBlockInputStream.h>
+#include <DataStreams/PartialSortingBlockInputStream.h>
 #include <DataStreams/SharedQueryBlockInputStream.h>
 #include <DataStreams/UnionBlockInputStream.h>
+#include <Flash/Coprocessor/DAGContext.h>
 #include <Flash/Coprocessor/InterpreterUtils.h>
 #include <Interpreters/Context.h>
 
@@ -39,32 +43,6 @@ void restoreConcurrency(
     }
 }
 
-BlockInputStreamPtr combinedNonJoinedDataStream(
-    DAGPipeline & pipeline,
-    size_t max_threads,
-    const LoggerPtr & log,
-    bool ignore_block)
-{
-    BlockInputStreamPtr ret = nullptr;
-    if (pipeline.streams_with_non_joined_data.size() == 1)
-        ret = pipeline.streams_with_non_joined_data.at(0);
-    else if (pipeline.streams_with_non_joined_data.size() > 1)
-    {
-        if (ignore_block)
-        {
-            ret = std::make_shared<UnionWithoutBlock>(pipeline.streams_with_non_joined_data, nullptr, max_threads, log->identifier());
-            ret->setExtraInfo("combine non joined(ignore block)");
-        }
-        else
-        {
-            ret = std::make_shared<UnionWithBlock>(pipeline.streams_with_non_joined_data, nullptr, max_threads, log->identifier());
-            ret->setExtraInfo("combine non joined");
-        }
-    }
-    pipeline.streams_with_non_joined_data.clear();
-    return ret;
-}
-
 void executeUnion(
     DAGPipeline & pipeline,
     size_t max_streams,
@@ -72,21 +50,33 @@ void executeUnion(
     bool ignore_block,
     const String & extra_info)
 {
-    if (pipeline.streams.size() == 1 && pipeline.streams_with_non_joined_data.empty())
-        return;
-    auto non_joined_data_stream = combinedNonJoinedDataStream(pipeline, max_streams, log, ignore_block);
-    if (!pipeline.streams.empty())
+    switch (pipeline.streams.size() + pipeline.streams_with_non_joined_data.size())
+    {
+    case 0:
+        break;
+    case 1:
     {
+        if (pipeline.streams.size() == 1)
+            break;
+        // streams_with_non_joined_data's size is 1.
+        pipeline.streams.push_back(pipeline.streams_with_non_joined_data.at(0));
+        pipeline.streams_with_non_joined_data.clear();
+        break;
+    }
+    default:
+    {
+        BlockInputStreamPtr stream;
         if (ignore_block)
-            pipeline.firstStream() = std::make_shared<UnionWithoutBlock>(pipeline.streams, non_joined_data_stream, max_streams, log->identifier());
+            stream = std::make_shared<UnionWithoutBlock>(pipeline.streams, pipeline.streams_with_non_joined_data, max_streams, log->identifier());
         else
-            pipeline.firstStream() = std::make_shared<UnionWithBlock>(pipeline.streams, non_joined_data_stream, max_streams, log->identifier());
-        pipeline.firstStream()->setExtraInfo(extra_info);
+            stream = std::make_shared<UnionWithBlock>(pipeline.streams, pipeline.streams_with_non_joined_data, max_streams, log->identifier());
+        stream->setExtraInfo(extra_info);
+
         pipeline.streams.resize(1);
+        pipeline.streams_with_non_joined_data.clear();
+        pipeline.firstStream() = std::move(stream);
+        break;
     }
-    else if (non_joined_data_stream != nullptr)
-    {
-        pipeline.streams.push_back(non_joined_data_stream);
     }
 }
 
@@ -102,4 +92,77 @@ ExpressionActionsPtr generateProjectExpressionActions(
     project->add(ExpressionAction::project(project_cols));
     return project;
 }
+
+void executeExpression(
+    DAGPipeline & pipeline,
+    const ExpressionActionsPtr & expr_actions,
+    const LoggerPtr & log,
+    const String & extra_info)
+{
+    if (expr_actions && !expr_actions->getActions().empty())
+    {
+        pipeline.transform([&](auto & stream) {
+            stream = std::make_shared<ExpressionBlockInputStream>(stream, expr_actions, log->identifier());
+            stream->setExtraInfo(extra_info);
+        });
+    }
+}
+
+void orderStreams(
+    DAGPipeline & pipeline,
+    size_t max_streams,
+    SortDescription order_descr,
+    Int64 limit,
+    bool enable_fine_grained_shuffle,
+    const Context & context,
+    const LoggerPtr & log)
+{
+    const Settings & settings = context.getSettingsRef();
+    String extra_info;
+    if (enable_fine_grained_shuffle)
+        extra_info = enableFineGrainedShuffleExtraInfo;
+
+    pipeline.transform([&](auto & stream) {
+        auto sorting_stream = std::make_shared<PartialSortingBlockInputStream>(stream, order_descr, log->identifier(), limit);
+
+        /// Limits on sorting
+        IProfilingBlockInputStream::LocalLimits limits;
+        limits.mode = IProfilingBlockInputStream::LIMITS_TOTAL;
+        limits.size_limits = SizeLimits(settings.max_rows_to_sort, settings.max_bytes_to_sort, settings.sort_overflow_mode);
+        sorting_stream->setLimits(limits);
+
+        stream = sorting_stream;
+        stream->setExtraInfo(extra_info);
+    });
+
+    if (enable_fine_grained_shuffle)
+    {
+        pipeline.transform([&](auto & stream) {
+            stream = std::make_shared<MergeSortingBlockInputStream>(
+                stream,
+                order_descr,
+                settings.max_block_size,
+                limit,
+                settings.max_bytes_before_external_sort,
+                context.getTemporaryPath(),
+                log->identifier());
+            stream->setExtraInfo(enableFineGrainedShuffleExtraInfo);
+        });
+    }
+    else
+    {
+        /// If there are several streams, we merge them into one
+        executeUnion(pipeline, max_streams, log, false, "for partial order");
+
+        /// Merge the sorted blocks.
+        pipeline.firstStream() = std::make_shared<MergeSortingBlockInputStream>(
+            pipeline.firstStream(),
+            order_descr,
+            settings.max_block_size,
+            limit,
+            settings.max_bytes_before_external_sort,
+            context.getTemporaryPath(),
+            log->identifier());
+    }
+}
 } // namespace DB
diff --git a/dbms/src/Flash/Coprocessor/InterpreterUtils.h b/dbms/src/Flash/Coprocessor/InterpreterUtils.h
index 5c4d4721d5e..bd64346718c 100644
--- a/dbms/src/Flash/Coprocessor/InterpreterUtils.h
+++ b/dbms/src/Flash/Coprocessor/InterpreterUtils.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <Common/Logger.h>
+#include <Core/SortDescription.h>
 #include <Flash/Coprocessor/DAGPipeline.h>
 #include <Interpreters/ExpressionActions.h>
 
@@ -44,4 +45,19 @@ ExpressionActionsPtr generateProjectExpressionActions(
     const BlockInputStreamPtr & stream,
     const Context & context,
     const NamesWithAliases & project_cols);
+
+void executeExpression(
+    DAGPipeline & pipeline,
+    const ExpressionActionsPtr & expr_actions,
+    const LoggerPtr & log,
+    const String & extra_info = "");
+
+void orderStreams(
+    DAGPipeline & pipeline,
+    size_t max_streams,
+    SortDescription order_descr,
+    Int64 limit,
+    bool enable_fine_grained_shuffle,
+    const Context & context,
+    const LoggerPtr & log);
 } // namespace DB
diff --git a/dbms/src/Flash/Coprocessor/StreamingDAGResponseWriter.cpp b/dbms/src/Flash/Coprocessor/StreamingDAGResponseWriter.cpp
index f915653fe96..a72dfcc16ef 100644
--- a/dbms/src/Flash/Coprocessor/StreamingDAGResponseWriter.cpp
+++ b/dbms/src/Flash/Coprocessor/StreamingDAGResponseWriter.cpp
@@ -23,6 +23,8 @@
 #include <Flash/Mpp/MPPTunnelSet.h>
 #include <Interpreters/AggregationCommon.h>
 
+#include <iostream>
+
 namespace DB
 {
 namespace ErrorCodes
@@ -37,8 +39,8 @@ inline void serializeToPacket(mpp::MPPDataPacket & packet, const tipb::SelectRes
         throw Exception(fmt::format("Fail to serialize response, response size: {}", response.ByteSizeLong()));
 }
 
-template <class StreamWriterPtr>
-StreamingDAGResponseWriter<StreamWriterPtr>::StreamingDAGResponseWriter(
+template <class StreamWriterPtr, bool enable_fine_grained_shuffle>
+StreamingDAGResponseWriter<StreamWriterPtr, enable_fine_grained_shuffle>::StreamingDAGResponseWriter(
     StreamWriterPtr writer_,
     std::vector<Int64> partition_col_ids_,
     TiDB::TiDBCollators collators_,
@@ -46,7 +48,9 @@ StreamingDAGResponseWriter<StreamWriterPtr>::StreamingDAGResponseWriter(
     Int64 records_per_chunk_,
     Int64 batch_send_min_limit_,
     bool should_send_exec_summary_at_last_,
-    DAGContext & dag_context_)
+    DAGContext & dag_context_,
+    uint64_t fine_grained_shuffle_stream_count_,
+    UInt64 fine_grained_shuffle_batch_size_)
     : DAGResponseWriter(records_per_chunk_, dag_context_)
     , batch_send_min_limit(batch_send_min_limit_)
     , should_send_exec_summary_at_last(should_send_exec_summary_at_last_)
@@ -54,6 +58,8 @@ StreamingDAGResponseWriter<StreamWriterPtr>::StreamingDAGResponseWriter(
     , writer(writer_)
     , partition_col_ids(std::move(partition_col_ids_))
     , collators(std::move(collators_))
+    , fine_grained_shuffle_stream_count(fine_grained_shuffle_stream_count_)
+    , fine_grained_shuffle_batch_size(fine_grained_shuffle_batch_size_)
 {
     rows_in_blocks = 0;
     partition_num = writer_->getPartitionNum();
@@ -71,17 +77,37 @@ StreamingDAGResponseWriter<StreamWriterPtr>::StreamingDAGResponseWriter(
     }
 }
 
-template <class StreamWriterPtr>
-void StreamingDAGResponseWriter<StreamWriterPtr>::finishWrite()
+template <class StreamWriterPtr, bool enable_fine_grained_shuffle>
+void StreamingDAGResponseWriter<StreamWriterPtr, enable_fine_grained_shuffle>::finishWrite()
 {
     if (should_send_exec_summary_at_last)
-        batchWrite<true>();
+    {
+        if constexpr (enable_fine_grained_shuffle)
+        {
+            assert(exchange_type == tipb::ExchangeType::Hash);
+            batchWriteFineGrainedShuffle<true>();
+        }
+        else
+        {
+            batchWrite<true>();
+        }
+    }
     else
-        batchWrite<false>();
+    {
+        if constexpr (enable_fine_grained_shuffle)
+        {
+            assert(exchange_type == tipb::ExchangeType::Hash);
+            batchWriteFineGrainedShuffle<false>();
+        }
+        else
+        {
+            batchWrite<false>();
+        }
+    }
 }
 
-template <class StreamWriterPtr>
-void StreamingDAGResponseWriter<StreamWriterPtr>::write(const Block & block)
+template <class StreamWriterPtr, bool enable_fine_grained_shuffle>
+void StreamingDAGResponseWriter<StreamWriterPtr, enable_fine_grained_shuffle>::write(const Block & block)
 {
     if (block.columns() != dag_context.result_field_types.size())
         throw TiFlashException("Output column size mismatch with field type size", Errors::Coprocessor::Internal);
@@ -91,15 +117,23 @@ void StreamingDAGResponseWriter<StreamWriterPtr>::write(const Block & block)
     {
         blocks.push_back(block);
     }
-    if (static_cast<Int64>(rows_in_blocks) > (dag_context.encode_type == tipb::EncodeType::TypeCHBlock ? batch_send_min_limit : records_per_chunk - 1))
+
+    if constexpr (enable_fine_grained_shuffle)
     {
-        batchWrite<false>();
+        assert(exchange_type == tipb::ExchangeType::Hash);
+        if (static_cast<UInt64>(rows_in_blocks) >= fine_grained_shuffle_batch_size)
+            batchWriteFineGrainedShuffle<false>();
+    }
+    else
+    {
+        if (static_cast<Int64>(rows_in_blocks) > (dag_context.encode_type == tipb::EncodeType::TypeCHBlock ? batch_send_min_limit : records_per_chunk - 1))
+            batchWrite<false>();
     }
 }
 
-template <class StreamWriterPtr>
+template <class StreamWriterPtr, bool enable_fine_grained_shuffle>
 template <bool send_exec_summary_at_last>
-void StreamingDAGResponseWriter<StreamWriterPtr>::encodeThenWriteBlocks(
+void StreamingDAGResponseWriter<StreamWriterPtr, enable_fine_grained_shuffle>::encodeThenWriteBlocks(
     const std::vector<Block> & input_blocks,
     tipb::SelectResponse & response) const
 {
@@ -191,133 +225,238 @@ void StreamingDAGResponseWriter<StreamWriterPtr>::encodeThenWriteBlocks(
     }
 }
 
-/// hash exchanging data among only TiFlash nodes.
-template <class StreamWriterPtr>
+
+template <class StreamWriterPtr, bool enable_fine_grained_shuffle>
 template <bool send_exec_summary_at_last>
-void StreamingDAGResponseWriter<StreamWriterPtr>::partitionAndEncodeThenWriteBlocks(
-    std::vector<Block> & input_blocks,
-    tipb::SelectResponse & response) const
+void StreamingDAGResponseWriter<StreamWriterPtr, enable_fine_grained_shuffle>::batchWrite()
 {
-    std::vector<mpp::MPPDataPacket> packet(partition_num);
-
-    std::vector<size_t> responses_row_count(partition_num);
+    tipb::SelectResponse response;
+    if constexpr (send_exec_summary_at_last)
+        addExecuteSummaries(response, !dag_context.isMPPTask() || dag_context.isRootMPPTask());
+    if (exchange_type == tipb::ExchangeType::Hash)
+    {
+        partitionAndEncodeThenWriteBlocks<send_exec_summary_at_last>(blocks, response);
+    }
+    else
+    {
+        encodeThenWriteBlocks<send_exec_summary_at_last>(blocks, response);
+    }
+    blocks.clear();
+    rows_in_blocks = 0;
+}
 
+template <class StreamWriterPtr, bool enable_fine_grained_shuffle>
+template <bool send_exec_summary_at_last>
+void StreamingDAGResponseWriter<StreamWriterPtr, enable_fine_grained_shuffle>::handleExecSummary(
+    const std::vector<Block> & input_blocks,
+    std::vector<mpp::MPPDataPacket> & packet,
+    tipb::SelectResponse & response) const
+{
     if constexpr (send_exec_summary_at_last)
     {
         /// Sending the response to only one node, default the first one.
         serializeToPacket(packet[0], response);
-    }
 
-    if (input_blocks.empty())
-    {
-        if constexpr (send_exec_summary_at_last)
+        // No need to send data when blocks are not empty,
+        // because exec_summary will be sent together with blocks.
+        if (input_blocks.empty())
         {
             for (auto part_id = 0; part_id < partition_num; ++part_id)
             {
                 writer->write(packet[part_id], part_id);
             }
         }
-        return;
     }
+}
 
-    // partition tuples in blocks
-    // 1) compute partition id
-    // 2) partition each row
-    // 3) encode each chunk and send it
-    std::vector<String> partition_key_containers(collators.size());
-    for (auto & block : input_blocks)
+template <class StreamWriterPtr, bool enable_fine_grained_shuffle>
+template <bool send_exec_summary_at_last>
+void StreamingDAGResponseWriter<StreamWriterPtr, enable_fine_grained_shuffle>::writePackets(const std::vector<size_t> & responses_row_count,
+                                                                                            std::vector<mpp::MPPDataPacket> & packets) const
+{
+    for (size_t part_id = 0; part_id < packets.size(); ++part_id)
     {
-        std::vector<Block> dest_blocks(partition_num);
-        std::vector<MutableColumns> dest_tbl_cols(partition_num);
-
-        for (size_t i = 0; i < block.columns(); ++i)
+        if constexpr (send_exec_summary_at_last)
         {
-            if (ColumnPtr converted = block.getByPosition(i).column->convertToFullColumnIfConst())
-            {
-                block.getByPosition(i).column = converted;
-            }
+            writer->write(packets[part_id], part_id);
         }
-
-        for (auto i = 0; i < partition_num; ++i)
+        else
         {
-            dest_tbl_cols[i] = block.cloneEmptyColumns();
-            dest_blocks[i] = block.cloneEmpty();
+            if (responses_row_count[part_id] > 0)
+                writer->write(packets[part_id], part_id);
         }
+    }
+}
 
-        size_t rows = block.rows();
-        WeakHash32 hash(rows);
-
-        // get hash values by all partition key columns
-        for (size_t i = 0; i < partition_col_ids.size(); i++)
+inline void initInputBlocks(std::vector<Block> & input_blocks)
+{
+    for (auto & input_block : input_blocks)
+    {
+        for (size_t i = 0; i < input_block.columns(); ++i)
         {
-            block.getByPosition(partition_col_ids[i]).column->updateWeakHash32(hash, collators[i], partition_key_containers[i]);
+            if (ColumnPtr converted = input_block.getByPosition(i).column->convertToFullColumnIfConst())
+                input_block.getByPosition(i).column = converted;
         }
-        const auto & hash_data = hash.getData();
+    }
+}
 
-        // partition each row
-        IColumn::Selector selector(rows);
-        for (size_t row = 0; row < rows; ++row)
-        {
-            /// Row from interval [(2^32 / partition_num) * i, (2^32 / partition_num) * (i + 1)) goes to bucket with number i.
-            selector[row] = hash_data[row]; /// [0, 2^32)
-            selector[row] *= partition_num; /// [0, partition_num * 2^32), selector stores 64 bit values.
-            selector[row] >>= 32u; /// [0, partition_num)
-        }
+inline void initDestColumns(const Block & input_block, std::vector<MutableColumns> & dest_tbl_cols)
+{
+    for (auto & cols : dest_tbl_cols)
+    {
+        cols = input_block.cloneEmptyColumns();
+    }
+}
 
-        for (size_t col_id = 0; col_id < block.columns(); ++col_id)
-        {
-            // Scatter columns to different partitions
-            auto scattered_columns = block.getByPosition(col_id).column->scatter(partition_num, selector);
-            for (size_t part_id = 0; part_id < partition_num; ++part_id)
-            {
-                dest_tbl_cols[part_id][col_id] = std::move(scattered_columns[part_id]);
-            }
-        }
-        // serialize each partitioned block and write it to its destination
-        for (auto part_id = 0; part_id < partition_num; ++part_id)
-        {
-            dest_blocks[part_id].setColumns(std::move(dest_tbl_cols[part_id]));
-            responses_row_count[part_id] += dest_blocks[part_id].rows();
-            chunk_codec_stream->encode(dest_blocks[part_id], 0, dest_blocks[part_id].rows());
-            packet[part_id].add_chunks(chunk_codec_stream->getString());
-            chunk_codec_stream->clear();
-        }
+void computeHash(const Block & input_block,
+                 uint32_t bucket_num,
+                 const TiDB::TiDBCollators & collators,
+                 std::vector<String> & partition_key_containers,
+                 const std::vector<Int64> & partition_col_ids,
+                 std::vector<std::vector<MutableColumnPtr>> & result_columns)
+{
+    size_t rows = input_block.rows();
+    WeakHash32 hash(rows);
+
+    // get hash values by all partition key columns
+    for (size_t i = 0; i < partition_col_ids.size(); ++i)
+    {
+        input_block.getByPosition(partition_col_ids[i]).column->updateWeakHash32(hash, collators[i], partition_key_containers[i]);
     }
 
-    for (auto part_id = 0; part_id < partition_num; ++part_id)
+    const auto & hash_data = hash.getData();
+
+    // partition each row
+    IColumn::Selector selector(rows);
+    for (size_t row = 0; row < rows; ++row)
     {
-        if constexpr (send_exec_summary_at_last)
+        /// Row from interval [(2^32 / bucket_num) * i, (2^32 / bucket_num) * (i + 1)) goes to bucket with number i.
+        selector[row] = hash_data[row]; /// [0, 2^32)
+        selector[row] *= bucket_num; /// [0, bucket_num * 2^32), selector stores 64 bit values.
+        selector[row] >>= 32u; /// [0, bucket_num)
+    }
+
+    for (size_t col_id = 0; col_id < input_block.columns(); ++col_id)
+    {
+        // Scatter columns to different partitions
+        std::vector<MutableColumnPtr> part_columns = input_block.getByPosition(col_id).column->scatter(bucket_num, selector);
+        assert(part_columns.size() == bucket_num);
+        for (size_t bucket_idx = 0; bucket_idx < bucket_num; ++bucket_idx)
         {
-            writer->write(packet[part_id], part_id);
+            result_columns[bucket_idx][col_id] = std::move(part_columns[bucket_idx]);
         }
-        else
+    }
+}
+
+/// Hash exchanging data among only TiFlash nodes. Only be called when enable_fine_grained_shuffle is false.
+template <class StreamWriterPtr, bool enable_fine_grained_shuffle>
+template <bool send_exec_summary_at_last>
+void StreamingDAGResponseWriter<StreamWriterPtr, enable_fine_grained_shuffle>::partitionAndEncodeThenWriteBlocks(
+    std::vector<Block> & input_blocks,
+    tipb::SelectResponse & response) const
+{
+    static_assert(!enable_fine_grained_shuffle);
+    std::vector<mpp::MPPDataPacket> packet(partition_num);
+    std::vector<size_t> responses_row_count(partition_num);
+    handleExecSummary<send_exec_summary_at_last>(input_blocks, packet, response);
+    if (input_blocks.empty())
+        return;
+
+    initInputBlocks(input_blocks);
+    Block dest_block = input_blocks[0].cloneEmpty();
+    std::vector<String> partition_key_containers(collators.size());
+    for (const auto & block : input_blocks)
+    {
+        std::vector<MutableColumns> dest_tbl_cols(partition_num);
+        initDestColumns(block, dest_tbl_cols);
+
+        computeHash(block, partition_num, collators, partition_key_containers, partition_col_ids, dest_tbl_cols);
+
+        for (size_t part_id = 0; part_id < partition_num; ++part_id)
         {
-            if (responses_row_count[part_id] > 0)
-                writer->write(packet[part_id], part_id);
+            dest_block.setColumns(std::move(dest_tbl_cols[part_id]));
+            responses_row_count[part_id] += dest_block.rows();
+            chunk_codec_stream->encode(dest_block, 0, dest_block.rows());
+            packet[part_id].add_chunks(chunk_codec_stream->getString());
+            chunk_codec_stream->clear();
         }
     }
+
+    writePackets<send_exec_summary_at_last>(responses_row_count, packet);
 }
 
-template <class StreamWriterPtr>
+/// Hash exchanging data among only TiFlash nodes. Only be called when enable_fine_grained_shuffle is true.
+template <class StreamWriterPtr, bool enable_fine_grained_shuffle>
 template <bool send_exec_summary_at_last>
-void StreamingDAGResponseWriter<StreamWriterPtr>::batchWrite()
+void StreamingDAGResponseWriter<StreamWriterPtr, enable_fine_grained_shuffle>::batchWriteFineGrainedShuffle()
 {
+    static_assert(enable_fine_grained_shuffle);
+    assert(exchange_type == tipb::ExchangeType::Hash);
+    assert(fine_grained_shuffle_stream_count <= 1024);
+
     tipb::SelectResponse response;
     if constexpr (send_exec_summary_at_last)
         addExecuteSummaries(response, !dag_context.isMPPTask() || dag_context.isRootMPPTask());
-    if (exchange_type == tipb::ExchangeType::Hash)
-    {
-        partitionAndEncodeThenWriteBlocks<send_exec_summary_at_last>(blocks, response);
-    }
-    else
+
+    std::vector<mpp::MPPDataPacket> packet(partition_num);
+    std::vector<size_t> responses_row_count(partition_num, 0);
+
+    // fine_grained_shuffle_stream_count is in [0, 1024], and partition_num is uint16_t, so will not overflow.
+    uint32_t bucket_num = partition_num * fine_grained_shuffle_stream_count;
+    handleExecSummary<send_exec_summary_at_last>(blocks, packet, response);
+    if (!blocks.empty())
     {
-        encodeThenWriteBlocks<send_exec_summary_at_last>(blocks, response);
+        std::vector<MutableColumns> final_dest_tbl_columns(bucket_num);
+        initInputBlocks(blocks);
+        initDestColumns(blocks[0], final_dest_tbl_columns);
+
+        // Hash partition input_blocks into bucket_num.
+        for (const auto & block : blocks)
+        {
+            std::vector<String> partition_key_containers(collators.size());
+            std::vector<MutableColumns> dest_tbl_columns(bucket_num);
+            initDestColumns(block, dest_tbl_columns);
+            computeHash(block, bucket_num, collators, partition_key_containers, partition_col_ids, dest_tbl_columns);
+            for (size_t bucket_idx = 0; bucket_idx < bucket_num; ++bucket_idx)
+            {
+                for (size_t col_id = 0; col_id < block.columns(); ++col_id)
+                {
+                    const MutableColumnPtr & src_col = dest_tbl_columns[bucket_idx][col_id];
+                    final_dest_tbl_columns[bucket_idx][col_id]->insertRangeFrom(*src_col, 0, src_col->size());
+                }
+            }
+        }
+
+        // For i-th stream_count buckets, send to i-th tiflash node.
+        for (size_t bucket_idx = 0; bucket_idx < bucket_num; bucket_idx += fine_grained_shuffle_stream_count)
+        {
+            size_t part_id = bucket_idx / fine_grained_shuffle_stream_count; // NOLINT(clang-analyzer-core.DivideZero)
+            size_t row_count_per_part = 0;
+            for (uint64_t stream_idx = 0; stream_idx < fine_grained_shuffle_stream_count; ++stream_idx)
+            {
+                Block dest_block = blocks[0].cloneEmpty();
+                // For now we put all rows into one Block, may cause this Block too large.
+                dest_block.setColumns(std::move(final_dest_tbl_columns[bucket_idx + stream_idx]));
+                row_count_per_part += dest_block.rows();
+
+                chunk_codec_stream->encode(dest_block, 0, dest_block.rows());
+                packet[part_id].add_chunks(chunk_codec_stream->getString());
+                packet[part_id].add_stream_ids(stream_idx);
+                chunk_codec_stream->clear();
+            }
+            responses_row_count[part_id] = row_count_per_part;
+        }
     }
+
+    writePackets<send_exec_summary_at_last>(responses_row_count, packet);
+
     blocks.clear();
     rows_in_blocks = 0;
 }
 
-template class StreamingDAGResponseWriter<StreamWriterPtr>;
-template class StreamingDAGResponseWriter<MPPTunnelSetPtr>;
+template class StreamingDAGResponseWriter<StreamWriterPtr, /*enable_fine_grained_shuffle=*/true>;
+template class StreamingDAGResponseWriter<MPPTunnelSetPtr, /*enable_fine_grained_shuffle=*/true>;
+template class StreamingDAGResponseWriter<StreamWriterPtr, /*enable_fine_grained_shuffle=*/false>;
+template class StreamingDAGResponseWriter<MPPTunnelSetPtr, /*enable_fine_grained_shuffle=*/false>;
 
 } // namespace DB
diff --git a/dbms/src/Flash/Coprocessor/StreamingDAGResponseWriter.h b/dbms/src/Flash/Coprocessor/StreamingDAGResponseWriter.h
index 9b5e3864c64..cd7559d1e79 100644
--- a/dbms/src/Flash/Coprocessor/StreamingDAGResponseWriter.h
+++ b/dbms/src/Flash/Coprocessor/StreamingDAGResponseWriter.h
@@ -33,7 +33,7 @@ namespace DB
 /// Serializes the stream of blocks and sends them to TiDB or TiFlash with different serialization paths.
 /// When sending data to TiDB, blocks with extra info are written into tipb::SelectResponse, then the whole tipb::SelectResponse is further serialized into mpp::MPPDataPacket.data.
 /// Differently when sending data to TiFlash, blocks with only tuples are directly serialized into mpp::MPPDataPacket.chunks, but for the last block, its extra info (like execution summaries) is written into tipb::SelectResponse, then further serialized into mpp::MPPDataPacket.data.
-template <class StreamWriterPtr>
+template <class StreamWriterPtr, bool enable_fine_grained_shuffle>
 class StreamingDAGResponseWriter : public DAGResponseWriter
 {
 public:
@@ -45,18 +45,30 @@ class StreamingDAGResponseWriter : public DAGResponseWriter
         Int64 records_per_chunk_,
         Int64 batch_send_min_limit_,
         bool should_send_exec_summary_at_last,
-        DAGContext & dag_context_);
+        DAGContext & dag_context_,
+        UInt64 fine_grained_shuffle_stream_count_,
+        UInt64 fine_grained_shuffle_batch_size);
     void write(const Block & block) override;
     void finishWrite() override;
 
 private:
     template <bool send_exec_summary_at_last>
     void batchWrite();
+    template <bool send_exec_summary_at_last>
+    void batchWriteFineGrainedShuffle();
+
     template <bool send_exec_summary_at_last>
     void encodeThenWriteBlocks(const std::vector<Block> & input_blocks, tipb::SelectResponse & response) const;
     template <bool send_exec_summary_at_last>
     void partitionAndEncodeThenWriteBlocks(std::vector<Block> & input_blocks, tipb::SelectResponse & response) const;
 
+    template <bool send_exec_summary_at_last>
+    void handleExecSummary(const std::vector<Block> & input_blocks,
+                           std::vector<mpp::MPPDataPacket> & packet,
+                           tipb::SelectResponse & response) const;
+    template <bool send_exec_summary_at_last>
+    void writePackets(const std::vector<size_t> & responses_row_count, std::vector<mpp::MPPDataPacket> & packets) const;
+
     Int64 batch_send_min_limit;
     bool should_send_exec_summary_at_last; /// only one stream needs to sending execution summaries at last.
     tipb::ExchangeType exchange_type;
@@ -67,6 +79,8 @@ class StreamingDAGResponseWriter : public DAGResponseWriter
     size_t rows_in_blocks;
     uint16_t partition_num;
     std::unique_ptr<ChunkCodecStream> chunk_codec_stream;
+    UInt64 fine_grained_shuffle_stream_count;
+    UInt64 fine_grained_shuffle_batch_size;
 };
 
 } // namespace DB
diff --git a/dbms/src/Flash/Coprocessor/TiDBColumn.cpp b/dbms/src/Flash/Coprocessor/TiDBColumn.cpp
index 7183374a5c1..eef89696d3a 100644
--- a/dbms/src/Flash/Coprocessor/TiDBColumn.cpp
+++ b/dbms/src/Flash/Coprocessor/TiDBColumn.cpp
@@ -28,7 +28,7 @@ template <typename T>
 void encodeLittleEndian(const T & value, WriteBuffer & ss)
 {
     auto v = toLittleEndian(value);
-    ss.write(reinterpret_cast<const char *>(&v), sizeof(v));
+    ss.template writeFixed<T>(&v);
 }
 
 TiDBColumn::TiDBColumn(Int8 element_len_)
@@ -141,10 +141,10 @@ void TiDBColumn::append(const TiDBDecimal & decimal)
     encodeLittleEndian<UInt8>(decimal.digits_int, *data);
     encodeLittleEndian<UInt8>(decimal.digits_frac, *data);
     encodeLittleEndian<UInt8>(decimal.result_frac, *data);
-    encodeLittleEndian<UInt8>((UInt8)decimal.negative, *data);
-    for (int i = 0; i < MAX_WORD_BUF_LEN; i++)
+    encodeLittleEndian<UInt8>(static_cast<UInt8>(decimal.negative), *data);
+    for (int i : decimal.word_buf)
     {
-        encodeLittleEndian<Int32>(decimal.word_buf[i], *data);
+        encodeLittleEndian<Int32>(i, *data);
     }
     finishAppendFixed();
 }
diff --git a/dbms/src/Flash/Coprocessor/tests/gtest_streaming_dag_writer.cpp b/dbms/src/Flash/Coprocessor/tests/gtest_streaming_dag_writer.cpp
new file mode 100644
index 00000000000..5d4186123b7
--- /dev/null
+++ b/dbms/src/Flash/Coprocessor/tests/gtest_streaming_dag_writer.cpp
@@ -0,0 +1,184 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <DataTypes/DataTypeNullable.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <Flash/Coprocessor/CHBlockChunkCodec.h>
+#include <TestUtils/TiFlashTestBasic.h>
+#include <TestUtils/TiFlashTestEnv.h>
+#include <gtest/gtest.h>
+
+#include <Flash/Coprocessor/StreamingDAGResponseWriter.cpp>
+#include <iostream>
+
+namespace DB
+{
+namespace tests
+{
+
+using BlockPtr = std::shared_ptr<Block>;
+class TestStreamingDAGResponseWriter : public testing::Test
+{
+protected:
+    void SetUp() override
+    {
+        dag_context_ptr = std::make_unique<DAGContext>(1024);
+        dag_context_ptr->encode_type = tipb::EncodeType::TypeCHBlock;
+        dag_context_ptr->is_mpp_task = true;
+        dag_context_ptr->is_root_mpp_task = false;
+        dag_context_ptr->result_field_types = makeFields();
+        context.setDAGContext(dag_context_ptr.get());
+    }
+
+public:
+    TestStreamingDAGResponseWriter()
+        : context(TiFlashTestEnv::getContext())
+        , part_col_ids{0}
+        , part_col_collators{
+              TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::BINARY)}
+    {}
+
+    // Return 10 Int64 column.
+    static std::vector<tipb::FieldType> makeFields()
+    {
+        std::vector<tipb::FieldType> fields(10);
+        for (int i = 0; i < 10; ++i)
+        {
+            fields[i].set_tp(TiDB::TypeLongLong);
+        }
+        return fields;
+    }
+
+    // Return a block with **rows** and 10 Int64 column.
+    static BlockPtr prepareBlock(const std::vector<Int64> & rows)
+    {
+        BlockPtr block = std::make_shared<Block>();
+        for (int i = 0; i < 10; ++i)
+        {
+            DataTypePtr int64_data_type = std::make_shared<DataTypeInt64>();
+            DataTypePtr nullable_int64_data_type = std::make_shared<DataTypeNullable>(int64_data_type);
+            MutableColumnPtr int64_col = nullable_int64_data_type->createColumn();
+            for (Int64 r : rows)
+            {
+                int64_col->insert(Field(r));
+            }
+            block->insert(ColumnWithTypeAndName{std::move(int64_col),
+                                                nullable_int64_data_type,
+                                                String("col") + std::to_string(i)});
+        }
+        return block;
+    }
+
+    Context context;
+    std::vector<Int64> part_col_ids;
+    TiDB::TiDBCollators part_col_collators;
+
+    std::unique_ptr<DAGContext> dag_context_ptr;
+};
+
+using MockStreamWriterChecker = std::function<void(mpp::MPPDataPacket &, uint16_t)>;
+
+struct MockStreamWriter
+{
+    MockStreamWriter(MockStreamWriterChecker checker_,
+                     uint16_t part_num_)
+        : checker(checker_)
+        , part_num(part_num_)
+    {}
+
+    void write(mpp::MPPDataPacket &) { FAIL() << "cannot reach here, because we only expect hash partition"; }
+    void write(mpp::MPPDataPacket & packet, uint16_t part_id) { checker(packet, part_id); }
+    void write(tipb::SelectResponse &, uint16_t) { FAIL() << "cannot reach here, only consider CH Block format"; }
+    void write(tipb::SelectResponse &) { FAIL() << "cannot reach here, only consider CH Block format"; }
+    uint16_t getPartitionNum() const { return part_num; }
+
+private:
+    MockStreamWriterChecker checker;
+    uint16_t part_num;
+};
+
+// Input block data is distributed uniform.
+// partition_num: 4
+// fine_grained_shuffle_stream_count: 8
+TEST_F(TestStreamingDAGResponseWriter, testBatchWriteFineGrainedShuffle)
+try
+{
+    const size_t block_rows = 1024;
+    const uint16_t part_num = 4;
+    const uint32_t fine_grained_shuffle_stream_count = 8;
+    const Int64 fine_grained_shuffle_batch_size = 4096;
+
+    // Set these to 1, because when fine grained shuffle is enabled,
+    // batchWriteFineGrainedShuffle() only check fine_grained_shuffle_batch_size.
+    // records_per_chunk and batch_send_min_limit are useless.
+    const Int64 records_per_chunk = 1;
+    const Int64 batch_send_min_limit = 1;
+    const bool should_send_exec_summary_at_last = true;
+
+    // 1. Build Block.
+    std::vector<Int64> uniform_data_set;
+    for (size_t i = 0; i < block_rows; ++i)
+    {
+        uniform_data_set.push_back(i);
+    }
+    BlockPtr block = prepareBlock(uniform_data_set);
+
+    // 2. Build MockStreamWriter.
+    std::unordered_map<uint16_t, mpp::MPPDataPacket> write_report;
+    auto checker = [&write_report](mpp::MPPDataPacket & packet, uint16_t part_id) {
+        auto res = write_report.insert({part_id, packet});
+        // Should always insert succeed.
+        // Because block.rows(1024) < fine_grained_shuffle_batch_size(4096),
+        // batchWriteFineGrainedShuffle() only called once, so will only be one packet for each partition.
+        ASSERT_TRUE(res.second);
+    };
+    auto mock_writer = std::make_shared<MockStreamWriter>(checker, part_num);
+
+    // 3. Start to write.
+    auto dag_writer = std::make_shared<StreamingDAGResponseWriter<std::shared_ptr<MockStreamWriter>, /*enable_fine_grained_shuffle=*/true>>(
+        mock_writer,
+        part_col_ids,
+        part_col_collators,
+        tipb::ExchangeType::Hash,
+        records_per_chunk,
+        batch_send_min_limit,
+        should_send_exec_summary_at_last,
+        *dag_context_ptr,
+        fine_grained_shuffle_stream_count,
+        fine_grained_shuffle_batch_size);
+    dag_writer->write(*block);
+    dag_writer->finishWrite();
+
+    // 4. Start to check write_report.
+    std::vector<Block> decoded_blocks;
+    ASSERT_EQ(write_report.size(), part_num);
+    for (const auto & ele : write_report)
+    {
+        const mpp::MPPDataPacket & packet = ele.second;
+        ASSERT_EQ(packet.chunks_size(), packet.stream_ids_size());
+        for (int i = 0; i < packet.chunks_size(); ++i)
+        {
+            decoded_blocks.push_back(CHBlockChunkCodec::decode(packet.chunks(i), *block));
+        }
+    }
+    ASSERT_EQ(decoded_blocks.size(), fine_grained_shuffle_stream_count * part_num);
+    for (const auto & block : decoded_blocks)
+    {
+        ASSERT_EQ(block.rows(), block_rows / (fine_grained_shuffle_stream_count * part_num));
+    }
+}
+CATCH
+
+} // namespace tests
+} // namespace DB
diff --git a/dbms/src/Flash/EstablishCall.cpp b/dbms/src/Flash/EstablishCall.cpp
index 8af81e30962..2f8c7c15f56 100644
--- a/dbms/src/Flash/EstablishCall.cpp
+++ b/dbms/src/Flash/EstablishCall.cpp
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <Common/FailPoint.h>
 #include <Common/TiFlashMetrics.h>
 #include <Flash/EstablishCall.h>
 #include <Flash/FlashService.h>
@@ -19,6 +20,11 @@
 
 namespace DB
 {
+namespace FailPoints
+{
+extern const char random_tunnel_init_rpc_failure_failpoint[];
+} // namespace FailPoints
+
 EstablishCallData::EstablishCallData(AsyncFlashService * service, grpc::ServerCompletionQueue * cq, grpc::ServerCompletionQueue * notify_cq, const std::shared_ptr<std::atomic<bool>> & is_shutdown)
     : service(service)
     , cq(cq)
@@ -71,6 +77,7 @@ void EstablishCallData::initRpc()
     std::exception_ptr eptr = nullptr;
     try
     {
+        FAIL_POINT_TRIGGER_EXCEPTION(FailPoints::random_tunnel_init_rpc_failure_failpoint);
         service->establishMPPConnectionSyncOrAsync(&ctx, &request, nullptr, this);
     }
     catch (...)
@@ -136,7 +143,7 @@ void EstablishCallData::finishTunnelAndResponder()
     state = FINISH;
     if (mpp_tunnel)
     {
-        mpp_tunnel->consumerFinish("grpc writes failed.", true); //trigger mpp tunnel finish work
+        mpp_tunnel->consumerFinish(fmt::format("{}: finishTunnelAndResponder called.", mpp_tunnel->id()), true); //trigger mpp tunnel finish work
     }
     grpc::Status status(static_cast<grpc::StatusCode>(GRPC_STATUS_UNKNOWN), "Consumer exits unexpected, grpc writes failed.");
     responder.Finish(status, this);
diff --git a/dbms/src/Flash/Mpp/ExchangeReceiver.cpp b/dbms/src/Flash/Mpp/ExchangeReceiver.cpp
index f194afee31f..ab8d83a1481 100644
--- a/dbms/src/Flash/Mpp/ExchangeReceiver.cpp
+++ b/dbms/src/Flash/Mpp/ExchangeReceiver.cpp
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #include <Common/CPUAffinityManager.h>
+#include <Common/Exception.h>
+#include <Common/FailPoint.h>
 #include <Common/ThreadFactory.h>
 #include <Common/TiFlashMetrics.h>
 #include <Flash/Coprocessor/CoprocessorReader.h>
@@ -22,6 +24,12 @@
 
 namespace DB
 {
+namespace FailPoints
+{
+extern const char random_receiver_sync_msg_push_failure_failpoint[];
+extern const char random_receiver_async_msg_push_failure_failpoint[];
+} // namespace FailPoints
+
 namespace
 {
 String getReceiverStateStr(const ExchangeReceiverState & s)
@@ -41,6 +49,106 @@ String getReceiverStateStr(const ExchangeReceiverState & s)
     }
 }
 
+// If enable_fine_grained_shuffle:
+//      Seperate chunks according to packet.stream_ids[i], then push to msg_channels[stream_id].
+// If fine grained_shuffle is disabled:
+//      Push all chunks to msg_channels[0].
+// Return true if all push succeed, otherwise return false.
+// NOTE: shared_ptr<MPPDataPacket> will be hold by all ExchangeReceiverBlockInputStream to make chunk pointer valid.
+template <bool enable_fine_grained_shuffle, bool is_sync>
+bool pushPacket(size_t source_index,
+                const String & req_info,
+                MPPDataPacketPtr & packet,
+                const std::vector<MsgChannelPtr> & msg_channels,
+                LoggerPtr & log)
+{
+    bool push_succeed = true;
+
+    const mpp::Error * error_ptr = nullptr;
+    if (packet->has_error())
+        error_ptr = &packet->error();
+    const String * resp_ptr = nullptr;
+    if (!packet->data().empty())
+        resp_ptr = &packet->data();
+
+    if constexpr (enable_fine_grained_shuffle)
+    {
+        std::vector<std::vector<const String *>> chunks(msg_channels.size());
+        if (!packet->chunks().empty())
+        {
+            // Packet not empty.
+            if (unlikely(packet->stream_ids().empty()))
+            {
+                // Fine grained shuffle is enabled in receiver, but sender didn't. We cannot handle this, so return error.
+                // This can happen when there are old version nodes when upgrading.
+                LOG_FMT_ERROR(log, "MPPDataPacket.stream_ids empty, it means ExchangeSender is old version of binary "
+                                   "(source_index: {}) while fine grained shuffle of ExchangeReceiver is enabled. "
+                                   "Cannot handle this.",
+                              source_index);
+                return false;
+            }
+            // packet.stream_ids[i] is corresponding to packet.chunks[i],
+            // indicating which stream_id this chunk belongs to.
+            assert(packet->chunks_size() == packet->stream_ids_size());
+
+            for (int i = 0; i < packet->stream_ids_size(); ++i)
+            {
+                UInt64 stream_id = packet->stream_ids(i) % msg_channels.size();
+                chunks[stream_id].push_back(&packet->chunks(i));
+            }
+        }
+        // Still need to send error_ptr or resp_ptr even if packet.chunks_size() is zero.
+        for (size_t i = 0; i < msg_channels.size() && push_succeed; ++i)
+        {
+            if (resp_ptr == nullptr && error_ptr == nullptr && chunks[i].empty())
+                continue;
+
+            std::shared_ptr<ReceivedMessage> recv_msg = std::make_shared<ReceivedMessage>(
+                source_index,
+                req_info,
+                packet,
+                error_ptr,
+                resp_ptr,
+                std::move(chunks[i]));
+            push_succeed = msg_channels[i]->push(std::move(recv_msg));
+            if constexpr (is_sync)
+                fiu_do_on(FailPoints::random_receiver_sync_msg_push_failure_failpoint, push_succeed = false;);
+            else
+                fiu_do_on(FailPoints::random_receiver_async_msg_push_failure_failpoint, push_succeed = false;);
+
+            // Only the first ExchangeReceiverInputStream need to handle resp.
+            resp_ptr = nullptr;
+        }
+    }
+    else
+    {
+        std::vector<const String *> chunks(packet->chunks_size());
+        for (int i = 0; i < packet->chunks_size(); ++i)
+        {
+            chunks[i] = &packet->chunks(i);
+        }
+
+        if (!(resp_ptr == nullptr && error_ptr == nullptr && chunks.empty()))
+        {
+            std::shared_ptr<ReceivedMessage> recv_msg = std::make_shared<ReceivedMessage>(
+                source_index,
+                req_info,
+                packet,
+                error_ptr,
+                resp_ptr,
+                std::move(chunks));
+
+            push_succeed = msg_channels[0]->push(std::move(recv_msg));
+            if constexpr (is_sync)
+                fiu_do_on(FailPoints::random_receiver_sync_msg_push_failure_failpoint, push_succeed = false;);
+            else
+                fiu_do_on(FailPoints::random_receiver_async_msg_push_failure_failpoint, push_succeed = false;);
+        }
+    }
+    LOG_FMT_DEBUG(log, "push recv_msg to msg_channels(size: {}) succeed:{}, enable_fine_grained_shuffle: {}", msg_channels.size(), push_succeed, enable_fine_grained_shuffle);
+    return push_succeed;
+}
+
 enum class AsyncRequestStage
 {
     NEED_INIT,
@@ -57,25 +165,25 @@ using TimePoint = Clock::time_point;
 constexpr Int32 max_retry_times = 10;
 constexpr Int32 batch_packet_count = 16;
 
-template <typename RPCContext>
+template <typename RPCContext, bool enable_fine_grained_shuffle>
 class AsyncRequestHandler : public UnaryCallback<bool>
 {
 public:
     using Status = typename RPCContext::Status;
     using Request = typename RPCContext::Request;
     using AsyncReader = typename RPCContext::AsyncReader;
-    using Self = AsyncRequestHandler<RPCContext>;
+    using Self = AsyncRequestHandler<RPCContext, enable_fine_grained_shuffle>;
 
     AsyncRequestHandler(
         MPMCQueue<Self *> * queue,
-        MPMCQueue<std::shared_ptr<ReceivedMessage>> * msg_channel_,
+        std::vector<MsgChannelPtr> * msg_channels_,
         const std::shared_ptr<RPCContext> & context,
         const Request & req,
         const String & req_id)
         : rpc_context(context)
         , request(&req)
         , notify_queue(queue)
-        , msg_channel(msg_channel_)
+        , msg_channels(msg_channels_)
         , req_info(fmt::format("tunnel{}+{}", req.send_task_id, req.recv_task_id))
         , log(Logger::get("ExchangeReceiver", req_id, req_info))
     {
@@ -253,11 +361,7 @@ class AsyncRequestHandler : public UnaryCallback<bool>
         for (size_t i = 0; i < read_packet_index; ++i)
         {
             auto & packet = packets[i];
-            auto recv_msg = std::make_shared<ReceivedMessage>();
-            recv_msg->packet = std::move(packet);
-            recv_msg->source_index = request->source_index;
-            recv_msg->req_info = req_info;
-            if (!msg_channel->push(std::move(recv_msg)))
+            if (!pushPacket<enable_fine_grained_shuffle, false>(request->source_index, req_info, packet, *msg_channels, log))
                 return false;
             // can't reuse packet since it is sent to readers.
             packet = std::make_shared<MPPDataPacket>();
@@ -274,7 +378,7 @@ class AsyncRequestHandler : public UnaryCallback<bool>
     std::shared_ptr<RPCContext> rpc_context;
     const Request * request; // won't be null
     MPMCQueue<Self *> * notify_queue; // won't be null
-    MPMCQueue<std::shared_ptr<ReceivedMessage>> * msg_channel; // won't be null
+    std::vector<MsgChannelPtr> * msg_channels; // won't be null
 
     String req_info;
     bool meet_error = false;
@@ -299,20 +403,32 @@ ExchangeReceiverBase<RPCContext>::ExchangeReceiverBase(
     size_t source_num_,
     size_t max_streams_,
     const String & req_id,
-    const String & executor_id)
+    const String & executor_id,
+    uint64_t fine_grained_shuffle_stream_count_)
     : rpc_context(std::move(rpc_context_))
     , source_num(source_num_)
     , max_streams(max_streams_)
     , max_buffer_size(std::max<size_t>(batch_packet_count, std::max(source_num, max_streams_) * 2))
     , thread_manager(newThreadManager())
-    , msg_channel(max_buffer_size)
     , live_connections(source_num)
     , state(ExchangeReceiverState::NORMAL)
     , exc_log(Logger::get("ExchangeReceiver", req_id, executor_id))
     , collected(false)
+    , fine_grained_shuffle_stream_count(fine_grained_shuffle_stream_count_)
 {
     try
     {
+        if (enableFineGrainedShuffle(fine_grained_shuffle_stream_count_))
+        {
+            for (size_t i = 0; i < max_streams_; ++i)
+            {
+                msg_channels.push_back(std::make_unique<MPMCQueue<std::shared_ptr<ReceivedMessage>>>(max_buffer_size));
+            }
+        }
+        else
+        {
+            msg_channels.push_back(std::make_unique<MPMCQueue<std::shared_ptr<ReceivedMessage>>>(max_buffer_size));
+        }
         rpc_context->fillSchema(schema);
         setUpConnection();
     }
@@ -349,14 +465,14 @@ template <typename RPCContext>
 void ExchangeReceiverBase<RPCContext>::cancel()
 {
     setEndState(ExchangeReceiverState::CANCELED);
-    msg_channel.finish();
+    cancelAllMsgChannels();
 }
 
 template <typename RPCContext>
 void ExchangeReceiverBase<RPCContext>::close()
 {
     setEndState(ExchangeReceiverState::CLOSED);
-    msg_channel.finish();
+    finishAllMsgChannels();
 }
 
 template <typename RPCContext>
@@ -371,7 +487,12 @@ void ExchangeReceiverBase<RPCContext>::setUpConnection()
             async_requests.push_back(std::move(req));
         else
         {
-            thread_manager->schedule(true, "Receiver", [this, req = std::move(req)] { readLoop(req); });
+            thread_manager->schedule(true, "Receiver", [this, req = std::move(req)] {
+                if (enableFineGrainedShuffle(fine_grained_shuffle_stream_count))
+                    readLoop<true>(req);
+                else
+                    readLoop<false>(req);
+            });
             ++thread_count;
         }
     }
@@ -379,15 +500,21 @@ void ExchangeReceiverBase<RPCContext>::setUpConnection()
     // TODO: reduce this thread in the future.
     if (!async_requests.empty())
     {
-        thread_manager->schedule(true, "RecvReactor", [this, async_requests = std::move(async_requests)] { reactor(async_requests); });
+        thread_manager->schedule(true, "RecvReactor", [this, async_requests = std::move(async_requests)] {
+            if (enableFineGrainedShuffle(fine_grained_shuffle_stream_count))
+                reactor<true>(async_requests);
+            else
+                reactor<false>(async_requests);
+        });
         ++thread_count;
     }
 }
 
 template <typename RPCContext>
+template <bool enable_fine_grained_shuffle>
 void ExchangeReceiverBase<RPCContext>::reactor(const std::vector<Request> & async_requests)
 {
-    using AsyncHandler = AsyncRequestHandler<RPCContext>;
+    using AsyncHandler = AsyncRequestHandler<RPCContext, enable_fine_grained_shuffle>;
 
     GET_METRIC(tiflash_thread_count, type_threads_of_receiver_reactor).Increment();
     SCOPE_EXIT({
@@ -403,7 +530,7 @@ void ExchangeReceiverBase<RPCContext>::reactor(const std::vector<Request> & asyn
     std::vector<std::unique_ptr<AsyncHandler>> handlers;
     handlers.reserve(alive_async_connections);
     for (const auto & req : async_requests)
-        handlers.emplace_back(std::make_unique<AsyncHandler>(&ready_requests, &msg_channel, rpc_context, req, exc_log->identifier()));
+        handlers.emplace_back(std::make_unique<AsyncHandler>(&ready_requests, &msg_channels, rpc_context, req, exc_log->identifier()));
 
     while (alive_async_connections > 0)
     {
@@ -415,7 +542,7 @@ void ExchangeReceiverBase<RPCContext>::reactor(const std::vector<Request> & asyn
         for (Int32 i = 0; i < check_waiting_requests_freq; ++i)
         {
             AsyncHandler * handler = nullptr;
-            if (unlikely(!ready_requests.tryPop(handler, timeout)))
+            if (unlikely(!ready_requests.popTimeout(handler, timeout)))
                 break;
 
             handler->handle();
@@ -448,6 +575,7 @@ void ExchangeReceiverBase<RPCContext>::reactor(const std::vector<Request> & asyn
 }
 
 template <typename RPCContext>
+template <bool enable_fine_grained_shuffle>
 void ExchangeReceiverBase<RPCContext>::readLoop(const Request & req)
 {
     GET_METRIC(tiflash_thread_count, type_threads_of_receiver_read_loop).Increment();
@@ -472,18 +600,15 @@ void ExchangeReceiverBase<RPCContext>::readLoop(const Request & req)
             for (;;)
             {
                 LOG_FMT_TRACE(log, "begin next ");
-                auto recv_msg = std::make_shared<ReceivedMessage>();
-                recv_msg->packet = std::make_shared<MPPDataPacket>();
-                recv_msg->req_info = req_info;
-                recv_msg->source_index = req.source_index;
-                bool success = reader->read(recv_msg->packet);
+                MPPDataPacketPtr packet = std::make_shared<MPPDataPacket>();
+                bool success = reader->read(packet);
                 if (!success)
                     break;
                 has_data = true;
-                if (recv_msg->packet->has_error())
-                    throw Exception("Exchange receiver meet error : " + recv_msg->packet->error().msg());
+                if (packet->has_error())
+                    throw Exception("Exchange receiver meet error : " + packet->error().msg());
 
-                if (!msg_channel.push(std::move(recv_msg)))
+                if (!pushPacket<enable_fine_grained_shuffle, true>(req.source_index, req_info, packet, msg_channels, log))
                 {
                     meet_error = true;
                     auto local_state = getState();
@@ -553,15 +678,15 @@ DecodeDetail ExchangeReceiverBase<RPCContext>::decodeChunks(
     assert(recv_msg != nullptr);
     DecodeDetail detail;
 
-    int chunk_size = recv_msg->packet->chunks_size();
-    if (chunk_size == 0)
+    if (recv_msg->chunks.empty())
         return detail;
 
+    // Record total packet size even if fine grained shuffle is enabled.
     detail.packet_bytes = recv_msg->packet->ByteSizeLong();
-    /// ExchangeReceiverBase should receive chunks of TypeCHBlock
-    for (int i = 0; i < chunk_size; ++i)
+
+    for (const String * chunk : recv_msg->chunks)
     {
-        Block block = CHBlockChunkCodec::decode(recv_msg->packet->chunks(i), header);
+        Block block = CHBlockChunkCodec::decode(*chunk, header);
         detail.rows += block.rows();
         if (unlikely(block.rows() == 0))
             continue;
@@ -571,10 +696,15 @@ DecodeDetail ExchangeReceiverBase<RPCContext>::decodeChunks(
 }
 
 template <typename RPCContext>
-ExchangeReceiverResult ExchangeReceiverBase<RPCContext>::nextResult(std::queue<Block> & block_queue, const Block & header)
+ExchangeReceiverResult ExchangeReceiverBase<RPCContext>::nextResult(std::queue<Block> & block_queue, const Block & header, size_t stream_id)
 {
+    if (unlikely(stream_id >= msg_channels.size()))
+    {
+        LOG_FMT_ERROR(exc_log, "stream_id out of range, stream_id: {}, total_stream_count: {}", stream_id, msg_channels.size());
+        return {nullptr, 0, "", true, "stream_id out of range", false};
+    }
     std::shared_ptr<ReceivedMessage> recv_msg;
-    if (!msg_channel.pop(recv_msg))
+    if (!msg_channels[stream_id]->pop(recv_msg))
     {
         std::unique_lock lock(mu);
 
@@ -596,29 +726,32 @@ ExchangeReceiverResult ExchangeReceiverBase<RPCContext>::nextResult(std::queue<B
             return {nullptr, 0, "ExchangeReceiver", false, "", true};
         }
     }
-    assert(recv_msg != nullptr && recv_msg->packet != nullptr);
+    assert(recv_msg != nullptr);
     ExchangeReceiverResult result;
-    if (recv_msg->packet->has_error())
+    if (recv_msg->error_ptr != nullptr)
     {
-        result = {nullptr, recv_msg->source_index, recv_msg->req_info, true, recv_msg->packet->error().msg(), false};
+        result = {nullptr, recv_msg->source_index, recv_msg->req_info, true, recv_msg->error_ptr->msg(), false};
     }
     else
     {
-        if (!recv_msg->packet->data().empty()) /// the data of the last packet is serialized from tipb::SelectResponse including execution summaries.
+        if (recv_msg->resp_ptr != nullptr) /// the data of the last packet is serialized from tipb::SelectResponse including execution summaries.
         {
-            auto resp_ptr = std::make_shared<tipb::SelectResponse>();
-            if (!resp_ptr->ParseFromString(recv_msg->packet->data()))
+            auto select_resp = std::make_shared<tipb::SelectResponse>();
+            if (!select_resp->ParseFromString(*(recv_msg->resp_ptr)))
             {
                 result = {nullptr, recv_msg->source_index, recv_msg->req_info, true, "decode error", false};
             }
             else
             {
-                result = {resp_ptr, recv_msg->source_index, recv_msg->req_info, false, "", false};
-                /// If mocking TiFlash as TiDB, here should decode chunks from resp_ptr.
-                if (!resp_ptr->chunks().empty())
+                result = {select_resp, recv_msg->source_index, recv_msg->req_info, false, "", false};
+                /// If mocking TiFlash as TiDB, here should decode chunks from select_resp.
+                if (!select_resp->chunks().empty())
                 {
-                    assert(recv_msg->packet->chunks().empty());
-                    result.decode_detail = CoprocessorReader::decodeChunks(resp_ptr, block_queue, header, schema);
+                    assert(recv_msg->chunks.empty());
+                    // Fine grained shuffle should only be enabled when sending data to TiFlash node.
+                    // So all data should be encoded into MPPDataPacket.chunks.
+                    RUNTIME_CHECK(!enableFineGrainedShuffle(fine_grained_shuffle_stream_count), Exception, "Data should not be encoded into tipb::SelectResponse.chunks when fine grained shuffle is enabled");
+                    result.decode_detail = CoprocessorReader::decodeChunks(select_resp, block_queue, header, schema);
                 }
             }
         }
@@ -626,7 +759,7 @@ ExchangeReceiverResult ExchangeReceiverBase<RPCContext>::nextResult(std::queue<B
         {
             result = {nullptr, recv_msg->source_index, recv_msg->req_info, false, "", false};
         }
-        if (!result.meet_error && !recv_msg->packet->chunks().empty())
+        if (!result.meet_error && !recv_msg->chunks.empty())
         {
             assert(result.decode_detail.rows == 0);
             result.decode_detail = decodeChunks(recv_msg, block_queue, header);
@@ -688,7 +821,21 @@ void ExchangeReceiverBase<RPCContext>::connectionDone(
         throw Exception("live_connections should not be less than 0!");
 
     if (meet_error || copy_live_conn == 0)
-        msg_channel.finish();
+        finishAllMsgChannels();
+}
+
+template <typename RPCContext>
+void ExchangeReceiverBase<RPCContext>::finishAllMsgChannels()
+{
+    for (auto & msg_channel : msg_channels)
+        msg_channel->finish();
+}
+
+template <typename RPCContext>
+void ExchangeReceiverBase<RPCContext>::cancelAllMsgChannels()
+{
+    for (auto & msg_channel : msg_channels)
+        msg_channel->cancel();
 }
 
 /// Explicit template instantiations - to avoid code bloat in headers.
diff --git a/dbms/src/Flash/Mpp/ExchangeReceiver.h b/dbms/src/Flash/Mpp/ExchangeReceiver.h
index 830dc6241a9..708f133f226 100644
--- a/dbms/src/Flash/Mpp/ExchangeReceiver.h
+++ b/dbms/src/Flash/Mpp/ExchangeReceiver.h
@@ -35,9 +35,28 @@ namespace DB
 {
 struct ReceivedMessage
 {
-    std::shared_ptr<mpp::MPPDataPacket> packet;
-    size_t source_index = 0;
+    size_t source_index;
     String req_info;
+    // shared_ptr<const MPPDataPacket> is copied to make sure error_ptr, resp_ptr and chunks are valid.
+    const std::shared_ptr<const MPPDataPacket> packet;
+    const mpp::Error * error_ptr;
+    const String * resp_ptr;
+    std::vector<const String *> chunks;
+
+    // Constructor that move chunks.
+    ReceivedMessage(size_t source_index_,
+                    const String & req_info_,
+                    const std::shared_ptr<const MPPDataPacket> & packet_,
+                    const mpp::Error * error_ptr_,
+                    const String * resp_ptr_,
+                    std::vector<const String *> && chunks_)
+        : source_index(source_index_)
+        , req_info(req_info_)
+        , packet(packet_)
+        , error_ptr(error_ptr_)
+        , resp_ptr(resp_ptr_)
+        , chunks(chunks_)
+    {}
 };
 
 struct ExchangeReceiverResult
@@ -78,6 +97,7 @@ enum class ExchangeReceiverState
     CLOSED,
 };
 
+using MsgChannelPtr = std::unique_ptr<MPMCQueue<std::shared_ptr<ReceivedMessage>>>;
 
 template <typename RPCContext>
 class ExchangeReceiverBase
@@ -92,7 +112,8 @@ class ExchangeReceiverBase
         size_t source_num_,
         size_t max_streams_,
         const String & req_id,
-        const String & executor_id);
+        const String & executor_id,
+        uint64_t fine_grained_shuffle_stream_count);
 
     ~ExchangeReceiverBase();
 
@@ -104,9 +125,11 @@ class ExchangeReceiverBase
 
     ExchangeReceiverResult nextResult(
         std::queue<Block> & block_queue,
-        const Block & header);
+        const Block & header,
+        size_t stream_id);
 
     size_t getSourceNum() const { return source_num; }
+    uint64_t getFineGrainedShuffleStreamCount() const { return fine_grained_shuffle_stream_count; }
 
     int computeNewThreadCount() const { return thread_count; }
 
@@ -128,7 +151,10 @@ class ExchangeReceiverBase
     using Request = typename RPCContext::Request;
 
     void setUpConnection();
+    // Template argument enable_fine_grained_shuffle will be setup properly in setUpConnection().
+    template <bool enable_fine_grained_shuffle>
     void readLoop(const Request & req);
+    template <bool enable_fine_grained_shuffle>
     void reactor(const std::vector<Request> & async_requests);
 
     bool setEndState(ExchangeReceiverState new_state);
@@ -139,12 +165,14 @@ class ExchangeReceiverBase
         std::queue<Block> & block_queue,
         const Block & header);
 
-
     void connectionDone(
         bool meet_error,
         const String & local_err_msg,
         const LoggerPtr & log);
 
+    void finishAllMsgChannels();
+    void cancelAllMsgChannels();
+
     std::shared_ptr<RPCContext> rpc_context;
 
     const tipb::ExchangeReceiver pb_exchange_receiver;
@@ -156,7 +184,7 @@ class ExchangeReceiverBase
     std::shared_ptr<ThreadManager> thread_manager;
     DAGSchema schema;
 
-    MPMCQueue<std::shared_ptr<ReceivedMessage>> msg_channel;
+    std::vector<MsgChannelPtr> msg_channels;
 
     std::mutex mu;
     /// should lock `mu` when visit these members
@@ -168,6 +196,7 @@ class ExchangeReceiverBase
 
     bool collected = false;
     int thread_count = 0;
+    uint64_t fine_grained_shuffle_stream_count;
 };
 
 class ExchangeReceiver : public ExchangeReceiverBase<GRPCReceiverContext>
diff --git a/dbms/src/Flash/Mpp/MPPHandler.cpp b/dbms/src/Flash/Mpp/MPPHandler.cpp
index a3096aaa644..7f97a1dd698 100644
--- a/dbms/src/Flash/Mpp/MPPHandler.cpp
+++ b/dbms/src/Flash/Mpp/MPPHandler.cpp
@@ -31,7 +31,7 @@ void MPPHandler::handleError(const MPPTaskPtr & task, String error)
     try
     {
         if (task)
-            task->cancel(error);
+            task->handleError(error);
     }
     catch (...)
     {
diff --git a/dbms/src/Flash/Mpp/MPPReceiverSet.cpp b/dbms/src/Flash/Mpp/MPPReceiverSet.cpp
new file mode 100644
index 00000000000..60cca308c18
--- /dev/null
+++ b/dbms/src/Flash/Mpp/MPPReceiverSet.cpp
@@ -0,0 +1,48 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <Flash/Mpp/ExchangeReceiver.h>
+#include <Flash/Mpp/MPPReceiverSet.h>
+
+namespace DB
+{
+void MPPReceiverSet::addExchangeReceiver(const String & executor_id, const ExchangeReceiverPtr & exchange_receiver)
+{
+    RUNTIME_ASSERT(exchange_receiver_map.find(executor_id) == exchange_receiver_map.end(), log, "Duplicate executor_id: {} in DAGRequest", executor_id);
+    exchange_receiver_map[executor_id] = exchange_receiver;
+}
+
+void MPPReceiverSet::addCoprocessorReader(const CoprocessorReaderPtr & coprocessor_reader)
+{
+    coprocessor_readers.push_back(coprocessor_reader);
+}
+
+ExchangeReceiverPtr MPPReceiverSet::getExchangeReceiver(const String & executor_id) const
+{
+    auto it = exchange_receiver_map.find(executor_id);
+    if (unlikely(it == exchange_receiver_map.end()))
+        return nullptr;
+    return it->second;
+}
+
+void MPPReceiverSet::cancel()
+{
+    for (auto & it : exchange_receiver_map)
+    {
+        it.second->cancel();
+    }
+    for (auto & cop_reader : coprocessor_readers)
+        cop_reader->cancel();
+}
+} // namespace DB
diff --git a/dbms/src/Flash/Mpp/MPPReceiverSet.h b/dbms/src/Flash/Mpp/MPPReceiverSet.h
new file mode 100644
index 00000000000..44274cb3ce8
--- /dev/null
+++ b/dbms/src/Flash/Mpp/MPPReceiverSet.h
@@ -0,0 +1,44 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <Flash/Coprocessor/CoprocessorReader.h>
+#include <Flash/Coprocessor/DAGContext.h>
+
+namespace DB
+{
+class MPPReceiverSet
+{
+public:
+    explicit MPPReceiverSet(const String & req_id)
+        : log(Logger::get("MPPReceiverSet", req_id))
+    {}
+    void addExchangeReceiver(const String & executor_id, const ExchangeReceiverPtr & exchange_receiver);
+    void addCoprocessorReader(const CoprocessorReaderPtr & coprocessor_reader);
+    ExchangeReceiverPtr getExchangeReceiver(const String & executor_id) const;
+    void cancel();
+
+private:
+    /// two kinds of receiver in MPP
+    /// ExchangeReceiver: receiver data from other MPPTask
+    /// CoprocessorReader: used in remote read
+    ExchangeReceiverMap exchange_receiver_map;
+    std::vector<CoprocessorReaderPtr> coprocessor_readers;
+    const LoggerPtr log;
+};
+
+using MPPReceiverSetPtr = std::shared_ptr<MPPReceiverSet>;
+
+} // namespace DB
diff --git a/dbms/src/Flash/Mpp/MPPTask.cpp b/dbms/src/Flash/Mpp/MPPTask.cpp
index 40f03ff79ba..7ddc6af361f 100644
--- a/dbms/src/Flash/Mpp/MPPTask.cpp
+++ b/dbms/src/Flash/Mpp/MPPTask.cpp
@@ -51,6 +51,7 @@ extern const char exception_before_mpp_register_tunnel_for_root_mpp_task[];
 extern const char exception_during_mpp_register_tunnel_for_non_root_mpp_task[];
 extern const char exception_during_mpp_write_err_to_tunnel[];
 extern const char force_no_local_region_for_mpp_task[];
+extern const char random_task_lifecycle_failpoint[];
 } // namespace FailPoints
 
 MPPTask::MPPTask(const mpp::TaskMeta & meta_, const ContextPtr & context_)
@@ -80,6 +81,34 @@ MPPTask::~MPPTask()
     LOG_FMT_DEBUG(log, "finish MPPTask: {}", id.toString());
 }
 
+void MPPTask::abortTunnels(const String & message, AbortType abort_type)
+{
+    if (abort_type == AbortType::ONCANCELLATION)
+    {
+        closeAllTunnels(message);
+    }
+    else
+    {
+        RUNTIME_ASSERT(tunnel_set != nullptr, log, "mpp task without tunnel set");
+        tunnel_set->writeError(message);
+    }
+}
+
+void MPPTask::abortReceivers()
+{
+    if (likely(receiver_set != nullptr))
+    {
+        receiver_set->cancel();
+    }
+}
+
+void MPPTask::abortDataStreams(AbortType abort_type)
+{
+    /// When abort type is ONERROR, it means MPPTask already known it meet error, so let the remaining task stop silently to avoid too many useless error message
+    bool is_kill = abort_type == AbortType::ONCANCELLATION;
+    context->getProcessList().sendCancelToQuery(context->getCurrentQueryId(), context->getClientInfo().current_user, is_kill);
+}
+
 void MPPTask::closeAllTunnels(const String & reason)
 {
     if (likely(tunnel_set))
@@ -125,7 +154,7 @@ void MPPTask::registerTunnels(const mpp::DispatchTaskRequest & task_request)
 
 void MPPTask::initExchangeReceivers()
 {
-    mpp_exchange_receiver_map = std::make_shared<ExchangeReceiverMap>();
+    receiver_set = std::make_shared<MPPReceiverSet>(log->identifier());
     traverseExecutors(&dag_req, [&](const tipb::Executor & executor) {
         if (executor.tp() == tipb::ExecType::TypeExchangeReceiver)
         {
@@ -143,27 +172,17 @@ void MPPTask::initExchangeReceivers()
                 executor.exchange_receiver().encoded_task_meta_size(),
                 context->getMaxStreams(),
                 log->identifier(),
-                executor_id);
+                executor_id,
+                executor.fine_grained_shuffle_stream_count());
             if (status != RUNNING)
                 throw Exception("exchange receiver map can not be initialized, because the task is not in running state");
 
-            (*mpp_exchange_receiver_map)[executor_id] = exchange_receiver;
+            receiver_set->addExchangeReceiver(executor_id, exchange_receiver);
             new_thread_count_of_exchange_receiver += exchange_receiver->computeNewThreadCount();
         }
         return true;
     });
-    dag_context->setMPPExchangeReceiverMap(mpp_exchange_receiver_map);
-}
-
-void MPPTask::cancelAllExchangeReceivers()
-{
-    if (likely(mpp_exchange_receiver_map != nullptr))
-    {
-        for (auto & it : *mpp_exchange_receiver_map)
-        {
-            it.second->cancel();
-        }
-    }
+    dag_context->setMPPReceiverSet(receiver_set);
 }
 
 std::pair<MPPTunnelPtr, String> MPPTask::getTunnel(const ::mpp::EstablishMPPConnectionRequest * request)
@@ -359,92 +378,124 @@ void MPPTask::runImpl()
             return_statistics.blocks,
             return_statistics.bytes);
     }
-    catch (Exception & e)
-    {
-        err_msg = e.displayText();
-        LOG_FMT_ERROR(log, "task running meets error: {} Stack Trace : {}", err_msg, e.getStackTrace().toString());
-    }
-    catch (pingcap::Exception & e)
-    {
-        err_msg = e.message();
-        LOG_FMT_ERROR(log, "task running meets error: {}", err_msg);
-    }
-    catch (std::exception & e)
-    {
-        err_msg = e.what();
-        LOG_FMT_ERROR(log, "task running meets error: {}", err_msg);
-    }
     catch (...)
     {
-        err_msg = "unrecovered error";
-        LOG_FMT_ERROR(log, "task running meets error: {}", err_msg);
+        err_msg = getCurrentExceptionMessage(true, true);
     }
+
     if (err_msg.empty())
     {
-        // todo when error happens, should try to update the metrics if it is available
-        auto throughput = dag_context->getTableScanThroughput();
-        if (throughput.first)
-            GET_METRIC(tiflash_storage_logical_throughput_bytes).Observe(throughput.second);
-        auto process_info = context->getProcessListElement()->getInfo();
-        auto peak_memory = process_info.peak_memory_usage > 0 ? process_info.peak_memory_usage : 0;
-        GET_METRIC(tiflash_coprocessor_request_memory_usage, type_run_mpp_task).Observe(peak_memory);
-        mpp_task_statistics.setMemoryPeak(peak_memory);
+        if (switchStatus(RUNNING, FINISHED))
+            LOG_INFO(log, "finish task");
+        else
+            LOG_FMT_WARNING(log, "finish task which is in {} state", taskStatusToString(status));
+        if (status == FINISHED)
+        {
+            // todo when error happens, should try to update the metrics if it is available
+            auto throughput = dag_context->getTableScanThroughput();
+            if (throughput.first)
+                GET_METRIC(tiflash_storage_logical_throughput_bytes).Observe(throughput.second);
+            auto process_info = context->getProcessListElement()->getInfo();
+            auto peak_memory = process_info.peak_memory_usage > 0 ? process_info.peak_memory_usage : 0;
+            GET_METRIC(tiflash_coprocessor_request_memory_usage, type_run_mpp_task).Observe(peak_memory);
+            mpp_task_statistics.setMemoryPeak(peak_memory);
+        }
     }
     else
     {
-        context->getProcessList().sendCancelToQuery(context->getCurrentQueryId(), context->getClientInfo().current_user, true);
-        cancelAllExchangeReceivers();
-        writeErrToAllTunnels(err_msg);
+        if (status == RUNNING)
+        {
+            LOG_FMT_ERROR(log, "task running meets error: {}", err_msg);
+            /// trim the stack trace to avoid too many useless information in log
+            trimStackTrace(err_msg);
+            try
+            {
+                handleError(err_msg);
+            }
+            catch (...)
+            {
+                tryLogCurrentException(log, "Meet error while try to handle error in MPPTask");
+            }
+        }
     }
     LOG_FMT_INFO(log, "task ends, time cost is {} ms.", stopwatch.elapsedMilliseconds());
-    unregisterTask();
-
-    if (switchStatus(RUNNING, FINISHED))
-        LOG_INFO(log, "finish task");
-    else
-        LOG_WARNING(log, "finish task which was cancelled before");
+    // unregister flag is only for FailPoint usage, to produce the situation that MPPTask is destructed
+    // by grpc CancelMPPTask thread;
+    bool unregister = true;
+    fiu_do_on(FailPoints::random_task_lifecycle_failpoint, {
+        if (!err_msg.empty())
+            unregister = false;
+    });
+    if (unregister)
+        unregisterTask();
 
-    mpp_task_statistics.end(status.load(), err_msg);
+    mpp_task_statistics.end(status.load(), err_string);
     mpp_task_statistics.logTracingJson();
 }
 
-void MPPTask::writeErrToAllTunnels(const String & e)
+void MPPTask::handleError(const String & error_msg)
 {
-    RUNTIME_ASSERT(tunnel_set != nullptr, log, "mpp task without tunnel set");
-    tunnel_set->writeError(e);
+    if (manager == nullptr || !manager->isTaskToBeCancelled(id))
+        abort(error_msg, AbortType::ONERROR);
 }
 
-void MPPTask::cancel(const String & reason)
+void MPPTask::abort(const String & message, AbortType abort_type)
 {
-    CPUAffinityManager::getInstance().bindSelfQueryThread();
-    LOG_FMT_WARNING(log, "Begin cancel task: {}", id.toString());
+    String abort_type_string;
+    TaskStatus next_task_status;
+    switch (abort_type)
+    {
+    case AbortType::ONCANCELLATION:
+        abort_type_string = "ONCANCELLATION";
+        next_task_status = CANCELLED;
+        break;
+    case AbortType::ONERROR:
+        abort_type_string = "ONERROR";
+        next_task_status = FAILED;
+        break;
+    }
+    LOG_FMT_WARNING(log, "Begin abort task: {}, abort type: {}", id.toString(), abort_type_string);
     while (true)
     {
         auto previous_status = status.load();
-        if (previous_status == FINISHED || previous_status == CANCELLED)
+        if (previous_status == FINISHED || previous_status == CANCELLED || previous_status == FAILED)
         {
-            LOG_FMT_WARNING(log, "task already {}", (previous_status == FINISHED ? "finished" : "cancelled"));
+            LOG_FMT_WARNING(log, "task already in {} state", taskStatusToString(previous_status));
             return;
         }
-        else if (previous_status == INITIALIZING && switchStatus(INITIALIZING, CANCELLED))
+        else if (previous_status == INITIALIZING && switchStatus(INITIALIZING, next_task_status))
         {
-            closeAllTunnels(reason);
+            err_string = message;
+            /// if the task is in initializing state, mpp task can return error to TiDB directly,
+            /// so just close all tunnels here
+            closeAllTunnels(message);
             unregisterTask();
-            LOG_WARNING(log, "Finish cancel task from uninitialized");
+            LOG_WARNING(log, "Finish abort task from uninitialized");
             return;
         }
-        else if (previous_status == RUNNING && switchStatus(RUNNING, CANCELLED))
+        else if (previous_status == RUNNING && switchStatus(RUNNING, next_task_status))
         {
+            /// abort the components from top to bottom because if bottom components are aborted
+            /// first, the top components may see an error caused by the abort, which is not
+            /// the original error
+            err_string = message;
+            abortTunnels(message, abort_type);
+            abortDataStreams(abort_type);
+            abortReceivers();
             scheduleThisTask(ScheduleState::FAILED);
-            context->getProcessList().sendCancelToQuery(context->getCurrentQueryId(), context->getClientInfo().current_user, true);
-            closeAllTunnels(reason);
             /// runImpl is running, leave remaining work to runImpl
-            LOG_WARNING(log, "Finish cancel task from running");
+            LOG_WARNING(log, "Finish abort task from running");
             return;
         }
     }
 }
 
+void MPPTask::cancel(const String & reason)
+{
+    CPUAffinityManager::getInstance().bindSelfQueryThread();
+    abort(reason, AbortType::ONCANCELLATION);
+}
+
 bool MPPTask::switchStatus(TaskStatus from, TaskStatus to)
 {
     return status.compare_exchange_strong(from, to);
diff --git a/dbms/src/Flash/Mpp/MPPTask.h b/dbms/src/Flash/Mpp/MPPTask.h
index c8423ac484c..a30150b26e8 100644
--- a/dbms/src/Flash/Mpp/MPPTask.h
+++ b/dbms/src/Flash/Mpp/MPPTask.h
@@ -19,6 +19,7 @@
 #include <Common/MemoryTracker.h>
 #include <DataStreams/BlockIO.h>
 #include <Flash/Coprocessor/DAGContext.h>
+#include <Flash/Mpp/MPPReceiverSet.h>
 #include <Flash/Mpp/MPPTaskId.h>
 #include <Flash/Mpp/MPPTaskStatistics.h>
 #include <Flash/Mpp/MPPTunnel.h>
@@ -58,6 +59,8 @@ class MPPTask : public std::enable_shared_from_this<MPPTask>
 
     void cancel(const String & reason);
 
+    void handleError(const String & error_msg);
+
     void prepare(const mpp::DispatchTaskRequest & task_request);
 
     void run();
@@ -89,12 +92,22 @@ class MPPTask : public std::enable_shared_from_this<MPPTask>
 
     void unregisterTask();
 
-    void writeErrToAllTunnels(const String & e);
-
     /// Similar to `writeErrToAllTunnels`, but it just try to write the error message to tunnel
     /// without waiting the tunnel to be connected
     void closeAllTunnels(const String & reason);
 
+    enum class AbortType
+    {
+        /// todo add ONKILL to distinguish between silent cancellation and kill
+        ONCANCELLATION,
+        ONERROR,
+    };
+    void abort(const String & message, AbortType abort_type);
+
+    void abortTunnels(const String & message, AbortType abort_type);
+    void abortReceivers();
+    void abortDataStreams(AbortType abort_type);
+
     void finishWrite();
 
     bool switchStatus(TaskStatus from, TaskStatus to);
@@ -109,8 +122,6 @@ class MPPTask : public std::enable_shared_from_this<MPPTask>
 
     void initExchangeReceivers();
 
-    void cancelAllExchangeReceivers();
-
     tipb::DAGRequest dag_req;
 
     ContextPtr context;
@@ -120,14 +131,15 @@ class MPPTask : public std::enable_shared_from_this<MPPTask>
     MemoryTracker * memory_tracker = nullptr;
 
     std::atomic<TaskStatus> status{INITIALIZING};
+    String err_string;
 
     mpp::TaskMeta meta;
 
     MPPTaskId id;
 
     MPPTunnelSetPtr tunnel_set;
-    /// key: executor_id of ExchangeReceiver nodes in dag.
-    ExchangeReceiverMapPtr mpp_exchange_receiver_map;
+
+    MPPReceiverSetPtr receiver_set;
 
     int new_thread_count_of_exchange_receiver = 0;
 
@@ -137,8 +149,6 @@ class MPPTask : public std::enable_shared_from_this<MPPTask>
 
     MPPTaskStatistics mpp_task_statistics;
 
-    Exception err;
-
     friend class MPPTaskManager;
 
     int needed_threads;
diff --git a/dbms/src/Flash/Mpp/MPPTaskManager.cpp b/dbms/src/Flash/Mpp/MPPTaskManager.cpp
index 531f8f7a10d..c5499eda89d 100644
--- a/dbms/src/Flash/Mpp/MPPTaskManager.cpp
+++ b/dbms/src/Flash/Mpp/MPPTaskManager.cpp
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <Common/FailPoint.h>
 #include <Common/FmtUtils.h>
 #include <Flash/Mpp/MPPTaskManager.h>
 #include <fmt/core.h>
@@ -22,6 +23,11 @@
 
 namespace DB
 {
+namespace FailPoints
+{
+extern const char random_task_manager_find_task_failure_failpoint[];
+} // namespace FailPoints
+
 MPPTaskManager::MPPTaskManager(MPPTaskSchedulerPtr scheduler_)
     : scheduler(std::move(scheduler_))
     , log(&Poco::Logger::get("TaskManager"))
@@ -50,6 +56,7 @@ MPPTaskPtr MPPTaskManager::findTaskWithTimeout(const mpp::TaskMeta & meta, std::
         it = query_it->second->task_map.find(id);
         return it != query_it->second->task_map.end();
     });
+    fiu_do_on(FailPoints::random_task_manager_find_task_failure_failpoint, ret = false;);
     if (cancelled)
     {
         errMsg = fmt::format("Task [{},{}] has been cancelled.", meta.start_ts(), meta.task_id());
@@ -140,6 +147,17 @@ bool MPPTaskManager::registerTask(MPPTaskPtr task)
     return true;
 }
 
+bool MPPTaskManager::isTaskToBeCancelled(const MPPTaskId & task_id)
+{
+    std::unique_lock lock(mu);
+    auto it = mpp_query_map.find(task_id.start_ts);
+    if (it != mpp_query_map.end() && it->second->to_be_cancelled)
+    {
+        return it->second->task_map.find(task_id) != it->second->task_map.end();
+    }
+    return false;
+}
+
 void MPPTaskManager::unregisterTask(MPPTask * task)
 {
     std::unique_lock lock(mu);
diff --git a/dbms/src/Flash/Mpp/MPPTaskManager.h b/dbms/src/Flash/Mpp/MPPTaskManager.h
index d7047804aca..770acea3853 100644
--- a/dbms/src/Flash/Mpp/MPPTaskManager.h
+++ b/dbms/src/Flash/Mpp/MPPTaskManager.h
@@ -73,6 +73,8 @@ class MPPTaskManager : private boost::noncopyable
 
     void unregisterTask(MPPTask * task);
 
+    bool isTaskToBeCancelled(const MPPTaskId & task_id);
+
     bool tryToScheduleTask(const MPPTaskPtr & task);
 
     void releaseThreadsFromScheduler(const int needed_threads);
diff --git a/dbms/src/Flash/Mpp/MPPTunnel.cpp b/dbms/src/Flash/Mpp/MPPTunnel.cpp
index 826e7fea88a..16fe4ae42cc 100644
--- a/dbms/src/Flash/Mpp/MPPTunnel.cpp
+++ b/dbms/src/Flash/Mpp/MPPTunnel.cpp
@@ -25,6 +25,7 @@ namespace DB
 namespace FailPoints
 {
 extern const char exception_during_mpp_close_tunnel[];
+extern const char random_tunnel_wait_timeout_failpoint[];
 } // namespace FailPoints
 
 template <typename Writer>
@@ -219,7 +220,11 @@ void MPPTunnelBase<Writer>::sendJob(bool need_lock)
         err_msg = "fatal error in sendJob()";
     }
     if (!err_msg.empty())
+    {
+        /// append tunnel id to error message
+        err_msg = fmt::format("{} meet error: {}", tunnel_id, err_msg);
         LOG_ERROR(log, err_msg);
+    }
     consumerFinish(err_msg, need_lock);
     if (is_async)
         writer->writeDone(grpc::Status::OK);
@@ -322,6 +327,7 @@ void MPPTunnelBase<Writer>::waitUntilConnectedOrFinished(std::unique_lock<std::m
         auto res = cv_for_connected_or_finished.wait_for(lk, timeout, connected_or_finished);
         LOG_FMT_TRACE(log, "end waitUntilConnectedOrFinished");
 
+        fiu_do_on(FailPoints::random_tunnel_wait_timeout_failpoint, res = false;);
         if (!res)
             throw Exception(tunnel_id + " is timeout");
     }
diff --git a/dbms/src/Flash/Mpp/MinTSOScheduler.cpp b/dbms/src/Flash/Mpp/MinTSOScheduler.cpp
index af525bd1a55..967bfcecfa3 100644
--- a/dbms/src/Flash/Mpp/MinTSOScheduler.cpp
+++ b/dbms/src/Flash/Mpp/MinTSOScheduler.cpp
@@ -12,12 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <Common/FailPoint.h>
 #include <Common/TiFlashMetrics.h>
 #include <Flash/Mpp/MPPTaskManager.h>
 #include <Flash/Mpp/MinTSOScheduler.h>
 
 namespace DB
 {
+namespace FailPoints
+{
+extern const char random_min_tso_scheduler_failpoint[];
+} // namespace FailPoints
+
 constexpr UInt64 MAX_UINT64 = std::numeric_limits<UInt64>::max();
 constexpr UInt64 OS_THREAD_SOFT_LIMIT = 100000;
 
@@ -193,7 +199,9 @@ bool MinTSOScheduler::scheduleImp(const UInt64 tso, const MPPQueryTaskSetPtr & q
     }
     else
     {
-        if (tso <= min_tso) /// the min_tso query should fully run, otherwise throw errors here.
+        bool is_tso_min = tso <= min_tso;
+        fiu_do_on(FailPoints::random_min_tso_scheduler_failpoint, is_tso_min = true;);
+        if (is_tso_min) /// the min_tso query should fully run, otherwise throw errors here.
         {
             has_error = true;
             auto msg = fmt::format("threads are unavailable for the query {} ({} min_tso {}) {}, need {}, but used {} of the thread hard limit {}, {} active and {} waiting queries.", tso, tso == min_tso ? "is" : "is newer than", min_tso, isWaiting ? "from the waiting set" : "when directly schedule it", needed_threads, estimated_thread_usage, thread_hard_limit, active_set.size(), waiting_set.size());
diff --git a/dbms/src/Flash/Mpp/TaskStatus.cpp b/dbms/src/Flash/Mpp/TaskStatus.cpp
index 423b768faea..c87ae2b8eb4 100644
--- a/dbms/src/Flash/Mpp/TaskStatus.cpp
+++ b/dbms/src/Flash/Mpp/TaskStatus.cpp
@@ -29,6 +29,8 @@ StringRef taskStatusToString(const TaskStatus & status)
         return "FINISHED";
     case CANCELLED:
         return "CANCELLED";
+    case FAILED:
+        return "FAILED";
     default:
         throw Exception("Unknown TaskStatus");
     }
diff --git a/dbms/src/Flash/Mpp/TaskStatus.h b/dbms/src/Flash/Mpp/TaskStatus.h
index 999e30790bf..0997c8adc52 100644
--- a/dbms/src/Flash/Mpp/TaskStatus.h
+++ b/dbms/src/Flash/Mpp/TaskStatus.h
@@ -24,6 +24,7 @@ enum TaskStatus
     RUNNING,
     FINISHED,
     CANCELLED,
+    FAILED,
 };
 
 StringRef taskStatusToString(const TaskStatus & status);
diff --git a/dbms/src/Flash/Mpp/Utils.cpp b/dbms/src/Flash/Mpp/Utils.cpp
index 477c478eef7..21d89b3cd52 100644
--- a/dbms/src/Flash/Mpp/Utils.cpp
+++ b/dbms/src/Flash/Mpp/Utils.cpp
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <Flash/Mpp/Utils.h>
+#include <Poco/String.h>
 
 #include <memory>
 
@@ -27,4 +28,14 @@ mpp::MPPDataPacket getPacketWithError(String reason)
     return data;
 }
 
+void trimStackTrace(String & message)
+{
+    auto stack_trace_pos = message.find("Stack trace");
+    if (stack_trace_pos != String::npos)
+    {
+        message.resize(stack_trace_pos);
+        Poco::trimRightInPlace(message);
+    }
+}
+
 } // namespace DB
diff --git a/dbms/src/Flash/Mpp/Utils.h b/dbms/src/Flash/Mpp/Utils.h
index 67e2dc3f641..021dc4407d5 100644
--- a/dbms/src/Flash/Mpp/Utils.h
+++ b/dbms/src/Flash/Mpp/Utils.h
@@ -23,5 +23,6 @@
 namespace DB
 {
 mpp::MPPDataPacket getPacketWithError(String reason);
+void trimStackTrace(String & message);
 
 } // namespace DB
diff --git a/dbms/src/Flash/Mpp/tests/gtest_mpptunnel.cpp b/dbms/src/Flash/Mpp/tests/gtest_mpptunnel.cpp
index 47ce2ee6ee6..706c17ed036 100644
--- a/dbms/src/Flash/Mpp/tests/gtest_mpptunnel.cpp
+++ b/dbms/src/Flash/Mpp/tests/gtest_mpptunnel.cpp
@@ -382,7 +382,7 @@ TEST_F(TestMPPTunnelBase, WriteError)
     }
     catch (Exception & e)
     {
-        GTEST_ASSERT_EQ(e.message(), "Consumer exits unexpected, grpc writes failed.");
+        GTEST_ASSERT_EQ(e.message(), "Consumer exits unexpected, 0000_0001 meet error: grpc writes failed.");
     }
 }
 
@@ -631,7 +631,7 @@ TEST_F(TestMPPTunnelBase, AsyncWriteError)
     }
     catch (Exception & e)
     {
-        GTEST_ASSERT_EQ(e.message(), "Consumer exits unexpected, grpc writes failed.");
+        GTEST_ASSERT_EQ(e.message(), "Consumer exits unexpected, 0000_0001 meet error: grpc writes failed.");
     }
 }
 
diff --git a/dbms/src/Flash/tests/WindowTestUtil.h b/dbms/src/Flash/tests/WindowTestUtil.h
index 3f4cb7d595f..b7385380419 100644
--- a/dbms/src/Flash/tests/WindowTestUtil.h
+++ b/dbms/src/Flash/tests/WindowTestUtil.h
@@ -39,9 +39,9 @@ inline void mockExecuteProject(std::shared_ptr<DAGQueryBlockInterpreter> & mock_
     mock_interpreter->executeProject(pipeline, final_project);
 }
 
-inline void mockExecuteWindowOrder(std::shared_ptr<DAGQueryBlockInterpreter> & mock_interpreter, DAGPipeline & pipeline, const tipb::Sort & sort)
+inline void mockExecuteWindowOrder(std::shared_ptr<DAGQueryBlockInterpreter> & mock_interpreter, DAGPipeline & pipeline, const tipb::Sort & sort, uint64_t fine_grained_shuffle_stream_count)
 {
-    mock_interpreter->handleWindowOrder(pipeline, sort);
+    mock_interpreter->handleWindowOrder(pipeline, sort, ::DB::enableFineGrainedShuffle(fine_grained_shuffle_stream_count));
     mock_interpreter->input_streams_vec[0] = pipeline.streams;
     NamesWithAliases final_project;
     for (const auto & column : (*mock_interpreter->analyzer).source_columns)
@@ -51,16 +51,9 @@ inline void mockExecuteWindowOrder(std::shared_ptr<DAGQueryBlockInterpreter> & m
     mockExecuteProject(mock_interpreter, pipeline, final_project);
 }
 
-inline void mockExecuteWindowOrder(std::shared_ptr<DAGQueryBlockInterpreter> & mock_interpreter, DAGPipeline & pipeline, const String & sort_json)
+inline void mockExecuteWindow(std::shared_ptr<DAGQueryBlockInterpreter> & mock_interpreter, DAGPipeline & pipeline, const tipb::Window & window, uint64_t fine_grained_shuffle_stream_count)
 {
-    tipb::Sort sort;
-    ::google::protobuf::util::JsonStringToMessage(sort_json, &sort);
-    mockExecuteWindowOrder(mock_interpreter, pipeline, sort);
-}
-
-inline void mockExecuteWindow(std::shared_ptr<DAGQueryBlockInterpreter> & mock_interpreter, DAGPipeline & pipeline, const tipb::Window & window)
-{
-    mock_interpreter->handleWindow(pipeline, window);
+    mock_interpreter->handleWindow(pipeline, window, ::DB::enableFineGrainedShuffle(fine_grained_shuffle_stream_count));
     mock_interpreter->input_streams_vec[0] = pipeline.streams;
     NamesWithAliases final_project;
     for (const auto & column : (*mock_interpreter->analyzer).source_columns)
@@ -70,12 +63,5 @@ inline void mockExecuteWindow(std::shared_ptr<DAGQueryBlockInterpreter> & mock_i
     mockExecuteProject(mock_interpreter, pipeline, final_project);
 }
 
-inline void mockExecuteWindow(std::shared_ptr<DAGQueryBlockInterpreter> & mock_interpreter, DAGPipeline & pipeline, std::string window_json_str)
-{
-    tipb::Window window;
-    google::protobuf::util::JsonStringToMessage(window_json_str, &window);
-    mockExecuteWindow(mock_interpreter, pipeline, window);
-}
-
 } // namespace tests
 } // namespace DB
diff --git a/dbms/src/Flash/tests/bench_exchange.cpp b/dbms/src/Flash/tests/bench_exchange.cpp
index fbb53bfd4a4..d6e3f3e825e 100644
--- a/dbms/src/Flash/tests/bench_exchange.cpp
+++ b/dbms/src/Flash/tests/bench_exchange.cpp
@@ -47,29 +47,46 @@ MockFixedRowsBlockInputStream::MockFixedRowsBlockInputStream(size_t total_rows_,
     , blocks(blocks_)
 {}
 
-Block makeBlock(int row_num)
+Block makeBlock(int row_num, bool skew)
 {
-    std::mt19937 mt(rd());
-    std::uniform_int_distribution<Int64> int64_dist;
-    std::uniform_int_distribution<int> len_dist(10, 20);
-    std::uniform_int_distribution<char> char_dist;
-
     InferredDataVector<Nullable<Int64>> int64_vec;
     InferredDataVector<Nullable<Int64>> int64_vec2;
-    for (int i = 0; i < row_num; ++i)
+    InferredDataVector<Nullable<String>> string_vec;
+
+    if (skew)
     {
-        int64_vec.emplace_back(int64_dist(mt));
-        int64_vec2.emplace_back(int64_dist(mt));
+        for (int i = 0; i < row_num; ++i)
+        {
+            int64_vec.emplace_back(100);
+            int64_vec2.emplace_back(100);
+        }
+
+        for (int i = 0; i < row_num; ++i)
+        {
+            string_vec.push_back("abcdefg");
+        }
     }
-
-    InferredDataVector<Nullable<String>> string_vec;
-    for (int i = 0; i < row_num; ++i)
+    else
     {
-        int len = len_dist(mt);
-        String s;
-        for (int j = 0; j < len; ++j)
-            s.push_back(char_dist(mt));
-        string_vec.push_back(std::move(s));
+        std::mt19937 mt(rd());
+        std::uniform_int_distribution<Int64> int64_dist;
+        std::uniform_int_distribution<int> len_dist(10, 20);
+        std::uniform_int_distribution<char> char_dist;
+
+        for (int i = 0; i < row_num; ++i)
+        {
+            int64_vec.emplace_back(int64_dist(mt));
+            int64_vec2.emplace_back(int64_dist(mt));
+        }
+
+        for (int i = 0; i < row_num; ++i)
+        {
+            int len = len_dist(mt);
+            String s;
+            for (int j = 0; j < len; ++j)
+                s.push_back(char_dist(mt));
+            string_vec.push_back(std::move(s));
+        }
     }
 
     auto int64_data_type = makeDataType<Nullable<Int64>>();
@@ -82,11 +99,11 @@ Block makeBlock(int row_num)
     return Block({int64_column, string_column, int64_column2});
 }
 
-std::vector<Block> makeBlocks(int block_num, int row_num)
+std::vector<Block> makeBlocks(int block_num, int row_num, bool skew)
 {
     std::vector<Block> blocks;
     for (int i = 0; i < block_num; ++i)
-        blocks.push_back(makeBlock(row_num));
+        blocks.push_back(makeBlock(row_num, skew));
     return blocks;
 }
 
@@ -139,32 +156,10 @@ void printException(const Exception & e)
                   << e.getStackTrace().toString() << std::endl;
 }
 
-void sendPacket(const std::vector<PacketPtr> & packets, const PacketQueuePtr & queue, StopFlag & stop_flag)
-{
-    std::mt19937 mt(rd());
-    std::uniform_int_distribution<int> dist(0, packets.size() - 1);
-
-    while (!stop_flag.load())
-    {
-        int i = dist(mt);
-        queue->tryPush(packets[i], std::chrono::milliseconds(10));
-    }
-    queue->finish();
-}
-
-void receivePacket(const PacketQueuePtr & queue)
-{
-    while (true)
-    {
-        PacketPtr packet;
-        if (!queue->pop(packet))
-            break;
-    }
-}
-
-ReceiverHelper::ReceiverHelper(int concurrency_, int source_num_)
+ReceiverHelper::ReceiverHelper(int concurrency_, int source_num_, uint32_t fine_grained_shuffle_stream_count_)
     : concurrency(concurrency_)
     , source_num(source_num_)
+    , fine_grained_shuffle_stream_count(fine_grained_shuffle_stream_count_)
 {
     pb_exchange_receiver.set_tp(tipb::Hash);
     for (int i = 0; i < source_num; ++i)
@@ -198,16 +193,21 @@ MockExchangeReceiverPtr ReceiverHelper::buildReceiver()
         source_num,
         concurrency,
         "mock_req_id",
-        "mock_exchange_receiver_id");
+        "mock_exchange_receiver_id",
+        fine_grained_shuffle_stream_count);
 }
 
 std::vector<BlockInputStreamPtr> ReceiverHelper::buildExchangeReceiverStream()
 {
     auto receiver = buildReceiver();
     std::vector<BlockInputStreamPtr> streams(concurrency);
+    // NOTE: check if need fine_grained_shuffle_stream_count
     for (int i = 0; i < concurrency; ++i)
     {
-        streams[i] = std::make_shared<MockExchangeReceiverInputStream>(receiver, "mock_req_id", "mock_executor_id" + std::to_string(i));
+        streams[i] = std::make_shared<MockExchangeReceiverInputStream>(receiver,
+                                                                       "mock_req_id",
+                                                                       "mock_executor_id" + std::to_string(i),
+                                                                       /*stream_id=*/enableFineGrainedShuffle(fine_grained_shuffle_stream_count) ? i : 0);
     }
     return streams;
 }
@@ -215,7 +215,7 @@ std::vector<BlockInputStreamPtr> ReceiverHelper::buildExchangeReceiverStream()
 BlockInputStreamPtr ReceiverHelper::buildUnionStream()
 {
     auto streams = buildExchangeReceiverStream();
-    return std::make_shared<UnionBlockInputStream<>>(streams, nullptr, concurrency, /*req_id=*/"");
+    return std::make_shared<UnionBlockInputStream<>>(streams, BlockInputStreams{}, concurrency, /*req_id=*/"");
 }
 
 void ReceiverHelper::finish()
@@ -230,10 +230,14 @@ void ReceiverHelper::finish()
 SenderHelper::SenderHelper(
     int source_num_,
     int concurrency_,
+    uint32_t fine_grained_shuffle_stream_count_,
+    int64_t fine_grained_shuffle_batch_size_,
     const std::vector<PacketQueuePtr> & queues_,
     const std::vector<tipb::FieldType> & fields)
     : source_num(source_num_)
     , concurrency(concurrency_)
+    , fine_grained_shuffle_stream_count(fine_grained_shuffle_stream_count_)
+    , fine_grained_shuffle_batch_size(fine_grained_shuffle_batch_size_)
     , queues(queues_)
 {
     mpp::TaskMeta task_meta;
@@ -277,20 +281,41 @@ BlockInputStreamPtr SenderHelper::buildUnionStream(
     for (int i = 0; i < concurrency; ++i)
     {
         BlockInputStreamPtr stream = std::make_shared<MockBlockInputStream>(blocks, stop_flag);
-        std::unique_ptr<DAGResponseWriter> response_writer(
-            new StreamingDAGResponseWriter<MockTunnelSetPtr>(
-                tunnel_set,
-                {0, 1, 2},
-                TiDB::TiDBCollators(3),
-                tipb::Hash,
-                -1,
-                -1,
-                true,
-                *dag_context));
-        send_streams.push_back(std::make_shared<ExchangeSenderBlockInputStream>(stream, std::move(response_writer), /*req_id=*/""));
+        if (enableFineGrainedShuffle(fine_grained_shuffle_stream_count))
+        {
+            std::unique_ptr<DAGResponseWriter> response_writer(
+                new StreamingDAGResponseWriter<MockTunnelSetPtr, true>(
+                    tunnel_set,
+                    {0, 1, 2},
+                    TiDB::TiDBCollators(3),
+                    tipb::Hash,
+                    -1,
+                    -1,
+                    true,
+                    *dag_context,
+                    fine_grained_shuffle_stream_count,
+                    fine_grained_shuffle_batch_size));
+            send_streams.push_back(std::make_shared<ExchangeSenderBlockInputStream>(stream, std::move(response_writer), /*req_id=*/""));
+        }
+        else
+        {
+            std::unique_ptr<DAGResponseWriter> response_writer(
+                new StreamingDAGResponseWriter<MockTunnelSetPtr, false>(
+                    tunnel_set,
+                    {0, 1, 2},
+                    TiDB::TiDBCollators(3),
+                    tipb::Hash,
+                    -1,
+                    -1,
+                    true,
+                    *dag_context,
+                    fine_grained_shuffle_stream_count,
+                    fine_grained_shuffle_batch_size));
+            send_streams.push_back(std::make_shared<ExchangeSenderBlockInputStream>(stream, std::move(response_writer), /*req_id=*/""));
+        }
     }
 
-    return std::make_shared<UnionBlockInputStream<>>(send_streams, nullptr, concurrency, /*req_id=*/"");
+    return std::make_shared<UnionBlockInputStream<>>(send_streams, BlockInputStreams{}, concurrency, /*req_id=*/"");
 }
 
 BlockInputStreamPtr SenderHelper::buildUnionStream(size_t total_rows, const std::vector<Block> & blocks)
@@ -299,20 +324,41 @@ BlockInputStreamPtr SenderHelper::buildUnionStream(size_t total_rows, const std:
     for (int i = 0; i < concurrency; ++i)
     {
         BlockInputStreamPtr stream = std::make_shared<MockFixedRowsBlockInputStream>(total_rows / concurrency, blocks);
-        std::unique_ptr<DAGResponseWriter> response_writer(
-            new StreamingDAGResponseWriter<MockTunnelSetPtr>(
-                tunnel_set,
-                {0, 1, 2},
-                TiDB::TiDBCollators(3),
-                tipb::Hash,
-                -1,
-                -1,
-                true,
-                *dag_context));
-        send_streams.push_back(std::make_shared<ExchangeSenderBlockInputStream>(stream, std::move(response_writer), /*req_id=*/""));
+        if (enableFineGrainedShuffle(fine_grained_shuffle_stream_count))
+        {
+            std::unique_ptr<DAGResponseWriter> response_writer(
+                new StreamingDAGResponseWriter<MockTunnelSetPtr, true>(
+                    tunnel_set,
+                    {0, 1, 2},
+                    TiDB::TiDBCollators(3),
+                    tipb::Hash,
+                    -1,
+                    -1,
+                    true,
+                    *dag_context,
+                    fine_grained_shuffle_stream_count,
+                    fine_grained_shuffle_batch_size));
+            send_streams.push_back(std::make_shared<ExchangeSenderBlockInputStream>(stream, std::move(response_writer), /*req_id=*/""));
+        }
+        else
+        {
+            std::unique_ptr<DAGResponseWriter> response_writer(
+                new StreamingDAGResponseWriter<MockTunnelSetPtr, false>(
+                    tunnel_set,
+                    {0, 1, 2},
+                    TiDB::TiDBCollators(3),
+                    tipb::Hash,
+                    -1,
+                    -1,
+                    true,
+                    *dag_context,
+                    fine_grained_shuffle_stream_count,
+                    fine_grained_shuffle_batch_size));
+            send_streams.push_back(std::make_shared<ExchangeSenderBlockInputStream>(stream, std::move(response_writer), /*req_id=*/""));
+        }
     }
 
-    return std::make_shared<UnionBlockInputStream<>>(send_streams, nullptr, concurrency, /*req_id=*/"");
+    return std::make_shared<UnionBlockInputStream<>>(send_streams, BlockInputStreams{}, concurrency, /*req_id=*/"");
 }
 
 void SenderHelper::finish()
@@ -327,13 +373,12 @@ void SenderHelper::finish()
 
 void ExchangeBench::SetUp(const benchmark::State &)
 {
-    Poco::Logger::root().setLevel("error");
-
     DynamicThreadPool::global_instance = std::make_unique<DynamicThreadPool>(
         /*fixed_thread_num=*/300,
         std::chrono::milliseconds(100000));
 
-    input_blocks = makeBlocks(/*block_num=*/100, /*row_num=*/1024);
+    uniform_blocks = makeBlocks(/*block_num=*/100, /*row_num=*/1024);
+    skew_blocks = makeBlocks(/*block_num=*/100, /*row_num=*/1024, /*skew=*/true);
 
     try
     {
@@ -348,7 +393,8 @@ void ExchangeBench::SetUp(const benchmark::State &)
 
 void ExchangeBench::TearDown(const benchmark::State &)
 {
-    input_blocks.clear();
+    uniform_blocks.clear();
+    skew_blocks.clear();
     // NOTE: Must reset here, otherwise DynamicThreadPool::fixedWork() may core because metrics already destroyed.
     DynamicThreadPool::global_instance.reset();
 }
@@ -383,25 +429,38 @@ try
     const int concurrency = state.range(0);
     const int source_num = state.range(1);
     const int total_rows = state.range(2);
+    const int fine_grained_shuffle_stream_count = state.range(3);
+    const int fine_grained_shuffle_batch_size = state.range(4);
     Context context = TiFlashTestEnv::getContext();
 
     for (auto _ : state)
     {
-        std::shared_ptr<ReceiverHelper> receiver_helper = std::make_shared<ReceiverHelper>(concurrency, source_num);
+        std::shared_ptr<ReceiverHelper> receiver_helper = std::make_shared<ReceiverHelper>(concurrency, source_num, fine_grained_shuffle_stream_count);
         BlockInputStreamPtr receiver_stream = receiver_helper->buildUnionStream();
 
         std::shared_ptr<SenderHelper> sender_helper = std::make_shared<SenderHelper>(source_num,
                                                                                      concurrency,
+                                                                                     fine_grained_shuffle_stream_count,
+                                                                                     fine_grained_shuffle_batch_size,
                                                                                      receiver_helper->queues,
                                                                                      receiver_helper->fields);
-        BlockInputStreamPtr sender_stream = sender_helper->buildUnionStream(total_rows, input_blocks);
+        BlockInputStreamPtr sender_stream = sender_helper->buildUnionStream(total_rows, uniform_blocks);
 
         runAndWait(receiver_helper, receiver_stream, sender_helper, sender_stream);
     }
 }
 CATCH
 BENCHMARK_REGISTER_F(ExchangeBench, basic_send_receive)
-    ->Args({8, 1, 1024 * 1000});
+    ->Args({8, 1, 1024 * 1000, 0, 4096})
+    ->Args({8, 1, 1024 * 1000, 4, 4096})
+    ->Args({8, 1, 1024 * 1000, 8, 4096})
+    ->Args({8, 1, 1024 * 1000, 16, 4096})
+    ->Args({8, 1, 1024 * 1000, 32, 4096})
+    ->Args({8, 1, 1024 * 1000, 8, 1})
+    ->Args({8, 1, 1024 * 1000, 8, 1000})
+    ->Args({8, 1, 1024 * 1000, 8, 10000})
+    ->Args({8, 1, 1024 * 1000, 8, 100000});
+
 
 } // namespace tests
 } // namespace DB
diff --git a/dbms/src/Flash/tests/bench_exchange.h b/dbms/src/Flash/tests/bench_exchange.h
index 6b09e319613..d8300d45740 100644
--- a/dbms/src/Flash/tests/bench_exchange.h
+++ b/dbms/src/Flash/tests/bench_exchange.h
@@ -69,7 +69,9 @@ struct MockReceiverContext
             : queue(queue_)
         {}
 
-        void initialize() const {}
+        void initialize() const
+        {
+        }
 
         bool read(PacketPtr & packet [[maybe_unused]]) const
         {
@@ -105,7 +107,8 @@ struct MockReceiverContext
         const std::vector<tipb::FieldType> & field_types_)
         : queues(queues_)
         , field_types(field_types_)
-    {}
+    {
+    }
 
     void fillSchema(DAGSchema & schema) const
     {
@@ -220,8 +223,8 @@ struct MockFixedRowsBlockInputStream : public IProfilingBlockInputStream
     }
 };
 
-Block makeBlock(int row_num);
-std::vector<Block> makeBlocks(int block_num, int row_num);
+Block makeBlock(int row_num, bool skew = false);
+std::vector<Block> makeBlocks(int block_num, int row_num, bool skew = false);
 mpp::MPPDataPacket makePacket(ChunkCodecStream & codec, int row_num);
 std::vector<PacketPtr> makePackets(ChunkCodecStream & codec, int packet_num, int row_num);
 std::vector<PacketQueuePtr> makePacketQueues(int source_num, int queue_size);
@@ -234,17 +237,17 @@ struct ReceiverHelper
 {
     const int concurrency;
     const int source_num;
+    const uint32_t fine_grained_shuffle_stream_count;
     tipb::ExchangeReceiver pb_exchange_receiver;
     std::vector<tipb::FieldType> fields;
     mpp::TaskMeta task_meta;
     std::vector<PacketQueuePtr> queues;
     std::shared_ptr<Join> join_ptr;
 
-    explicit ReceiverHelper(int concurrency_, int source_num_);
+    explicit ReceiverHelper(int concurrency_, int source_num_, uint32_t fine_grained_shuffle_stream_count_);
     MockExchangeReceiverPtr buildReceiver();
     std::vector<BlockInputStreamPtr> buildExchangeReceiverStream();
     BlockInputStreamPtr buildUnionStream();
-    BlockInputStreamPtr buildUnionStreamWithHashJoinBuildStream();
     void finish();
 };
 
@@ -252,6 +255,8 @@ struct SenderHelper
 {
     const int source_num;
     const int concurrency;
+    const uint32_t fine_grained_shuffle_stream_count;
+    const int64_t fine_grained_shuffle_batch_size;
 
     std::vector<PacketQueuePtr> queues;
     std::vector<MockWriterPtr> mock_writers;
@@ -262,6 +267,8 @@ struct SenderHelper
     SenderHelper(
         int source_num_,
         int concurrency_,
+        uint32_t fine_grained_shuffle_stream_count_,
+        int64_t fine_grained_shuffle_batch_size_,
         const std::vector<PacketQueuePtr> & queues_,
         const std::vector<tipb::FieldType> & fields);
 
@@ -283,7 +290,8 @@ class ExchangeBench : public benchmark::Fixture
                     std::shared_ptr<SenderHelper> & sender_helper,
                     BlockInputStreamPtr sender_stream);
 
-    std::vector<Block> input_blocks;
+    std::vector<Block> uniform_blocks;
+    std::vector<Block> skew_blocks;
 };
 
 
diff --git a/dbms/src/Flash/tests/bench_window.cpp b/dbms/src/Flash/tests/bench_window.cpp
index da9df20fdf3..75dc53b065b 100644
--- a/dbms/src/Flash/tests/bench_window.cpp
+++ b/dbms/src/Flash/tests/bench_window.cpp
@@ -24,10 +24,14 @@ class WindowFunctionBench : public ExchangeBench
 public:
     void SetUp(const benchmark::State & state) override
     {
-        // build tipb::Window and tipb::Sort.
+        // Using DAGRequestBuilder to build tipb::Window and tipb::Sort.
         // select row_number() over w1 from t1 window w1 as (partition by c1, c2, c3 order by c1, c2, c3);
         ExchangeBench::SetUp(state);
-        MockColumnInfos columns{
+    }
+
+    static void setupPB(uint64_t fine_grained_shuffle_stream_count, tipb::Window & window, tipb::Sort & sort)
+    {
+        MockColumnInfoVec columns{
             {"c1", TiDB::TP::TypeLongLong},
             {"c2", TiDB::TP::TypeString},
             {"c3", TiDB::TP::TypeLongLong},
@@ -36,11 +40,12 @@ class WindowFunctionBench : public ExchangeBench
         DAGRequestBuilder builder(executor_index);
         builder
             .mockTable("test", "t1", columns)
-            .sort({{"c1", false}, {"c2", false}, {"c3", false}}, true)
+            .sort({{"c1", false}, {"c2", false}, {"c3", false}}, true, fine_grained_shuffle_stream_count)
             .window(RowNumber(),
                     {{"c1", false}, {"c2", false}, {"c3", false}},
                     {{"c1", false}, {"c2", false}, {"c3", false}},
-                    buildDefaultRowsFrame());
+                    buildDefaultRowsFrame(),
+                    fine_grained_shuffle_stream_count);
         tipb::DAGRequest req;
         MPPInfo mpp_info(0, -1, -1, {}, std::unordered_map<String, std::vector<Int64>>{});
         builder.getRoot()->toTiPBExecutor(req.mutable_root_executor(), /*collator_id=*/0, mpp_info, TiFlashTestEnv::getContext());
@@ -50,13 +55,17 @@ class WindowFunctionBench : public ExchangeBench
         sort = window.child().sort();
     }
 
-    void prepareWindowStream(Context & context, int concurrency, int source_num, int total_rows, const std::vector<Block> & blocks, BlockInputStreamPtr & sender_stream, BlockInputStreamPtr & receiver_stream, std::shared_ptr<SenderHelper> & sender_helper, std::shared_ptr<ReceiverHelper> & receiver_helper) const
+    static void prepareWindowStream(Context & context, int concurrency, int source_num, int total_rows, uint32_t fine_grained_shuffle_stream_count, uint64_t fine_grained_shuffle_batch_size, const std::vector<Block> & blocks, BlockInputStreamPtr & sender_stream, BlockInputStreamPtr & receiver_stream, std::shared_ptr<SenderHelper> & sender_helper, std::shared_ptr<ReceiverHelper> & receiver_helper, bool build_window = true)
     {
+        tipb::Window window;
+        tipb::Sort sort;
+        setupPB(fine_grained_shuffle_stream_count, window, sort);
+
         DAGPipeline pipeline;
-        receiver_helper = std::make_shared<ReceiverHelper>(concurrency, source_num);
+        receiver_helper = std::make_shared<ReceiverHelper>(concurrency, source_num, fine_grained_shuffle_stream_count);
         pipeline.streams = receiver_helper->buildExchangeReceiverStream();
 
-        sender_helper = std::make_shared<SenderHelper>(source_num, concurrency, receiver_helper->queues, receiver_helper->fields);
+        sender_helper = std::make_shared<SenderHelper>(source_num, concurrency, fine_grained_shuffle_stream_count, fine_grained_shuffle_batch_size, receiver_helper->queues, receiver_helper->fields);
         sender_stream = sender_helper->buildUnionStream(total_rows, blocks);
 
         context.setDAGContext(sender_helper->dag_context.get());
@@ -66,16 +75,16 @@ class WindowFunctionBench : public ExchangeBench
             NameAndTypePair("c3", makeNullable(std::make_shared<DataTypeInt64>()))};
         auto mock_interpreter = mockInterpreter(context, source_columns, concurrency);
         mock_interpreter->input_streams_vec.push_back(pipeline.streams);
-        mockExecuteWindowOrder(mock_interpreter, pipeline, sort);
-        mockExecuteWindow(mock_interpreter, pipeline, window);
+        mockExecuteWindowOrder(mock_interpreter, pipeline, sort, fine_grained_shuffle_stream_count);
+        if (build_window)
+        {
+            mockExecuteWindow(mock_interpreter, pipeline, window, fine_grained_shuffle_stream_count);
+        }
         pipeline.transform([&](auto & stream) {
             stream = std::make_shared<SquashingBlockInputStream>(stream, 8192, 0, "mock_executor_id_squashing");
         });
-        receiver_stream = std::make_shared<UnionBlockInputStream<>>(pipeline.streams, nullptr, concurrency, /*req_id=*/"");
+        receiver_stream = std::make_shared<UnionBlockInputStream<>>(pipeline.streams, BlockInputStreams{}, concurrency, /*req_id=*/"");
     }
-
-    tipb::Window window;
-    tipb::Sort sort;
 };
 
 BENCHMARK_DEFINE_F(WindowFunctionBench, basic_row_number)
@@ -85,8 +94,15 @@ try
     const int concurrency = state.range(0);
     const int source_num = state.range(1);
     const int total_rows = state.range(2);
+    const int fine_grained_shuffle_stream_count = state.range(3);
+    const int fine_grained_shuffle_batch_size = state.range(4);
+    const bool skew = state.range(5);
     Context context = TiFlashTestEnv::getContext();
 
+    std::vector<Block> * blocks = &uniform_blocks;
+    if (skew)
+        blocks = &skew_blocks;
+
     for (auto _ : state)
     {
         std::shared_ptr<SenderHelper> sender_helper;
@@ -94,14 +110,58 @@ try
         BlockInputStreamPtr sender_stream;
         BlockInputStreamPtr receiver_stream;
 
-        prepareWindowStream(context, concurrency, source_num, total_rows, input_blocks, sender_stream, receiver_stream, sender_helper, receiver_helper);
+        prepareWindowStream(context, concurrency, source_num, total_rows, fine_grained_shuffle_stream_count, fine_grained_shuffle_batch_size, *blocks, sender_stream, receiver_stream, sender_helper, receiver_helper);
 
         runAndWait(receiver_helper, receiver_stream, sender_helper, sender_stream);
     }
 }
 CATCH
 BENCHMARK_REGISTER_F(WindowFunctionBench, basic_row_number)
-    ->Args({8, 1, 1024 * 1000});
+    ->Args({8, 1, 1024 * 1000, 0, 4096, false}) // Test fine_grained_shuffle_stream_count.
+    ->Args({8, 1, 1024 * 1000, 4, 4096, false})
+    ->Args({8, 1, 1024 * 1000, 8, 4096, false})
+    ->Args({8, 1, 1024 * 1000, 16, 4096, false})
+    ->Args({8, 1, 1024 * 1000, 32, 4096, false})
+    ->Args({8, 1, 1024 * 1000, 8, 1, false}) // Test fine_grained_shuffle_batch_size.
+    ->Args({8, 1, 1024 * 1000, 8, 1000, false})
+    ->Args({8, 1, 1024 * 1000, 8, 10000, false})
+    ->Args({8, 1, 1024 * 1000, 8, 100000, false})
+    ->Args({8, 1, 1024 * 1000, 0, 4096, true}) // Test skew dataset.
+    ->Args({8, 1, 1024 * 1000, 4, 4096, true})
+    ->Args({8, 1, 1024 * 1000, 8, 4096, true})
+    ->Args({8, 1, 1024 * 1000, 16, 4096, true});
+
+BENCHMARK_DEFINE_F(WindowFunctionBench, partial_sort_skew_dataset)
+(benchmark::State & state)
+try
+{
+    const int concurrency = state.range(0);
+    const int source_num = state.range(1);
+    const int total_rows = state.range(2);
+    const int fine_grained_shuffle_stream_count = state.range(3);
+    const int fine_grained_shuffle_batch_size = state.range(4);
+    Context context = TiFlashTestEnv::getContext();
 
+    std::vector<Block> * blocks = &skew_blocks;
+
+    for (auto _ : state)
+    {
+        std::shared_ptr<SenderHelper> sender_helper;
+        std::shared_ptr<ReceiverHelper> receiver_helper;
+        BlockInputStreamPtr sender_stream;
+        BlockInputStreamPtr receiver_stream;
+
+        // Only build partial sort.
+        prepareWindowStream(context, concurrency, source_num, total_rows, fine_grained_shuffle_stream_count, fine_grained_shuffle_batch_size, *blocks, sender_stream, receiver_stream, sender_helper, receiver_helper, /*build_window=*/false);
+
+        runAndWait(receiver_helper, receiver_stream, sender_helper, sender_stream);
+    }
+}
+CATCH
+BENCHMARK_REGISTER_F(WindowFunctionBench, partial_sort_skew_dataset)
+    ->Args({1, 1, 1024 * 10000, 0, 4096}) // Test how much multiple-thread improves performance for partial sort.
+    ->Args({2, 1, 1024 * 10000, 0, 4096})
+    ->Args({4, 1, 1024 * 10000, 0, 4096})
+    ->Args({8, 1, 1024 * 10000, 0, 4096});
 } // namespace tests
 } // namespace DB
diff --git a/dbms/src/Flash/tests/gtest_executor.cpp b/dbms/src/Flash/tests/gtest_executor.cpp
index 64c60f14bb6..d0e7b7e6c67 100644
--- a/dbms/src/Flash/tests/gtest_executor.cpp
+++ b/dbms/src/Flash/tests/gtest_executor.cpp
@@ -69,18 +69,18 @@ try
                        .filter(eq(col("s1"), col("s2")))
                        .build(context);
     {
-        executeStreams(request,
-                       {toNullableVec<String>({"banana"}),
-                        toNullableVec<String>({"banana"})});
+        ASSERT_COLUMNS_EQ_R(executeStreams(request),
+                            createColumns({toNullableVec<String>({"banana"}),
+                                           toNullableVec<String>({"banana"})}));
     }
 
     request = context.receive("exchange1")
                   .filter(eq(col("s1"), col("s2")))
                   .build(context);
     {
-        executeStreams(request,
-                       {toNullableVec<String>({"banana"}),
-                        toNullableVec<String>({"banana"})});
+        ASSERT_COLUMNS_EQ_R(executeStreams(request),
+                            createColumns({toNullableVec<String>({"banana"}),
+                                           toNullableVec<String>({"banana"})}));
     }
 }
 CATCH
@@ -99,25 +99,23 @@ try
                           "  table_scan_0 | {<0, String>, <1, String>}\n"
                           "  table_scan_1 | {<0, String>, <1, String>}\n";
         ASSERT_DAGREQUEST_EQAUL(expected, request);
-        executeStreams(request,
-                       {toNullableVec<String>({"banana", "banana"}),
-                        toNullableVec<String>({"apple", "banana"}),
-                        toNullableVec<String>({"banana", "banana"}),
-                        toNullableVec<String>({"apple", "banana"})},
-                       2);
-
-        executeStreams(request,
-                       {toNullableVec<String>({"banana", "banana"}),
-                        toNullableVec<String>({"apple", "banana"}),
-                        toNullableVec<String>({"banana", "banana"}),
-                        toNullableVec<String>({"apple", "banana"})},
-                       5);
-
-        executeStreams(request,
-                       {toNullableVec<String>({"banana", "banana"}),
-                        toNullableVec<String>({"apple", "banana"}),
-                        toNullableVec<String>({"banana", "banana"}),
-                        toNullableVec<String>({"apple", "banana"})});
+        ASSERT_COLUMNS_EQ_R(executeStreams(request, 2),
+                            createColumns({toNullableVec<String>({"banana", "banana"}),
+                                           toNullableVec<String>({"apple", "banana"}),
+                                           toNullableVec<String>({"banana", "banana"}),
+                                           toNullableVec<String>({"apple", "banana"})}));
+
+        ASSERT_COLUMNS_EQ_R(executeStreams(request, 5),
+                            createColumns({toNullableVec<String>({"banana", "banana"}),
+                                           toNullableVec<String>({"apple", "banana"}),
+                                           toNullableVec<String>({"banana", "banana"}),
+                                           toNullableVec<String>({"apple", "banana"})}));
+
+        ASSERT_COLUMNS_EQ_R(executeStreams(request),
+                            createColumns({toNullableVec<String>({"banana", "banana"}),
+                                           toNullableVec<String>({"apple", "banana"}),
+                                           toNullableVec<String>({"banana", "banana"}),
+                                           toNullableVec<String>({"apple", "banana"})}));
     }
     request = context
                   .scan("test_db", "l_table")
@@ -132,10 +130,9 @@ try
                           "   table_scan_0 | {<0, String>, <1, String>}\n"
                           "   table_scan_1 | {<0, String>, <1, String>}\n";
         ASSERT_DAGREQUEST_EQAUL(expected, request);
-        executeStreams(request,
-                       {toNullableVec<String>({"banana", "banana"}),
-                        toNullableVec<String>({"apple", "banana"})},
-                       2);
+        ASSERT_COLUMNS_EQ_R(executeStreams(request, 2),
+                            createColumns({toNullableVec<String>({"banana", "banana"}),
+                                           toNullableVec<String>({"apple", "banana"})}));
     }
 
     request = context
@@ -149,18 +146,16 @@ try
                           "  table_scan_0 | {<0, String>, <1, String>}\n"
                           "  table_scan_1 | {<0, String>, <1, String>}\n";
         ASSERT_DAGREQUEST_EQAUL(expected, request);
-        executeStreams(request,
-                       {toNullableVec<String>({"banana", "banana", "banana", "banana"}),
-                        toNullableVec<String>({"apple", "apple", "apple", "banana"}),
-                        toNullableVec<String>({"banana", "banana", "banana", {}}),
-                        toNullableVec<String>({"apple", "apple", "apple", {}})},
-                       2);
-        executeStreams(request,
-                       {toNullableVec<String>({"banana", "banana", "banana", "banana"}),
-                        toNullableVec<String>({"apple", "apple", "apple", "banana"}),
-                        toNullableVec<String>({"banana", "banana", "banana", {}}),
-                        toNullableVec<String>({"apple", "apple", "apple", {}})},
-                       3);
+        ASSERT_COLUMNS_EQ_R(executeStreams(request, 2),
+                            createColumns({toNullableVec<String>({"banana", "banana", "banana", "banana"}),
+                                           toNullableVec<String>({"apple", "apple", "apple", "banana"}),
+                                           toNullableVec<String>({"banana", "banana", "banana", {}}),
+                                           toNullableVec<String>({"apple", "apple", "apple", {}})}));
+        ASSERT_COLUMNS_EQ_R(executeStreams(request, 3),
+                            createColumns({toNullableVec<String>({"banana", "banana", "banana", "banana"}),
+                                           toNullableVec<String>({"apple", "apple", "apple", "banana"}),
+                                           toNullableVec<String>({"banana", "banana", "banana", {}}),
+                                           toNullableVec<String>({"apple", "apple", "apple", {}})}));
     }
 }
 CATCH
@@ -179,25 +174,23 @@ try
                           "  exchange_receiver_0 | type:PassThrough, {<0, String>, <1, String>}\n"
                           "  exchange_receiver_1 | type:PassThrough, {<0, String>, <1, String>}\n";
         ASSERT_DAGREQUEST_EQAUL(expected, request);
-        executeStreams(request,
-                       {toNullableVec<String>({"banana", "banana"}),
-                        toNullableVec<String>({"apple", "banana"}),
-                        toNullableVec<String>({"banana", "banana"}),
-                        toNullableVec<String>({"apple", "banana"})},
-                       2);
-
-        executeStreams(request,
-                       {toNullableVec<String>({"banana", "banana"}),
-                        toNullableVec<String>({"apple", "banana"}),
-                        toNullableVec<String>({"banana", "banana"}),
-                        toNullableVec<String>({"apple", "banana"})},
-                       5);
-
-        executeStreams(request,
-                       {toNullableVec<String>({"banana", "banana"}),
-                        toNullableVec<String>({"apple", "banana"}),
-                        toNullableVec<String>({"banana", "banana"}),
-                        toNullableVec<String>({"apple", "banana"})});
+        ASSERT_COLUMNS_EQ_R(executeStreams(request, 2),
+                            createColumns({toNullableVec<String>({"banana", "banana"}),
+                                           toNullableVec<String>({"apple", "banana"}),
+                                           toNullableVec<String>({"banana", "banana"}),
+                                           toNullableVec<String>({"apple", "banana"})}));
+
+        ASSERT_COLUMNS_EQ_R(executeStreams(request, 5),
+                            createColumns({toNullableVec<String>({"banana", "banana"}),
+                                           toNullableVec<String>({"apple", "banana"}),
+                                           toNullableVec<String>({"banana", "banana"}),
+                                           toNullableVec<String>({"apple", "banana"})}));
+
+        ASSERT_COLUMNS_EQ_R(executeStreams(request),
+                            createColumns({toNullableVec<String>({"banana", "banana"}),
+                                           toNullableVec<String>({"apple", "banana"}),
+                                           toNullableVec<String>({"banana", "banana"}),
+                                           toNullableVec<String>({"apple", "banana"})}));
     }
 }
 CATCH
@@ -216,15 +209,14 @@ try
                           "  table_scan_0 | {<0, String>, <1, String>}\n"
                           "  exchange_receiver_1 | type:PassThrough, {<0, String>, <1, String>}\n";
         ASSERT_DAGREQUEST_EQAUL(expected, request);
-        executeStreams(request,
-                       {toNullableVec<String>({"banana", "banana"}),
-                        toNullableVec<String>({"apple", "banana"}),
-                        toNullableVec<String>({"banana", "banana"}),
-                        toNullableVec<String>({"apple", "banana"})},
-                       2);
+        ASSERT_COLUMNS_EQ_R(executeStreams(request, 2),
+                            createColumns({toNullableVec<String>({"banana", "banana"}),
+                                           toNullableVec<String>({"apple", "banana"}),
+                                           toNullableVec<String>({"banana", "banana"}),
+                                           toNullableVec<String>({"apple", "banana"})}));
     }
 }
 CATCH
 
 } // namespace tests
-} // namespace DB
\ No newline at end of file
+} // namespace DB
diff --git a/dbms/src/Flash/tests/gtest_interpreter.cpp b/dbms/src/Flash/tests/gtest_interpreter.cpp
index ba7d8fd15ee..75a0857465e 100644
--- a/dbms/src/Flash/tests/gtest_interpreter.cpp
+++ b/dbms/src/Flash/tests/gtest_interpreter.cpp
@@ -31,8 +31,8 @@ class InterpreterExecuteTest : public DB::tests::ExecutorTest
         context.addMockTable({"test_db", "r_table"}, {{"r_a", TiDB::TP::TypeLong}, {"r_b", TiDB::TP::TypeString}, {"join_c", TiDB::TP::TypeString}});
         context.addMockTable({"test_db", "l_table"}, {{"l_a", TiDB::TP::TypeLong}, {"l_b", TiDB::TP::TypeString}, {"join_c", TiDB::TP::TypeString}});
         context.addExchangeRelationSchema("sender_1", {{"s1", TiDB::TP::TypeString}, {"s2", TiDB::TP::TypeString}, {"s3", TiDB::TP::TypeString}});
-        context.addExchangeRelationSchema("sender_l", {{"l_a", TiDB::TP::TypeString}, {"l_b", TiDB::TP::TypeString}, {"join_c", TiDB::TP::TypeString}});
-        context.addExchangeRelationSchema("sender_r", {{"r_a", TiDB::TP::TypeString}, {"r_b", TiDB::TP::TypeString}, {"join_c", TiDB::TP::TypeString}});
+        context.addExchangeRelationSchema("sender_l", {{"l_a", TiDB::TP::TypeLong}, {"l_b", TiDB::TP::TypeString}, {"join_c", TiDB::TP::TypeString}});
+        context.addExchangeRelationSchema("sender_r", {{"r_a", TiDB::TP::TypeLong}, {"r_b", TiDB::TP::TypeString}, {"join_c", TiDB::TP::TypeString}});
     }
 };
 
@@ -96,22 +96,19 @@ try
     auto request = context.scan("test_db", "test_table_1")
                        .project({"s1", "s2", "s3"})
                        .project({"s1", "s2"})
-                       .project("s1")
+                       .project({"s1"})
                        .build(context);
     {
         String expected = R"(
 Union: <for test>
  Expression x 10: <final projection>
   Expression: <projection>
-   Expression: <before projection>
-    Expression: <final projection>
-     Expression: <projection>
-      Expression: <before projection>
+   Expression: <final projection>
+    Expression: <projection>
+     Expression: <final projection>
+      Expression: <projection>
        Expression: <final projection>
-        Expression: <projection>
-         Expression: <before projection>
-          Expression: <final projection>
-           MockTableScan)";
+        MockTableScan)";
         ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
     }
 
@@ -125,16 +122,14 @@ Union: <for test>
 Union: <for test>
  Expression x 10: <final projection>
   Expression: <projection>
-   Expression: <before projection>
-    SharedQuery: <restore concurrency>
-     Expression: <final projection>
-      MergeSorting, limit = 10
-       Union: <for partial order>
-        PartialSorting x 10: limit = 10
-         Expression: <projection>
-          Expression: <before projection>
-           Expression: <final projection>
-            MockTableScan)";
+   SharedQuery: <restore concurrency>
+    Expression: <final projection>
+     MergeSorting, limit = 10
+      Union: <for partial order>
+       PartialSorting x 10: limit = 10
+        Expression: <projection>
+         Expression: <final projection>
+          MockTableScan)";
         ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
     }
 
@@ -150,22 +145,18 @@ Union: <for test>
 Union: <for test>
  Expression x 10: <final projection>
   Expression: <projection>
-   Expression: <before projection>
-    Expression: <final projection>
-     SharedQuery: <restore concurrency>
-      ParallelAggregating, max_threads: 10, final: true
-       Expression x 10: <before aggregation>
-        Expression: <projection>
-         Expression: <before projection>
-          SharedQuery: <restore concurrency>
-           Expression: <final projection>
-            MergeSorting, limit = 10
-             Union: <for partial order>
-              PartialSorting x 10: limit = 10
-               Expression: <projection>
-                Expression: <before projection>
-                 Expression: <final projection>
-                  MockTableScan)";
+   Expression: <final projection>
+    SharedQuery: <restore concurrency>
+     ParallelAggregating, max_threads: 10, final: true
+      Expression x 10: <projection>
+       SharedQuery: <restore concurrency>
+        Expression: <final projection>
+         MergeSorting, limit = 10
+          Union: <for partial order>
+           PartialSorting x 10: limit = 10
+            Expression: <projection>
+             Expression: <final projection>
+              MockTableScan)";
         ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
     }
 
@@ -188,97 +179,48 @@ Union: <for test>
     Limit x 10, limit = 10
      Expression: <final projection>
       Expression: <projection>
-       Expression: <before projection>
-        Expression: <final projection>
-         Expression: <before order and select>
-          Filter: <execute where>
-           Expression: <projection>
-            Expression: <before projection>
-             Expression: <final projection>
-              SharedQuery: <restore concurrency>
-               ParallelAggregating, max_threads: 10, final: true
-                Expression x 10: <before aggregation>
-                 Expression: <projection>
-                  Expression: <before projection>
-                   SharedQuery: <restore concurrency>
-                    Expression: <final projection>
-                     MergeSorting, limit = 10
-                      Union: <for partial order>
-                       PartialSorting x 10: limit = 10
-                        Expression: <projection>
-                         Expression: <before projection>
-                          Expression: <final projection>
-                           MockTableScan)";
-        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
-    }
-
-    // Join Source.
-    DAGRequestBuilder table1 = context.scan("test_db", "r_table");
-    DAGRequestBuilder table2 = context.scan("test_db", "l_table");
-    DAGRequestBuilder table3 = context.scan("test_db", "r_table");
-    DAGRequestBuilder table4 = context.scan("test_db", "l_table");
-
-    request = table1.join(
-                        table2.join(
-                            table3.join(table4,
-                                        {col("join_c")},
-                                        ASTTableJoin::Kind::Left),
-                            {col("join_c")},
-                            ASTTableJoin::Kind::Left),
-                        {col("join_c")},
-                        ASTTableJoin::Kind::Left)
-                  .build(context);
-    {
-        String expected = R"(
-CreatingSets
- Union: <for join>
-  HashJoinBuildBlockInputStream x 10: <join build, build_side_root_executor_id = table_scan_3>, join_kind = Left
-   Expression: <append join key and join filters for build side>
-    Expression: <final projection>
-     MockTableScan
- Union x 2: <for join>
-  HashJoinBuildBlockInputStream x 10: <join build, build_side_root_executor_id = Join_4>, join_kind = Left
-   Expression: <append join key and join filters for build side>
-    Expression: <final projection>
-     Expression: <remove useless column after join>
-      HashJoinProbe: <join probe, join_executor_id = Join_4>
        Expression: <final projection>
-        MockTableScan
- Union: <for test>
-  Expression x 10: <final projection>
-   Expression: <remove useless column after join>
-    HashJoinProbe: <join probe, join_executor_id = Join_6>
-     Expression: <final projection>
-      MockTableScan)";
+        Expression: <before order and select>
+         Filter: <execute where>
+          Expression: <projection>
+           Expression: <final projection>
+            SharedQuery: <restore concurrency>
+             ParallelAggregating, max_threads: 10, final: true
+              Expression x 10: <projection>
+               SharedQuery: <restore concurrency>
+                Expression: <final projection>
+                 MergeSorting, limit = 10
+                  Union: <for partial order>
+                   PartialSorting x 10: limit = 10
+                    Expression: <projection>
+                     Expression: <final projection>
+                      MockTableScan)";
         ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
     }
 
     request = context.receive("sender_1")
                   .project({"s1", "s2", "s3"})
                   .project({"s1", "s2"})
-                  .project("s1")
+                  .project({"s1"})
                   .build(context);
     {
         String expected = R"(
 Union: <for test>
  Expression x 10: <final projection>
   Expression: <projection>
-   Expression: <before projection>
-    Expression: <final projection>
-     Expression: <projection>
-      Expression: <before projection>
+   Expression: <final projection>
+    Expression: <projection>
+     Expression: <final projection>
+      Expression: <projection>
        Expression: <final projection>
-        Expression: <projection>
-         Expression: <before projection>
-          Expression: <final projection>
-           MockExchangeReceiver)";
+        MockExchangeReceiver)";
         ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
     }
 
     request = context.receive("sender_1")
                   .project({"s1", "s2", "s3"})
                   .project({"s1", "s2"})
-                  .project("s1")
+                  .project({"s1"})
                   .exchangeSender(tipb::Broadcast)
                   .build(context);
     {
@@ -287,35 +229,230 @@ Union: <for test>
  MockExchangeSender x 10
   Expression: <final projection>
    Expression: <projection>
-    Expression: <before projection>
-     Expression: <final projection>
-      Expression: <projection>
-       Expression: <before projection>
+    Expression: <final projection>
+     Expression: <projection>
+      Expression: <final projection>
+       Expression: <projection>
         Expression: <final projection>
+         MockExchangeReceiver)";
+        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
+    }
+}
+CATCH
+
+TEST_F(InterpreterExecuteTest, Window)
+try
+{
+    auto request = context
+                       .scan("test_db", "test_table")
+                       .sort({{"s1", true}, {"s2", false}}, true)
+                       .window(RowNumber(), {"s1", true}, {"s2", false}, buildDefaultRowsFrame())
+                       .build(context);
+    {
+        String expected = R"(
+Union: <for test>
+ Expression x 10: <final projection>
+  SharedQuery: <restore concurrency>
+   Expression: <cast after window>
+    Window, function: {row_number}, frame: {type: Rows, boundary_begin: Current, boundary_end: Current}
+     Expression: <final projection>
+      MergeSorting, limit = 0
+       Union: <for partial order>
+        PartialSorting x 10: limit = 0
+         Expression: <final projection>
+          MockTableScan)";
+        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
+    }
+
+    request = context.scan("test_db", "test_table")
+                  .sort({{"s1", true}, {"s2", false}}, true)
+                  .window(RowNumber(), {"s1", true}, {"s2", false}, buildDefaultRowsFrame())
+                  .project({"s1", "s2", "RowNumber()"})
+                  .build(context);
+    {
+        String expected = R"(
+Union: <for test>
+ Expression x 10: <final projection>
+  Expression: <projection>
+   Expression: <final projection>
+    SharedQuery: <restore concurrency>
+     Expression: <cast after window>
+      Window, function: {row_number}, frame: {type: Rows, boundary_begin: Current, boundary_end: Current}
+       Expression: <final projection>
+        MergeSorting, limit = 0
+         Union: <for partial order>
+          PartialSorting x 10: limit = 0
+           Expression: <final projection>
+            MockTableScan)";
+        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
+    }
+
+    request = context.scan("test_db", "test_table_1")
+                  .sort({{"s1", true}, {"s2", false}}, true)
+                  .project({"s1", "s2", "s3"})
+                  .window(RowNumber(), {"s1", true}, {"s1", false}, buildDefaultRowsFrame())
+                  .project({"s1", "s2", "s3", "RowNumber()"})
+                  .build(context);
+    {
+        String expected = R"(
+Union: <for test>
+ Expression x 10: <final projection>
+  Expression: <projection>
+   Expression: <final projection>
+    SharedQuery: <restore concurrency>
+     Expression: <cast after window>
+      Window, function: {row_number}, frame: {type: Rows, boundary_begin: Current, boundary_end: Current}
+       Union: <merge into one for window input>
+        Expression x 10: <final projection>
          Expression: <projection>
-          Expression: <before projection>
+          SharedQuery: <restore concurrency>
            Expression: <final projection>
-            MockExchangeReceiver)";
+            MergeSorting, limit = 0
+             Union: <for partial order>
+              PartialSorting x 10: limit = 0
+               Expression: <final projection>
+                MockTableScan)";
         ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
     }
+}
+CATCH
 
-    // only join + ExchangeReceiver
-    DAGRequestBuilder receiver1 = context.receive("sender_l");
-    DAGRequestBuilder receiver2 = context.receive("sender_r");
-    DAGRequestBuilder receiver3 = context.receive("sender_l");
-    DAGRequestBuilder receiver4 = context.receive("sender_r");
-
-    request = receiver1.join(
-                           receiver2.join(
-                               receiver3.join(receiver4,
-                                              {col("join_c")},
-                                              ASTTableJoin::Kind::Left),
-                               {col("join_c")},
-                               ASTTableJoin::Kind::Left),
-                           {col("join_c")},
-                           ASTTableJoin::Kind::Left)
+TEST_F(InterpreterExecuteTest, FineGrainedShuffle)
+try
+{
+    // fine-grained shuffle is enabled.
+    const uint64_t enable = 8;
+    const uint64_t disable = 0;
+    auto request = context
+                       .receive("sender_1", enable)
+                       .sort({{"s1", true}, {"s2", false}}, true, enable)
+                       .window(RowNumber(), {"s1", true}, {"s2", false}, buildDefaultRowsFrame(), enable)
+                       .build(context);
+    {
+        String expected = R"(
+Union: <for test>
+ Expression x 10: <final projection>
+  Expression: <cast after window>
+   Window: <enable fine grained shuffle>, function: {row_number}, frame: {type: Rows, boundary_begin: Current, boundary_end: Current}
+    Expression: <final projection>
+     MergeSorting: <enable fine grained shuffle>, limit = 0
+      PartialSorting: <enable fine grained shuffle>: limit = 0
+       Expression: <final projection>
+        MockExchangeReceiver
+        )";
+        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
+    }
+
+    auto topn_request = context
+                            .receive("sender_1")
+                            .topN("s2", false, 10)
+                            .build(context);
+    String topn_expected = R"(
+Union: <for test>
+ SharedQuery x 10: <restore concurrency>
+  Expression: <final projection>
+   MergeSorting, limit = 10
+    Union: <for partial order>
+     PartialSorting x 10: limit = 10
+      MockExchangeReceiver
+    )";
+    ASSERT_BLOCKINPUTSTREAM_EQAUL(topn_expected, topn_request, 10);
+
+    // fine-grained shuffle is disabled.
+    request = context
+                  .receive("sender_1", disable)
+                  .sort({{"s1", true}, {"s2", false}}, true, disable)
+                  .window(RowNumber(), {"s1", true}, {"s2", false}, buildDefaultRowsFrame(), disable)
                   .build(context);
     {
+        String expected = R"(
+Union: <for test>
+ Expression x 10: <final projection>
+  SharedQuery: <restore concurrency>
+   Expression: <cast after window>
+    Window, function: {row_number}, frame: {type: Rows, boundary_begin: Current, boundary_end: Current}
+     Expression: <final projection>
+      MergeSorting, limit = 0
+       Union: <for partial order>
+        PartialSorting x 10: limit = 0
+         Expression: <final projection>
+          MockExchangeReceiver
+        )";
+        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
+    }
+
+    topn_request = context
+                       .receive("sender_1")
+                       .topN("s2", false, 10)
+                       .build(context);
+    ASSERT_BLOCKINPUTSTREAM_EQAUL(topn_expected, topn_request, 10);
+}
+CATCH
+
+TEST_F(InterpreterExecuteTest, Join)
+try
+{
+    // TODO: Find a way to write the request easier.
+    {
+        // Join Source.
+        DAGRequestBuilder table1 = context.scan("test_db", "r_table");
+        DAGRequestBuilder table2 = context.scan("test_db", "l_table");
+        DAGRequestBuilder table3 = context.scan("test_db", "r_table");
+        DAGRequestBuilder table4 = context.scan("test_db", "l_table");
+
+        auto request = table1.join(
+                                 table2.join(
+                                     table3.join(table4,
+                                                 {col("join_c")},
+                                                 ASTTableJoin::Kind::Left),
+                                     {col("join_c")},
+                                     ASTTableJoin::Kind::Left),
+                                 {col("join_c")},
+                                 ASTTableJoin::Kind::Left)
+                           .build(context);
+
+        String expected = R"(
+CreatingSets
+ Union: <for join>
+  HashJoinBuildBlockInputStream x 10: <join build, build_side_root_executor_id = table_scan_3>, join_kind = Left
+   Expression: <append join key and join filters for build side>
+    Expression: <final projection>
+     MockTableScan
+ Union x 2: <for join>
+  HashJoinBuildBlockInputStream x 10: <join build, build_side_root_executor_id = Join_4>, join_kind = Left
+   Expression: <append join key and join filters for build side>
+    Expression: <final projection>
+     Expression: <remove useless column after join>
+      HashJoinProbe: <join probe, join_executor_id = Join_4>
+       Expression: <final projection>
+        MockTableScan
+ Union: <for test>
+  Expression x 10: <final projection>
+   Expression: <remove useless column after join>
+    HashJoinProbe: <join probe, join_executor_id = Join_6>
+     Expression: <final projection>
+      MockTableScan)";
+        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
+    }
+
+    {
+        // only join + ExchangeReceiver
+        DAGRequestBuilder receiver1 = context.receive("sender_l");
+        DAGRequestBuilder receiver2 = context.receive("sender_r");
+        DAGRequestBuilder receiver3 = context.receive("sender_l");
+        DAGRequestBuilder receiver4 = context.receive("sender_r");
+
+        auto request = receiver1.join(
+                                    receiver2.join(
+                                        receiver3.join(receiver4,
+                                                       {col("join_c")},
+                                                       ASTTableJoin::Kind::Left),
+                                        {col("join_c")},
+                                        ASTTableJoin::Kind::Left),
+                                    {col("join_c")},
+                                    ASTTableJoin::Kind::Left)
+                           .build(context);
+
         String expected = R"(
 CreatingSets
  Union: <for join>
@@ -340,24 +477,25 @@ CreatingSets
         ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
     }
 
-    // join + receiver + sender
-    // TODO: Find a way to write the request easier.
-    DAGRequestBuilder receiver5 = context.receive("sender_l");
-    DAGRequestBuilder receiver6 = context.receive("sender_r");
-    DAGRequestBuilder receiver7 = context.receive("sender_l");
-    DAGRequestBuilder receiver8 = context.receive("sender_r");
-    request = receiver5.join(
-                           receiver6.join(
-                               receiver7.join(receiver8,
-                                              {col("join_c")},
-                                              ASTTableJoin::Kind::Left),
-                               {col("join_c")},
-                               ASTTableJoin::Kind::Left),
-                           {col("join_c")},
-                           ASTTableJoin::Kind::Left)
-                  .exchangeSender(tipb::PassThrough)
-                  .build(context);
     {
+        // join + receiver + sender
+        DAGRequestBuilder receiver1 = context.receive("sender_l");
+        DAGRequestBuilder receiver2 = context.receive("sender_r");
+        DAGRequestBuilder receiver3 = context.receive("sender_l");
+        DAGRequestBuilder receiver4 = context.receive("sender_r");
+
+        auto request = receiver1.join(
+                                    receiver2.join(
+                                        receiver3.join(receiver4,
+                                                       {col("join_c")},
+                                                       ASTTableJoin::Kind::Left),
+                                        {col("join_c")},
+                                        ASTTableJoin::Kind::Left),
+                                    {col("join_c")},
+                                    ASTTableJoin::Kind::Left)
+                           .exchangeSender(tipb::PassThrough)
+                           .build(context);
+
         String expected = R"(
 CreatingSets
  Union: <for join>
@@ -385,85 +523,111 @@ CreatingSets
 }
 CATCH
 
-TEST_F(InterpreterExecuteTest, Window)
+TEST_F(InterpreterExecuteTest, JoinThenAgg)
 try
 {
-    auto request = context
-                       .scan("test_db", "test_table")
-                       .sort({{"s1", true}, {"s2", false}}, true)
-                       .window(RowNumber(), {"s1", true}, {"s2", false}, buildDefaultRowsFrame())
-                       .build(context);
     {
+        // Left Join.
+        DAGRequestBuilder table1 = context.scan("test_db", "r_table");
+        DAGRequestBuilder table2 = context.scan("test_db", "l_table");
+
+        auto request = table1.join(
+                                 table2,
+                                 {col("join_c")},
+                                 ASTTableJoin::Kind::Left)
+                           .aggregation({Max(col("r_a"))}, {col("join_c")})
+                           .build(context);
         String expected = R"(
-Union: <for test>
- Expression x 10: <final projection>
-  SharedQuery: <restore concurrency>
-   Expression: <cast after window>
-    Window, function: {row_number}, frame: {type: Rows, boundary_begin: Current, boundary_end: Current}
-     Expression: <final projection>
-      MergeSorting, limit = 0
-       Union: <for partial order>
-        PartialSorting x 10: limit = 0
-         Expression: <final projection>
-          MockTableScan)";
+CreatingSets
+ Union: <for join>
+  HashJoinBuildBlockInputStream x 10: <join build, build_side_root_executor_id = table_scan_1>, join_kind = Left
+   Expression: <append join key and join filters for build side>
+    Expression: <final projection>
+     MockTableScan
+ Union: <for test>
+  Expression x 10: <final projection>
+   SharedQuery: <restore concurrency>
+    ParallelAggregating, max_threads: 10, final: true
+     Expression x 10: <remove useless column after join>
+      HashJoinProbe: <join probe, join_executor_id = Join_2>
+       Expression: <final projection>
+        MockTableScan)";
         ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
     }
 
-    request = context.scan("test_db", "test_table")
-                  .sort({{"s1", true}, {"s2", false}}, true)
-                  .window(RowNumber(), {"s1", true}, {"s2", false}, buildDefaultRowsFrame())
-                  .project({"s1", "s2", "RowNumber()"})
-                  .build(context);
     {
+        // Right Join
+        DAGRequestBuilder table1 = context.scan("test_db", "r_table");
+        DAGRequestBuilder table2 = context.scan("test_db", "l_table");
+
+        auto request = table1.join(
+                                 table2,
+                                 {col("join_c")},
+                                 ASTTableJoin::Kind::Right)
+                           .aggregation({Max(col("r_a"))}, {col("join_c")})
+                           .build(context);
         String expected = R"(
-Union: <for test>
- Expression x 10: <final projection>
-  Expression: <projection>
-   Expression: <before projection>
+CreatingSets
+ Union: <for join>
+  HashJoinBuildBlockInputStream x 10: <join build, build_side_root_executor_id = table_scan_1>, join_kind = Right
+   Expression: <append join key and join filters for build side>
     Expression: <final projection>
-     SharedQuery: <restore concurrency>
-      Expression: <cast after window>
-       Window, function: {row_number}, frame: {type: Rows, boundary_begin: Current, boundary_end: Current}
+     MockTableScan
+ Union: <for test>
+  Expression x 10: <final projection>
+   SharedQuery: <restore concurrency>
+    ParallelAggregating, max_threads: 10, final: true
+     Expression x 10: <remove useless column after join>
+      HashJoinProbe: <join probe, join_executor_id = Join_2>
+       Expression: <append join key and join filters for probe side>
         Expression: <final projection>
-         MergeSorting, limit = 0
-          Union: <for partial order>
-           PartialSorting x 10: limit = 0
-            Expression: <final projection>
-             MockTableScan)";
+         MockTableScan
+     Expression x 10: <remove useless column after join>
+      NonJoined: <add stream with non_joined_data if full_or_right_join>)";
         ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
     }
 
-    request = context.scan("test_db", "test_table_1")
-                  .sort({{"s1", true}, {"s2", false}}, true)
-                  .project({"s1", "s2", "s3"})
-                  .window(RowNumber(), {"s1", true}, {"s1", false}, buildDefaultRowsFrame())
-                  .project({"s1", "s2", "s3", "RowNumber()"})
-                  .build(context);
     {
+        // Right join + receiver + sender
+        DAGRequestBuilder receiver1 = context.receive("sender_l");
+        DAGRequestBuilder receiver2 = context.receive("sender_r");
+
+        auto request = receiver1.join(
+                                    receiver2,
+                                    {col("join_c")},
+                                    ASTTableJoin::Kind::Right)
+                           .aggregation({Sum(col("r_a"))}, {col("join_c")})
+                           .exchangeSender(tipb::PassThrough)
+                           .limit(10)
+                           .build(context);
         String expected = R"(
-Union: <for test>
- Expression x 10: <final projection>
-  Expression: <projection>
-   Expression: <before projection>
+CreatingSets
+ Union: <for join>
+  HashJoinBuildBlockInputStream x 20: <join build, build_side_root_executor_id = exchange_receiver_1>, join_kind = Right
+   Expression: <append join key and join filters for build side>
     Expression: <final projection>
-     SharedQuery: <restore concurrency>
-      Expression: <cast after window>
-       Window, function: {row_number}, frame: {type: Rows, boundary_begin: Current, boundary_end: Current}
-        Union: <merge into one for window input>
-         Expression x 10: <final projection>
-          Expression: <projection>
-           Expression: <before projection>
-            SharedQuery: <restore concurrency>
-             Expression: <final projection>
-              MergeSorting, limit = 0
-               Union: <for partial order>
-                PartialSorting x 10: limit = 0
-                 Expression: <final projection>
-                  MockTableScan)";
-        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 10);
+     MockExchangeReceiver
+ Union: <for test>
+  MockExchangeSender x 20
+   SharedQuery: <restore concurrency>
+    Limit, limit = 10
+     Union: <for partial limit>
+      Limit x 20, limit = 10
+       Expression: <final projection>
+        Expression: <before order and select>
+         SharedQuery: <restore concurrency>
+          ParallelAggregating, max_threads: 20, final: true
+           Expression x 20: <remove useless column after join>
+            HashJoinProbe: <join probe, join_executor_id = Join_2>
+             Expression: <append join key and join filters for probe side>
+              Expression: <final projection>
+               MockExchangeReceiver
+           Expression x 20: <remove useless column after join>
+            NonJoined: <add stream with non_joined_data if full_or_right_join>)";
+        ASSERT_BLOCKINPUTSTREAM_EQAUL(expected, request, 20);
     }
 }
 CATCH
 
 } // namespace tests
-} // namespace DB
\ No newline at end of file
+} // namespace DB
diff --git a/dbms/src/Flash/tests/gtest_limit_executor.cpp b/dbms/src/Flash/tests/gtest_limit_executor.cpp
new file mode 100644
index 00000000000..47482540b39
--- /dev/null
+++ b/dbms/src/Flash/tests/gtest_limit_executor.cpp
@@ -0,0 +1,77 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <TestUtils/ExecutorTestUtils.h>
+#include <TestUtils/mockExecutor.h>
+
+namespace DB
+{
+namespace tests
+{
+
+class ExecutorLimitTestRunner : public DB::tests::ExecutorTest
+{
+public:
+    using ColDataType = std::optional<typename TypeTraits<String>::FieldType>;
+    using ColumnWithData = std::vector<ColDataType>;
+
+    void initializeContext() override
+    {
+        ExecutorTest::initializeContext();
+
+        context.addMockTable({db_name, table_name},
+                             {{col_name, TiDB::TP::TypeString}},
+                             {toNullableVec<String>(col_name, col0)});
+    }
+
+    std::shared_ptr<tipb::DAGRequest> buildDAGRequest(size_t limit_num)
+    {
+        return context.scan(db_name, table_name).limit(limit_num).build(context);
+    }
+
+    /// Prepare some names
+    const String db_name{"test_db"};
+    const String table_name{"projection_test_table"};
+    const String col_name{"limit_col"};
+    const ColumnWithData col0{"col0-0", {}, "col0-2", "col0-3", {}, "col0-5", "col0-6", "col0-7"};
+};
+
+TEST_F(ExecutorLimitTestRunner, Limit)
+try
+{
+    std::shared_ptr<tipb::DAGRequest> request;
+    ColumnsWithTypeAndName expect_cols;
+
+    /// Check limit result with various parameters
+    const size_t col_data_num = col0.size();
+    for (size_t limit_num = 0; limit_num <= col_data_num + 3; ++limit_num)
+    {
+        if (limit_num == col_data_num + 3)
+            limit_num = INT_MAX;
+        request = buildDAGRequest(limit_num);
+
+        if (limit_num == 0)
+            expect_cols = {};
+        else if (limit_num > col_data_num)
+            expect_cols = {toNullableVec<String>(col_name, ColumnWithData(col0.begin(), col0.end()))};
+        else
+            expect_cols = {toNullableVec<String>(col_name, ColumnWithData(col0.begin(), col0.begin() + limit_num))};
+
+        ASSERT_COLUMNS_EQ_R(executeStreams(request), expect_cols);
+    }
+}
+CATCH
+
+} // namespace tests
+} // namespace DB
diff --git a/dbms/src/Flash/tests/gtest_projection_executor.cpp b/dbms/src/Flash/tests/gtest_projection_executor.cpp
new file mode 100644
index 00000000000..8443dedeb49
--- /dev/null
+++ b/dbms/src/Flash/tests/gtest_projection_executor.cpp
@@ -0,0 +1,224 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <TestUtils/ExecutorTestUtils.h>
+#include <TestUtils/mockExecutor.h>
+
+namespace DB
+{
+namespace tests
+{
+
+class ExecutorProjectionTestRunner : public DB::tests::ExecutorTest
+{
+public:
+    using ColDataString = std::vector<std::optional<typename TypeTraits<String>::FieldType>>;
+    using ColDataInt32 = std::vector<std::optional<typename TypeTraits<Int32>::FieldType>>;
+
+    void initializeContext() override
+    {
+        ExecutorTest::initializeContext();
+
+        context.addMockTable({db_name, table_name},
+                             {{col_names[0], TiDB::TP::TypeString},
+                              {col_names[1], TiDB::TP::TypeString},
+                              {col_names[2], TiDB::TP::TypeString},
+                              {col_names[3], TiDB::TP::TypeLong},
+                              {col_names[4], TiDB::TP::TypeLong}},
+                             {toNullableVec<String>(col_names[0], col0),
+                              toNullableVec<String>(col_names[1], col1),
+                              toNullableVec<String>(col_names[2], col2),
+                              toNullableVec<Int32>(col_names[3], col3),
+                              toNullableVec<Int32>(col_names[4], col4)});
+    }
+
+    template <typename T>
+    std::shared_ptr<tipb::DAGRequest> buildDAGRequest(T param)
+    {
+        return context.scan(db_name, table_name).project(param).build(context);
+    };
+
+    void executeWithConcurrency(const std::shared_ptr<tipb::DAGRequest> & request, const ColumnsWithTypeAndName & expect_columns)
+    {
+        for (size_t i = 1; i < 10; i += 2)
+        {
+            ASSERT_COLUMNS_EQ_UR(executeStreams(request, i), expect_columns);
+        }
+    }
+
+    /// Prepare column data
+    const ColDataString col0{"col0-0", "col0-1", "", "col0-2", {}, "col0-3", ""};
+    const ColDataString col1{"col1-0", {}, "", "col1-1", "", "col1-2", "col1-3"};
+    const ColDataString col2{"", "col2-0", "col2-1", {}, "col2-3", {}, "col2-4"};
+    const ColDataInt32 col3{1, {}, 0, -111111, {}, 0, 9999};
+
+    /** Each value in col4 should be different from each other so that topn 
+     *  could sort the columns into an unique result, or multi-results could
+     *  be right.
+     */
+    const ColDataInt32 col4{0, 5, -123, -234, {}, 24353, 9999};
+
+    /// Results after sorted by col4
+    const ColDataString col0_sorted_asc{{}, "col0-2", "", "col0-0", "col0-1", "", "col0-3"};
+    const ColDataString col1_sorted_asc{"", "col1-1", "", "col1-0", {}, "col1-3", "col1-2"};
+    const ColDataString col2_sorted_asc{"col2-3", {}, "col2-1", "", "col2-0", "col2-4", {}};
+    const ColDataInt32 col3_sorted_asc{{}, -111111, 0, 1, {}, 9999, 0};
+    const ColDataInt32 col4_sorted_asc{{}, -234, -123, 0, 5, 9999, 24353};
+
+    /// Prepare some names
+    std::vector<String> col_names{"col0", "col1", "col2", "col3", "col4"};
+    const String db_name{"test_db"};
+    const String table_name{"projection_test_table"};
+};
+
+TEST_F(ExecutorProjectionTestRunner, Projection)
+try
+{
+    /// Check single column
+    auto request = buildDAGRequest<MockColumnNameVec>({col_names[4]});
+    executeWithConcurrency(request, {toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+    /// Check multi columns
+    request = buildDAGRequest<MockColumnNameVec>({col_names[0], col_names[4]});
+    executeWithConcurrency(request,
+                           {
+                               toNullableVec<String>(col_names[0], col0_sorted_asc),
+                               toNullableVec<Int32>(col_names[4], col4_sorted_asc),
+                           });
+
+    /// Check multi columns
+    request = buildDAGRequest<MockColumnNameVec>({col_names[0], col_names[1], col_names[4]});
+    executeWithConcurrency(request,
+                           {toNullableVec<String>(col_names[0], col0_sorted_asc),
+                            toNullableVec<String>(col_names[1], col1_sorted_asc),
+                            toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+    /// Check duplicate columns
+    request = buildDAGRequest<MockColumnNameVec>({col_names[4], col_names[4], col_names[4]});
+    executeWithConcurrency(request,
+                           {toNullableVec<Int32>(col_names[4], col4_sorted_asc),
+                            toNullableVec<Int32>(col_names[4], col4_sorted_asc),
+                            toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+    {
+        /// Check large number of columns
+        const size_t col_num = 100;
+        MockColumnNameVec projection_input;
+        ColumnsWithTypeAndName columns;
+        auto expect_column = toNullableVec<Int32>(col_names[4], col4_sorted_asc);
+
+        for (size_t i = 0; i < col_num; ++i)
+        {
+            projection_input.push_back(col_names[4]);
+            columns.push_back(expect_column);
+        }
+
+        request = buildDAGRequest<MockColumnNameVec>(projection_input);
+        executeWithConcurrency(request, columns);
+    }
+}
+CATCH
+
+TEST_F(ExecutorProjectionTestRunner, ProjectionFunction)
+try
+{
+    std::shared_ptr<tipb::DAGRequest> request;
+
+    /// Test "equal" function
+
+    /// Data type: TypeString
+    request = buildDAGRequest<MockAstVec>({eq(col(col_names[0]), col(col_names[0])), col(col_names[4])});
+    executeWithConcurrency(request,
+                           {toNullableVec<UInt64>({{}, 1, 1, 1, 1, 1, 1}),
+                            toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+    request = buildDAGRequest<MockAstVec>({eq(col(col_names[0]), col(col_names[1])), col(col_names[4])});
+    executeWithConcurrency(request,
+                           {toNullableVec<UInt64>({{}, 0, 1, 0, {}, 0, 0}),
+                            toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+    /// Data type: TypeLong
+    request = buildDAGRequest<MockAstVec>({eq(col(col_names[3]), col(col_names[4])), col(col_names[4])});
+    executeWithConcurrency(request,
+                           {toNullableVec<UInt64>({{}, 0, 0, 0, {}, 1, 0}),
+                            toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+
+    /// Test "greater" function
+
+    /// Data type: TypeString
+    request = buildDAGRequest<MockAstVec>({gt(col(col_names[0]), col(col_names[1])), col(col_names[4])});
+    executeWithConcurrency(request,
+                           {toNullableVec<UInt64>({{}, 0, 0, 0, {}, 0, 0}),
+                            toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+    request = buildDAGRequest<MockAstVec>({gt(col(col_names[1]), col(col_names[0])), col(col_names[4])});
+    executeWithConcurrency(request,
+                           {toNullableVec<UInt64>({{}, 1, 0, 1, {}, 1, 1}),
+                            toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+    /// Data type: TypeLong
+    request = buildDAGRequest<MockAstVec>({gt(col(col_names[3]), col(col_names[4])), col(col_names[4])});
+    executeWithConcurrency(request,
+                           {toNullableVec<UInt64>({{}, 0, 1, 1, {}, 0, 0}),
+                            toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+    request = buildDAGRequest<MockAstVec>({gt(col(col_names[4]), col(col_names[3])), col(col_names[4])});
+    executeWithConcurrency(request,
+                           {toNullableVec<UInt64>({{}, 1, 0, 0, {}, 0, 1}),
+                            toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+
+    /// Test "and" function
+
+    /// Data type: TypeString
+    request = buildDAGRequest<MockAstVec>({And(col(col_names[0]), col(col_names[0])), col(col_names[4])});
+    executeWithConcurrency(request,
+                           {toNullableVec<UInt64>({{}, 0, 0, 0, 0, 0, 0}),
+                            toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+    request = buildDAGRequest<MockAstVec>({And(col(col_names[0]), col(col_names[1])), col(col_names[4])});
+    executeWithConcurrency(request,
+                           {toNullableVec<UInt64>({0, 0, 0, 0, 0, 0, 0}),
+                            toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+    /// Data type: TypeLong
+    request = buildDAGRequest<MockAstVec>({And(col(col_names[3]), col(col_names[4])), col(col_names[4])});
+    executeWithConcurrency(request,
+                           {toNullableVec<UInt64>({{}, 1, 0, 0, {}, 1, 0}),
+                            toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+    /// Test "not" function
+
+    /// Data type: TypeString
+    request = buildDAGRequest<MockAstVec>({NOT(col(col_names[0])), NOT(col(col_names[1])), NOT(col(col_names[2])), col(col_names[4])});
+    executeWithConcurrency(request,
+                           {toNullableVec<UInt64>({{}, 1, 1, 1, 1, 1, 1}),
+                            toNullableVec<UInt64>({1, 1, 1, 1, {}, 1, 1}),
+                            toNullableVec<UInt64>({1, {}, 1, 1, 1, 1, {}}),
+                            toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+    /// Data type: TypeLong
+    request = buildDAGRequest<MockAstVec>({NOT(col(col_names[3])), NOT(col(col_names[4])), col(col_names[4])});
+    executeWithConcurrency(request,
+                           {toNullableVec<UInt64>({{}, 0, 1, 0, {}, 0, 1}),
+                            toNullableVec<UInt64>({{}, 0, 0, 1, 0, 0, 0}),
+                            toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+    /// TODO more functions...
+}
+CATCH
+
+} // namespace tests
+} // namespace DB
diff --git a/dbms/src/Flash/tests/gtest_topn_executor.cpp b/dbms/src/Flash/tests/gtest_topn_executor.cpp
new file mode 100644
index 00000000000..597ac9f279a
--- /dev/null
+++ b/dbms/src/Flash/tests/gtest_topn_executor.cpp
@@ -0,0 +1,221 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <TestUtils/ExecutorTestUtils.h>
+#include <TestUtils/mockExecutor.h>
+
+namespace DB
+{
+namespace tests
+{
+
+class ExecutorTopNTestRunner : public DB::tests::ExecutorTest
+{
+public:
+    using ColStringType = std::optional<typename TypeTraits<String>::FieldType>;
+    using ColInt32Type = std::optional<typename TypeTraits<Int32>::FieldType>;
+    using ColumnWithString = std::vector<ColStringType>;
+    using ColumnWithInt32 = std::vector<ColInt32Type>;
+
+    void initializeContext() override
+    {
+        ExecutorTest::initializeContext();
+
+        context.addMockTable({db_name, table_single_name},
+                             {{single_col_name, TiDB::TP::TypeString}},
+                             {toNullableVec<String>(single_col_name, col0)});
+
+        context.addMockTable({db_name, table_name},
+                             {{col_name[0], TiDB::TP::TypeLong},
+                              {col_name[1], TiDB::TP::TypeString},
+                              {col_name[2], TiDB::TP::TypeString},
+                              {col_name[3], TiDB::TP::TypeLong}},
+                             {toNullableVec<Int32>(col_name[0], col_age),
+                              toNullableVec<String>(col_name[1], col_gender),
+                              toNullableVec<String>(col_name[2], col_country),
+                              toNullableVec<Int32>(col_name[3], c0l_salary)});
+    }
+
+    std::shared_ptr<tipb::DAGRequest> buildDAGRequest(const String & table_name, const String & col_name, bool is_desc, int limit_num)
+    {
+        return context.scan(db_name, table_name).topN(col_name, is_desc, limit_num).build(context);
+    }
+
+    std::shared_ptr<tipb::DAGRequest> buildDAGRequest(const String & table_name, MockOrderByItemVec order_by_items, int limit, MockAstVec func_proj_ast = {}, MockColumnNameVec out_proj_ast = {})
+    {
+        if (func_proj_ast.size() == 0)
+            return context.scan(db_name, table_name).topN(order_by_items, limit).build(context);
+        else
+            return context.scan(db_name, table_name).project(func_proj_ast).topN(order_by_items, limit).project(out_proj_ast).build(context);
+    }
+
+    /// Prepare some names
+    const String db_name{"test_db"};
+
+    const String table_single_name{"topn_single_table"}; /// For single column test
+    const String single_col_name{"single_col"};
+    ColumnWithString col0{"col0-0", "col0-1", "col0-2", {}, "col0-4", {}, "col0-6", "col0-7"};
+
+    const String table_name{"clerk"};
+    const std::vector<String> col_name{"age", "gender", "country", "salary"};
+    ColumnWithInt32 col_age{{}, 27, 32, 36, {}, 34};
+    ColumnWithString col_gender{"female", "female", "male", "female", "male", "male"};
+    ColumnWithString col_country{"korea", "usa", "usa", "china", "china", "china"};
+    ColumnWithInt32 c0l_salary{1300, 0, {}, 900, {}, -300};
+};
+
+TEST_F(ExecutorTopNTestRunner, TopN)
+try
+{
+    std::shared_ptr<tipb::DAGRequest> request;
+    std::vector<ColumnsWithTypeAndName> expect_cols;
+
+    {
+        /// Test single column
+        size_t col_data_num = col0.size();
+        for (size_t i = 1; i <= 1; ++i)
+        {
+            bool is_desc;
+            is_desc = static_cast<bool>(i); /// Set descent or ascent
+            if (is_desc)
+                sort(col0.begin(), col0.end(), std::greater<ColStringType>()); /// Sort col0 for the following comparison
+            else
+                sort(col0.begin(), col0.end());
+
+            for (size_t limit_num = 0; limit_num <= col_data_num + 5; ++limit_num)
+            {
+                request = buildDAGRequest(table_single_name, single_col_name, is_desc, limit_num);
+
+                expect_cols.clear();
+                if (limit_num == 0 || limit_num > col_data_num)
+                    expect_cols.push_back({toNullableVec<String>(single_col_name, ColumnWithString(col0.begin(), col0.end()))});
+                else
+                    expect_cols.push_back({toNullableVec<String>(single_col_name, ColumnWithString(col0.begin(), col0.begin() + limit_num))});
+
+                ASSERT_COLUMNS_EQ_R(executeStreams(request), expect_cols[0]);
+                ASSERT_COLUMNS_EQ_R(executeStreams(request, 2), expect_cols[0]);
+                ASSERT_COLUMNS_EQ_R(executeStreams(request, 4), expect_cols[0]);
+                ASSERT_COLUMNS_EQ_R(executeStreams(request, 8), expect_cols[0]);
+            }
+        }
+    }
+
+    {
+        /// Test multi-columns
+        expect_cols = {{toNullableVec<Int32>(col_name[0], ColumnWithInt32{36, 34, 32, 27, {}, {}}),
+                        toNullableVec<String>(col_name[1], ColumnWithString{"female", "male", "male", "female", "male", "female"}),
+                        toNullableVec<String>(col_name[2], ColumnWithString{"china", "china", "usa", "usa", "china", "korea"}),
+                        toNullableVec<Int32>(col_name[3], ColumnWithInt32{900, -300, {}, 0, {}, 1300})},
+                       {toNullableVec<Int32>(col_name[0], ColumnWithInt32{32, {}, 34, 27, 36, {}}),
+                        toNullableVec<String>(col_name[1], ColumnWithString{"male", "male", "male", "female", "female", "female"}),
+                        toNullableVec<String>(col_name[2], ColumnWithString{"usa", "china", "china", "usa", "china", "korea"}),
+                        toNullableVec<Int32>(col_name[3], ColumnWithInt32{{}, {}, -300, 0, 900, 1300})},
+                       {toNullableVec<Int32>(col_name[0], ColumnWithInt32{34, {}, 32, 36, {}, 27}),
+                        toNullableVec<String>(col_name[1], ColumnWithString{"male", "male", "male", "female", "female", "female"}),
+                        toNullableVec<String>(col_name[2], ColumnWithString{"china", "china", "usa", "china", "korea", "usa"}),
+                        toNullableVec<Int32>(col_name[3], ColumnWithInt32{-300, {}, {}, 900, 1300, 0})}};
+
+        std::vector<MockOrderByItemVec> order_by_items{
+            /// select * from clerk order by age DESC, gender DESC;
+            {MockOrderByItem(col_name[0], true), MockOrderByItem(col_name[1], true)},
+            /// select * from clerk order by gender DESC, salary ASC;
+            {MockOrderByItem(col_name[1], true), MockOrderByItem(col_name[3], false)},
+            /// select * from clerk order by gender DESC, country ASC, salary DESC;
+            {MockOrderByItem(col_name[1], true), MockOrderByItem(col_name[2], false), MockOrderByItem(col_name[3], true)}};
+
+        size_t test_num = expect_cols.size();
+
+        for (size_t i = 0; i < test_num; ++i)
+        {
+            request = buildDAGRequest(table_name, order_by_items[i], 100);
+            ASSERT_COLUMNS_EQ_R(executeStreams(request), expect_cols[i]);
+        }
+    }
+}
+CATCH
+
+TEST_F(ExecutorTopNTestRunner, TopNFunction)
+try
+{
+    std::shared_ptr<tipb::DAGRequest> request;
+    std::vector<ColumnsWithTypeAndName> expect_cols;
+    MockColumnNameVec output_projection{col_name[0], col_name[1], col_name[2], col_name[3]};
+    MockAstVec func_projection; // Do function operation for topn
+    MockOrderByItemVec order_by_items;
+    ASTPtr col0_ast = col(col_name[0]);
+    ASTPtr col1_ast = col(col_name[1]);
+    ASTPtr col2_ast = col(col_name[2]);
+    ASTPtr col3_ast = col(col_name[3]);
+    ASTPtr func_ast;
+
+    {
+        /// "and" function
+        expect_cols = {{toNullableVec<Int32>(col_name[0], ColumnWithInt32{{}, {}, 32, 27, 36, 34}),
+                        toNullableVec<String>(col_name[1], ColumnWithString{"female", "male", "male", "female", "female", "male"}),
+                        toNullableVec<String>(col_name[2], ColumnWithString{"korea", "china", "usa", "usa", "china", "china"}),
+                        toNullableVec<Int32>(col_name[3], ColumnWithInt32{1300, {}, {}, 0, 900, -300})}};
+
+        {
+            /// select * from clerk order by age and salary ASC limit 100;
+            order_by_items = {MockOrderByItem("and(age, salary)", false)};
+            func_ast = And(col(col_name[0]), col(col_name[3]));
+            func_projection = {col0_ast, col1_ast, col2_ast, col3_ast, func_ast};
+
+            request = buildDAGRequest(table_name, order_by_items, 100, func_projection, output_projection);
+            ASSERT_COLUMNS_EQ_R(executeStreams(request), expect_cols[0]);
+        }
+    }
+
+    {
+        /// "equal" function
+        expect_cols = {{toNullableVec<Int32>(col_name[0], ColumnWithInt32{27, 36, 34, 32, {}, {}}),
+                        toNullableVec<String>(col_name[1], ColumnWithString{"female", "female", "male", "male", "female", "male"}),
+                        toNullableVec<String>(col_name[2], ColumnWithString{"usa", "china", "china", "usa", "korea", "china"}),
+                        toNullableVec<Int32>(col_name[3], ColumnWithInt32{0, 900, -300, {}, 1300, {}})}};
+
+        {
+            /// select age, salary from clerk order by age = salary DESC limit 100;
+            order_by_items = {MockOrderByItem("equals(age, salary)", true)};
+            func_ast = eq(col(col_name[0]), col(col_name[3]));
+            func_projection = {col0_ast, col1_ast, col2_ast, col3_ast, func_ast};
+
+            request = buildDAGRequest(table_name, order_by_items, 100, func_projection, output_projection);
+            ASSERT_COLUMNS_EQ_R(executeStreams(request), expect_cols[0]);
+        }
+    }
+
+    {
+        /// "greater" function
+        expect_cols = {{toNullableVec<Int32>(col_name[0], ColumnWithInt32{{}, 32, {}, 36, 27, 34}),
+                        toNullableVec<String>(col_name[1], ColumnWithString{"female", "male", "male", "female", "female", "male"}),
+                        toNullableVec<String>(col_name[2], ColumnWithString{"korea", "usa", "china", "china", "usa", "china"}),
+                        toNullableVec<Int32>(col_name[3], ColumnWithInt32{1300, {}, {}, 900, 0, -300})}};
+
+        {
+            /// select age, gender, country, salary from clerk order by age > salary ASC limit 100;
+            order_by_items = {MockOrderByItem("greater(age, salary)", false)};
+            func_ast = gt(col(col_name[0]), col(col_name[3]));
+            func_projection = {col0_ast, col1_ast, col2_ast, col3_ast, func_ast};
+
+            request = buildDAGRequest(table_name, order_by_items, 100, func_projection, output_projection);
+            ASSERT_COLUMNS_EQ_R(executeStreams(request), expect_cols[0]);
+        }
+    }
+
+    /// TODO more functions...
+}
+CATCH
+
+} // namespace tests
+} // namespace DB
diff --git a/dbms/src/Functions/CollationOperatorOptimized.h b/dbms/src/Functions/CollationOperatorOptimized.h
new file mode 100644
index 00000000000..395ecc5b9eb
--- /dev/null
+++ b/dbms/src/Functions/CollationOperatorOptimized.h
@@ -0,0 +1,210 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <Columns/ColumnString.h>
+#include <Core/AccurateComparison.h>
+#include <Functions/StringUtil.h>
+#include <common/StringRef.h>
+#include <common/defines.h>
+
+#include <cstddef>
+#include <string_view>
+
+
+namespace DB
+{
+
+template <typename T>
+ALWAYS_INLINE inline int signum(T val)
+{
+    return (0 < val) - (val < 0);
+}
+
+// Check equality is much faster than other comparison.
+// - check size first
+// - return 0 if equal else 1
+__attribute__((flatten, always_inline, pure)) inline uint8_t RawStrEqualCompare(const std::string_view & lhs, const std::string_view & rhs)
+{
+    return StringRef(lhs) == StringRef(rhs) ? 0 : 1;
+}
+
+// Compare str view by memcmp
+__attribute__((flatten, always_inline, pure)) inline int RawStrCompare(const std::string_view & v1, const std::string_view & v2)
+{
+    return signum(v1.compare(v2));
+}
+
+constexpr char SPACE = ' ';
+
+// Remove tail space
+__attribute__((flatten, always_inline, pure)) inline std::string_view RightTrim(const std::string_view & v)
+{
+    if (likely(v.empty() || v.back() != SPACE))
+        return v;
+    size_t end = v.find_last_not_of(SPACE);
+    return end == std::string_view::npos ? std::string_view{} : std::string_view(v.data(), end + 1);
+}
+
+__attribute__((flatten, always_inline, pure)) inline int RtrimStrCompare(const std::string_view & va, const std::string_view & vb)
+{
+    return RawStrCompare(RightTrim(va), RightTrim(vb));
+}
+
+// If true, only need to check equal or not.
+template <typename T>
+struct IsEqualRelated
+{
+    static constexpr const bool value = false;
+};
+
+// For `EqualsOp` and `NotEqualsOp`, value is true.
+template <typename... A>
+struct IsEqualRelated<DB::EqualsOp<A...>>
+{
+    static constexpr const bool value = true;
+};
+template <typename... A>
+struct IsEqualRelated<DB::NotEqualsOp<A...>>
+{
+    static constexpr const bool value = true;
+};
+
+// Loop columns and invoke callback for each pair.
+template <typename F>
+__attribute__((flatten, always_inline)) inline void LoopTwoColumns(
+    const ColumnString::Chars_t & a_data,
+    const ColumnString::Offsets & a_offsets,
+    const ColumnString::Chars_t & b_data,
+    const ColumnString::Offsets & b_offsets,
+    size_t size,
+    F && func)
+{
+    for (size_t i = 0; i < size; ++i)
+    {
+        size_t a_size = StringUtil::sizeAt(a_offsets, i) - 1;
+        size_t b_size = StringUtil::sizeAt(b_offsets, i) - 1;
+        const auto * a_ptr = reinterpret_cast<const char *>(&a_data[StringUtil::offsetAt(a_offsets, i)]);
+        const auto * b_ptr = reinterpret_cast<const char *>(&b_data[StringUtil::offsetAt(b_offsets, i)]);
+
+        func({a_ptr, a_size}, {b_ptr, b_size}, i);
+    }
+}
+
+// Loop one column and invoke callback for each pair.
+template <typename F>
+__attribute__((flatten, always_inline)) inline void LoopOneColumn(
+    const ColumnString::Chars_t & a_data,
+    const ColumnString::Offsets & a_offsets,
+    size_t size,
+    F && func)
+{
+    for (size_t i = 0; i < size; ++i)
+    {
+        size_t a_size = StringUtil::sizeAt(a_offsets, i) - 1;
+        const auto * a_ptr = reinterpret_cast<const char *>(&a_data[StringUtil::offsetAt(a_offsets, i)]);
+
+        func({a_ptr, a_size}, i);
+    }
+}
+
+// Handle str-column compare str-column.
+// - Optimize UTF8_BIN and UTF8MB4_BIN
+//   - Check if columns do NOT contain tail space
+//   - If Op is `EqualsOp` or `NotEqualsOp`, optimize comparison by faster way
+template <typename Op, typename Result>
+ALWAYS_INLINE inline bool StringVectorStringVector(
+    const ColumnString::Chars_t & a_data,
+    const ColumnString::Offsets & a_offsets,
+    const ColumnString::Chars_t & b_data,
+    const ColumnString::Offsets & b_offsets,
+    const TiDB::TiDBCollatorPtr & collator,
+    Result & c)
+{
+    bool use_optimized_path = false;
+
+    switch (collator->getCollatorId())
+    {
+    case TiDB::ITiDBCollator::UTF8MB4_BIN:
+    case TiDB::ITiDBCollator::UTF8_BIN:
+    {
+        size_t size = a_offsets.size();
+
+        LoopTwoColumns(a_data, a_offsets, b_data, b_offsets, size, [&c](const std::string_view & va, const std::string_view & vb, size_t i) {
+            if constexpr (IsEqualRelated<Op>::value)
+            {
+                c[i] = Op::apply(RawStrEqualCompare(RightTrim(va), RightTrim(vb)), 0);
+            }
+            else
+            {
+                c[i] = Op::apply(RtrimStrCompare(va, vb), 0);
+            }
+        });
+
+        use_optimized_path = true;
+
+        break;
+    }
+    default:
+        break;
+    }
+    return use_optimized_path;
+}
+
+// Handle str-column compare const-str.
+// - Optimize UTF8_BIN and UTF8MB4_BIN
+//   - Right trim const-str first
+//   - Check if column does NOT contain tail space
+//   - If Op is `EqualsOp` or `NotEqualsOp`, optimize comparison by faster way
+template <typename Op, typename Result>
+ALWAYS_INLINE inline bool StringVectorConstant(
+    const ColumnString::Chars_t & a_data,
+    const ColumnString::Offsets & a_offsets,
+    const std::string_view & b,
+    const TiDB::TiDBCollatorPtr & collator,
+    Result & c)
+{
+    bool use_optimized_path = false;
+
+    switch (collator->getCollatorId())
+    {
+    case TiDB::ITiDBCollator::UTF8MB4_BIN:
+    case TiDB::ITiDBCollator::UTF8_BIN:
+    {
+        size_t size = a_offsets.size();
+
+        std::string_view tar_str_view = RightTrim(b); // right trim const-str first
+
+        LoopOneColumn(a_data, a_offsets, size, [&c, &tar_str_view](const std::string_view & view, size_t i) {
+            if constexpr (IsEqualRelated<Op>::value)
+            {
+                c[i] = Op::apply(RawStrEqualCompare(RightTrim(view), tar_str_view), 0);
+            }
+            else
+            {
+                c[i] = Op::apply(RawStrCompare(RightTrim(view), tar_str_view), 0);
+            }
+        });
+
+        use_optimized_path = true;
+        break;
+    }
+    default:
+        break;
+    }
+    return use_optimized_path;
+}
+
+} // namespace DB
diff --git a/dbms/src/Functions/FunctionsComparison.h b/dbms/src/Functions/FunctionsComparison.h
index 1c63a286452..8f7502fba85 100644
--- a/dbms/src/Functions/FunctionsComparison.h
+++ b/dbms/src/Functions/FunctionsComparison.h
@@ -33,6 +33,7 @@
 #include <DataTypes/DataTypeString.h>
 #include <DataTypes/DataTypeTuple.h>
 #include <DataTypes/DataTypesNumber.h>
+#include <Functions/CollationOperatorOptimized.h>
 #include <Functions/FunctionHelpers.h>
 #include <Functions/FunctionsLogical.h>
 #include <Functions/IFunction.h>
@@ -301,6 +302,12 @@ struct StringComparisonWithCollatorImpl
         const TiDB::TiDBCollatorPtr & collator,
         PaddedPODArray<ResultType> & c)
     {
+        bool optimized_path = StringVectorStringVector<Op>(a_data, a_offsets, b_data, b_offsets, collator, c);
+        if (optimized_path)
+        {
+            return;
+        }
+
         size_t size = a_offsets.size();
 
         for (size_t i = 0; i < size; ++i)
@@ -317,10 +324,17 @@ struct StringComparisonWithCollatorImpl
     static void NO_INLINE stringVectorConstant(
         const ColumnString::Chars_t & a_data,
         const ColumnString::Offsets & a_offsets,
-        const std::string & b,
+        const std::string_view & b,
         const TiDB::TiDBCollatorPtr & collator,
         PaddedPODArray<ResultType> & c)
     {
+        bool optimized_path = StringVectorConstant<Op>(a_data, a_offsets, b, collator, c);
+
+        if (optimized_path)
+        {
+            return;
+        }
+
         size_t size = a_offsets.size();
         ColumnString::Offset b_size = b.size();
         const char * b_data = reinterpret_cast<const char *>(b.data());
@@ -332,7 +346,7 @@ struct StringComparisonWithCollatorImpl
     }
 
     static void constantStringVector(
-        const std::string & a,
+        const std::string_view & a,
         const ColumnString::Chars_t & b_data,
         const ColumnString::Offsets & b_offsets,
         const TiDB::TiDBCollatorPtr & collator,
@@ -342,8 +356,8 @@ struct StringComparisonWithCollatorImpl
     }
 
     static void constantConstant(
-        const std::string & a,
-        const std::string & b,
+        const std::string_view & a,
+        const std::string_view & b,
         const TiDB::TiDBCollatorPtr & collator,
         ResultType & c)
     {
@@ -706,6 +720,25 @@ class FunctionComparison : public IFunction
         }
     }
 
+    static inline std::string_view genConstStrRef(const ColumnConst * c0_const)
+    {
+        std::string_view c0_const_str_ref{};
+        if (c0_const)
+        {
+            if (const auto * c0_const_string = checkAndGetColumn<ColumnString>(&c0_const->getDataColumn()); c0_const_string)
+            {
+                c0_const_str_ref = std::string_view(c0_const_string->getDataAt(0));
+            }
+            else if (const auto * c0_const_fixed_string = checkAndGetColumn<ColumnFixedString>(&c0_const->getDataColumn()); c0_const_fixed_string)
+            {
+                c0_const_str_ref = std::string_view(c0_const_fixed_string->getDataAt(0));
+            }
+            else
+                throw Exception("Logical error: ColumnConst contains not String nor FixedString column", ErrorCodes::ILLEGAL_COLUMN);
+        }
+        return c0_const_str_ref;
+    }
+
     template <typename ResultColumnType>
     bool executeStringWithCollator(
         Block & block,
@@ -720,10 +753,13 @@ class FunctionComparison : public IFunction
         using ResultType = typename ResultColumnType::value_type;
         using StringImpl = StringComparisonWithCollatorImpl<Op<int, int>, ResultType>;
 
+        std::string_view c0_const_str_ref = genConstStrRef(c0_const);
+        std::string_view c1_const_str_ref = genConstStrRef(c1_const);
+
         if (c0_const && c1_const)
         {
             ResultType res = 0;
-            StringImpl::constantConstant(c0_const->getValue<String>(), c1_const->getValue<String>(), collator, res);
+            StringImpl::constantConstant(c0_const_str_ref, c1_const_str_ref, collator, res);
             block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(c0_const->size(), toField(res));
             return true;
         }
@@ -745,12 +781,12 @@ class FunctionComparison : public IFunction
                 StringImpl::stringVectorConstant(
                     c0_string->getChars(),
                     c0_string->getOffsets(),
-                    c1_const->getValue<String>(),
+                    c1_const_str_ref,
                     collator,
                     c_res->getData());
             else if (c0_const && c1_string)
                 StringImpl::constantStringVector(
-                    c0_const->getValue<String>(),
+                    c0_const_str_ref,
                     c1_string->getChars(),
                     c1_string->getOffsets(),
                     collator,
@@ -770,8 +806,8 @@ class FunctionComparison : public IFunction
     template <typename ReturnColumnType = ColumnUInt8>
     bool executeString(Block & block, size_t result, const IColumn * c0, const IColumn * c1) const
     {
-        const ColumnString * c0_string = checkAndGetColumn<ColumnString>(c0);
-        const ColumnString * c1_string = checkAndGetColumn<ColumnString>(c1);
+        const auto * c0_string = checkAndGetColumn<ColumnString>(c0);
+        const auto * c1_string = checkAndGetColumn<ColumnString>(c1);
         const ColumnConst * c0_const = checkAndGetColumnConstStringOrFixedString(c0);
         const ColumnConst * c1_const = checkAndGetColumnConstStringOrFixedString(c1);
 
diff --git a/dbms/src/Functions/FunctionsConversion.cpp b/dbms/src/Functions/FunctionsConversion.cpp
index 118574ed33d..0446f76bd51 100644
--- a/dbms/src/Functions/FunctionsConversion.cpp
+++ b/dbms/src/Functions/FunctionsConversion.cpp
@@ -240,6 +240,7 @@ void registerFunctionsConversion(FunctionFactory & factory)
 
     factory.registerFunction<FunctionFromUnixTime>();
     factory.registerFunction<FunctionDateFormat>();
+    factory.registerFunction<FunctionGetFormat>();
     factory.registerFunction<FunctionTiDBUnixTimeStamp<NameTiDBUnixTimeStampInt>>();
     factory.registerFunction<FunctionTiDBUnixTimeStamp<NameTiDBUnixTimeStampDec>>();
     factory.registerFunction<FunctionStrToDate<NameStrToDateDate>>();
diff --git a/dbms/src/Functions/FunctionsConversion.h b/dbms/src/Functions/FunctionsConversion.h
index ddf64a70ca1..e8333ceeeea 100644
--- a/dbms/src/Functions/FunctionsConversion.h
+++ b/dbms/src/Functions/FunctionsConversion.h
@@ -1751,6 +1751,120 @@ class FunctionDateFormat : public IFunction
     }
 };
 
+class FunctionGetFormat : public IFunction
+{
+private:
+    static String get_format(const StringRef & time_type, const StringRef & location)
+    {
+        if (time_type == "DATE")
+        {
+            if (location == "USA")
+                return "%m.%d.%Y";
+            else if (location == "JIS")
+                return "%Y-%m-%d";
+            else if (location == "ISO")
+                return "%Y-%m-%d";
+            else if (location == "EUR")
+                return "%d.%m.%Y";
+            else if (location == "INTERNAL")
+                return "%Y%m%d";
+        }
+        else if (time_type == "DATETIME" || time_type == "TIMESTAMP")
+        {
+            if (location == "USA")
+                return "%Y-%m-%d %H.%i.%s";
+            else if (location == "JIS")
+                return "%Y-%m-%d %H:%i:%s";
+            else if (location == "ISO")
+                return "%Y-%m-%d %H:%i:%s";
+            else if (location == "EUR")
+                return "%Y-%m-%d %H.%i.%s";
+            else if (location == "INTERNAL")
+                return "%Y%m%d%H%i%s";
+        }
+        else if (time_type == "TIME")
+        {
+            if (location == "USA")
+                return "%h:%i:%s %p";
+            else if (location == "JIS")
+                return "%H:%i:%s";
+            else if (location == "ISO")
+                return "%H:%i:%s";
+            else if (location == "EUR")
+                return "%H.%i.%s";
+            else if (location == "INTERNAL")
+                return "%H%i%s";
+        }
+        return "";
+    }
+
+public:
+    static constexpr auto name = "getFormat";
+    static FunctionPtr create(const Context &) { return std::make_shared<FunctionGetFormat>(); };
+
+    String getName() const override { return name; }
+
+    size_t getNumberOfArguments() const override { return 2; }
+
+    DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
+    {
+        if (!arguments[0].type->isString())
+            throw Exception("First argument for function " + getName() + " must be String", ErrorCodes::ILLEGAL_COLUMN);
+        if (!arguments[1].type->isString())
+            throw Exception("Second argument for function " + getName() + " must be String", ErrorCodes::ILLEGAL_COLUMN);
+
+        return std::make_shared<DataTypeString>();
+    }
+
+    bool useDefaultImplementationForConstants() const override { return true; }
+
+    /**
+     * @brief The first argument is designed as a MySQL reserved word. You would encounter a syntax error when wrap it around with quote in SQL.
+     * For example, select GET_FORMAT("DATE", "USA") will fail. Removing the quote can solve the problem.
+     * Thus the first argument should always be a ColumnConst. See details in the link below:
+     * https://dev.mysql.com/doc/refman/5.7/en/date-and-time-functions.html#function_get-format
+     *
+     * @return ColumnNumbers
+     */
+    ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0}; }
+
+    void executeImpl(Block & block, const ColumnNumbers & arguments, const size_t result) const override
+    {
+        const auto * location_col = checkAndGetColumn<ColumnString>(block.getByPosition(arguments[1]).column.get());
+        assert(location_col);
+        size_t size = location_col->size();
+        const auto & time_type_col = block.getByPosition(arguments[0]).column;
+        auto col_to = ColumnString::create();
+
+        if (time_type_col->isColumnConst())
+        {
+            const auto & time_type_col_const = checkAndGetColumnConst<ColumnString>(time_type_col.get());
+            const auto & time_type = time_type_col_const->getValue<String>();
+
+            ColumnString::Chars_t & data_to = col_to->getChars();
+            ColumnString::Offsets & offsets_to = col_to->getOffsets();
+            auto max_length = 18;
+            data_to.resize(size * max_length);
+            offsets_to.resize(size);
+            WriteBufferFromVector<ColumnString::Chars_t> write_buffer(data_to);
+            for (size_t i = 0; i < size; ++i)
+            {
+                const auto & location = location_col->getDataAt(i);
+                const auto & result = get_format(StringRef(time_type), location);
+                write_buffer.write(result.c_str(), result.size());
+                writeChar(0, write_buffer);
+                offsets_to[i] = write_buffer.count();
+            }
+            data_to.resize(write_buffer.count());
+            block.getByPosition(result).column = std::move(col_to);
+        }
+        else
+        {
+            throw Exception("First argument for function " + getName() + " must be String constant", ErrorCodes::ILLEGAL_COLUMN);
+        }
+    }
+};
+
 struct NameStrToDateDate
 {
     static constexpr auto name = "strToDateDate";
diff --git a/dbms/src/Functions/FunctionsDuration.cpp b/dbms/src/Functions/FunctionsDuration.cpp
index ea7b86ac670..9ccafd2794d 100644
--- a/dbms/src/Functions/FunctionsDuration.cpp
+++ b/dbms/src/Functions/FunctionsDuration.cpp
@@ -97,6 +97,57 @@ void FunctionDurationSplit<Impl>::executeImpl(Block & block, const ColumnNumbers
             ErrorCodes::ILLEGAL_COLUMN);
 };
 
+template <typename Impl>
+DataTypePtr FunctionMyDurationToSec<Impl>::getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const
+{
+    if (!arguments[0].type->isMyTime())
+    {
+        throw Exception(
+            fmt::format("Illegal type {} of the first argument of function {}", arguments[0].type->getName(), getName()),
+            ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+    }
+    return std::make_shared<DataTypeInt64>();
+}
+
+template <typename Impl>
+void FunctionMyDurationToSec<Impl>::executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const
+{
+    const auto * from_type = checkAndGetDataType<DataTypeMyDuration>(block.getByPosition(arguments[0]).type.get());
+    if (from_type == nullptr)
+    {
+        throw Exception(
+            fmt::format(
+                "Illegal column {} of the first argument of function {}",
+                block.getByPosition(arguments[0]).column->getName(),
+                name),
+            ErrorCodes::ILLEGAL_COLUMN);
+    }
+
+    using FromFieldType = typename DataTypeMyDuration::FieldType;
+    const auto * col_from = checkAndGetColumn<ColumnVector<FromFieldType>>(block.getByPosition(arguments[0]).column.get());
+    if (col_from != nullptr)
+    {
+        const typename ColumnVector<FromFieldType>::Container & vec_from = col_from->getData();
+        const size_t size = vec_from.size();
+        auto col_to = ColumnVector<Int64>::create(size);
+        typename ColumnVector<Int64>::Container & vec_to = col_to->getData();
+
+        for (size_t i = 0; i < size; ++i)
+        {
+            MyDuration val(vec_from[i], from_type->getFsp());
+            vec_to[i] = Impl::apply(val);
+        }
+        block.getByPosition(result).column = std::move(col_to);
+    }
+    else
+        throw Exception(
+            fmt::format(
+                "Illegal column {} of the first argument of function {}",
+                block.getByPosition(arguments[0]).column->getName(),
+                name),
+            ErrorCodes::ILLEGAL_COLUMN);
+}
+
 struct DurationSplitHourImpl
 {
     static constexpr auto name = "hour";
@@ -133,11 +184,27 @@ struct DurationSplitMicroSecondImpl
     }
 };
 
+struct TiDBTimeToSecTransformerImpl
+{
+    static constexpr auto name = "tidbTimeToSec";
+    static Int64 apply(const MyDuration & val)
+    {
+        Int64 sign = 1;
+        if (val.isNeg())
+        {
+            sign = -1;
+        }
+        return sign * (val.hours() * 3600 + val.minutes() * 60 + val.seconds());
+    }
+};
+
 using FunctionDurationHour = FunctionDurationSplit<DurationSplitHourImpl>;
 using FunctionDurationMinute = FunctionDurationSplit<DurationSplitMinuteImpl>;
 using FunctionDurationSecond = FunctionDurationSplit<DurationSplitSecondImpl>;
 using FunctionDurationMicroSecond = FunctionDurationSplit<DurationSplitMicroSecondImpl>;
 
+using FunctionToTiDBTimeToSec = FunctionMyDurationToSec<TiDBTimeToSecTransformerImpl>;
+
 void registerFunctionsDuration(FunctionFactory & factory)
 {
     factory.registerFunction<FunctionConvertDurationFromNanos>();
@@ -146,5 +213,7 @@ void registerFunctionsDuration(FunctionFactory & factory)
     factory.registerFunction<FunctionDurationMinute>();
     factory.registerFunction<FunctionDurationSecond>();
     factory.registerFunction<FunctionDurationMicroSecond>();
+
+    factory.registerFunction<FunctionToTiDBTimeToSec>();
 }
 } // namespace DB
diff --git a/dbms/src/Functions/FunctionsDuration.h b/dbms/src/Functions/FunctionsDuration.h
index 4247cde03ff..5bc54d425f4 100644
--- a/dbms/src/Functions/FunctionsDuration.h
+++ b/dbms/src/Functions/FunctionsDuration.h
@@ -69,4 +69,23 @@ class FunctionDurationSplit : public IFunction
     void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override;
 };
 
+template <typename Impl>
+class FunctionMyDurationToSec : public IFunction
+{
+public:
+    static constexpr auto name = Impl::name;
+
+    static FunctionPtr create(const Context &) { return std::make_shared<FunctionMyDurationToSec>(); };
+
+    String getName() const override { return name; }
+
+    size_t getNumberOfArguments() const override { return 1; }
+
+    bool useDefaultImplementationForConstants() const override { return true; }
+
+    DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override;
+
+    void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override;
+};
+
 } // namespace DB
\ No newline at end of file
diff --git a/dbms/src/Functions/FunctionsString.cpp b/dbms/src/Functions/FunctionsString.cpp
index b9f20e45134..76022b983ad 100644
--- a/dbms/src/Functions/FunctionsString.cpp
+++ b/dbms/src/Functions/FunctionsString.cpp
@@ -992,7 +992,7 @@ class FunctionStringOrArrayToT : public IFunction
     void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override
     {
         const ColumnPtr column = block.getByPosition(arguments[0]).column;
-        if (const ColumnString * col = checkAndGetColumn<ColumnString>(column.get()))
+        if (const auto * col = checkAndGetColumn<ColumnString>(column.get()))
         {
             auto col_res = ColumnVector<ResultType>::create();
 
@@ -1002,7 +1002,7 @@ class FunctionStringOrArrayToT : public IFunction
 
             block.getByPosition(result).column = std::move(col_res);
         }
-        else if (const ColumnFixedString * col = checkAndGetColumn<ColumnFixedString>(column.get()))
+        else if (const auto * col = checkAndGetColumn<ColumnFixedString>(column.get()))
         {
             if (Impl::is_fixed_to_constant)
             {
@@ -1022,7 +1022,7 @@ class FunctionStringOrArrayToT : public IFunction
                 block.getByPosition(result).column = std::move(col_res);
             }
         }
-        else if (const ColumnArray * col = checkAndGetColumn<ColumnArray>(column.get()))
+        else if (const auto * col = checkAndGetColumn<ColumnArray>(column.get()))
         {
             auto col_res = ColumnVector<ResultType>::create();
 
@@ -1081,13 +1081,13 @@ class FunctionReverse : public IFunction
     void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override
     {
         const ColumnPtr column = block.getByPosition(arguments[0]).column;
-        if (const ColumnString * col = checkAndGetColumn<ColumnString>(column.get()))
+        if (const auto * col = checkAndGetColumn<ColumnString>(column.get()))
         {
             auto col_res = ColumnString::create();
             ReverseImpl::vector(col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets());
             block.getByPosition(result).column = std::move(col_res);
         }
-        else if (const ColumnFixedString * col = checkAndGetColumn<ColumnFixedString>(column.get()))
+        else if (const auto * col = checkAndGetColumn<ColumnFixedString>(column.get()))
         {
             auto col_res = ColumnFixedString::create(col->getN());
             ReverseImpl::vectorFixed(col->getChars(), col->getN(), col_res->getChars());
@@ -1131,7 +1131,7 @@ class FunctionJsonLength : public IFunction
     void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override
     {
         const ColumnPtr column = block.getByPosition(arguments[0]).column;
-        if (const ColumnString * col = checkAndGetColumn<ColumnString>(column.get()))
+        if (const auto * col = checkAndGetColumn<ColumnString>(column.get()))
         {
             auto col_res = ColumnUInt64::create();
             typename ColumnUInt64::Container & vec_col_res = col_res->getData();
@@ -1232,8 +1232,8 @@ class ConcatImpl : public IFunction
         const IColumn * c0 = block.getByPosition(arguments[0]).column.get();
         const IColumn * c1 = block.getByPosition(arguments[1]).column.get();
 
-        const ColumnString * c0_string = checkAndGetColumn<ColumnString>(c0);
-        const ColumnString * c1_string = checkAndGetColumn<ColumnString>(c1);
+        const auto * c0_string = checkAndGetColumn<ColumnString>(c0);
+        const auto * c1_string = checkAndGetColumn<ColumnString>(c1);
         const ColumnConst * c0_const_string = checkAndGetColumnConst<ColumnString>(c0);
         const ColumnConst * c1_const_string = checkAndGetColumnConst<ColumnString>(c1);
 
@@ -1552,7 +1552,7 @@ class FunctionSubstring : public IFunction
         if (number_of_arguments == 3)
             column_length = block.getByPosition(arguments[2]).column;
 
-        const ColumnConst * column_start_const = checkAndGetColumn<ColumnConst>(column_start.get());
+        const auto * column_start_const = checkAndGetColumn<ColumnConst>(column_start.get());
         const ColumnConst * column_length_const = nullptr;
 
         if (number_of_arguments == 3)
@@ -1572,9 +1572,9 @@ class FunctionSubstring : public IFunction
                 throw Exception("Third argument provided for function substring could not be negative.", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
         }
 
-        if (const ColumnString * col = checkAndGetColumn<ColumnString>(column_string.get()))
+        if (const auto * col = checkAndGetColumn<ColumnString>(column_string.get()))
             executeForSource(column_start, column_length, column_start_const, column_length_const, start_value, length_value, block, result, StringSource(*col));
-        else if (const ColumnFixedString * col = checkAndGetColumn<ColumnFixedString>(column_string.get()))
+        else if (const auto * col = checkAndGetColumn<ColumnFixedString>(column_string.get()))
             executeForSource(column_start, column_length, column_start_const, column_length_const, start_value, length_value, block, result, FixedStringSource(*col));
         else if (const ColumnConst * col = checkAndGetColumnConst<ColumnString>(column_string.get()))
             executeForSource(column_start, column_length, column_start_const, column_length_const, start_value, length_value, block, result, ConstSource<StringSource>(*col));
@@ -1676,7 +1676,7 @@ class FunctionSubstringUTF8 : public IFunction
                     return true;
                 }
 
-                const ColumnString * col = checkAndGetColumn<ColumnString>(column_string.get());
+                const auto * col = checkAndGetColumn<ColumnString>(column_string.get());
                 assert(col);
                 auto col_res = ColumnString::create();
                 getVectorConstConstFunc(implicit_length, is_positive)(col->getChars(), col->getOffsets(), start_abs, length, col_res->getChars(), col_res->getOffsets());
@@ -1732,7 +1732,7 @@ class FunctionSubstringUTF8 : public IFunction
 
                 // convert to vector if string is const.
                 ColumnPtr full_column_string = column_string->isColumnConst() ? column_string->convertToFullColumnIfConst() : column_string;
-                const ColumnString * col = checkAndGetColumn<ColumnString>(full_column_string.get());
+                const auto * col = checkAndGetColumn<ColumnString>(full_column_string.get());
                 assert(col);
                 auto col_res = ColumnString::create();
                 if (implicit_length)
@@ -1869,7 +1869,7 @@ class FunctionRightUTF8 : public IFunction
             using LengthFieldType = typename LengthType::FieldType;
 
             auto col_res = ColumnString::create();
-            if (const ColumnString * col_string = checkAndGetColumn<ColumnString>(column_string.get()))
+            if (const auto * col_string = checkAndGetColumn<ColumnString>(column_string.get()))
             {
                 if (column_length->isColumnConst())
                 {
@@ -1897,7 +1897,7 @@ class FunctionRightUTF8 : public IFunction
             else if (const ColumnConst * col_const_string = checkAndGetColumnConst<ColumnString>(column_string.get()))
             {
                 // const vector
-                const ColumnString * col_string_from_const = checkAndGetColumn<ColumnString>(col_const_string->getDataColumnPtr().get());
+                const auto * col_string_from_const = checkAndGetColumn<ColumnString>(col_const_string->getDataColumnPtr().get());
                 assert(col_string_from_const);
                 // When useDefaultImplementationForConstants is true, string and length are not both constants
                 assert(!column_length->isColumnConst());
@@ -1993,7 +1993,7 @@ class FunctionAppendTrailingCharIfAbsent : public IFunction
         if (!checkColumnConst<ColumnString>(column_char.get()))
             throw Exception(fmt::format("Second argument of function {} must be a constant string", getName()), ErrorCodes::ILLEGAL_COLUMN);
 
-        String trailing_char_str = static_cast<const ColumnConst &>(*column_char).getValue<String>();
+        auto trailing_char_str = static_cast<const ColumnConst &>(*column_char).getValue<String>();
 
         if (trailing_char_str.size() != 1)
             throw Exception(fmt::format("Second argument of function {} must be a one-character string", getName()), ErrorCodes::BAD_ARGUMENTS);
@@ -2101,7 +2101,7 @@ class TrimImpl : public IFunction
     void executeTrim(Block & block, const ColumnNumbers & arguments, const size_t result) const
     {
         const IColumn * c0 = block.getByPosition(arguments[0]).column.get();
-        const ColumnString * c0_string = checkAndGetColumn<ColumnString>(c0);
+        const auto * c0_string = checkAndGetColumn<ColumnString>(c0);
         const ColumnConst * c0_const_string = checkAndGetColumnConst<ColumnString>(c0);
 
         auto c_res = ColumnString::create();
@@ -2121,8 +2121,8 @@ class TrimImpl : public IFunction
         const IColumn * c0 = block.getByPosition(arguments[0]).column.get();
         const IColumn * c1 = block.getByPosition(arguments[1]).column.get();
 
-        const ColumnString * c0_string = checkAndGetColumn<ColumnString>(c0);
-        const ColumnString * c1_string = checkAndGetColumn<ColumnString>(c1);
+        const auto * c0_string = checkAndGetColumn<ColumnString>(c0);
+        const auto * c1_string = checkAndGetColumn<ColumnString>(c1);
         const ColumnConst * c0_const_string = checkAndGetColumnConst<ColumnString>(c0);
         const ColumnConst * c1_const_string = checkAndGetColumnConst<ColumnString>(c1);
 
@@ -2202,7 +2202,7 @@ class TrimUTF8Impl : public IFunction
     void executeTrim(Block & block, const ColumnNumbers & arguments, const size_t result) const
     {
         const IColumn * c0 = block.getByPosition(arguments[0]).column.get();
-        const ColumnString * c0_string = checkAndGetColumn<ColumnString>(c0);
+        const auto * c0_string = checkAndGetColumn<ColumnString>(c0);
         const ColumnConst * c0_const_string = checkAndGetColumnConst<ColumnString>(c0);
 
         auto c_res = ColumnString::create();
@@ -2225,7 +2225,7 @@ class TrimUTF8Impl : public IFunction
         const IColumn * c0 = block.getByPosition(arguments[0]).column.get();
         const IColumn * c1 = block.getByPosition(arguments[1]).column.get();
 
-        const ColumnString * c0_string = checkAndGetColumn<ColumnString>(c0);
+        const auto * c0_string = checkAndGetColumn<ColumnString>(c0);
         const ColumnConst * c0_const_string = checkAndGetColumnConst<ColumnString>(c0);
         const ColumnConst * c1_const_string = checkAndGetColumnConst<ColumnString>(c1);
         const auto * column_trim_string = checkAndGetColumn<ColumnString>(c1_const_string->getDataColumnPtr().get());
@@ -2716,7 +2716,7 @@ class FunctionTiDBTrim : public IFunction
         ColumnPtr & column_data = block.getByPosition(arguments[0]).column;
         auto res_col = ColumnString::create();
 
-        const ColumnString * data_col = checkAndGetColumn<ColumnString>(column_data.get());
+        const auto * data_col = checkAndGetColumn<ColumnString>(column_data.get());
 
         static constexpr std::string_view default_rem = " ";
         static const auto * remstr_ptr = reinterpret_cast<const UInt8 *>(default_rem.data());
@@ -2738,25 +2738,25 @@ class FunctionTiDBTrim : public IFunction
         if (data_const && !remstr_const)
         {
             const ColumnConst * data_col = checkAndGetColumnConst<ColumnString>(column_data.get());
-            const ColumnString * remstr_col = checkAndGetColumn<ColumnString>(column_remstr.get());
+            const auto * remstr_col = checkAndGetColumn<ColumnString>(column_remstr.get());
 
-            const std::string data = data_col->getValue<String>();
+            const auto data = data_col->getValue<String>();
             const auto * data_ptr = reinterpret_cast<const UInt8 *>(data.c_str());
             constVector(is_ltrim, is_rtrim, data_ptr, data.size() + 1, remstr_col->getChars(), remstr_col->getOffsets(), res_col->getChars(), res_col->getOffsets());
         }
         else if (remstr_const && !data_const)
         {
             const ColumnConst * remstr_col = checkAndGetColumnConst<ColumnString>(column_remstr.get());
-            const ColumnString * data_col = checkAndGetColumn<ColumnString>(column_data.get());
+            const auto * data_col = checkAndGetColumn<ColumnString>(column_data.get());
 
-            const std::string remstr = remstr_col->getValue<String>();
+            const auto remstr = remstr_col->getValue<String>();
             const auto * remstr_ptr = reinterpret_cast<const UInt8 *>(remstr.c_str());
             vectorConst(is_ltrim, is_rtrim, data_col->getChars(), data_col->getOffsets(), remstr_ptr, remstr.size() + 1, res_col->getChars(), res_col->getOffsets());
         }
         else
         {
-            const ColumnString * data_col = checkAndGetColumn<ColumnString>(column_data.get());
-            const ColumnString * remstr_col = checkAndGetColumn<ColumnString>(column_remstr.get());
+            const auto * data_col = checkAndGetColumn<ColumnString>(column_data.get());
+            const auto * remstr_col = checkAndGetColumn<ColumnString>(column_remstr.get());
 
             vectorVector(is_ltrim, is_rtrim, data_col->getChars(), data_col->getOffsets(), remstr_col->getChars(), remstr_col->getOffsets(), res_col->getChars(), res_col->getOffsets());
         }
@@ -2769,7 +2769,7 @@ class FunctionTiDBTrim : public IFunction
         ColumnPtr & column_direction = block.getByPosition(arguments[2]).column;
         if (!column_direction->isColumnConst())
             throw Exception(fmt::format("3nd argument of function {} must be constant.", getName()));
-        const ColumnConst * direction_col = checkAndGetColumn<ColumnConst>(column_direction.get());
+        const auto * direction_col = checkAndGetColumn<ColumnConst>(column_direction.get());
 
         static constexpr Int64 trim_both_default = 0; // trims from both direction by default
         static constexpr Int64 trim_both = 1; // trims from both direction with explicit notation
@@ -2989,7 +2989,7 @@ class TidbPadImpl
                 {
                     continue;
                 }
-                int32_t len = static_cast<int32_t>(column_length->getInt(i));
+                auto len = static_cast<int32_t>(column_length->getInt(i));
                 if (len <= 0)
                 {
                     len = 0;
@@ -3051,7 +3051,7 @@ class TidbPadImpl
         }
         else
         {
-            const ColumnString * column_string = checkAndGetColumn<ColumnString>(column_string_ptr.get());
+            const auto * column_string = checkAndGetColumn<ColumnString>(column_string_ptr.get());
             const ColumnString::Offsets & string_offsets = column_string->getOffsets();
             const ColumnString::Chars_t & string_data = column_string->getChars();
 
@@ -3233,7 +3233,7 @@ class TidbPadImpl
             return true;
         }
 
-        ColumnString::Offset tmp_target_len = static_cast<ColumnString::Offset>(target_len);
+        auto tmp_target_len = static_cast<ColumnString::Offset>(target_len);
         ColumnString::Offset per_pad_offset = 0;
         ColumnString::Offset pad_bytes = 0;
         ColumnString::Offset left = 0;
@@ -3300,7 +3300,7 @@ class TidbPadImpl
             return true;
         }
 
-        ColumnString::Offset tmp_target_len = static_cast<ColumnString::Offset>(target_len);
+        auto tmp_target_len = static_cast<ColumnString::Offset>(target_len);
         if (data_len < tmp_target_len)
         {
             ColumnString::Offset left = tmp_target_len - data_len;
@@ -3421,7 +3421,7 @@ class PadImpl : public IFunction
         ColumnPtr column_length = block.getByPosition(arguments[1]).column;
         ColumnPtr column_padding = block.getByPosition(arguments[2]).column;
 
-        const ColumnConst * column_length_const = checkAndGetColumn<ColumnConst>(column_length.get());
+        const auto * column_length_const = checkAndGetColumn<ColumnConst>(column_length.get());
         const ColumnConst * column_padding_const = checkAndGetColumnConst<ColumnString>(column_padding.get());
 
         Int64 length_value = 0;
@@ -3441,7 +3441,7 @@ class PadImpl : public IFunction
 
         auto c_res = ColumnString::create();
 
-        if (const ColumnString * col = checkAndGetColumn<ColumnString>(column_string.get()))
+        if (const auto * col = checkAndGetColumn<ColumnString>(column_string.get()))
             pad<is_left, StringSource, ConstSource<StringSource>, StringSink>(
                 StringSource(*col),
                 ConstSource<StringSource>(*column_padding_const),
@@ -3548,7 +3548,7 @@ class PadUTF8Impl : public IFunction
         ColumnPtr column_length = block.getByPosition(arguments[1]).column;
         ColumnPtr column_padding = block.getByPosition(arguments[2]).column;
 
-        const ColumnConst * column_length_const = checkAndGetColumn<ColumnConst>(column_length.get());
+        const auto * column_length_const = checkAndGetColumn<ColumnConst>(column_length.get());
         const ColumnConst * column_padding_const = checkAndGetColumnConst<ColumnString>(column_padding.get());
 
         Int64 length_value = 0;
@@ -3568,7 +3568,7 @@ class PadUTF8Impl : public IFunction
 
         auto c_res = ColumnString::create();
         const auto * column_padding_string = checkAndGetColumn<ColumnString>(column_padding_const->getDataColumnPtr().get());
-        if (const ColumnString * col = checkAndGetColumn<ColumnString>(column_string.get()))
+        if (const auto * col = checkAndGetColumn<ColumnString>(column_string.get()))
             vector(col->getChars(), col->getOffsets(), length_value, column_padding_string->getChars(), column_padding_string->getOffsets(), c_res->getChars(), c_res->getOffsets());
         else if (const ColumnConst * col = checkAndGetColumnConst<ColumnString>(column_string.get()))
         {
@@ -4114,8 +4114,8 @@ class FunctionASCII : public IFunction
     void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override
     {
         const IColumn * c0_col = block.getByPosition(arguments[0]).column.get();
-        const ColumnConst * c0_const = checkAndGetColumn<ColumnConst>(c0_col);
-        const ColumnString * c0_string = checkAndGetColumn<ColumnString>(c0_col);
+        const auto * c0_const = checkAndGetColumn<ColumnConst>(c0_col);
+        const auto * c0_string = checkAndGetColumn<ColumnString>(c0_col);
 
         Field res_field;
         int val_num = c0_col->size();
@@ -4165,8 +4165,8 @@ class FunctionLength : public IFunction
     void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override
     {
         const IColumn * c0_col = block.getByPosition(arguments[0]).column.get();
-        const ColumnConst * c0_const = checkAndGetColumn<ColumnConst>(c0_col);
-        const ColumnString * c0_string = checkAndGetColumn<ColumnString>(c0_col);
+        const auto * c0_const = checkAndGetColumn<ColumnConst>(c0_col);
+        const auto * c0_string = checkAndGetColumn<ColumnString>(c0_col);
 
         Field res_field;
         int val_num = c0_col->size();
@@ -4215,13 +4215,13 @@ class FunctionPosition : public IFunction
     void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override
     {
         const IColumn * c0_col = block.getByPosition(arguments[0]).column.get();
-        const ColumnConst * c0_const = checkAndGetColumn<ColumnConst>(c0_col);
-        const ColumnString * c0_string = checkAndGetColumn<ColumnString>(c0_col);
+        const auto * c0_const = checkAndGetColumn<ColumnConst>(c0_col);
+        const auto * c0_string = checkAndGetColumn<ColumnString>(c0_col);
         Field c0_field;
 
         const IColumn * c1_col = block.getByPosition(arguments[1]).column.get();
-        const ColumnConst * c1_const = checkAndGetColumn<ColumnConst>(c1_col);
-        const ColumnString * c1_string = checkAndGetColumn<ColumnString>(c1_col);
+        const auto * c1_const = checkAndGetColumn<ColumnConst>(c1_col);
+        const auto * c1_string = checkAndGetColumn<ColumnString>(c1_col);
         Field c1_field;
 
         if ((c0_const == nullptr && c0_string == nullptr) || (c1_const == nullptr && c1_string == nullptr))
@@ -4331,7 +4331,7 @@ class FunctionSubStringIndex : public IFunction
         column_str = column_str->isColumnConst() ? column_str->convertToFullColumnIfConst() : column_str;
         if (delim_const && count_const)
         {
-            const ColumnString * str_col = checkAndGetColumn<ColumnString>(column_str.get());
+            const auto * str_col = checkAndGetColumn<ColumnString>(column_str.get());
             const ColumnConst * delim_col = checkAndGetColumnConst<ColumnString>(column_delim.get());
             const ColumnConst * count_col = checkAndGetColumnConst<ColumnVector<IntType>>(column_count.get());
             if (str_col == nullptr || delim_col == nullptr || count_col == nullptr)
@@ -4339,7 +4339,7 @@ class FunctionSubStringIndex : public IFunction
                 return false;
             }
             auto col_res = ColumnString::create();
-            IntType count = count_col->getValue<IntType>();
+            auto count = count_col->getValue<IntType>();
             vectorConstConst(
                 str_col->getChars(),
                 str_col->getOffsets(),
@@ -4353,9 +4353,9 @@ class FunctionSubStringIndex : public IFunction
         {
             column_delim = column_delim->isColumnConst() ? column_delim->convertToFullColumnIfConst() : column_delim;
             column_count = column_count->isColumnConst() ? column_count->convertToFullColumnIfConst() : column_count;
-            const ColumnString * str_col = checkAndGetColumn<ColumnString>(column_str.get());
-            const ColumnString * delim_col = checkAndGetColumn<ColumnString>(column_delim.get());
-            const ColumnVector<IntType> * count_col = checkAndGetColumn<ColumnVector<IntType>>(column_count.get());
+            const auto * str_col = checkAndGetColumn<ColumnString>(column_str.get());
+            const auto * delim_col = checkAndGetColumn<ColumnString>(column_delim.get());
+            const auto * count_col = checkAndGetColumn<ColumnVector<IntType>>(column_count.get());
             if (str_col == nullptr || delim_col == nullptr || count_col == nullptr)
             {
                 return false;
@@ -4573,7 +4573,9 @@ class FormatImpl : public IFunction
             using NumberFieldType = typename NumberType::FieldType;
             using NumberColVec = std::conditional_t<IsDecimal<NumberFieldType>, ColumnDecimal<NumberFieldType>, ColumnVector<NumberFieldType>>;
             const auto * number_raw = block.getByPosition(arguments[0]).column.get();
+
             TiDBDecimalRoundInfo info{number_type, number_type};
+            info.output_prec = info.output_prec < 65 ? info.output_prec + 1 : 65;
 
             return getPrecisionType(precision_base_type, [&](const auto & precision_type, bool) {
                 using PrecisionType = std::decay_t<decltype(precision_type)>;
@@ -4723,10 +4725,11 @@ class FormatImpl : public IFunction
     static void format(
         T number,
         size_t max_num_decimals,
-        const TiDBDecimalRoundInfo & info,
+        TiDBDecimalRoundInfo & info,
         ColumnString::Chars_t & res_data,
         ColumnString::Offsets & res_offsets)
     {
+        info.output_scale = std::min(max_num_decimals, static_cast<size_t>(info.input_scale));
         auto round_number = round(number, max_num_decimals, info);
         std::string round_number_str = number2Str(round_number, info);
         std::string buffer = Format::apply(round_number_str, max_num_decimals);
@@ -4870,7 +4873,7 @@ class FunctionFormatWithLocale : public IFunction
             }
             else
             {
-                const String value = locale_const->getValue<String>();
+                const auto value = locale_const->getValue<String>();
                 if (!boost::iequals(value, supported_locale))
                 {
                     const auto & msg = genWarningMsg(value);
diff --git a/dbms/src/Functions/bitShiftRight.cpp b/dbms/src/Functions/bitShiftRight.cpp
index 961f7459f68..90b365771de 100644
--- a/dbms/src/Functions/bitShiftRight.cpp
+++ b/dbms/src/Functions/bitShiftRight.cpp
@@ -13,6 +13,9 @@
 // limitations under the License.
 
 #include <Functions/FunctionBinaryArithmetic.h>
+#include <common/types.h>
+
+#include <limits>
 
 namespace DB
 {
@@ -29,7 +32,18 @@ struct BitShiftRightImpl<A, B, false>
     template <typename Result = ResultType>
     static Result apply(A a, B b)
     {
-        return static_cast<Result>(a) >> static_cast<Result>(b);
+        // It is an undefined behavior for shift operation in c++ that the right operand is negative or greater than
+        // or equal to the number of digits of the bits in the (promoted) left operand.
+        // See https://en.cppreference.com/w/cpp/language/operator_arithmetic for details.
+        if (static_cast<Result>(b) >= std::numeric_limits<decltype(static_cast<Result>(a))>::digits)
+        {
+            return static_cast<Result>(0);
+        }
+        // Note that we do not consider the case that the right operand is negative,
+        // since other types will all be cast to uint64 before shift operation
+        // according to DAGExpressionAnalyzerHelper::buildBitwiseFunction.
+        // Therefore, we simply suppress clang-tidy checking here.
+        return static_cast<Result>(a) >> static_cast<Result>(b); // NOLINT(clang-analyzer-core.UndefinedBinaryOperatorResult)
     }
     template <typename Result = ResultType>
     static Result apply(A, B, UInt8 &)
@@ -87,4 +101,4 @@ void registerFunctionBitShiftRight(FunctionFactory & factory)
     factory.registerFunction<FunctionBitShiftRight>();
 }
 
-} // namespace DB
\ No newline at end of file
+} // namespace DB
diff --git a/dbms/src/Functions/tests/gtest_bitshiftright.cpp b/dbms/src/Functions/tests/gtest_bitshiftright.cpp
new file mode 100644
index 00000000000..a4af6336099
--- /dev/null
+++ b/dbms/src/Functions/tests/gtest_bitshiftright.cpp
@@ -0,0 +1,273 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <TestUtils/FunctionTestUtils.h>
+#include <TestUtils/TiFlashTestBasic.h>
+
+namespace DB
+{
+namespace tests
+{
+class TestFunctionBitShiftRight : public DB::tests::FunctionTest
+{
+};
+
+#define ASSERT_BITSHIFTRIGHT(t1, t2, result) \
+    ASSERT_COLUMN_EQ(result, executeFunction("bitShiftRight", {t1, t2}))
+
+TEST_F(TestFunctionBitShiftRight, Simple)
+try
+{
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<Int64>>({8}),
+                         createColumn<Nullable<Int64>>({2}),
+                         createColumn<Nullable<UInt64>>({2}));
+}
+CATCH
+
+/// Note: Only IntX and UIntX will be received by BitShiftRight, others will be casted by TiDB planner.
+/// Note: BitShiftRight will further cast other types to UInt64 before doing shift.
+TEST_F(TestFunctionBitShiftRight, TypePromotion)
+try
+{
+    // Type Promotion
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<Int8>>({-1}), createColumn<Nullable<Int16>>({1}), createColumn<Nullable<UInt64>>({9223372036854775807ull}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<Int16>>({-1}), createColumn<Nullable<Int32>>({1}), createColumn<Nullable<UInt64>>({9223372036854775807ull}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<Int32>>({-1}), createColumn<Nullable<Int64>>({1}), createColumn<Nullable<UInt64>>({9223372036854775807ull}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<Int8>>({-1}), createColumn<Nullable<Int64>>({1}), createColumn<Nullable<UInt64>>({9223372036854775807ull}));
+
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<UInt8>>({1}), createColumn<Nullable<UInt16>>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<UInt16>>({1}), createColumn<Nullable<UInt32>>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<UInt32>>({1}), createColumn<Nullable<UInt64>>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<UInt8>>({1}), createColumn<Nullable<UInt64>>({0}), createColumn<Nullable<UInt64>>({1}));
+
+    // Type Promotion across signed/unsigned
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<Int16>>({-1}), createColumn<Nullable<UInt32>>({0}), createColumn<Nullable<UInt64>>({18446744073709551615ull}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<Int64>>({-1}), createColumn<Nullable<UInt8>>({0}), createColumn<Nullable<UInt64>>({18446744073709551615ull}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<UInt32>>({1}), createColumn<Nullable<Int16>>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<UInt8>>({1}), createColumn<Nullable<Int64>>({0}), createColumn<Nullable<UInt64>>({1}));
+}
+CATCH
+
+TEST_F(TestFunctionBitShiftRight, Nullable)
+try
+{
+    // Non Nullable
+    ASSERT_BITSHIFTRIGHT(createColumn<Int8>({1}), createColumn<Int16>({0}), createColumn<UInt64>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Int16>({1}), createColumn<Int32>({0}), createColumn<UInt64>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Int32>({1}), createColumn<Int64>({0}), createColumn<UInt64>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Int8>({1}), createColumn<Int64>({0}), createColumn<UInt64>({1}));
+
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt8>({1}), createColumn<UInt16>({0}), createColumn<UInt64>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt16>({1}), createColumn<UInt32>({0}), createColumn<UInt64>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt32>({1}), createColumn<UInt64>({0}), createColumn<UInt64>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt8>({1}), createColumn<UInt64>({0}), createColumn<UInt64>({1}));
+
+    ASSERT_BITSHIFTRIGHT(createColumn<Int16>({1}), createColumn<UInt32>({0}), createColumn<UInt64>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Int64>({1}), createColumn<UInt8>({0}), createColumn<UInt64>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt32>({1}), createColumn<Int16>({0}), createColumn<UInt64>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt8>({1}), createColumn<Int64>({0}), createColumn<UInt64>({1}));
+
+    // Across Nullable and non-Nullable
+    ASSERT_BITSHIFTRIGHT(createColumn<Int8>({1}), createColumn<Nullable<Int16>>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Int16>({1}), createColumn<Nullable<Int32>>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Int32>({1}), createColumn<Nullable<Int64>>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Int8>({1}), createColumn<Nullable<Int64>>({0}), createColumn<Nullable<UInt64>>({1}));
+
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt8>({1}), createColumn<Nullable<UInt16>>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt16>({1}), createColumn<Nullable<UInt32>>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt32>({1}), createColumn<Nullable<UInt64>>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt8>({1}), createColumn<Nullable<UInt64>>({0}), createColumn<Nullable<UInt64>>({1}));
+
+    ASSERT_BITSHIFTRIGHT(createColumn<Int16>({1}), createColumn<Nullable<UInt32>>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Int64>({1}), createColumn<Nullable<UInt8>>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt32>({1}), createColumn<Nullable<Int16>>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt8>({1}), createColumn<Nullable<Int64>>({0}), createColumn<Nullable<UInt64>>({1}));
+
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<Int8>>({1}), createColumn<Int16>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<Int16>>({1}), createColumn<Int32>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<Int32>>({1}), createColumn<Int64>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<Int8>>({1}), createColumn<Int64>({0}), createColumn<Nullable<UInt64>>({1}));
+
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<UInt8>>({1}), createColumn<UInt16>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<UInt16>>({1}), createColumn<UInt32>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<UInt32>>({1}), createColumn<UInt64>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<UInt8>>({1}), createColumn<UInt64>({0}), createColumn<Nullable<UInt64>>({1}));
+
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<Int16>>({1}), createColumn<UInt32>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<Int64>>({1}), createColumn<UInt8>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<UInt32>>({1}), createColumn<Int16>({0}), createColumn<Nullable<UInt64>>({1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<UInt8>>({1}), createColumn<Int64>({0}), createColumn<Nullable<UInt64>>({1}));
+}
+CATCH
+
+TEST_F(TestFunctionBitShiftRight, TypeCastWithConst)
+try
+{
+    /// need test these kinds of columns:
+    /// 1. ColumnVector
+    /// 2. ColumnVector<Nullable>
+    /// 3. ColumnConst
+    /// 4. ColumnConst<Nullable>, value != null
+    /// 5. ColumnConst<Nullable>, value = null
+
+    ASSERT_BITSHIFTRIGHT(createColumn<Int8>({0, 0, 1, 1}), createColumn<UInt64>({0, 1, 0, 1}), createColumn<UInt64>({0, 0, 1, 0}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Int8>({0, 0, 1, 1}), createColumn<Nullable<UInt64>>({0, 1, std::nullopt, std::nullopt}), createColumn<Nullable<UInt64>>({0, 0, std::nullopt, std::nullopt}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Int8>({0, 0, 1, 1}), createConstColumn<UInt64>(4, 0), createColumn<UInt64>({0, 0, 1, 1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Int8>({0, 0, 1, 1}), createConstColumn<Nullable<UInt64>>(4, 0), createColumn<UInt64>({0, 0, 1, 1}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Int8>({0, 0, 1, 1}), createConstColumn<Nullable<UInt64>>(4, std::nullopt), createConstColumn<Nullable<UInt64>>(4, std::nullopt)); // become const in wrapInNullable
+
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<Int8>>({0, 1, std::nullopt, std::nullopt}), createColumn<UInt64>({0, 1, 0, 1}), createColumn<Nullable<UInt64>>({0, 0, std::nullopt, std::nullopt}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<Int8>>({0, 1, std::nullopt, std::nullopt}), createColumn<Nullable<UInt64>>({0, 1, std::nullopt, std::nullopt}), createColumn<Nullable<UInt64>>({0, 0, std::nullopt, std::nullopt}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<Int8>>({0, 1, std::nullopt, std::nullopt}), createConstColumn<UInt64>(4, 0), createColumn<Nullable<UInt64>>({0, 1, std::nullopt, std::nullopt}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<Int8>>({0, 1, std::nullopt, std::nullopt}), createConstColumn<UInt64>(4, 0), createColumn<Nullable<UInt64>>({0, 1, std::nullopt, std::nullopt}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<Int8>>({0, 1, std::nullopt, std::nullopt}), createConstColumn<Nullable<UInt64>>(4, std::nullopt), createConstColumn<Nullable<UInt64>>(4, std::nullopt));
+
+    ASSERT_BITSHIFTRIGHT(createConstColumn<Int8>(4, 1), createColumn<UInt64>({0, 1, 0, 1}), createColumn<UInt64>({1, 0, 1, 0}));
+    ASSERT_BITSHIFTRIGHT(createConstColumn<Int8>(4, 1), createColumn<Nullable<UInt64>>({0, 1, std::nullopt, std::nullopt}), createColumn<Nullable<UInt64>>({1, 0, std::nullopt, std::nullopt}));
+    ASSERT_BITSHIFTRIGHT(createConstColumn<Int8>(4, 1), createConstColumn<UInt64>(4, 0), createConstColumn<UInt64>(4, 1));
+    ASSERT_BITSHIFTRIGHT(createConstColumn<Int8>(4, 1), createConstColumn<Nullable<UInt64>>(4, 0), createConstColumn<UInt64>(4, 1));
+    ASSERT_BITSHIFTRIGHT(createConstColumn<Int8>(4, 1), createConstColumn<Nullable<UInt64>>(4, std::nullopt), createConstColumn<Nullable<UInt64>>(4, std::nullopt));
+
+    ASSERT_BITSHIFTRIGHT(createConstColumn<Nullable<Int8>>(4, 1), createColumn<UInt64>({0, 1, 0, 1}), createColumn<UInt64>({1, 0, 1, 0}));
+    ASSERT_BITSHIFTRIGHT(createConstColumn<Nullable<Int8>>(4, 1), createColumn<Nullable<UInt64>>({0, 1, std::nullopt, std::nullopt}), createColumn<Nullable<UInt64>>({1, 0, std::nullopt, std::nullopt}));
+    ASSERT_BITSHIFTRIGHT(createConstColumn<Nullable<Int8>>(4, 1), createConstColumn<UInt64>(4, 0), createConstColumn<UInt64>(4, 1));
+    ASSERT_BITSHIFTRIGHT(createConstColumn<Nullable<Int8>>(4, 1), createConstColumn<Nullable<UInt64>>(4, 0), createConstColumn<UInt64>(4, 1));
+    ASSERT_BITSHIFTRIGHT(createConstColumn<Nullable<Int8>>(4, 1), createConstColumn<Nullable<UInt64>>(4, std::nullopt), createConstColumn<Nullable<UInt64>>(4, std::nullopt));
+
+    ASSERT_BITSHIFTRIGHT(createConstColumn<Nullable<Int8>>(4, std::nullopt), createColumn<UInt64>({0, 1, 0, 1}), createConstColumn<Nullable<UInt64>>(4, std::nullopt));
+    ASSERT_BITSHIFTRIGHT(createConstColumn<Nullable<Int8>>(4, std::nullopt), createColumn<Nullable<UInt64>>({0, 1, std::nullopt, std::nullopt}), createConstColumn<Nullable<UInt64>>(4, std::nullopt));
+    ASSERT_BITSHIFTRIGHT(createConstColumn<Nullable<Int8>>(4, std::nullopt), createConstColumn<UInt64>(4, 0), createConstColumn<Nullable<UInt64>>(4, std::nullopt));
+    ASSERT_BITSHIFTRIGHT(createConstColumn<Nullable<Int8>>(4, std::nullopt), createConstColumn<UInt64>(4, 0), createConstColumn<Nullable<UInt64>>(4, std::nullopt));
+    ASSERT_BITSHIFTRIGHT(createConstColumn<Nullable<Int8>>(4, std::nullopt), createConstColumn<Nullable<UInt64>>(4, std::nullopt), createConstColumn<Nullable<UInt64>>(4, std::nullopt));
+}
+CATCH
+
+TEST_F(TestFunctionBitShiftRight, Boundary)
+try
+{
+    ASSERT_BITSHIFTRIGHT(createColumn<Int8>({127, 127, -128, -128}), createColumn<UInt8>({0, 7, 0, 7}), createColumn<UInt64>({127, 0, 18446744073709551488ull, 144115188075855871ull}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Int8>({127, 127, -128, -128}), createColumn<UInt16>({0, 7, 0, 7}), createColumn<UInt64>({127, 0, 18446744073709551488ull, 144115188075855871ull}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Int16>({32767, 32767, -32768, -32768}), createColumn<UInt8>({0, 15, 0, 15}), createColumn<UInt64>({32767, 0, 18446744073709518848ull, 562949953421311ull}));
+
+    ASSERT_BITSHIFTRIGHT(createColumn<Int64>({0, 0, 1, 1, -1, -1, INT64_MAX, INT64_MAX, INT64_MIN, INT64_MIN}),
+                         createColumn<UInt64>({0, 63, 0, 63, 0, 63, 0, 63, 0, 63}),
+                         createColumn<UInt64>({0, 0, 1, 0, 18446744073709551615ull, 1, INT64_MAX, 0, 9223372036854775808ull, 1}));
+}
+CATCH
+
+TEST_F(TestFunctionBitShiftRight, UINT64)
+try
+{
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt64>({0, UINT64_MAX}),
+                         createColumn<UInt64>({63, 63}),
+                         createColumn<UInt64>({0, 1}));
+
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<UInt64>>({0, UINT64_MAX, std::nullopt}),
+                         createColumn<Nullable<UInt64>>({63, 63, 63}),
+                         createColumn<Nullable<UInt64>>({0, 1, std::nullopt}));
+
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<UInt64>>({0, UINT64_MAX, std::nullopt}),
+                         createColumn<UInt64>({63, 63, 63}),
+                         createColumn<Nullable<UInt64>>({0, 1, std::nullopt}));
+
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt64>({0, UINT64_MAX}),
+                         createColumn<Nullable<UInt64>>({63, 63}),
+                         createColumn<Nullable<UInt64>>({0, 1}));
+
+    ASSERT_BITSHIFTRIGHT(createColumn<Int64>({0, 0, 1, 1, -1, -1, INT64_MAX, INT64_MAX, INT64_MIN, INT64_MIN}),
+                         createColumn<UInt64>({0, UINT64_MAX, 0, UINT64_MAX, 0, UINT64_MAX, 0, UINT64_MAX, 0, UINT64_MAX}),
+                         createColumn<UInt64>({0, 0, 1, 0, 18446744073709551615ull, 0, INT64_MAX, 0, 9223372036854775808ull, 0}));
+
+
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt64>({0, 0, UINT64_MAX, UINT64_MAX}),
+                         createColumn<UInt64>({0, UINT64_MAX, 0, UINT64_MAX}),
+                         createColumn<UInt64>({0, 0, UINT64_MAX, 0}));
+
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<UInt64>>({0, 0, UINT64_MAX, UINT64_MAX, 0, std::nullopt}),
+                         createColumn<Nullable<UInt64>>({0, UINT64_MAX, 0, UINT64_MAX, std::nullopt, 0}),
+                         createColumn<Nullable<UInt64>>({0, 0, UINT64_MAX, 0, std::nullopt, std::nullopt}));
+
+    ASSERT_BITSHIFTRIGHT(createColumn<Nullable<UInt64>>({0, 0, UINT64_MAX, UINT64_MAX, std::nullopt}),
+                         createColumn<UInt64>({0, UINT64_MAX, 0, UINT64_MAX, 0}),
+                         createColumn<Nullable<UInt64>>({0, 0, UINT64_MAX, 0, std::nullopt}));
+
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt64>({0, UINT64_MAX, 0, UINT64_MAX, 0}),
+                         createColumn<Nullable<UInt64>>({0, 0, UINT64_MAX, UINT64_MAX, std::nullopt}),
+                         createColumn<Nullable<UInt64>>({0, UINT64_MAX, 0, 0, std::nullopt}));
+
+    /*
+    std::mt19937 gen(std::random_device{}());
+    std::uniform_int_distribution<uint64_t> dis(
+            std::numeric_limits<std::uint64_t>::min(),
+            std::numeric_limits<std::uint64_t>::max()
+    );
+    size_t count = 100;
+    std::vector<uint64_t> v1(count), v2(count), res(count);
+    for (size_t i=0; i<count; ++i) {
+        v1[i] = dis(gen);
+        v2[i] = dis(gen) % 64;
+        res[i] = v1[i] >> v2[i];
+    }
+    */
+    // clang-format off
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt64>({4286230172992429668ull,11550684080080434735ull,775195682263841867ull,18390588538388462661ull,15578761645824658314ull,20662948907547635ull,8403266546632871011ull,10316916867086714284ull,14494183568060929367ull,11741337603037632348ull,10803264694948981380ull,2181969932373516503ull,9673801579564730047ull,12998855911221966916ull,13852157931865274857ull,9203926828777338586ull,8903261359104369984ull,3296258311466476456ull,14658801806079697908ull,7542518003247963618ull,7751150277360944372ull,12225694156629117269ull,3173837214287201256ull,10555082060194839563ull,14202570947308501213ull,13841194359225980123ull,9085267378073816945ull,15975493157631073381ull,1890233386459299033ull,2368634323417847398ull,691423931511513606ull,986000479038857169ull,6676906740954304741ull,2841686799872009560ull,6483676442160212821ull,12550114481083571140ull,1973026146580965947ull,15006687639313690830ull,6443617813685195609ull,13648732879238232658ull,173820604016606515ull,2669428687588070677ull,15361476519767969236ull,8957522718906827285ull,10484385204137290737ull,12390466571993898199ull,13655746682011856065ull,4183302523705398003ull,9898692767945122925ull,16701902679050716746ull,15003324714492513897ull,15554724240808081962ull,7754458312088240871ull,16060968032680196798ull,12619581440986221928ull,15462661961676206824ull,2991773628650321635ull,16341599119345297909ull,14943939970889580769ull,17589764776976679210ull,15274914527536421890ull,16268454608136611433ull,14617646699124891378ull,466927094873143934ull,10558583305251737283ull,255559140356160501ull,5962789691899784330ull,8004603198837555992ull,1881892337023478820ull,6549167700870881840ull,17551996157828573642ull,3349744237253314638ull,2876698686583880568ull,16792783373922568330ull,16231348759981899800ull,17731631990557975899ull,1305376485657663531ull,3568754485566225727ull,10076204423028931225ull,1206238310176455071ull,4297062324543635867ull,5116785256928623516ull,4216305034157620433ull,412817651268481791ull,11256299741838589766ull,10786197076871163667ull,8588357635228913652ull,6361409982074778071ull,4750871994764527580ull,12851835128796581697ull,13871712051825681122ull,12445309465661589227ull,1668617678034382020ull,10152918068481134781ull,16242941973571224246ull,12988338226657152812ull,2352083670492692674ull,10735026236980245779ull,14986388012066843516ull,17651064432466444102ull}),
+                         createColumn<UInt64>({0,58,55,24,5,35,34,54,43,45,17,36,51,54,19,55,55,8,37,49,15,11,36,0,5,41,46,54,2,59,11,25,43,29,31,8,59,2,11,19,56,35,57,13,2,35,6,54,17,0,49,5,15,3,60,44,16,6,57,44,58,54,26,23,58,23,26,29,56,40,45,2,21,9,57,40,4,46,17,15,62,21,5,54,22,47,10,24,53,61,43,52,23,10,61,43,26,31,38,2}),
+                         createColumn<UInt64>({4286230172992429668ull,40,21,1096164497041ull,486836301432020572ull,601370,489134489,572,1647797,333708,82422368583289ull,31751841,4296,721,26420894492846ull,255,247,12876009029165923ull,106656820,13398,236546334147978ull,5969577224916561ull,46185410,10555082060194839563ull,443830342103390662ull,6294246,129109,886,472558346614824758ull,4,337609341558356ull,29385104150ull,759076,5293054133ull,3019197118ull,49023884691732699ull,3,3751671909828422707ull,3146297760588474ull,26032891996838ull,2,77690599,106,1093447597522806ull,2621096301034322684ull,360610038,213371041906435251ull,232,75521032470284ull,16701902679050716746ull,26651,486085132525252561ull,236647287356208ull,2007621004085024599ull,10,878950,45650842722325ull,255337486239770279ull,103,999862,52,903,217819909738ull,55662047251ull,36,30465023560ull,88852490364ull,14909735319ull,26,5956433,498857,837436059313328659ull,1371716826717ull,32798405027192516ull,112,16126825,81586030353603970ull,50715,76875338920813ull,36811471868177ull,0,2439873341049ull,131759532317425638ull,22,2683710990390ull,76640,8387068003153235ull,379169582252ull,527,5,1577031,2763,198914727930ull,9914959051251108ull,7,1476603,35048777915ull,4998886136ull,54520161,4412766108116611025ull}));
+    // clang-format on
+}
+CATCH
+
+TEST_F(TestFunctionBitShiftRight, UB)
+try
+{
+    ASSERT_BITSHIFTRIGHT(createColumn<Int8>({127, -128}), createColumn<UInt8>({64, 64}), createColumn<UInt64>({0, 0}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Int8>({127, -128}), createColumn<UInt16>({64, 64}), createColumn<UInt64>({0, 0}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Int16>({32767, -32768}), createColumn<UInt8>({64, 64}), createColumn<UInt64>({0, 0}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Int32>({INT32_MAX, INT32_MIN}), createColumn<UInt8>({64, 64}), createColumn<UInt64>({0, 0}));
+    ASSERT_BITSHIFTRIGHT(createColumn<Int64>({INT64_MAX, INT64_MIN}), createColumn<UInt8>({64, 64}), createColumn<UInt64>({0, 0}));
+
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt8>({255}), createColumn<UInt8>({64}), createColumn<UInt64>({0}));
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt8>({255}), createColumn<UInt16>({64}), createColumn<UInt64>({0}));
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt16>({65535}), createColumn<UInt8>({64}), createColumn<UInt64>({0}));
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt32>({UINT32_MAX}), createColumn<UInt8>({64}), createColumn<UInt64>({0}));
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt64>({UINT64_MAX}), createColumn<UInt8>({64}), createColumn<UInt64>({0}));
+
+    /*
+    std::mt19937 gen(std::random_device{}());
+    std::uniform_int_distribution<uint64_t> dis1(
+            std::numeric_limits<std::uint64_t>::min(),
+            std::numeric_limits<std::uint64_t>::max()
+    );
+    std::uniform_int_distribution<uint64_t> dis2(
+            64,
+            std::numeric_limits<std::uint64_t>::max()
+    );
+    size_t count = 100;
+    std::vector<uint64_t> v1(count), v2(count), res(count);
+    for (size_t i=0; i<count; ++i) {
+        v1[i] = dis1(gen);
+        v2[i] = dis2(gen);
+        res[i] = 0;
+    }
+    */
+    // clang-format off
+    ASSERT_BITSHIFTRIGHT(createColumn<UInt64>({17563387625296433369ull,5842891814427459261ull,15074502074821508463ull,386435802999553003ull,5487893274931198395ull,8125923807366590570ull,13340330062727071249ull,14908193031091561411ull,296805448857369387ull,8684453485792353774ull,13117933444495098288ull,3225762988982100714ull,11290506757949810556ull,14617912756126856962ull,9479575714707174581ull,11720728318194739598ull,14410575429605211363ull,12068356718035872518ull,80682389916710599ull,11003236134534292734ull,4412447398096224810ull,5331184707993902906ull,13827083432789678788ull,958142831027309576ull,16716461997317184701ull,17128750834581527743ull,11590434571174666313ull,10204342520615148287ull,11067791415848657283ull,17583875436196878829ull,186304014359496415ull,9381729025189804702ull,11502205568225715300ull,16472133582690439104ull,3743303387826342067ull,12860029445868505658ull,2244056593742923769ull,3275687468466891223ull,1545828456957460699ull,14187252460708728077ull,7551907967738536187ull,9754400233340010491ull,16293183350230169116ull,6298812696728711031ull,5915538565572009956ull,2284684518775825662ull,1130711226902262476ull,17158957721471765323ull,4220824385439711070ull,16559772875254313109ull,15397179690017513678ull,6300413832999049491ull,13787530251307637715ull,10132349060092695582ull,10446586881482901699ull,15759779838283537085ull,14402587207027333363ull,5546051719872960161ull,6545031029710296628ull,17407295406267098658ull,4259019625544816073ull,791895457880289787ull,8549227257401578066ull,15246278171168501125ull,1674668228908076954ull,849762797502000057ull,13302651500925764574ull,12438174880334092333ull,17701249772557033303ull,10742459186038873636ull,15671491258945407856ull,9352557101631889001ull,8914093883925002585ull,17935292744735591949ull,606989231583658922ull,6528503454270721815ull,14980539549624989095ull,13765196438235456668ull,3058323869228644592ull,14346577759191739044ull,1543206286382906519ull,1025562312317433790ull,17052896445025268012ull,18349597294988935754ull,17174604730104962524ull,11924965352621110201ull,502032511104181724ull,13845633389643139332ull,15436039204445155412ull,17809579006694175565ull,15166364145138562881ull,14062748599121933798ull,1777457178576774356ull,4985224560472716170ull,3881603168175384251ull,11555031280550342082ull,1252677486917153396ull,8744807353133366467ull,2048964426549800495ull,11945831330508218140ull}),
+                         createColumn<UInt64>({7570379165150948640ull,2086259313016069849ull,3606689596671293211ull,14039117280692395662ull,13678665403528829741ull,16069000531561010558ull,18229345530821449414ull,433464578739092378ull,6298872104011095934ull,4518228872693063137ull,14988726875963869472ull,9568218424260764817ull,5383191468426384555ull,8698762658876708752ull,9487599666567205013ull,14370091126330876161ull,10702068376663045773ull,8045701071228357739ull,10878469353312437370ull,3183167829827610494ull,5928881618833110378ull,10410530709181481816ull,249988564503361262ull,13482614555530280987ull,5522946068620734806ull,12797173590813112894ull,14133419908717831141ull,10825732602137508628ull,13271177233899692778ull,1157753039017783757ull,3370600557036147696ull,2957689395775524062ull,11963898745206689513ull,4828931188614542720ull,15157289330857160797ull,369467010700905309ull,6278071805692607460ull,17817858137511910604ull,17789013631125929528ull,2861684947245777353ull,2583152408663154190ull,7935135702156687355ull,3033127046167579202ull,14224256960933395097ull,10838403249753694181ull,2154089102842257532ull,7860358918492191001ull,2982010253383852617ull,16385171982396620123ull,12241857497176342828ull,2080931105225959532ull,1046322072991155713ull,6146917059052005252ull,17411786298437646544ull,5497869583209795613ull,11701448129764809247ull,12642962700918363620ull,15936842187305218463ull,7811510447588439153ull,3558405966224377785ull,977960926168429540ull,9505800334935014018ull,12114068456102275321ull,5141880021314950000ull,6719615890604904521ull,1341445859098821585ull,3883912906202435997ull,2107770591867486616ull,2657186337437393032ull,2640917573672927653ull,3746140861437224253ull,15057648507099656234ull,12051189681068107042ull,2259769676757597701ull,2935229535510718769ull,6368233316971463582ull,14384644474340782197ull,2553547617837260603ull,14238122466576902747ull,9555765226032904481ull,15522640015319979866ull,10274396157562093026ull,5996101113505388770ull,16915812546351047056ull,4956089714130804219ull,17126605744801075545ull,12036643325202409080ull,11257234688654558199ull,375338337104024778ull,11152980243617851986ull,12325805905403174063ull,8653948654121626815ull,15348912598299408338ull,6883296938248095081ull,6484642948886870833ull,16936141613107270500ull,17012171815528507292ull,2574129622316042070ull,17178726110735453748ull,16578303277501346489ull}),
+                         createColumn<UInt64>({0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}));
+    // clang-format on
+}
+CATCH
+
+} // namespace tests
+} // namespace DB
diff --git a/dbms/src/Functions/tests/gtest_duration_pushdown.cpp b/dbms/src/Functions/tests/gtest_duration_pushdown.cpp
index 4501a4c9fae..106f3d84642 100644
--- a/dbms/src/Functions/tests/gtest_duration_pushdown.cpp
+++ b/dbms/src/Functions/tests/gtest_duration_pushdown.cpp
@@ -166,5 +166,85 @@ try
     ASSERT_COLUMN_EQ(microSecond_out, executeFunction("microSecond", input4));
 }
 CATCH
+
+TEST_F(DurationPushDown, timeToSecPushDownTest)
+try
+{
+    ColumnWithTypeAndName input(
+        createColumn<Nullable<DataTypeMyDuration::FieldType>>({(838 * 3600 + 59 * 60 + 59) * 1000000000L + 999999000L,
+                                                               -(838 * 3600 + 59 * 60 + 59) * 1000000000L - 123456000L,
+                                                               0,
+                                                               (1 * 3600 + 2 * 60 + 3) * 1000000000L + 4000L})
+            .column,
+        makeNullable(std::make_shared<DataTypeMyDuration>(6)),
+        "input");
+    auto second_output = createColumn<Nullable<Int64>>({3020399, -3020399, 0, 3723});
+    ASSERT_COLUMN_EQ(second_output, executeFunction("tidbTimeToSec", input));
+
+    // Test Overflow
+    ColumnWithTypeAndName input2(
+        createColumn<Nullable<DataTypeMyDuration::FieldType>>({(838 * 3600 + 59 * 60 + 59) * 1000000000L + 999999000L + 1000L}).column,
+        makeNullable(std::make_shared<DataTypeMyDuration>(6)),
+        "result");
+    try
+    {
+        auto result = executeFunction("tidbTimeToSec", input2);
+        FAIL() << "Expected overflow";
+    }
+    catch (DB::Exception & e)
+    {
+        ASSERT_EQ(e.message(), std::string("nanos must >= -3020399999999000 and <= 3020399999999000"));
+    }
+    catch (...)
+    {
+        FAIL() << "Expected overflow";
+    };
+
+    ColumnWithTypeAndName input3(
+        createColumn<Nullable<DataTypeMyDuration::FieldType>>({-(838 * 3600 + 59 * 60 + 59) * 1000000000L - 999999000L - 1000L}).column,
+        makeNullable(std::make_shared<DataTypeMyDuration>(6)),
+        "result");
+    try
+    {
+        auto result = executeFunction("tidbTimeToSec", input3);
+        FAIL() << "Expected overflow";
+    }
+    catch (DB::Exception & e)
+    {
+        ASSERT_EQ(e.message(), std::string("nanos must >= -3020399999999000 and <= 3020399999999000"));
+    }
+    catch (...)
+    {
+        FAIL() << "Expected overflow";
+    };
+
+    // Random Test
+    constexpr int rowNum = 1000;
+    auto dur_column = ColumnVector<Int64>::create();
+    auto & dur_data = dur_column->getData();
+    auto second_column = ColumnVector<Int64>::create();
+    auto & second_data = second_column->getData();
+    dur_data.resize(rowNum);
+    second_data.resize(rowNum);
+
+    std::random_device rd;
+    std::default_random_engine gen = std::default_random_engine(rd());
+    std::uniform_int_distribution<int> sign_dis(0, 1), hour_dis(0, 838), minute_dis(0, 59), second_dis(0, 59), microSecond_dis(0, 999999);
+    for (int i = 0; i < rowNum; ++i)
+    {
+        auto sign = (sign_dis(gen) == 0) ? 1 : -1;
+        auto hour = hour_dis(gen);
+        auto minute = minute_dis(gen);
+        auto second = second_dis(gen);
+        auto microSecond = microSecond_dis(gen);
+        dur_data[i] = sign * ((hour * 3600 + minute * 60 + second) * 1000000000L + microSecond * 1000L);
+        second_data[i] = sign * (hour * 3600 + minute * 60 + second);
+    }
+
+    ColumnWithTypeAndName input4(std::move(dur_column), std::make_shared<DataTypeMyDuration>(6), "duration");
+    ColumnWithTypeAndName second_out(std::move(second_column), std::make_shared<DataTypeInt64>(), "time_to_sec");
+    ASSERT_COLUMN_EQ(second_out, executeFunction("tidbTimeToSec", input4));
+}
+CATCH
 } // namespace tests
 } // namespace DB
\ No newline at end of file
diff --git a/dbms/src/Functions/tests/gtest_get_format.cpp b/dbms/src/Functions/tests/gtest_get_format.cpp
new file mode 100644
index 00000000000..61a8d80e7b4
--- /dev/null
+++ b/dbms/src/Functions/tests/gtest_get_format.cpp
@@ -0,0 +1,153 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <Columns/ColumnConst.h>
+#include <Columns/ColumnString.h>
+#include <DataTypes/DataTypeString.h>
+#include <Functions/FunctionFactory.h>
+#include <Functions/FunctionHelpers.h>
+#include <Interpreters/Context.h>
+#include <TestUtils/FunctionTestUtils.h>
+#include <TestUtils/TiFlashTestBasic.h>
+
+#include <string>
+#include <vector>
+
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#include <Poco/Types.h>
+
+#pragma GCC diagnostic pop
+
+namespace DB::tests
+{
+class GetFormatTest : public DB::tests::FunctionTest
+{
+public:
+    static constexpr auto funcName = "getFormat";
+};
+
+TEST_F(GetFormatTest, testBoundary)
+try
+{
+    // const(non-null), vector
+    // time_type is a const with non null value
+    // location is a vector containing null
+    ASSERT_COLUMN_EQ(
+        createColumn<Nullable<String>>({"%m.%d.%Y", {}}),
+        executeFunction(
+            funcName,
+            createConstColumn<Nullable<String>>(2, "DATE"),
+            createColumn<Nullable<String>>({"USA", {}})));
+
+    // const(null), vector
+    // time_type is a const with null value
+    // location is a vector containing null
+    ASSERT_COLUMN_EQ(
+        createConstColumn<Nullable<String>>(2, {}),
+        executeFunction(
+            funcName,
+            createConstColumn<Nullable<String>>(2, {}),
+            createColumn<Nullable<String>>({"USA", {}})));
+
+    // const(non-null), const(non-null)
+    // time_type is a const with non null value
+    // location is a const with non null value
+    ASSERT_COLUMN_EQ(
+        createConstColumn<String>(2, "%m.%d.%Y"),
+        executeFunction(
+            funcName,
+            createConstColumn<Nullable<String>>(2, "DATE"),
+            createConstColumn<Nullable<String>>(2, "USA")));
+
+    // const(non-null), const(null)
+    // time_type is a const with non null value
+    // location is a const with null value
+    ASSERT_COLUMN_EQ(
+        createConstColumn<Nullable<String>>(2, {}),
+        executeFunction(
+            funcName,
+            createConstColumn<Nullable<String>>(2, "DATE"),
+            createConstColumn<Nullable<String>>(2, {})));
+
+    // The time_type is a system pre_defined macro, thus assume time_type column is const
+    // Throw an exception is time_type is not ColumnConst
+    ASSERT_THROW(
+        executeFunction(
+            funcName,
+            createColumn<Nullable<String>>({"DATE", "TIME"}),
+            createColumn<Nullable<String>>({"USA", {}})),
+        DB::Exception);
+}
+CATCH
+
+TEST_F(GetFormatTest, testMoreCases)
+try
+{
+    // time_type: DATE
+    // all locations
+    ASSERT_COLUMN_EQ(
+        createColumn<Nullable<String>>({"%m.%d.%Y", "%Y-%m-%d", "%Y-%m-%d", "%d.%m.%Y", "%Y%m%d"}),
+        executeFunction(
+            funcName,
+            createConstColumn<Nullable<String>>(5, "DATE"),
+            createColumn<Nullable<String>>({"USA", "JIS", "ISO", "EUR", "INTERNAL"})));
+
+    // time_type: DATETIME
+    // all locations
+    ASSERT_COLUMN_EQ(
+        createColumn<Nullable<String>>({"%Y-%m-%d %H.%i.%s", "%Y-%m-%d %H:%i:%s", "%Y-%m-%d %H:%i:%s", "%Y-%m-%d %H.%i.%s", "%Y%m%d%H%i%s"}),
+        executeFunction(
+            funcName,
+            createConstColumn<Nullable<String>>(5, "DATETIME"),
+            createColumn<Nullable<String>>({"USA", "JIS", "ISO", "EUR", "INTERNAL"})));
+
+    // time_type: TIMESTAMP
+    // all locations
+    ASSERT_COLUMN_EQ(
+        createColumn<Nullable<String>>({"%Y-%m-%d %H.%i.%s", "%Y-%m-%d %H:%i:%s", "%Y-%m-%d %H:%i:%s", "%Y-%m-%d %H.%i.%s", "%Y%m%d%H%i%s"}),
+        executeFunction(
+            funcName,
+            createConstColumn<Nullable<String>>(5, "TIMESTAMP"),
+            createColumn<Nullable<String>>({"USA", "JIS", "ISO", "EUR", "INTERNAL"})));
+
+    // time_type: TIME
+    // all locations
+    ASSERT_COLUMN_EQ(
+        createColumn<Nullable<String>>({"%h:%i:%s %p", "%H:%i:%s", "%H:%i:%s", "%H.%i.%s", "%H%i%s"}),
+        executeFunction(
+            funcName,
+            createConstColumn<Nullable<String>>(5, "TIME"),
+            createColumn<Nullable<String>>({"USA", "JIS", "ISO", "EUR", "INTERNAL"})));
+
+    // the location is not in ("USA", "JIS", "ISO", "EUR", "INTERNAL")
+    ASSERT_COLUMN_EQ(
+        createColumn<Nullable<String>>({"", ""}),
+        executeFunction(
+            funcName,
+            createConstColumn<Nullable<String>>(2, "TIME"),
+            createColumn<Nullable<String>>({"CAN", ""})));
+
+    // the time_type is not in ("DATE", "DATETIME", "TIMESTAMP", "TIME")
+    ASSERT_COLUMN_EQ(
+        createColumn<Nullable<String>>({"", ""}),
+        executeFunction(
+            funcName,
+            createConstColumn<Nullable<String>>(2, "TIMEINUTC"),
+            createColumn<Nullable<String>>({"USA", "ISO"})));
+}
+CATCH
+
+} // namespace DB::tests
diff --git a/dbms/src/Functions/tests/gtest_strings_format.cpp b/dbms/src/Functions/tests/gtest_strings_format.cpp
index 2d571a9bb1b..8f3b899316e 100644
--- a/dbms/src/Functions/tests/gtest_strings_format.cpp
+++ b/dbms/src/Functions/tests/gtest_strings_format.cpp
@@ -34,7 +34,7 @@ class StringFormat : public DB::tests::FunctionTest
         using FieldType = DecimalField<Decimal>;
         using NullableDecimal = Nullable<Decimal>;
         ASSERT_COLUMN_EQ(
-            createColumn<Nullable<String>>({"0.0000", "-0.0120", "0.0120", "12,332.1000", "12,332", "12,332", "12,332.300000000000000000000000000000", "-12,332.30000", "-1,000.0", "-333.33", {}}),
+            createColumn<Nullable<String>>({"0.0000", "-0.0120", "0.0120", "12,332.1000", "12,332", "12,332", "12,332.300000000000000000000000000000", "-12,332.30000", "-1,000.0", "-333.33", {}, "99,999.9999000000", "100,000.000", "100,000"}),
             executeFunction(
                 func_name,
                 createColumn<NullableDecimal>(
@@ -49,8 +49,11 @@ class StringFormat : public DB::tests::FunctionTest
                      FieldType(static_cast<Native>(-123323000), 4),
                      FieldType(static_cast<Native>(-9999999), 4),
                      FieldType(static_cast<Native>(-3333330), 4),
-                     FieldType(static_cast<Native>(0), 0)}),
-                createColumn<Nullable<Int64>>({4, 4, 4, 4, 0, -1, 31, 5, 1, 2, {}})));
+                     FieldType(static_cast<Native>(0), 0),
+                     FieldType(static_cast<Native>(999999999), 4),
+                     FieldType(static_cast<Native>(999999999), 4),
+                     FieldType(static_cast<Native>(999999999), 4)}),
+                createColumn<Nullable<Int64>>({4, 4, 4, 4, 0, -1, 31, 5, 1, 2, {}, 10, 3, -5})));
         ASSERT_COLUMN_EQ(
             createColumn<Nullable<String>>({"12,332.100", "-12,332.300", "-1,000.000", "-333.333"}),
             executeFunction(
@@ -62,8 +65,6 @@ class StringFormat : public DB::tests::FunctionTest
                      FieldType(static_cast<Native>(-9999999), 4),
                      FieldType(static_cast<Native>(-3333330), 4)}),
                 createConstColumn<Nullable<Int16>>(4, 3)));
-        /// known issue https://github.com/pingcap/tiflash/issues/4891
-        /*
         ASSERT_COLUMN_EQ(
             createColumn<Nullable<String>>({"-999.9999", "-1,000", "-1,000", "-999.999900000000000000000000000000", "-999.99990", "-1,000.0", "-1,000.00"}),
             executeFunction(
@@ -74,7 +75,7 @@ class StringFormat : public DB::tests::FunctionTest
                     FieldType(static_cast<Native>(-9999999), 4)),
                 createColumn<Nullable<Int32>>({4, 0, -1, 31, 5, 1, 2})));
         ASSERT_COLUMN_EQ(
-            createConstColumn<Nullable<String>>(1, "-1,000.000"),
+            createConstColumn<String>(1, "-1,000.000"),
             executeFunction(
                 func_name,
                 createConstColumn<NullableDecimal>(
@@ -82,7 +83,6 @@ class StringFormat : public DB::tests::FunctionTest
                     1,
                     FieldType(static_cast<Native>(-9999999), 4)),
                 createConstColumn<Nullable<Int8>>(1, 3)));
-                */
         ASSERT_COLUMN_EQ(
             createColumn<Nullable<String>>({"12,332.1000", "12,332", "12,332.300000000000000000000000000000", "-12,332.30000", "-1,000.0", "-333.33", {}}),
             executeFunction(
@@ -108,8 +108,6 @@ class StringFormat : public DB::tests::FunctionTest
                      FieldType(static_cast<Native>(-9999999), 4),
                      FieldType(static_cast<Native>(-3333330), 4)}),
                 createConstColumn<Nullable<UInt16>>(4, 3)));
-        /// known issue https://github.com/pingcap/tiflash/issues/4891
-        /*
         ASSERT_COLUMN_EQ(
             createColumn<Nullable<String>>({"-999.9999", "-1,000", "-999.999900000000000000000000000000", "-999.99990", "-1,000.0", "-1,000.00"}),
             executeFunction(
@@ -120,7 +118,7 @@ class StringFormat : public DB::tests::FunctionTest
                     FieldType(static_cast<Native>(-9999999), 4)),
                 createColumn<Nullable<UInt32>>({4, 0, 31, 5, 1, 2})));
         ASSERT_COLUMN_EQ(
-            createConstColumn<Nullable<String>>(1, "-1,000.000"),
+            createConstColumn<String>(1, "-1,000.000"),
             executeFunction(
                 func_name,
                 createConstColumn<NullableDecimal>(
@@ -128,7 +126,6 @@ class StringFormat : public DB::tests::FunctionTest
                     1,
                     FieldType(static_cast<Native>(-9999999), 4)),
                 createConstColumn<Nullable<UInt8>>(1, 3)));
-         */
     }
 
     template <typename Integer>
diff --git a/dbms/src/Functions/tests/gtest_strings_reverse.cpp b/dbms/src/Functions/tests/gtest_strings_reverse.cpp
new file mode 100644
index 00000000000..304a403db83
--- /dev/null
+++ b/dbms/src/Functions/tests/gtest_strings_reverse.cpp
@@ -0,0 +1,120 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <Functions/FunctionFactory.h>
+#include <Functions/FunctionsString.h>
+#include <Interpreters/Context.h>
+#include <TestUtils/FunctionTestUtils.h>
+#include <TestUtils/TiFlashTestBasic.h>
+
+#include <string>
+#include <vector>
+
+#pragma GCC diagnostic pop
+
+namespace DB::tests
+{
+class StringReverse : public DB::tests::FunctionTest
+{
+protected:
+    static ColumnWithTypeAndName toVec(const std::vector<String> & v)
+    {
+        return createColumn<String>(v);
+    }
+
+    static ColumnWithTypeAndName toNullableVec(const std::vector<std::optional<String>> & v)
+    {
+        return createColumn<Nullable<String>>(v);
+    }
+
+    static ColumnWithTypeAndName toConst(const String & s)
+    {
+        return createConstColumn<String>(1, s);
+    }
+};
+// test reverse
+TEST_F(StringReverse, stringReverseTest)
+try
+{
+    std::vector<String> candidate_strings = {"one week's time test", "abcdef", "abcabc", "moc.pacgnip"};
+    std::vector<String> reversed_strings = {"tset emit s'keew eno", "fedcba", "cbacba", "pingcap.com"};
+
+    // test vector
+    ASSERT_COLUMN_EQ(
+        toVec(reversed_strings),
+        executeFunction(
+            "reverse",
+            toVec(candidate_strings)));
+
+    // test nullable
+    ASSERT_COLUMN_EQ(
+        toNullableVec({"", " ", {}, "pacgnip"}),
+        executeFunction(
+            "reverse",
+            toNullableVec({"", " ", {}, "pingcap"})));
+
+    // test const
+    ASSERT_COLUMN_EQ(
+        toConst("pacgnip"),
+        executeFunction(
+            "reverse",
+            toConst("pingcap")));
+
+    // test null
+    ASSERT_COLUMN_EQ(
+        toConst({}),
+        executeFunction(
+            "reverse",
+            toConst({})));
+}
+CATCH
+
+// test reverseUTF8
+TEST_F(StringReverse, stringReverseUTF8Test)
+try
+{
+    std::vector<String> candidate_strings = {"one week's time test", "abc测试def", "abcテストabc", "ѐёђѓєѕіїјљњћќѝўџ", "+ѐ-ё*ђ/ѓ!є@ѕ#і$@ї%ј……љ&њ（ћ）ќ￥ѝ#ў@џ！^", "αβγδεζηθικλμνξοπρστυφχψωσ", "▲α▼βγ➨δε☎ζη✂θι€κλ♫μν✓ξο✚πρ℉στ♥υφ♖χψ♘ω★σ✕", "թփձջրչճժծքոեռտըւիօպասդֆգհյկլխզղցվբնմշ"};
+    std::vector<String> reversed_strings = {"tset emit s'keew eno", "fed试测cba", "cbaトステcba", "џўѝќћњљјїіѕєѓђёѐ", "^！џ@ў#ѝ￥ќ）ћ（њ&љ……ј%ї@$і#ѕ@є!ѓ/ђ*ё-ѐ+", "σωψχφυτσρποξνμλκιθηζεδγβα", "✕σ★ω♘ψχ♖φυ♥τσ℉ρπ✚οξ✓νμ♫λκ€ιθ✂ηζ☎εδ➨γβ▼α▲", "շմնբվցղզխլկյհգֆդսապօիւըտռեոքծժճչրջձփթ"};
+
+    // test vector
+    ASSERT_COLUMN_EQ(
+        toVec(reversed_strings),
+        executeFunction(
+            "reverseUTF8",
+            toVec(candidate_strings)));
+
+    // test nullable
+    ASSERT_COLUMN_EQ(
+        toNullableVec({"", " ", {}, "pacgnip"}),
+        executeFunction(
+            "reverseUTF8",
+            toNullableVec({"", " ", {}, "pingcap"})));
+
+    // test const
+    ASSERT_COLUMN_EQ(
+        toConst("pacgnip"),
+        executeFunction(
+            "reverseUTF8",
+            toConst("pingcap")));
+
+    // test null
+    ASSERT_COLUMN_EQ(
+        toConst({}),
+        executeFunction(
+            "reverseUTF8",
+            toConst({})));
+}
+CATCH
+
+} // namespace DB::tests
\ No newline at end of file
diff --git a/dbms/src/IO/WriteBuffer.h b/dbms/src/IO/WriteBuffer.h
index 361081d1176..0c0fa2cb545 100644
--- a/dbms/src/IO/WriteBuffer.h
+++ b/dbms/src/IO/WriteBuffer.h
@@ -96,6 +96,24 @@ class WriteBuffer : public BufferBase
         }
     }
 
+    template <class T>
+    __attribute__((always_inline)) void writeFixed(const T * __restrict from)
+    {
+        if (likely(working_buffer.end() - pos >= static_cast<ptrdiff_t>(sizeof(T))))
+        {
+            tiflash_compiler_builtin_memcpy(pos, from, sizeof(T));
+            pos += sizeof(T);
+        }
+        else
+        {
+            [&]() __attribute__((noinline))
+            {
+                write(reinterpret_cast<const char *>(from), sizeof(T));
+            }
+            ();
+        }
+    }
+
 
     inline void write(char x)
     {
diff --git a/dbms/src/Interpreters/Aggregator.cpp b/dbms/src/Interpreters/Aggregator.cpp
index 6a39bc333a8..6cb947a1bfa 100644
--- a/dbms/src/Interpreters/Aggregator.cpp
+++ b/dbms/src/Interpreters/Aggregator.cpp
@@ -17,6 +17,7 @@
 #include <AggregateFunctions/AggregateFunctionState.h>
 #include <Columns/ColumnTuple.h>
 #include <Common/ClickHouseRevision.h>
+#include <Common/FailPoint.h>
 #include <Common/MemoryTracker.h>
 #include <Common/Stopwatch.h>
 #include <Common/ThreadManager.h>
@@ -48,6 +49,11 @@ extern const int CANNOT_MERGE_DIFFERENT_AGGREGATED_DATA_VARIANTS;
 extern const int LOGICAL_ERROR;
 } // namespace ErrorCodes
 
+namespace FailPoints
+{
+extern const char random_aggregate_create_state_failpoint[];
+extern const char random_aggregate_merge_failpoint[];
+} // namespace FailPoints
 
 AggregatedDataVariants::~AggregatedDataVariants()
 {
@@ -317,6 +323,7 @@ void Aggregator::createAggregateStates(AggregateDataPtr & aggregate_data) const
               * In order that then everything is properly destroyed, we "roll back" some of the created states.
               * The code is not very convenient.
               */
+            FAIL_POINT_TRIGGER_EXCEPTION(FailPoints::random_aggregate_create_state_failpoint);
             aggregate_functions[j]->create(aggregate_data + offsets_of_aggregate_states[j]);
         }
         catch (...)
@@ -1504,6 +1511,8 @@ class MergingAndConvertingBlockInputStream : public IProfilingBlockInputStream
         if (current_bucket_num >= NUM_BUCKETS)
             return {};
 
+        FAIL_POINT_TRIGGER_EXCEPTION(FailPoints::random_aggregate_merge_failpoint);
+
         AggregatedDataVariantsPtr & first = data[0];
 
         if (current_bucket_num == -1)
diff --git a/dbms/src/Interpreters/Context.cpp b/dbms/src/Interpreters/Context.cpp
index 44699a324f4..7cd0cb5ad53 100644
--- a/dbms/src/Interpreters/Context.cpp
+++ b/dbms/src/Interpreters/Context.cpp
@@ -68,9 +68,9 @@
 #include <fmt/core.h>
 
 #include <boost/functional/hash/hash.hpp>
-#include <map>
 #include <pcg_random.hpp>
 #include <set>
+#include <unordered_map>
 
 
 namespace ProfileEvents
@@ -78,8 +78,6 @@ namespace ProfileEvents
 extern const Event ContextLock;
 }
 
-#include <set>
-
 namespace CurrentMetrics
 {
 extern const Metric GlobalStorageRunMode;
@@ -1440,20 +1438,32 @@ void Context::dropCaches() const
         shared->mark_cache->reset();
 }
 
-BackgroundProcessingPool & Context::getBackgroundPool()
+BackgroundProcessingPool & Context::initializeBackgroundPool(UInt16 pool_size)
 {
     auto lock = getLock();
     if (!shared->background_pool)
-        shared->background_pool = std::make_shared<BackgroundProcessingPool>(settings.background_pool_size);
+        shared->background_pool = std::make_shared<BackgroundProcessingPool>(pool_size);
     return *shared->background_pool;
 }
 
-BackgroundProcessingPool & Context::getBlockableBackgroundPool()
+BackgroundProcessingPool & Context::getBackgroundPool()
+{
+    auto lock = getLock();
+    return *shared->background_pool;
+}
+
+BackgroundProcessingPool & Context::initializeBlockableBackgroundPool(UInt16 pool_size)
 {
-    // TODO: choose a better thread pool size and maybe a better name for the pool
     auto lock = getLock();
     if (!shared->blockable_background_pool)
-        shared->blockable_background_pool = std::make_shared<BackgroundProcessingPool>(settings.background_pool_size);
+        shared->blockable_background_pool = std::make_shared<BackgroundProcessingPool>(pool_size);
+    return *shared->blockable_background_pool;
+}
+
+BackgroundProcessingPool & Context::getBlockableBackgroundPool()
+{
+    // TODO: maybe a better name for the pool
+    auto lock = getLock();
     return *shared->blockable_background_pool;
 }
 
diff --git a/dbms/src/Interpreters/Context.h b/dbms/src/Interpreters/Context.h
index b6e759e364b..7663b40f612 100644
--- a/dbms/src/Interpreters/Context.h
+++ b/dbms/src/Interpreters/Context.h
@@ -379,7 +379,9 @@ class Context
     void setUseL0Opt(bool use_l0_opt);
     bool useL0Opt() const;
 
+    BackgroundProcessingPool & initializeBackgroundPool(UInt16 pool_size);
     BackgroundProcessingPool & getBackgroundPool();
+    BackgroundProcessingPool & initializeBlockableBackgroundPool(UInt16 pool_size);
     BackgroundProcessingPool & getBlockableBackgroundPool();
 
     void createTMTContext(const TiFlashRaftConfig & raft_config, pingcap::ClusterConfig && cluster_config);
@@ -505,7 +507,7 @@ class DDLGuard
 class SessionCleaner
 {
 public:
-    SessionCleaner(Context & context_)
+    explicit SessionCleaner(Context & context_)
         : context{context_}
     {}
     ~SessionCleaner();
diff --git a/dbms/src/Interpreters/InterpreterSelectQuery.cpp b/dbms/src/Interpreters/InterpreterSelectQuery.cpp
index 01e8625f943..3514f915626 100644
--- a/dbms/src/Interpreters/InterpreterSelectQuery.cpp
+++ b/dbms/src/Interpreters/InterpreterSelectQuery.cpp
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <Columns/Collator.h>
+#include <Common/FailPoint.h>
 #include <Common/Logger.h>
 #include <Common/TiFlashException.h>
 #include <Common/typeid_cast.h>
@@ -93,6 +94,12 @@ extern const int SCHEMA_VERSION_ERROR;
 extern const int UNKNOWN_EXCEPTION;
 } // namespace ErrorCodes
 
+
+namespace FailPoints
+{
+extern const char pause_query_init[];
+} // namespace FailPoints
+
 InterpreterSelectQuery::InterpreterSelectQuery(
     const ASTPtr & query_ptr_,
     const Context & context_,
@@ -131,6 +138,15 @@ InterpreterSelectQuery::~InterpreterSelectQuery() = default;
 
 void InterpreterSelectQuery::init(const Names & required_result_column_names)
 {
+    /// the failpoint pause_query_init should use with the failpoint unblock_query_init_after_write,
+    /// to fulfill that the select query action will be blocked before init state to wait the write action finished.
+    /// In using, we need enable unblock_query_init_after_write in our test code,
+    /// and before each write statement take effect, we need enable pause_query_init.
+    /// When the write action finished, the pause_query_init will be disabled automatically,
+    /// and then the select query could be continued.
+    /// you can refer multi_alter_with_write.test for an example.
+    FAIL_POINT_PAUSE(FailPoints::pause_query_init);
+
     if (!context.hasQueryContext())
         context.setQueryContext(context);
 
@@ -496,13 +512,13 @@ void InterpreterSelectQuery::executeImpl(Pipeline & pipeline, const BlockInputSt
             {
                 const auto & join = static_cast<const ASTTableJoin &>(*query.join()->table_join);
                 if (join.kind == ASTTableJoin::Kind::Full || join.kind == ASTTableJoin::Kind::Right)
-                    pipeline.stream_with_non_joined_data = expressions.before_join->createStreamWithNonJoinedDataIfFullOrRightJoin(
+                    pipeline.streams_with_non_joined_data.push_back(expressions.before_join->createStreamWithNonJoinedDataIfFullOrRightJoin(
                         pipeline.firstStream()->getHeader(),
                         0,
                         1,
-                        settings.max_block_size);
+                        settings.max_block_size));
 
-                for (auto & stream : pipeline.streams) /// Applies to all sources except stream_with_non_joined_data.
+                for (auto & stream : pipeline.streams) /// Applies to all sources except streams_with_non_joined_data.
                     stream = std::make_shared<ExpressionBlockInputStream>(stream, expressions.before_join, /*req_id=*/"");
             }
 
@@ -587,7 +603,7 @@ void InterpreterSelectQuery::executeImpl(Pipeline & pipeline, const BlockInputSt
             if (need_second_distinct_pass
                 || query.limit_length
                 || query.limit_by_expression_list
-                || pipeline.stream_with_non_joined_data)
+                || !pipeline.streams_with_non_joined_data.empty())
             {
                 need_merge_streams = true;
             }
@@ -971,11 +987,11 @@ void InterpreterSelectQuery::executeAggregation(Pipeline & pipeline, const Expre
     Aggregator::Params params(header, keys, aggregates, overflow_row, settings.max_rows_to_group_by, settings.group_by_overflow_mode, allow_to_use_two_level_group_by ? settings.group_by_two_level_threshold : SettingUInt64(0), allow_to_use_two_level_group_by ? settings.group_by_two_level_threshold_bytes : SettingUInt64(0), settings.max_bytes_before_external_group_by, settings.empty_result_for_aggregation_by_empty_set, context.getTemporaryPath());
 
     /// If there are several sources, then we perform parallel aggregation
-    if (pipeline.streams.size() > 1)
+    if (pipeline.streams.size() > 1 || pipeline.streams_with_non_joined_data.size() > 1)
     {
-        pipeline.firstStream() = std::make_shared<ParallelAggregatingBlockInputStream>(
+        auto stream = std::make_shared<ParallelAggregatingBlockInputStream>(
             pipeline.streams,
-            pipeline.stream_with_non_joined_data,
+            pipeline.streams_with_non_joined_data,
             params,
             file_provider,
             final,
@@ -985,19 +1001,21 @@ void InterpreterSelectQuery::executeAggregation(Pipeline & pipeline, const Expre
                 : static_cast<size_t>(settings.max_threads),
             /*req_id=*/"");
 
-        pipeline.stream_with_non_joined_data = nullptr;
         pipeline.streams.resize(1);
+        pipeline.streams_with_non_joined_data.clear();
+        pipeline.firstStream() = std::move(stream);
     }
     else
     {
         BlockInputStreams inputs;
         if (!pipeline.streams.empty())
             inputs.push_back(pipeline.firstStream());
-        else
-            pipeline.streams.resize(1);
 
-        if (pipeline.stream_with_non_joined_data)
-            inputs.push_back(pipeline.stream_with_non_joined_data);
+        if (!pipeline.streams_with_non_joined_data.empty())
+            inputs.push_back(pipeline.streams_with_non_joined_data.at(0));
+
+        pipeline.streams.resize(1);
+        pipeline.streams_with_non_joined_data.clear();
 
         pipeline.firstStream() = std::make_shared<AggregatingBlockInputStream>(
             std::make_shared<ConcatBlockInputStream>(inputs, /*req_id=*/""),
@@ -1005,8 +1023,6 @@ void InterpreterSelectQuery::executeAggregation(Pipeline & pipeline, const Expre
             file_provider,
             final,
             /*req_id=*/"");
-
-        pipeline.stream_with_non_joined_data = nullptr;
     }
 }
 
@@ -1228,21 +1244,33 @@ void InterpreterSelectQuery::executeDistinct(Pipeline & pipeline, bool before_or
 
 void InterpreterSelectQuery::executeUnion(Pipeline & pipeline)
 {
-    /// If there are still several streams, then we combine them into one
-    if (pipeline.hasMoreThanOneStream())
+    switch (pipeline.streams.size() + pipeline.streams_with_non_joined_data.size())
     {
-        pipeline.firstStream() = std::make_shared<UnionBlockInputStream<>>(
+    case 0:
+        break;
+    case 1:
+    {
+        if (pipeline.streams.size() == 1)
+            break;
+        // streams_with_non_joined_data's size is 1.
+        pipeline.streams.push_back(pipeline.streams_with_non_joined_data.at(0));
+        pipeline.streams_with_non_joined_data.clear();
+        break;
+    }
+    default:
+    {
+        BlockInputStreamPtr stream = std::make_shared<UnionBlockInputStream<>>(
             pipeline.streams,
-            pipeline.stream_with_non_joined_data,
+            pipeline.streams_with_non_joined_data,
             max_streams,
             /*req_id=*/"");
-        pipeline.stream_with_non_joined_data = nullptr;
+        ;
+
         pipeline.streams.resize(1);
+        pipeline.streams_with_non_joined_data.clear();
+        pipeline.firstStream() = std::move(stream);
+        break;
     }
-    else if (pipeline.stream_with_non_joined_data)
-    {
-        pipeline.streams.push_back(pipeline.stream_with_non_joined_data);
-        pipeline.stream_with_non_joined_data = nullptr;
     }
 }
 
diff --git a/dbms/src/Interpreters/InterpreterSelectQuery.h b/dbms/src/Interpreters/InterpreterSelectQuery.h
index 474ace7ee84..d1bcec2a3dd 100644
--- a/dbms/src/Interpreters/InterpreterSelectQuery.h
+++ b/dbms/src/Interpreters/InterpreterSelectQuery.h
@@ -95,7 +95,7 @@ class InterpreterSelectQuery : public IInterpreter
           * It has a special meaning, since reading from it should be done after reading from the main streams.
           * It is appended to the main streams in UnionBlockInputStream or ParallelAggregatingBlockInputStream.
           */
-        BlockInputStreamPtr stream_with_non_joined_data;
+        BlockInputStreams streams_with_non_joined_data;
 
         BlockInputStreamPtr & firstStream() { return streams.at(0); }
 
@@ -105,13 +105,13 @@ class InterpreterSelectQuery : public IInterpreter
             for (auto & stream : streams)
                 transform(stream);
 
-            if (stream_with_non_joined_data)
-                transform(stream_with_non_joined_data);
+            for (auto & stream : streams_with_non_joined_data)
+                transform(stream);
         }
 
         bool hasMoreThanOneStream() const
         {
-            return streams.size() + (stream_with_non_joined_data ? 1 : 0) > 1;
+            return streams.size() + streams_with_non_joined_data.size() > 1;
         }
     };
 
diff --git a/dbms/src/Interpreters/InterpreterSelectWithUnionQuery.cpp b/dbms/src/Interpreters/InterpreterSelectWithUnionQuery.cpp
index 5e73b1e5f3e..076c290cc9d 100644
--- a/dbms/src/Interpreters/InterpreterSelectWithUnionQuery.cpp
+++ b/dbms/src/Interpreters/InterpreterSelectWithUnionQuery.cpp
@@ -224,7 +224,7 @@ BlockIO InterpreterSelectWithUnionQuery::execute()
     }
     else
     {
-        result_stream = std::make_shared<UnionBlockInputStream<>>(nested_streams, nullptr, settings.max_threads, /*req_id=*/"");
+        result_stream = std::make_shared<UnionBlockInputStream<>>(nested_streams, BlockInputStreams{}, settings.max_threads, /*req_id=*/"");
         nested_streams.clear();
     }
 
diff --git a/dbms/src/Interpreters/Join.cpp b/dbms/src/Interpreters/Join.cpp
index 820618a6e8b..181ebcaaa64 100644
--- a/dbms/src/Interpreters/Join.cpp
+++ b/dbms/src/Interpreters/Join.cpp
@@ -17,6 +17,7 @@
 #include <Columns/ColumnNullable.h>
 #include <Columns/ColumnString.h>
 #include <Common/ColumnsHashing.h>
+#include <Common/FailPoint.h>
 #include <Common/typeid_cast.h>
 #include <Core/ColumnNumbers.h>
 #include <DataStreams/IProfilingBlockInputStream.h>
@@ -26,9 +27,17 @@
 #include <Functions/FunctionHelpers.h>
 #include <Interpreters/Join.h>
 #include <Interpreters/NullableUtils.h>
+#include <common/logger_useful.h>
+
 
 namespace DB
 {
+namespace FailPoints
+{
+extern const char random_join_build_failpoint[];
+extern const char random_join_prob_failpoint[];
+} // namespace FailPoints
+
 namespace ErrorCodes
 {
 extern const int UNKNOWN_SET_DATA_VARIANT;
@@ -621,6 +630,7 @@ void NO_INLINE insertFromBlockImplTypeCaseWithLock(
     }
     for (size_t insert_index = 0; insert_index < segment_index_info.size(); insert_index++)
     {
+        FAIL_POINT_TRIGGER_EXCEPTION(FailPoints::random_join_build_failpoint);
         size_t segment_index = (insert_index + stream_index) % segment_index_info.size();
         if (segment_index == segment_size)
         {
@@ -1513,7 +1523,7 @@ void Join::joinBlockImpl(Block & block, const Maps & maps) const
     default:
         throw Exception("Unknown JOIN keys variant.", ErrorCodes::UNKNOWN_SET_DATA_VARIANT);
     }
-
+    FAIL_POINT_TRIGGER_EXCEPTION(FailPoints::random_join_prob_failpoint);
     for (size_t i = 0; i < num_columns_to_add; ++i)
     {
         const ColumnWithTypeAndName & sample_col = sample_block_with_columns_to_add.getByPosition(i);
diff --git a/dbms/src/Interpreters/Settings.h b/dbms/src/Interpreters/Settings.h
index 9361e0525d2..add761c581d 100644
--- a/dbms/src/Interpreters/Settings.h
+++ b/dbms/src/Interpreters/Settings.h
@@ -209,7 +209,7 @@ struct Settings
       * Basically, limits are checked for each block (not every row). That is, the limits can be slightly violated. \
       * Almost all limits apply only to SELECTs. \
       * Almost all limits apply to each stream individually. \
-      */                                                                                                                                                                       \
+      */                                                                                                                                                                                                                                \
                                                                                                                                                                                                                                         \
     M(SettingUInt64, max_rows_to_read, 0, "Limit on read rows from the most 'deep' sources. That is, only in the deepest subquery. When reading from a remote server, it "                                                              \
                                           "is only checked on a remote server.")                                                                                                                                                        \
@@ -272,7 +272,7 @@ struct Settings
     M(SettingUInt64, dt_segment_delta_small_column_file_size, 8388608, "Determine whether a column file in delta is small or not. 8MB by default.")                                                                                     \
     M(SettingUInt64, dt_segment_stable_pack_rows, DEFAULT_MERGE_BLOCK_SIZE, "Expected stable pack rows in DeltaTree Engine.")                                                                                                           \
     M(SettingFloat, dt_segment_wait_duration_factor, 1, "The factor of wait duration in a write stall.")                                                                                                                                \
-    M(SettingUInt64, dt_bg_gc_check_interval, 60, "Background gc thread check interval, the unit is second.")                                                                                                                            \
+    M(SettingUInt64, dt_bg_gc_check_interval, 60, "Background gc thread check interval, the unit is second.")                                                                                                                           \
     M(SettingInt64, dt_bg_gc_max_segments_to_check_every_round, 100, "Max segments to check in every gc round, value less than or equal to 0 means gc no segments.")                                                                    \
     M(SettingFloat, dt_bg_gc_ratio_threhold_to_trigger_gc, 1.2, "Trigger segment's gc when the ratio of invalid version exceed this threhold. Values smaller than or equal to 1.0 means gc all "                                        \
                                                                 "segments")                                                                                                                                                             \
@@ -355,15 +355,15 @@ struct Settings
     M(SettingUInt64, elastic_threadpool_init_cap, 400, "The size of elastic thread pool.")                                                                                                                                              \
     M(SettingUInt64, elastic_threadpool_shrink_period_ms, 300000, "The shrink period(ms) of elastic thread pool.")                                                                                                                      \
     M(SettingBool, enable_local_tunnel, true, "Enable local data transfer between local MPP tasks.")                                                                                                                                    \
-    M(SettingBool, enable_async_grpc_client, true, "Enable async grpc in MPP.")                                                                                                                                                                \
-    M(SettingUInt64, grpc_completion_queue_pool_size, 0, "The size of gRPC completion queue pool. 0 means using hardware_concurrency.")\
+    M(SettingBool, enable_async_grpc_client, true, "Enable async grpc in MPP.")                                                                                                                                                         \
+    M(SettingUInt64, grpc_completion_queue_pool_size, 0, "The size of gRPC completion queue pool. 0 means using hardware_concurrency.")                                                                                                 \
     M(SettingBool, enable_async_server, true, "Enable async rpc server.")                                                                                                                                                               \
     M(SettingUInt64, async_pollers_per_cq, 200, "grpc async pollers per cqs")                                                                                                                                                           \
     M(SettingUInt64, async_cqs, 1, "grpc async cqs")                                                                                                                                                                                    \
     M(SettingUInt64, preallocated_request_count_per_poller, 20, "grpc preallocated_request_count_per_poller")                                                                                                                           \
                                                                                                                                                                                                                                         \
     M(SettingUInt64, manual_compact_pool_size, 1, "The number of worker threads to handle manual compact requests.")                                                                                                                    \
-    M(SettingUInt64, manual_compact_max_concurrency, 10, "Max concurrent tasks. It should be larger than pool size.")                                                                                                              \
+    M(SettingUInt64, manual_compact_max_concurrency, 10, "Max concurrent tasks. It should be larger than pool size.")                                                                                                                   \
     M(SettingUInt64, manual_compact_more_until_ms, 60000, "Continuously compact more segments until reaching specified elapsed time. If 0 is specified, only one segment will be compacted each round.")
 
 // clang-format on
diff --git a/dbms/src/Interpreters/executeQuery.cpp b/dbms/src/Interpreters/executeQuery.cpp
index 96cfc0a58ae..78ad4b41ce6 100644
--- a/dbms/src/Interpreters/executeQuery.cpp
+++ b/dbms/src/Interpreters/executeQuery.cpp
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <Common/FailPoint.h>
 #include <Common/ProfileEvents.h>
 #include <Common/formatReadable.h>
 #include <Common/typeid_cast.h>
@@ -53,7 +54,10 @@ extern const int LOGICAL_ERROR;
 extern const int QUERY_IS_TOO_LARGE;
 extern const int INTO_OUTFILE_NOT_ALLOWED;
 } // namespace ErrorCodes
-
+namespace FailPoints
+{
+extern const char random_interpreter_failpoint[];
+} // namespace FailPoints
 namespace
 {
 void checkASTSizeLimits(const IAST & ast, const Settings & settings)
@@ -226,6 +230,7 @@ std::tuple<ASTPtr, BlockIO> executeQueryImpl(
             context.setProcessListElement(&process_list_entry->get());
         }
 
+        FAIL_POINT_TRIGGER_EXCEPTION(FailPoints::random_interpreter_failpoint);
         auto interpreter = query_src.interpreter(context, stage);
         res = interpreter->execute();
 
diff --git a/dbms/src/Server/CLIService.h b/dbms/src/Server/CLIService.h
index 9078fa991f3..0acffebb577 100644
--- a/dbms/src/Server/CLIService.h
+++ b/dbms/src/Server/CLIService.h
@@ -126,6 +126,8 @@ CLIService<Func, Args>::TiFlashProxyConfig::TiFlashProxyConfig(Poco::Util::Layer
         args.push_back(v.first.data());
         args.push_back(v.second.data());
     }
+    // Start the decryption service without starting the raftstore service
+    args.push_back("--only-decryption");
     is_proxy_runnable = true;
 }
 template <typename Func, typename Args>
diff --git a/dbms/src/Server/Server.cpp b/dbms/src/Server/Server.cpp
index 1bb35e51866..aabca11cf9c 100644
--- a/dbms/src/Server/Server.cpp
+++ b/dbms/src/Server/Server.cpp
@@ -18,6 +18,7 @@
 #include <Common/Config/ConfigReloader.h>
 #include <Common/CurrentMetrics.h>
 #include <Common/DynamicThreadPool.h>
+#include <Common/FailPoint.h>
 #include <Common/Macros.h>
 #include <Common/RedactHelpers.h>
 #include <Common/StringUtils/StringUtils.h>
@@ -53,10 +54,15 @@
 #include <Poco/Net/NetException.h>
 #include <Poco/StringTokenizer.h>
 #include <Poco/Timestamp.h>
+#include <Server/HTTPHandlerFactory.h>
+#include <Server/MetricsPrometheus.h>
+#include <Server/MetricsTransmitter.h>
 #include <Server/RaftConfigParser.h>
 #include <Server/Server.h>
 #include <Server/ServerInfo.h>
+#include <Server/StatusFile.h>
 #include <Server/StorageConfigParser.h>
+#include <Server/TCPHandlerFactory.h>
 #include <Server/UserConfigParser.h>
 #include <Storages/FormatVersion.h>
 #include <Storages/IManageableStorage.h>
@@ -81,12 +87,6 @@
 #include <limits>
 #include <memory>
 
-#include "HTTPHandlerFactory.h"
-#include "MetricsPrometheus.h"
-#include "MetricsTransmitter.h"
-#include "StatusFile.h"
-#include "TCPHandlerFactory.h"
-
 #if Poco_NetSSL_FOUND
 #include <Poco/Net/Context.h>
 #include <Poco/Net/SecureServerSocket.h>
@@ -151,6 +151,7 @@ void loadMiConfig(Logger * log)
 }
 #undef TRY_LOAD_CONF
 #endif
+
 namespace
 {
 [[maybe_unused]] void tryLoadBoolConfigFromEnv(Poco::Logger * log, bool & target, const char * name)
@@ -176,6 +177,12 @@ namespace
 }
 } // namespace
 
+namespace CurrentMetrics
+{
+extern const Metric LogicalCPUCores;
+extern const Metric MemoryCapacity;
+} // namespace CurrentMetrics
+
 namespace DB
 {
 namespace ErrorCodes
@@ -184,6 +191,7 @@ extern const int NO_ELEMENTS_IN_CONFIG;
 extern const int SUPPORT_IS_DISABLED;
 extern const int ARGUMENT_OUT_OF_BOUND;
 extern const int INVALID_CONFIG_PARAMETER;
+extern const int IP_ADDRESS_NOT_ALLOWED;
 } // namespace ErrorCodes
 
 namespace Debug
@@ -621,6 +629,10 @@ class Server::FlashGrpcServerHolder
             }
         }
         flash_grpc_server = builder.BuildAndStart();
+        if (!flash_grpc_server)
+        {
+            throw Exception("Exception happens when start grpc server, the flash.service_addr may be invalid, flash.service_addr is " + raft_config.flash_server_addr, ErrorCodes::IP_ADDRESS_NOT_ALLOWED);
+        }
         LOG_FMT_INFO(log, "Flash grpc server listening on [{}]", raft_config.flash_server_addr);
         Debug::setServiceAddr(raft_config.flash_server_addr);
         if (enable_async_server)
@@ -961,7 +973,10 @@ class Server::TcpHttpServersHolder
             LOG_DEBUG(log, debug_msg);
     }
 
-    const std::vector<std::unique_ptr<Poco::Net::TCPServer>> & getServers() const { return servers; }
+    const std::vector<std::unique_ptr<Poco::Net::TCPServer>> & getServers() const
+    {
+        return servers;
+    }
 
 private:
     Server & server;
@@ -977,6 +992,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
     Poco::Logger * log = &logger();
 #ifdef FIU_ENABLE
     fiu_init(0); // init failpoint
+    FailPointHelper::initRandomFailPoints(config(), log);
 #endif
 
     UpdateMallocConfig(log);
@@ -996,7 +1012,6 @@ int Server::main(const std::vector<std::string> & /*args*/)
 #ifdef TIFLASH_ENABLE_SVE_SUPPORT
     tryLoadBoolConfigFromEnv(log, simd_option::ENABLE_SVE, "TIFLASH_ENABLE_SVE");
 #endif
-
     registerFunctions();
     registerAggregateFunctions();
     registerWindowFunctions();
@@ -1128,6 +1143,12 @@ int Server::main(const std::vector<std::string> & /*args*/)
         global_context->getPathCapacity(),
         global_context->getFileProvider());
 
+    /// Initialize the background & blockable background thread pool.
+    Settings & settings = global_context->getSettingsRef();
+    LOG_FMT_INFO(log, "Background & Blockable Background pool size: {}", settings.background_pool_size);
+    auto & bg_pool = global_context->initializeBackgroundPool(settings.background_pool_size);
+    auto & blockable_bg_pool = global_context->initializeBlockableBackgroundPool(settings.background_pool_size);
+
     global_context->initializePageStorageMode(global_context->getPathPool(), STORAGE_FORMAT_CURRENT.page);
     global_context->initializeGlobalStoragePoolIfNeed(global_context->getPathPool());
     LOG_FMT_INFO(log, "Global PageStorage run mode is {}", static_cast<UInt8>(global_context->getPageStorageRunMode()));
@@ -1244,13 +1265,6 @@ int Server::main(const std::vector<std::string> & /*args*/)
     /// Load global settings from default_profile and system_profile.
     /// It internally depends on UserConfig::parseSettings.
     global_context->setDefaultProfiles(config());
-    Settings & settings = global_context->getSettingsRef();
-
-    /// Initialize the background thread pool.
-    /// It internally depends on settings.background_pool_size,
-    /// so must be called after settings has been load.
-    auto & bg_pool = global_context->getBackgroundPool();
-    auto & blockable_bg_pool = global_context->getBlockableBackgroundPool();
 
     /// Initialize RateLimiter.
     global_context->initializeRateLimiter(config(), bg_pool, blockable_bg_pool);
@@ -1417,6 +1431,8 @@ int Server::main(const std::vector<std::string> & /*args*/)
 
         {
             // on ARM processors it can show only enabled at current moment cores
+            CurrentMetrics::set(CurrentMetrics::LogicalCPUCores, server_info.cpu_info.logical_cores);
+            CurrentMetrics::set(CurrentMetrics::MemoryCapacity, server_info.memory_info.capacity);
             LOG_FMT_INFO(
                 log,
                 "Available RAM = {}; physical cores = {}; logical cores = {}.",
diff --git a/dbms/src/Storages/BackgroundProcessingPool.h b/dbms/src/Storages/BackgroundProcessingPool.h
index 1ba6c4efcf8..49a01b3a397 100644
--- a/dbms/src/Storages/BackgroundProcessingPool.h
+++ b/dbms/src/Storages/BackgroundProcessingPool.h
@@ -81,7 +81,7 @@ class BackgroundProcessingPool
     using TaskHandle = std::shared_ptr<TaskInfo>;
 
 
-    BackgroundProcessingPool(int size_);
+    explicit BackgroundProcessingPool(int size_);
 
     size_t getNumberOfThreads() const { return size; }
 
@@ -96,7 +96,7 @@ class BackgroundProcessingPool
     /// 2. thread B also get the same task
     /// 3. thread A finish the execution of the task quickly, release the task and try to update the next schedule time of the task
     /// 4. thread B find the task is not occupied and execute the task again almost immediately
-    TaskHandle addTask(const Task & task, const bool multi = true, const size_t interval_ms = 0);
+    TaskHandle addTask(const Task & task, bool multi = true, size_t interval_ms = 0);
     void removeTask(const TaskHandle & task);
 
     ~BackgroundProcessingPool();
diff --git a/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.cpp b/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.cpp
index 132732d6989..8a69b7573e2 100644
--- a/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.cpp
+++ b/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.cpp
@@ -141,6 +141,19 @@ bool DeltaValueSpace::ingestColumnFiles(DMContext & /*context*/, const RowKeyRan
 
 bool DeltaValueSpace::flush(DMContext & context)
 {
+    bool v = false;
+    if (!is_flushing.compare_exchange_strong(v, true))
+    {
+        // other thread is flushing, just return.
+        LOG_FMT_DEBUG(log, "{}, Flush stop because other thread is flushing", simpleInfo());
+        return false;
+    }
+    SCOPE_EXIT({
+        bool v = true;
+        if (!is_flushing.compare_exchange_strong(v, false))
+            throw Exception(simpleInfo() + " is expected to be flushing", ErrorCodes::LOGICAL_ERROR);
+    });
+
     LOG_FMT_DEBUG(log, "{}, Flush start", info());
 
     /// We have two types of data needed to flush to disk:
diff --git a/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.h b/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.h
index 8f14682caa8..04fb97b3004 100644
--- a/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.h
+++ b/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.h
@@ -77,6 +77,11 @@ class DeltaValueSpace
     /// Note that those things can not be done at the same time.
     std::atomic_bool is_updating = false;
 
+    /// Note that it's safe to do multiple flush concurrently but only one of them can succeed,
+    /// and other thread's work is just a waste of resource.
+    /// So we only allow one flush task running at any time to aviod waste resource.
+    std::atomic_bool is_flushing = false;
+
     std::atomic<size_t> last_try_flush_rows = 0;
     std::atomic<size_t> last_try_flush_bytes = 0;
     std::atomic<size_t> last_try_compact_column_files = 0;
@@ -159,6 +164,8 @@ class DeltaValueSpace
     size_t getTotalCacheBytes() const;
     size_t getValidCacheRows() const;
 
+    bool isFlushing() const { return is_flushing; }
+
     bool isUpdating() const { return is_updating; }
 
     bool tryLockUpdating()
diff --git a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp
index 195ed5c53c2..935a4ac111c 100644
--- a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp
+++ b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp
@@ -100,6 +100,9 @@ extern const char exception_after_drop_segment[];
 
 namespace DM
 {
+// It is used to prevent hash conflict of file caches.
+static std::atomic<UInt64> DELTA_MERGE_STORE_HASH_SALT{0};
+
 // ================================================
 //   MergeDeltaTaskPool
 // ================================================
@@ -980,14 +983,14 @@ void DeltaMergeStore::deleteRange(const Context & db_context, const DB::Settings
         checkSegmentUpdate(dm_context, segment, ThreadType::Write);
 }
 
-void DeltaMergeStore::flushCache(const DMContextPtr & dm_context, const RowKeyRange & range)
+bool DeltaMergeStore::flushCache(const DMContextPtr & dm_context, const RowKeyRange & range, bool try_until_succeed)
 {
     RowKeyRange cur_range = range;
     while (!cur_range.none())
     {
         RowKeyRange segment_range;
 
-        // Keep trying until succeeded.
+        // Keep trying until succeeded if needed.
         while (true)
         {
             SegmentPtr segment;
@@ -1010,10 +1013,15 @@ void DeltaMergeStore::flushCache(const DMContextPtr & dm_context, const RowKeyRa
             {
                 break;
             }
+            else if (!try_until_succeed)
+            {
+                return false;
+            }
         }
 
         cur_range.setStart(segment_range.end);
     }
+    return true;
 }
 
 void DeltaMergeStore::mergeDeltaAll(const Context & context)
@@ -1055,6 +1063,13 @@ std::optional<DM::RowKeyRange> DeltaMergeStore::mergeDeltaBySegment(const Contex
             segment = segment_it->second;
         }
 
+        if (!segment->flushCache(*dm_context))
+        {
+            // If the flush failed, it means there are parallel updates to the segment in the background.
+            // In this case, we try again.
+            continue;
+        }
+
         const auto new_segment = segmentMergeDelta(*dm_context, segment, run_thread);
         if (new_segment)
         {
@@ -1347,6 +1362,12 @@ void DeltaMergeStore::checkSegmentUpdate(const DMContextPtr & dm_context, const
         && (delta_rows - delta_last_try_flush_rows >= delta_cache_limit_rows
             || delta_bytes - delta_last_try_flush_bytes >= delta_cache_limit_bytes);
     bool should_foreground_flush = unsaved_rows >= delta_cache_limit_rows * 3 || unsaved_bytes >= delta_cache_limit_bytes * 3;
+    /// For write thread, we want to avoid foreground flush to block the process of apply raft command.
+    /// So we increase the threshold of foreground flush for write thread.
+    if (thread_type == ThreadType::Write)
+    {
+        should_foreground_flush = unsaved_rows >= delta_cache_limit_rows * 10 || unsaved_bytes >= delta_cache_limit_bytes * 10;
+    }
 
     bool should_background_merge_delta = ((delta_check_rows >= delta_limit_rows || delta_check_bytes >= delta_limit_bytes) //
                                           && (delta_rows - delta_last_try_merge_delta_rows >= delta_cache_limit_rows
@@ -1404,9 +1425,16 @@ void DeltaMergeStore::checkSegmentUpdate(const DMContextPtr & dm_context, const
         }
         else if (should_background_flush)
         {
-            delta_last_try_flush_rows = delta_rows;
-            delta_last_try_flush_bytes = delta_bytes;
-            try_add_background_task(BackgroundTask{TaskType::Flush, dm_context, segment, {}});
+            /// It's meaningless to add more flush tasks if the segment is flushing.
+            /// Because only one flush task can proceed at any time.
+            /// And after the current flush task finished,
+            /// it will call `checkSegmentUpdate` again to check whether there is more flush task to do.
+            if (!segment->isFlushing())
+            {
+                delta_last_try_flush_rows = delta_rows;
+                delta_last_try_flush_bytes = delta_bytes;
+                try_add_background_task(BackgroundTask{TaskType::Flush, dm_context, segment, {}});
+            }
         }
     }
 
@@ -1502,7 +1530,12 @@ void DeltaMergeStore::checkSegmentUpdate(const DMContextPtr & dm_context, const
         return false;
     };
     auto try_bg_compact = [&]() {
-        if (should_compact)
+        /// Compact task should be a really low priority task.
+        /// And if the segment is flushing,
+        /// we should avoid adding background compact task to reduce lock contention on the segment and save disk throughput.
+        /// And after the current flush task complete,
+        /// it will call `checkSegmentUpdate` again to check whether there is other kinds of task to do.
+        if (should_compact && !segment->isFlushing())
         {
             delta_last_try_compact_column_files = column_file_count;
             try_add_background_task(BackgroundTask{TaskType::Compact, dm_context, segment, {}});
diff --git a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.h b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.h
index 705481ca107..36a72d3cda5 100644
--- a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.h
+++ b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.h
@@ -51,7 +51,7 @@ inline static const PageId DELTA_MERGE_FIRST_SEGMENT_ID = 1;
 
 struct SegmentStat
 {
-    UInt64 segment_id;
+    UInt64 segment_id = 0;
     RowKeyRange range;
 
     UInt64 rows = 0;
@@ -145,9 +145,6 @@ struct DeltaMergeStoreStat
     UInt64 background_tasks_length = 0;
 };
 
-// It is used to prevent hash conflict of file caches.
-static std::atomic<UInt64> DELTA_MERGE_STORE_HASH_SALT{0};
-
 class DeltaMergeStore : private boost::noncopyable
 {
 public:
@@ -367,14 +364,14 @@ class DeltaMergeStore : private boost::noncopyable
                            const SegmentIdSet & read_segments = {},
                            size_t extra_table_id_index = InvalidColumnID);
 
-    /// Force flush all data to disk.
-    void flushCache(const Context & context, const RowKeyRange & range)
+    /// Try flush all data in `range` to disk and return whether the task succeed.
+    bool flushCache(const Context & context, const RowKeyRange & range, bool try_until_succeed = true)
     {
         auto dm_context = newDMContext(context, context.getSettingsRef());
-        flushCache(dm_context, range);
+        return flushCache(dm_context, range, try_until_succeed);
     }
 
-    void flushCache(const DMContextPtr & dm_context, const RowKeyRange & range);
+    bool flushCache(const DMContextPtr & dm_context, const RowKeyRange & range, bool try_until_succeed = true);
 
     /// Merge delta into the stable layer for all segments.
     ///
@@ -386,7 +383,7 @@ class DeltaMergeStore : private boost::noncopyable
     /// If there is no segment found by the start key, nullopt is returned.
     ///
     /// This function is called when using `ALTER TABLE [TABLE] COMPACT ...` from TiDB.
-    std::optional<DM::RowKeyRange> mergeDeltaBySegment(const Context & context, const DM::RowKeyValue & start_key, const TaskRunThread run_thread);
+    std::optional<DM::RowKeyRange> mergeDeltaBySegment(const Context & context, const DM::RowKeyValue & start_key,  TaskRunThread run_thread);
 
     /// Compact the delta layer, merging multiple fragmented delta files into larger ones.
     /// This is a minor compaction as it does not merge the delta into stable layer.
diff --git a/dbms/src/Storages/DeltaMerge/DeltaTree.h b/dbms/src/Storages/DeltaMerge/DeltaTree.h
index 47674ab2cfc..29e127fe35f 100644
--- a/dbms/src/Storages/DeltaMerge/DeltaTree.h
+++ b/dbms/src/Storages/DeltaMerge/DeltaTree.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <Common/TargetSpecific.h>
 #include <Core/Types.h>
 #include <IO/WriteHelpers.h>
 #include <Storages/DeltaMerge/Tuple.h>
@@ -810,6 +811,20 @@ class DeltaTree
     template <typename T>
     InternPtr afterNodeUpdated(T * node);
 
+#ifdef __x86_64__
+    template <typename T>
+    InternPtr afterNodeUpdatedGeneric(T * node);
+
+    template <typename T>
+    InternPtr afterNodeUpdatedAVX512(T * node);
+
+    template <typename T>
+    InternPtr afterNodeUpdatedAVX(T * node);
+
+    template <typename T>
+    InternPtr afterNodeUpdatedSSE4(T * node);
+#endif
+
     inline void afterLeafUpdated(LeafPtr leaf)
     {
         if (leaf->count == 0 && isRootOnly())
@@ -1348,158 +1363,86 @@ typename DT_CLASS::InterAndSid DT_CLASS::submitMinSid(T * node, UInt64 subtree_m
     }
 }
 
-DT_TEMPLATE
-template <class T>
-typename DT_CLASS::InternPtr DT_CLASS::afterNodeUpdated(T * node)
+#ifndef __x86_64__
+#define TIFLASH_DT_IMPL_NAME afterNodeUpdated
+#include "DeltaTree.ipp"
+#undef TIFLASH_DT_IMPL_NAME
+#else
+
+// generic implementation
+#define TIFLASH_DT_IMPL_NAME afterNodeUpdatedGeneric
+#include "DeltaTree.ipp"
+#undef TIFLASH_DT_IMPL_NAME
+
+// avx512 implementation
+TIFLASH_BEGIN_AVX512_SPECIFIC_CODE
+#define TIFLASH_DT_IMPL_NAME afterNodeUpdatedAVX512
+#include "DeltaTree.ipp"
+#undef TIFLASH_DT_IMPL_NAME
+TIFLASH_END_TARGET_SPECIFIC_CODE
+
+// avx implementation
+TIFLASH_BEGIN_AVX_SPECIFIC_CODE
+#define TIFLASH_DT_IMPL_NAME afterNodeUpdatedAVX
+#include "DeltaTree.ipp"
+#undef TIFLASH_DT_IMPL_NAME
+TIFLASH_END_TARGET_SPECIFIC_CODE
+
+// sse4 implementation
+TIFLASH_BEGIN_SSE4_SPECIFIC_CODE
+#define TIFLASH_DT_IMPL_NAME afterNodeUpdatedSSE4
+#include "DeltaTree.ipp"
+#undef TIFLASH_DT_IMPL_NAME
+TIFLASH_END_TARGET_SPECIFIC_CODE
+
+namespace Impl
 {
-    if (!node)
-        return {};
-
-    constexpr bool is_leaf = std::is_same<Leaf, T>::value;
+enum class DeltaTreeVariant
+{
+    Generic,
+    SSE4,
+    AVX,
+    AVX512
+};
 
-    if (root == asNode(node) && !isLeaf(root) && node->count == 1)
+static inline DeltaTreeVariant resolveDeltaTreeVariant()
+{
+    if (DB::TargetSpecific::AVX512Checker::runtimeSupport())
     {
-        /// Decrease tree height.
-        root = as(Intern, root)->children[0];
-
-        --(node->count);
-        freeNode<T>(node);
-
-        if (isLeaf(root))
-            as(Leaf, root)->parent = nullptr;
-        else
-            as(Intern, root)->parent = nullptr;
-        --height;
-
-        LOG_FMT_TRACE(log, "height {} -> {}", (height + 1), height);
-
-        return {};
+        return DeltaTreeVariant::AVX512;
     }
-
-    auto parent = node->parent;
-    bool parent_updated = false;
-
-    if (T::overflow(node->count)) // split
+    if (DB::TargetSpecific::AVXChecker::runtimeSupport())
     {
-        if (!parent)
-        {
-            /// Increase tree height.
-            parent = createNode<Intern>();
-            root = asNode(parent);
-
-            parent->deltas[0] = checkDelta(node->getDelta());
-            parent->children[0] = asNode(node);
-            ++(parent->count);
-            parent->refreshChildParent();
-
-            ++height;
-
-            LOG_FMT_TRACE(log, "height {} -> {}", (height - 1), height);
-        }
-
-        auto pos = parent->searchChild(asNode(node));
-
-        T * next_n = createNode<T>();
-
-        UInt64 sep_sid = node->split(next_n);
-
-        // handle parent update
-        parent->shiftEntries(pos + 1, 1);
-        // for current node
-        parent->deltas[pos] = checkDelta(node->getDelta());
-        // for next node
-        parent->sids[pos] = sep_sid;
-        parent->deltas[pos + 1] = checkDelta(next_n->getDelta());
-        parent->children[pos + 1] = asNode(next_n);
-
-        ++(parent->count);
-
-        if constexpr (is_leaf)
-        {
-            if (as(Leaf, node) == right_leaf)
-                right_leaf = as(Leaf, next_n);
-        }
-
-        parent_updated = true;
+        return DeltaTreeVariant::AVX;
     }
-    else if (T::underflow(node->count) && root != asNode(node)) // adopt or merge
+    if (DB::TargetSpecific::SSE4Checker::runtimeSupport())
     {
-        auto pos = parent->searchChild(asNode(node));
-
-        // currently we always adopt from the right one if possible
-        bool is_sibling_left;
-        size_t sibling_pos;
-        T * sibling;
-
-        if (unlikely(parent->count <= 1))
-            throw Exception("Unexpected parent entry count: " + DB::toString(parent->count));
-
-        if (pos == parent->count - 1)
-        {
-            is_sibling_left = true;
-            sibling_pos = pos - 1;
-            sibling = as(T, parent->children[sibling_pos]);
-        }
-        else
-        {
-            is_sibling_left = false;
-            sibling_pos = pos + 1;
-            sibling = as(T, parent->children[sibling_pos]);
-        }
-
-        if (unlikely(sibling->parent != node->parent))
-            throw Exception("parent not the same");
-
-        auto after_adopt = (node->count + sibling->count) / 2;
-        if (T::underflow(after_adopt))
-        {
-            // Do merge.
-            // adoption won't work because the sibling doesn't have enough entries.
-
-            node->merge(sibling, is_sibling_left, pos);
-            freeNode<T>(sibling);
-
-            pos = std::min(pos, sibling_pos);
-            parent->deltas[pos] = checkDelta(node->getDelta());
-            parent->children[pos] = asNode(node);
-            parent->shiftEntries(pos + 2, -1);
-
-            if constexpr (is_leaf)
-            {
-                if (is_sibling_left && (as(Leaf, sibling) == left_leaf))
-                    left_leaf = as(Leaf, node);
-                else if (!is_sibling_left && as(Leaf, sibling) == right_leaf)
-                    right_leaf = as(Leaf, node);
-            }
-            --(parent->count);
-        }
-        else
-        {
-            // Do adoption.
-
-            auto adopt_count = after_adopt - node->count;
-            auto new_sep_sid = node->adopt(sibling, is_sibling_left, adopt_count, pos);
+        return DeltaTreeVariant::SSE4;
+    }
+    return DeltaTreeVariant::Generic;
+}
 
-            parent->sids[std::min(pos, sibling_pos)] = new_sep_sid;
-            parent->deltas[pos] = checkDelta(node->getDelta());
-            parent->deltas[sibling_pos] = checkDelta(sibling->getDelta());
-        }
+static inline DeltaTreeVariant DELTA_TREE_VARIANT = resolveDeltaTreeVariant();
+} // namespace Impl
 
-        parent_updated = true;
-    }
-    else if (parent)
+DT_TEMPLATE
+template <class T>
+typename DT_CLASS::InternPtr DT_CLASS::afterNodeUpdated(T * node)
+{
+    switch (Impl::DELTA_TREE_VARIANT)
     {
-        auto pos = parent->searchChild(asNode(node));
-        auto delta = node->getDelta();
-        parent_updated = parent->deltas[pos] != delta;
-        parent->deltas[pos] = checkDelta(delta);
+    case Impl::DeltaTreeVariant::Generic:
+        return afterNodeUpdatedGeneric(node);
+    case Impl::DeltaTreeVariant::SSE4:
+        return afterNodeUpdatedSSE4(node);
+    case Impl::DeltaTreeVariant::AVX:
+        return afterNodeUpdatedAVX(node);
+    case Impl::DeltaTreeVariant::AVX512:
+        return afterNodeUpdatedAVX512(node);
     }
-
-    if (parent_updated)
-        return parent;
-    else
-        return {};
 }
+#endif
+
 
 #undef as
 #undef asNode
diff --git a/dbms/src/Storages/DeltaMerge/DeltaTree.ipp b/dbms/src/Storages/DeltaMerge/DeltaTree.ipp
new file mode 100644
index 00000000000..27b8a3b96f1
--- /dev/null
+++ b/dbms/src/Storages/DeltaMerge/DeltaTree.ipp
@@ -0,0 +1,165 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+DT_TEMPLATE
+template <class T>
+__attribute__((noinline, flatten)) typename DT_CLASS::InternPtr DT_CLASS::TIFLASH_DT_IMPL_NAME(T * node)
+{
+    if (!node)
+        return {};
+
+    constexpr bool is_leaf = std::is_same<Leaf, T>::value;
+
+    if (root == asNode(node) && !isLeaf(root) && node->count == 1)
+    {
+        /// Decrease tree height.
+        root = as(Intern, root)->children[0];
+
+        --(node->count);
+        freeNode<T>(node);
+
+        if (isLeaf(root))
+            as(Leaf, root)->parent = nullptr;
+        else
+            as(Intern, root)->parent = nullptr;
+        --height;
+
+        LOG_FMT_TRACE(log, "height {} -> {}", (height + 1), height);
+
+        return {};
+    }
+
+    auto parent = node->parent;
+    bool parent_updated = false;
+
+    if (T::overflow(node->count)) // split
+    {
+        if (!parent)
+        {
+            /// Increase tree height.
+            parent = createNode<Intern>();
+            root = asNode(parent);
+
+            parent->deltas[0] = checkDelta(node->getDelta());
+            parent->children[0] = asNode(node);
+            ++(parent->count);
+            parent->refreshChildParent();
+
+            ++height;
+
+            LOG_FMT_TRACE(log, "height {} -> {}", (height - 1), height);
+        }
+
+        auto pos = parent->searchChild(asNode(node));
+
+        T * next_n = createNode<T>();
+
+        UInt64 sep_sid = node->split(next_n);
+
+        // handle parent update
+        parent->shiftEntries(pos + 1, 1);
+        // for current node
+        parent->deltas[pos] = checkDelta(node->getDelta());
+        // for next node
+        parent->sids[pos] = sep_sid;
+        parent->deltas[pos + 1] = checkDelta(next_n->getDelta());
+        parent->children[pos + 1] = asNode(next_n);
+
+        ++(parent->count);
+
+        if constexpr (is_leaf)
+        {
+            if (as(Leaf, node) == right_leaf)
+                right_leaf = as(Leaf, next_n);
+        }
+
+        parent_updated = true;
+    }
+    else if (T::underflow(node->count) && root != asNode(node)) // adopt or merge
+    {
+        auto pos = parent->searchChild(asNode(node));
+
+        // currently we always adopt from the right one if possible
+        bool is_sibling_left;
+        size_t sibling_pos;
+        T * sibling;
+
+        if (unlikely(parent->count <= 1))
+            throw Exception("Unexpected parent entry count: " + DB::toString(parent->count));
+
+        if (pos == parent->count - 1)
+        {
+            is_sibling_left = true;
+            sibling_pos = pos - 1;
+            sibling = as(T, parent->children[sibling_pos]);
+        }
+        else
+        {
+            is_sibling_left = false;
+            sibling_pos = pos + 1;
+            sibling = as(T, parent->children[sibling_pos]);
+        }
+
+        if (unlikely(sibling->parent != node->parent))
+            throw Exception("parent not the same");
+
+        auto after_adopt = (node->count + sibling->count) / 2;
+        if (T::underflow(after_adopt))
+        {
+            // Do merge.
+            // adoption won't work because the sibling doesn't have enough entries.
+
+            node->merge(sibling, is_sibling_left, pos);
+            freeNode<T>(sibling);
+
+            pos = std::min(pos, sibling_pos);
+            parent->deltas[pos] = checkDelta(node->getDelta());
+            parent->children[pos] = asNode(node);
+            parent->shiftEntries(pos + 2, -1);
+
+            if constexpr (is_leaf)
+            {
+                if (is_sibling_left && (as(Leaf, sibling) == left_leaf))
+                    left_leaf = as(Leaf, node);
+                else if (!is_sibling_left && as(Leaf, sibling) == right_leaf)
+                    right_leaf = as(Leaf, node);
+            }
+            --(parent->count);
+        }
+        else
+        {
+            // Do adoption.
+
+            auto adopt_count = after_adopt - node->count;
+            auto new_sep_sid = node->adopt(sibling, is_sibling_left, adopt_count, pos);
+
+            parent->sids[std::min(pos, sibling_pos)] = new_sep_sid;
+            parent->deltas[pos] = checkDelta(node->getDelta());
+            parent->deltas[sibling_pos] = checkDelta(sibling->getDelta());
+        }
+
+        parent_updated = true;
+    }
+    else if (parent)
+    {
+        auto pos = parent->searchChild(asNode(node));
+        auto delta = node->getDelta();
+        parent_updated = parent->deltas[pos] != delta;
+        parent->deltas[pos] = checkDelta(delta);
+    }
+
+    if (parent_updated)
+        return parent;
+    else
+        return {};
+}
\ No newline at end of file
diff --git a/dbms/src/Storages/DeltaMerge/Segment.h b/dbms/src/Storages/DeltaMerge/Segment.h
index cccfc5091b9..8058329ae91 100644
--- a/dbms/src/Storages/DeltaMerge/Segment.h
+++ b/dbms/src/Storages/DeltaMerge/Segment.h
@@ -300,6 +300,8 @@ class Segment : private boost::noncopyable
 
     void drop(const FileProviderPtr & file_provider, WriteBatches & wbs);
 
+    bool isFlushing() const { return delta->isFlushing(); }
+
     RowsAndBytes getRowsAndBytesInRange(
         DMContext & dm_context,
         const SegmentSnapshotPtr & segment_snap,
diff --git a/dbms/src/Storages/DeltaMerge/tests/DMTestEnv.h b/dbms/src/Storages/DeltaMerge/tests/DMTestEnv.h
index b35dae0cbe2..84fafbc46ef 100644
--- a/dbms/src/Storages/DeltaMerge/tests/DMTestEnv.h
+++ b/dbms/src/Storages/DeltaMerge/tests/DMTestEnv.h
@@ -273,7 +273,8 @@ class DMTestEnv
                                          DataTypePtr pk_type = EXTRA_HANDLE_COLUMN_INT_TYPE,
                                          bool is_common_handle = false,
                                          size_t rowkey_column_size = 1,
-                                         bool with_internal_columns = true)
+                                         bool with_internal_columns = true,
+                                         bool is_deleted = false)
     {
         Block block;
         const size_t num_rows = (end - beg);
@@ -324,7 +325,7 @@ class DMTestEnv
                 VERSION_COLUMN_ID));
             // tag_col
             block.insert(DB::tests::createColumn<UInt8>(
-                std::vector<UInt64>(num_rows, 0),
+                std::vector<UInt64>(num_rows, is_deleted),
                 TAG_COLUMN_NAME,
                 TAG_COLUMN_ID));
         }
diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_dm_delta_merge_store.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_dm_delta_merge_store.cpp
index d46e1b7aa36..b7913c44a2c 100644
--- a/dbms/src/Storages/DeltaMerge/tests/gtest_dm_delta_merge_store.cpp
+++ b/dbms/src/Storages/DeltaMerge/tests/gtest_dm_delta_merge_store.cpp
@@ -3762,6 +3762,55 @@ try
 CATCH
 
 
+// Verify that unflushed data will also be compacted.
+TEST_P(DeltaMergeStoreMergeDeltaBySegmentTest, Flush)
+try
+{
+    {
+        // Write data to first 3 segments and flush.
+        auto newly_written_rows = helper->rows_by_segments[0] + helper->rows_by_segments[1] + helper->rows_by_segments[2];
+        Block block = DMTestEnv::prepareSimpleWriteBlock(0, newly_written_rows, false, pk_type, 5 /* new tso */);
+        store->write(*db_context, db_context->getSettingsRef(), block);
+        store->flushCache(dm_context, RowKeyRange::newAll(store->isCommonHandle(), store->getRowKeyColumnSize()));
+
+        helper->expected_delta_rows[0] += helper->rows_by_segments[0];
+        helper->expected_delta_rows[1] += helper->rows_by_segments[1];
+        helper->expected_delta_rows[2] += helper->rows_by_segments[2];
+        helper->verifyExpectedRowsForAllSegments();
+
+        auto segment1 = std::next(store->segments.begin())->second;
+        ASSERT_EQ(segment1->getDelta()->getUnsavedRows(), 0);
+    }
+    {
+        // Write new data to segment[1] without flush.
+        auto newly_written_rows = helper->rows_by_segments[1];
+        Block block = DMTestEnv::prepareSimpleWriteBlock(helper->rows_by_segments[0], helper->rows_by_segments[0] + newly_written_rows, false, pk_type, 10 /* new tso */);
+        store->write(*db_context, db_context->getSettingsRef(), block);
+
+        helper->expected_delta_rows[1] += helper->rows_by_segments[1];
+        helper->verifyExpectedRowsForAllSegments();
+
+        auto segment1 = std::next(store->segments.begin())->second;
+        ASSERT_GT(segment1->getDelta()->getUnsavedRows(), 0);
+    }
+    {
+        auto segment1 = std::next(store->segments.begin())->second;
+        auto result = store->mergeDeltaBySegment(*db_context, segment1->getRowKeyRange().start, DeltaMergeStore::TaskRunThread::Foreground);
+        ASSERT_NE(result, std::nullopt);
+
+        segment1 = std::next(store->segments.begin())->second;
+        ASSERT_EQ(*result, segment1->getRowKeyRange());
+
+        helper->expected_stable_rows[1] += helper->expected_delta_rows[1];
+        helper->expected_delta_rows[1] = 0;
+        helper->verifyExpectedRowsForAllSegments();
+
+        ASSERT_EQ(segment1->getDelta()->getUnsavedRows(), 0);
+    }
+}
+CATCH
+
+
 } // namespace tests
 } // namespace DM
 } // namespace DB
diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_segment.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_segment.cpp
new file mode 100644
index 00000000000..dc43ef3713b
--- /dev/null
+++ b/dbms/src/Storages/DeltaMerge/tests/gtest_segment.cpp
@@ -0,0 +1,100 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <Common/CurrentMetrics.h>
+#include <DataStreams/OneBlockInputStream.h>
+#include <Storages/DeltaMerge/DeltaMergeStore.h>
+#include <Storages/DeltaMerge/tests/gtest_segment_test_basic.h>
+#include <TestUtils/TiFlashTestBasic.h>
+
+
+namespace DB
+{
+namespace DM
+{
+namespace tests
+{
+class SegmentOperationTest : public SegmentTestBasic
+{
+protected:
+    static void SetUpTestCase() {}
+};
+
+TEST_F(SegmentOperationTest, Issue4956)
+try
+{
+    SegmentTestOptions options;
+    reloadWithOptions(options);
+
+    // flush data, make the segment can be split.
+    writeSegment(DELTA_MERGE_FIRST_SEGMENT_ID);
+    flushSegmentCache(DELTA_MERGE_FIRST_SEGMENT_ID);
+    // write data to cache, reproduce the https://github.com/pingcap/tiflash/issues/4956
+    writeSegment(DELTA_MERGE_FIRST_SEGMENT_ID);
+    deleteRangeSegment(DELTA_MERGE_FIRST_SEGMENT_ID);
+    auto segment_id = splitSegment(DELTA_MERGE_FIRST_SEGMENT_ID);
+    ASSERT_TRUE(segment_id.has_value());
+
+    mergeSegment(DELTA_MERGE_FIRST_SEGMENT_ID, *segment_id);
+}
+CATCH
+
+TEST_F(SegmentOperationTest, TestSegment)
+try
+{
+    SegmentTestOptions options;
+    reloadWithOptions(options);
+    writeSegment(DELTA_MERGE_FIRST_SEGMENT_ID);
+    flushSegmentCache(DELTA_MERGE_FIRST_SEGMENT_ID);
+    mergeSegmentDelta(DELTA_MERGE_FIRST_SEGMENT_ID);
+    auto segment_id = splitSegment(DELTA_MERGE_FIRST_SEGMENT_ID);
+    ASSERT_TRUE(segment_id.has_value());
+
+    size_t origin_rows = getSegmentRowNum(DELTA_MERGE_FIRST_SEGMENT_ID);
+
+    writeSegment(*segment_id);
+    flushSegmentCache(*segment_id);
+    deleteRangeSegment(*segment_id);
+    writeSegmentWithDeletedPack(*segment_id);
+    mergeSegment(DELTA_MERGE_FIRST_SEGMENT_ID, *segment_id);
+
+    EXPECT_EQ(getSegmentRowNum(DELTA_MERGE_FIRST_SEGMENT_ID), origin_rows);
+}
+CATCH
+
+TEST_F(SegmentOperationTest, TestSegmentRandom)
+try
+{
+    srand(time(nullptr));
+    SegmentTestOptions options;
+    options.is_common_handle = true;
+    reloadWithOptions(options);
+    randomSegmentTest(100);
+}
+CATCH
+
+// run in CI weekly
+TEST_F(SegmentOperationTest, DISABLED_TestSegmentRandomForCI)
+try
+{
+    srand(time(nullptr));
+    SegmentTestOptions options;
+    options.is_common_handle = true;
+    reloadWithOptions(options);
+    randomSegmentTest(10000);
+}
+CATCH
+
+} // namespace tests
+} // namespace DM
+} // namespace DB
diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp
new file mode 100644
index 00000000000..c676f2e08d5
--- /dev/null
+++ b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp
@@ -0,0 +1,430 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <Common/CurrentMetrics.h>
+#include <DataStreams/OneBlockInputStream.h>
+#include <Storages/DeltaMerge/DMContext.h>
+#include <Storages/DeltaMerge/DeltaMergeStore.h>
+#include <Storages/DeltaMerge/Segment.h>
+#include <Storages/DeltaMerge/tests/DMTestEnv.h>
+#include <Storages/DeltaMerge/tests/gtest_segment_test_basic.h>
+#include <Storages/Transaction/TMTContext.h>
+#include <Storages/tests/TiFlashStorageTestBasic.h>
+#include <TestUtils/TiFlashTestBasic.h>
+
+namespace DB
+{
+namespace DM
+{
+namespace tests
+{
+void SegmentTestBasic::reloadWithOptions(SegmentTestOptions config)
+{
+    TiFlashStorageTestBasic::SetUp();
+    options = config;
+    table_columns = std::make_shared<ColumnDefines>();
+
+    root_segment = reload(config.is_common_handle);
+    ASSERT_EQ(root_segment->segmentId(), DELTA_MERGE_FIRST_SEGMENT_ID);
+    segments.clear();
+    segments[DELTA_MERGE_FIRST_SEGMENT_ID] = root_segment;
+}
+
+PageId SegmentTestBasic::createNewSegmentWithSomeData()
+{
+    SegmentPtr new_segment;
+    std::tie(root_segment, new_segment) = root_segment->split(dmContext(), tableColumns());
+
+    const size_t num_rows_write_per_batch = 100;
+    {
+        // write to segment and flush
+        Block block = DMTestEnv::prepareSimpleWriteBlock(0, num_rows_write_per_batch, false);
+        new_segment->write(dmContext(), std::move(block), true);
+    }
+    {
+        // write to segment and don't flush
+        Block block = DMTestEnv::prepareSimpleWriteBlock(num_rows_write_per_batch, 2 * num_rows_write_per_batch, false);
+        new_segment->write(dmContext(), std::move(block), false);
+    }
+    return new_segment->segmentId();
+}
+
+size_t SegmentTestBasic::getSegmentRowNumWithoutMVCC(PageId segment_id)
+{
+    auto segment = segments[segment_id];
+    auto in = segment->getInputStreamRaw(dmContext(), *tableColumns());
+
+    size_t num_rows_read = 0;
+    in->readPrefix();
+    while (Block block = in->read())
+    {
+        num_rows_read += block.rows();
+    }
+    in->readSuffix();
+    return num_rows_read;
+}
+
+size_t SegmentTestBasic::getSegmentRowNum(PageId segment_id)
+{
+    auto segment = segments[segment_id];
+    auto in = segment->getInputStream(dmContext(), *tableColumns(), {segment->getRowKeyRange()});
+
+    size_t num_rows_read = 0;
+    in->readPrefix();
+    while (Block block = in->read())
+    {
+        num_rows_read += block.rows();
+    }
+    in->readSuffix();
+    return num_rows_read;
+}
+
+void SegmentTestBasic::checkSegmentRow(PageId segment_id, size_t expected_row_num)
+{
+    auto segment = segments[segment_id];
+    // read written data
+    auto in = segment->getInputStream(dmContext(), *tableColumns(), {segment->getRowKeyRange()});
+
+    size_t num_rows_read = 0;
+    in->readPrefix();
+    while (Block block = in->read())
+    {
+        num_rows_read += block.rows();
+    }
+    in->readSuffix();
+    ASSERT_EQ(num_rows_read, expected_row_num);
+}
+
+std::optional<PageId> SegmentTestBasic::splitSegment(PageId segment_id)
+{
+    auto origin_segment = segments[segment_id];
+    size_t origin_segment_row_num = getSegmentRowNum(segment_id);
+    SegmentPtr segment, new_segment;
+    std::tie(segment, new_segment) = origin_segment->split(dmContext(), tableColumns());
+    if (new_segment)
+    {
+        segments[new_segment->segmentId()] = new_segment;
+        segments[segment_id] = segment;
+
+        EXPECT_EQ(origin_segment_row_num, getSegmentRowNum(segment_id) + getSegmentRowNum(new_segment->segmentId()));
+        return new_segment->segmentId();
+    }
+    return std::nullopt;
+}
+
+void SegmentTestBasic::mergeSegment(PageId left_segment_id, PageId right_segment_id)
+{
+    auto left_segment = segments[left_segment_id];
+    auto right_segment = segments[right_segment_id];
+
+    size_t left_segment_row_num = getSegmentRowNum(left_segment_id);
+    size_t right_segment_row_num = getSegmentRowNum(right_segment_id);
+    LOG_FMT_TRACE(&Poco::Logger::root(), "merge in segment:{}:{} and {}:{}", left_segment->segmentId(), left_segment_row_num, right_segment->segmentId(), right_segment_row_num);
+
+    SegmentPtr merged_segment = Segment::merge(dmContext(), tableColumns(), left_segment, right_segment);
+    segments[merged_segment->segmentId()] = merged_segment;
+    auto it = segments.find(right_segment->segmentId());
+    if (it != segments.end())
+    {
+        segments.erase(it);
+    }
+    EXPECT_EQ(getSegmentRowNum(merged_segment->segmentId()), left_segment_row_num + right_segment_row_num);
+}
+
+void SegmentTestBasic::mergeSegmentDelta(PageId segment_id)
+{
+    auto segment = segments[segment_id];
+    size_t segment_row_num = getSegmentRowNum(segment_id);
+    SegmentPtr merged_segment = segment->mergeDelta(dmContext(), tableColumns());
+    segments[merged_segment->segmentId()] = merged_segment;
+    EXPECT_EQ(getSegmentRowNum(merged_segment->segmentId()), segment_row_num);
+}
+
+void SegmentTestBasic::flushSegmentCache(PageId segment_id)
+{
+    auto segment = segments[segment_id];
+    size_t segment_row_num = getSegmentRowNum(segment_id);
+    segment->flushCache(dmContext());
+    EXPECT_EQ(getSegmentRowNum(segment_id), segment_row_num);
+}
+
+std::pair<Int64, Int64> SegmentTestBasic::getSegmentKeyRange(SegmentPtr segment)
+{
+    Int64 start_key, end_key;
+    if (!options.is_common_handle)
+    {
+        start_key = segment->getRowKeyRange().getStart().int_value;
+        end_key = segment->getRowKeyRange().getEnd().int_value;
+        return {start_key, end_key};
+    }
+    EXPECT_EQ(segment->getRowKeyRange().getStart().data[0], TiDB::CodecFlagInt);
+    EXPECT_EQ(segment->getRowKeyRange().getEnd().data[0], TiDB::CodecFlagInt);
+    {
+        size_t cursor = 1;
+        start_key = DecodeInt64(cursor, String(segment->getRowKeyRange().getStart().data, segment->getRowKeyRange().getStart().size));
+    }
+    {
+        size_t cursor = 1;
+        end_key = DecodeInt64(cursor, String(segment->getRowKeyRange().getEnd().data, segment->getRowKeyRange().getEnd().size));
+    }
+    return {start_key, end_key};
+}
+
+void SegmentTestBasic::writeSegment(PageId segment_id, UInt64 write_rows)
+{
+    if (write_rows == 0)
+    {
+        return;
+    }
+    auto segment = segments[segment_id];
+    size_t segment_row_num = getSegmentRowNumWithoutMVCC(segment_id);
+    std::pair<Int64, Int64> keys = getSegmentKeyRange(segment);
+    Int64 start_key = keys.first;
+    Int64 end_key = keys.second;
+    UInt64 remain_row_num = 0;
+    if (static_cast<UInt64>(end_key - start_key) > write_rows)
+    {
+        end_key = start_key + write_rows;
+    }
+    else
+    {
+        remain_row_num = write_rows - static_cast<UInt64>(end_key - start_key);
+    }
+    {
+        // write to segment and not flush
+        Block block = DMTestEnv::prepareSimpleWriteBlock(start_key, end_key, false, version, DMTestEnv::pk_name, EXTRA_HANDLE_COLUMN_ID, options.is_common_handle ? EXTRA_HANDLE_COLUMN_STRING_TYPE : EXTRA_HANDLE_COLUMN_INT_TYPE, options.is_common_handle);
+        segment->write(dmContext(), std::move(block), false);
+        LOG_FMT_TRACE(&Poco::Logger::root(), "write key range [{}, {})", start_key, end_key);
+        version++;
+    }
+    while (remain_row_num > 0)
+    {
+        UInt64 write_num = std::min(remain_row_num, static_cast<UInt64>(end_key - start_key));
+        Block block = DMTestEnv::prepareSimpleWriteBlock(start_key, write_num + start_key, false, version, DMTestEnv::pk_name, EXTRA_HANDLE_COLUMN_ID, options.is_common_handle ? EXTRA_HANDLE_COLUMN_STRING_TYPE : EXTRA_HANDLE_COLUMN_INT_TYPE, options.is_common_handle);
+        segment->write(dmContext(), std::move(block), false);
+        remain_row_num -= write_num;
+        LOG_FMT_TRACE(&Poco::Logger::root(), "write key range [{}, {})", start_key, write_num + start_key);
+        version++;
+    }
+    EXPECT_EQ(getSegmentRowNumWithoutMVCC(segment_id), segment_row_num + write_rows);
+}
+
+void SegmentTestBasic::writeSegmentWithDeletedPack(PageId segment_id)
+{
+    UInt64 write_rows = DEFAULT_MERGE_BLOCK_SIZE;
+    auto segment = segments[segment_id];
+    size_t segment_row_num = getSegmentRowNumWithoutMVCC(segment_id);
+    std::pair<Int64, Int64> keys = getSegmentKeyRange(segment);
+    Int64 start_key = keys.first;
+    Int64 end_key = keys.second;
+    UInt64 remain_row_num = 0;
+    if (static_cast<UInt64>(end_key - start_key) > write_rows)
+    {
+        end_key = start_key + write_rows;
+    }
+    else
+    {
+        remain_row_num = write_rows - static_cast<UInt64>(end_key - start_key);
+    }
+    {
+        // write to segment and not flush
+        Block block = DMTestEnv::prepareSimpleWriteBlock(start_key, end_key, false, version, DMTestEnv::pk_name, EXTRA_HANDLE_COLUMN_ID, options.is_common_handle ? EXTRA_HANDLE_COLUMN_STRING_TYPE : EXTRA_HANDLE_COLUMN_INT_TYPE, options.is_common_handle, 1, true, true);
+        segment->write(dmContext(), std::move(block), true);
+        LOG_FMT_TRACE(&Poco::Logger::root(), "write key range [{}, {})", start_key, end_key);
+        version++;
+    }
+    while (remain_row_num > 0)
+    {
+        UInt64 write_num = std::min(remain_row_num, static_cast<UInt64>(end_key - start_key));
+        Block block = DMTestEnv::prepareSimpleWriteBlock(start_key, write_num + start_key, false, version, DMTestEnv::pk_name, EXTRA_HANDLE_COLUMN_ID, options.is_common_handle ? EXTRA_HANDLE_COLUMN_STRING_TYPE : EXTRA_HANDLE_COLUMN_INT_TYPE, options.is_common_handle, 1, true, true);
+        segment->write(dmContext(), std::move(block), true);
+        remain_row_num -= write_num;
+        LOG_FMT_TRACE(&Poco::Logger::root(), "write key range [{}, {})", start_key, write_num + start_key);
+        version++;
+    }
+    EXPECT_EQ(getSegmentRowNumWithoutMVCC(segment_id), segment_row_num + write_rows);
+}
+
+void SegmentTestBasic::deleteRangeSegment(PageId segment_id)
+{
+    auto segment = segments[segment_id];
+    segment->write(dmContext(), /*delete_range*/ segment->getRowKeyRange());
+    EXPECT_EQ(getSegmentRowNum(segment_id), 0);
+}
+
+void SegmentTestBasic::writeRandomSegment()
+{
+    if (segments.empty())
+    {
+        return;
+    }
+    PageId random_segment_id = getRandomSegmentId();
+    LOG_FMT_TRACE(&Poco::Logger::root(), "start write segment:{}", random_segment_id);
+    writeSegment(random_segment_id);
+}
+void SegmentTestBasic::writeRandomSegmentWithDeletedPack()
+{
+    if (segments.empty())
+    {
+        return;
+    }
+    PageId random_segment_id = getRandomSegmentId();
+    LOG_FMT_TRACE(&Poco::Logger::root(), "start write segment with deleted pack:{}", random_segment_id);
+    writeSegmentWithDeletedPack(random_segment_id);
+}
+
+void SegmentTestBasic::deleteRangeRandomSegment()
+{
+    if (segments.empty())
+    {
+        return;
+    }
+    PageId random_segment_id = getRandomSegmentId();
+    LOG_FMT_TRACE(&Poco::Logger::root(), "start delete range segment:{}", random_segment_id);
+    deleteRangeSegment(random_segment_id);
+}
+
+void SegmentTestBasic::splitRandomSegment()
+{
+    if (segments.empty())
+    {
+        return;
+    }
+    PageId random_segment_id = getRandomSegmentId();
+    LOG_FMT_TRACE(&Poco::Logger::root(), "start split segment:{}", random_segment_id);
+    splitSegment(random_segment_id);
+}
+
+void SegmentTestBasic::mergeRandomSegment()
+{
+    if (segments.empty() || segments.size() == 1)
+    {
+        return;
+    }
+    std::pair<PageId, PageId> segment_pair;
+    segment_pair = getRandomMergeablePair();
+    LOG_FMT_TRACE(&Poco::Logger::root(), "start merge segment:{} and {}", segment_pair.first, segment_pair.second);
+    mergeSegment(segment_pair.first, segment_pair.second);
+}
+
+void SegmentTestBasic::mergeDeltaRandomSegment()
+{
+    if (segments.empty())
+    {
+        return;
+    }
+    PageId random_segment_id = getRandomSegmentId();
+    LOG_FMT_TRACE(&Poco::Logger::root(), "start merge delta in segment:{}", random_segment_id);
+    mergeSegmentDelta(random_segment_id);
+}
+
+void SegmentTestBasic::flushCacheRandomSegment()
+{
+    if (segments.empty())
+    {
+        return;
+    }
+    PageId random_segment_id = getRandomSegmentId();
+    LOG_FMT_TRACE(&Poco::Logger::root(), "start flush cache in segment:{}", random_segment_id);
+    flushSegmentCache(random_segment_id);
+}
+
+void SegmentTestBasic::randomSegmentTest(size_t operator_count)
+{
+    for (size_t i = 0; i < operator_count; i++)
+    {
+        auto op = static_cast<SegmentOperaterType>(random() % SegmentOperaterMax);
+        segment_operator_entries[op]();
+    }
+}
+
+PageId SegmentTestBasic::getRandomSegmentId()
+{
+    auto max_segment_id = segments.rbegin()->first;
+    PageId random_segment_id = random() % (max_segment_id + 1);
+    auto it = segments.find(random_segment_id);
+    while (it == segments.end())
+    {
+        random_segment_id = random() % (max_segment_id + 1);
+        it = segments.find(random_segment_id);
+    }
+    return random_segment_id;
+}
+
+std::pair<PageId, PageId> SegmentTestBasic::getRandomMergeablePair()
+{
+    while (true)
+    {
+        PageId random_left_segment_id = getRandomSegmentId();
+        PageId random_right_segment_id = random_left_segment_id;
+        while (random_right_segment_id == random_left_segment_id)
+        {
+            random_right_segment_id = getRandomSegmentId();
+        }
+        auto left_segment = segments[random_left_segment_id];
+        auto right_segment = segments[random_right_segment_id];
+        if (compare(left_segment->getRowKeyRange().getEnd(), right_segment->getRowKeyRange().getStart()) != 0 || left_segment->nextSegmentId() != right_segment->segmentId())
+        {
+            continue;
+        }
+        return {random_left_segment_id, random_right_segment_id};
+    }
+}
+
+RowKeyRange SegmentTestBasic::commanHandleKeyRange()
+{
+    String start_key, end_key;
+    {
+        WriteBufferFromOwnString ss;
+        ::DB::EncodeUInt(static_cast<UInt8>(TiDB::CodecFlagInt), ss);
+        ::DB::EncodeInt64(std::numeric_limits<Int64>::min(), ss);
+        start_key = ss.releaseStr();
+    }
+    {
+        WriteBufferFromOwnString ss;
+        ::DB::EncodeUInt(static_cast<UInt8>(TiDB::CodecFlagInt), ss);
+        ::DB::EncodeInt64(std::numeric_limits<Int64>::max(), ss);
+        end_key = ss.releaseStr();
+    }
+    return RowKeyRange(RowKeyValue(true, std::make_shared<String>(start_key), 0), RowKeyValue(true, std::make_shared<String>(end_key), 0), true, 1);
+}
+
+SegmentPtr SegmentTestBasic::reload(bool is_common_handle, const ColumnDefinesPtr & pre_define_columns, DB::Settings && db_settings)
+{
+    TiFlashStorageTestBasic::reload(std::move(db_settings));
+    storage_path_pool = std::make_unique<StoragePathPool>(db_context->getPathPool().withTable("test", "t1", false));
+    storage_pool = std::make_unique<StoragePool>(*db_context, /*ns_id*/ 100, *storage_path_pool, "test.t1");
+    storage_pool->restore();
+    ColumnDefinesPtr cols = (!pre_define_columns) ? DMTestEnv::getDefaultColumns(is_common_handle ? DMTestEnv::PkType::CommonHandle : DMTestEnv::PkType::HiddenTiDBRowID) : pre_define_columns;
+    setColumns(cols);
+
+    return Segment::newSegment(*dm_context, table_columns, is_common_handle ? commanHandleKeyRange() : RowKeyRange::newAll(is_common_handle, 1), storage_pool->newMetaPageId(), 0);
+}
+
+void SegmentTestBasic::setColumns(const ColumnDefinesPtr & columns)
+{
+    *table_columns = *columns;
+
+    dm_context = std::make_unique<DMContext>(*db_context,
+                                             *storage_path_pool,
+                                             *storage_pool,
+                                             0,
+                                             /*min_version_*/ 0,
+                                             settings.not_compress_columns,
+                                             options.is_common_handle,
+                                             1,
+                                             db_context->getSettingsRef());
+}
+} // namespace tests
+} // namespace DM
+} // namespace DB
diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.h b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.h
new file mode 100644
index 00000000000..ab0c7d6d0be
--- /dev/null
+++ b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.h
@@ -0,0 +1,123 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <Storages/DeltaMerge/DMContext.h>
+#include <Storages/DeltaMerge/Segment.h>
+#include <Storages/Transaction/TMTContext.h>
+#include <Storages/tests/TiFlashStorageTestBasic.h>
+#include <TestUtils/TiFlashTestBasic.h>
+
+#include <vector>
+
+namespace DB
+{
+namespace DM
+{
+namespace tests
+{
+class SegmentTestBasic : public DB::base::TiFlashStorageTestBasic
+{
+public:
+    struct SegmentTestOptions
+    {
+        bool is_common_handle = false;
+    };
+
+public:
+    void reloadWithOptions(SegmentTestOptions config);
+
+    std::optional<PageId> splitSegment(PageId segment_id);
+    void mergeSegment(PageId left_segment_id, PageId right_segment_id);
+    void mergeSegmentDelta(PageId segment_id);
+    void flushSegmentCache(PageId segment_id);
+    void writeSegment(PageId segment_id, UInt64 write_rows = 100);
+    void writeSegmentWithDeletedPack(PageId segment_id);
+    void deleteRangeSegment(PageId segment_id);
+
+
+    void writeRandomSegment();
+    void writeRandomSegmentWithDeletedPack();
+    void deleteRangeRandomSegment();
+    void splitRandomSegment();
+    void mergeRandomSegment();
+    void mergeDeltaRandomSegment();
+    void flushCacheRandomSegment();
+
+    void randomSegmentTest(size_t operator_count);
+
+    PageId createNewSegmentWithSomeData();
+    size_t getSegmentRowNumWithoutMVCC(PageId segment_id);
+    size_t getSegmentRowNum(PageId segment_id);
+    void checkSegmentRow(PageId segment_id, size_t expected_row_num);
+    std::pair<Int64, Int64> getSegmentKeyRange(SegmentPtr segment);
+
+protected:
+    // <segment_id, segment_ptr>
+    std::map<PageId, SegmentPtr> segments;
+
+    enum SegmentOperaterType
+    {
+        Write = 0,
+        DeleteRange,
+        Split,
+        Merge,
+        MergeDelta,
+        FlushCache,
+        WriteDeletedPack,
+        SegmentOperaterMax
+    };
+
+    const std::vector<std::function<void()>> segment_operator_entries = {
+        [this] { writeRandomSegment(); },
+        [this] { deleteRangeRandomSegment(); },
+        [this] { splitRandomSegment(); },
+        [this] { mergeRandomSegment(); },
+        [this] { mergeDeltaRandomSegment(); },
+        [this] { flushCacheRandomSegment(); },
+        [this] {
+            writeRandomSegmentWithDeletedPack();
+        }};
+
+    PageId getRandomSegmentId();
+
+    std::pair<PageId, PageId> getRandomMergeablePair();
+
+    RowKeyRange commanHandleKeyRange();
+
+    SegmentPtr reload(bool is_common_handle, const ColumnDefinesPtr & pre_define_columns = {}, DB::Settings && db_settings = DB::Settings());
+
+    // setColumns should update dm_context at the same time
+    void setColumns(const ColumnDefinesPtr & columns);
+
+    const ColumnDefinesPtr & tableColumns() const { return table_columns; }
+
+    DMContext & dmContext() { return *dm_context; }
+
+protected:
+    /// all these var lives as ref in dm_context
+    std::unique_ptr<StoragePathPool> storage_path_pool;
+    std::unique_ptr<StoragePool> storage_pool;
+    /// dm_context
+    std::unique_ptr<DMContext> dm_context;
+    ColumnDefinesPtr table_columns;
+    DM::DeltaMergeStore::Settings settings;
+
+    SegmentPtr root_segment;
+    UInt64 version = 0;
+    SegmentTestOptions options;
+};
+} // namespace tests
+} // namespace DM
+} // namespace DB
\ No newline at end of file
diff --git a/dbms/src/Storages/DeltaMerge/tests/stress/DMStressProxy.cpp b/dbms/src/Storages/DeltaMerge/tests/stress/DMStressProxy.cpp
index 98cb8ef34e7..390d4001432 100644
--- a/dbms/src/Storages/DeltaMerge/tests/stress/DMStressProxy.cpp
+++ b/dbms/src/Storages/DeltaMerge/tests/stress/DMStressProxy.cpp
@@ -26,7 +26,7 @@ namespace tests
 {
 IDGenerator<Int64> pk{0};
 
-IDGenerator<UInt64> tso{StopWatchDetail::nanoseconds(CLOCK_MONOTONIC)};
+IDGenerator<UInt64> tso{clock_gettime_ns(CLOCK_MONOTONIC)};
 
 template <typename T>
 void insertColumn(Block & block, const DataTypePtr & type, const String & name, Int64 col_id, const std::vector<T> & values)
@@ -68,6 +68,7 @@ DMStressProxy::DMStressProxy(const StressOptions & opts_)
                                              /* num_streams= */ 1,
                                              /* max_version= */ tso.get(),
                                              EMPTY_FILTER,
+                                             /* tracing_id= */ "",
                                              /* expected_block_size= */ 1024)[0];
         while (Block block = in->read())
         {
@@ -199,6 +200,7 @@ UInt64 DMStressProxy::countRows(UInt32 rnd_break_prob)
                                          /* num_streams= */ 1,
                                          /* max_version= */ tso.get(),
                                          EMPTY_FILTER,
+                                         /* tracing_id= */ "",
                                          /* expected_block_size= */ 1024)[0];
 
     UInt64 total_count = 0;
@@ -428,6 +430,7 @@ void DMStressProxy::verify()
                                          /* num_streams= */ 1,
                                          /* max_version= */ tso.get(),
                                          EMPTY_FILTER,
+                                         /* tracing_id= */ "",
                                          /* expected_block_size= */ 1024)[0];
     UInt64 dm_total_count = 0;
     while (Block block = in->read())
diff --git a/dbms/src/Storages/DeltaMerge/tests/stress/DMStressProxy.h b/dbms/src/Storages/DeltaMerge/tests/stress/DMStressProxy.h
index 0571eafae83..6f004ed6959 100644
--- a/dbms/src/Storages/DeltaMerge/tests/stress/DMStressProxy.h
+++ b/dbms/src/Storages/DeltaMerge/tests/stress/DMStressProxy.h
@@ -54,7 +54,7 @@ template <typename T>
 class IDGenerator
 {
 public:
-    IDGenerator(T t_)
+    explicit IDGenerator(T t_)
         : t(t_)
     {}
     std::vector<T> get(Int32 count)
@@ -81,19 +81,21 @@ class KeyLock
 public:
     static constexpr UInt32 default_key_lock_slot_count = 4096;
 
-    KeyLock(UInt32 slot_count = default_key_lock_slot_count)
+    explicit KeyLock(UInt32 slot_count = default_key_lock_slot_count)
         : key_rmutexs(slot_count)
     {}
 
     std::vector<std::unique_lock<std::recursive_mutex>> getLocks(const std::vector<Int64> & keys)
     {
         std::vector<UInt32> idxs;
+        idxs.reserve(keys.size());
         for (Int64 key : keys)
         {
             idxs.push_back(getLockIdx(key));
         }
         sort(idxs.begin(), idxs.end()); // Sort mutex to avoid dead lock.
         std::vector<std::unique_lock<std::recursive_mutex>> locks;
+        locks.reserve(idxs.size());
         for (UInt32 i : idxs)
         {
             locks.push_back(getLockByIdx(i));
@@ -121,7 +123,7 @@ class KeySet
 {
 public:
     static constexpr UInt32 default_key_set_slot_count = 4096;
-    KeySet(UInt32 slot_count = default_key_set_slot_count)
+    explicit KeySet(UInt32 slot_count = default_key_set_slot_count)
         : key_set_mutexs(slot_count)
         , key_sets(slot_count)
     {}
@@ -181,7 +183,7 @@ class KeySet
 class DMStressProxy
 {
 public:
-    DMStressProxy(const StressOptions & opts_);
+    explicit DMStressProxy(const StressOptions & opts_);
 
     void run();
 
diff --git a/dbms/src/Storages/DeltaMerge/workload/TimestampGenerator.h b/dbms/src/Storages/DeltaMerge/workload/TimestampGenerator.h
index 816820230df..56eb47f30ee 100644
--- a/dbms/src/Storages/DeltaMerge/workload/TimestampGenerator.h
+++ b/dbms/src/Storages/DeltaMerge/workload/TimestampGenerator.h
@@ -24,7 +24,7 @@ class TimestampGenerator
 {
 public:
     TimestampGenerator()
-        : t(StopWatchDetail::nanoseconds(CLOCK_MONOTONIC))
+        : t(clock_gettime_ns(CLOCK_MONOTONIC))
     {}
 
     std::vector<uint64_t> get(int count)
diff --git a/dbms/src/Storages/IManageableStorage.h b/dbms/src/Storages/IManageableStorage.h
index ebf84c592e4..2ff766a9c6d 100644
--- a/dbms/src/Storages/IManageableStorage.h
+++ b/dbms/src/Storages/IManageableStorage.h
@@ -68,7 +68,7 @@ class IManageableStorage : public IStorage
 
     virtual void flushCache(const Context & /*context*/) {}
 
-    virtual void flushCache(const Context & /*context*/, const DM::RowKeyRange & /*range_to_flush*/) {}
+    virtual bool flushCache(const Context & /*context*/, const DM::RowKeyRange & /*range_to_flush*/, [[maybe_unused]] bool try_until_succeed = true) { return true; }
 
     virtual BlockInputStreamPtr status() { return {}; }
 
diff --git a/dbms/src/Storages/Page/V3/PageDirectory.cpp b/dbms/src/Storages/Page/V3/PageDirectory.cpp
index 5eb275f5af5..951da42de1c 100644
--- a/dbms/src/Storages/Page/V3/PageDirectory.cpp
+++ b/dbms/src/Storages/Page/V3/PageDirectory.cpp
@@ -478,7 +478,7 @@ PageSize VersionedPageEntries::getEntriesByBlobIds(
 bool VersionedPageEntries::cleanOutdatedEntries(
     UInt64 lowest_seq,
     std::map<PageIdV3Internal, std::pair<PageVersion, Int64>> * normal_entries_to_deref,
-    PageEntriesV3 & entries_removed,
+    PageEntriesV3 * entries_removed,
     const PageLock & /*page_lock*/)
 {
     if (type == EditRecordType::VAR_EXTERNAL)
@@ -541,7 +541,10 @@ bool VersionedPageEntries::cleanOutdatedEntries(
             {
                 if (iter->second.being_ref_count == 1)
                 {
-                    entries_removed.emplace_back(iter->second.entry);
+                    if (entries_removed)
+                    {
+                        entries_removed->emplace_back(iter->second.entry);
+                    }
                     iter = entries.erase(iter);
                 }
                 // The `being_ref_count` for this version is valid. While for older versions,
@@ -551,7 +554,10 @@ bool VersionedPageEntries::cleanOutdatedEntries(
             else
             {
                 // else there are newer "entry" in the version list, the outdated entries should be removed
-                entries_removed.emplace_back(iter->second.entry);
+                if (entries_removed)
+                {
+                    entries_removed->emplace_back(iter->second.entry);
+                }
                 iter = entries.erase(iter);
             }
         }
@@ -564,7 +570,7 @@ bool VersionedPageEntries::cleanOutdatedEntries(
     return entries.empty() || (entries.size() == 1 && entries.begin()->second.isDelete());
 }
 
-bool VersionedPageEntries::derefAndClean(UInt64 lowest_seq, PageIdV3Internal page_id, const PageVersion & deref_ver, const Int64 deref_count, PageEntriesV3 & entries_removed)
+bool VersionedPageEntries::derefAndClean(UInt64 lowest_seq, PageIdV3Internal page_id, const PageVersion & deref_ver, const Int64 deref_count, PageEntriesV3 * entries_removed)
 {
     auto page_lock = acquireLock();
     if (type == EditRecordType::VAR_EXTERNAL)
@@ -1239,7 +1245,7 @@ bool PageDirectory::tryDumpSnapshot(const ReadLimiterPtr & read_limiter, const W
     return done_any_io;
 }
 
-PageEntriesV3 PageDirectory::gcInMemEntries()
+PageEntriesV3 PageDirectory::gcInMemEntries(bool return_removed_entries)
 {
     UInt64 lowest_seq = sequence.load();
 
@@ -1303,7 +1309,7 @@ PageEntriesV3 PageDirectory::gcInMemEntries()
         const bool all_deleted = iter->second->cleanOutdatedEntries(
             lowest_seq,
             &normal_entries_to_deref,
-            all_del_entries,
+            return_removed_entries ? &all_del_entries : nullptr,
             iter->second->acquireLock());
 
         {
@@ -1342,7 +1348,7 @@ PageEntriesV3 PageDirectory::gcInMemEntries()
             page_id,
             /*deref_ver=*/deref_counter.first,
             /*deref_count=*/deref_counter.second,
-            all_del_entries);
+            return_removed_entries ? &all_del_entries : nullptr);
 
         if (all_deleted)
         {
diff --git a/dbms/src/Storages/Page/V3/PageDirectory.h b/dbms/src/Storages/Page/V3/PageDirectory.h
index bd7c433022f..2f0f09f4e42 100644
--- a/dbms/src/Storages/Page/V3/PageDirectory.h
+++ b/dbms/src/Storages/Page/V3/PageDirectory.h
@@ -223,14 +223,14 @@ class VersionedPageEntries
     bool cleanOutdatedEntries(
         UInt64 lowest_seq,
         std::map<PageIdV3Internal, std::pair<PageVersion, Int64>> * normal_entries_to_deref,
-        PageEntriesV3 & entries_removed,
+        PageEntriesV3 * entries_removed,
         const PageLock & page_lock);
     bool derefAndClean(
         UInt64 lowest_seq,
         PageIdV3Internal page_id,
         const PageVersion & deref_ver,
         Int64 deref_count,
-        PageEntriesV3 & entries_removed);
+        PageEntriesV3 * entries_removed);
 
     void collapseTo(UInt64 seq, PageIdV3Internal page_id, PageEntriesEdit & edit);
 
@@ -360,7 +360,9 @@ class PageDirectory
 
     bool tryDumpSnapshot(const ReadLimiterPtr & read_limiter = nullptr, const WriteLimiterPtr & write_limiter = nullptr);
 
-    PageEntriesV3 gcInMemEntries();
+    // Perform a GC for in-memory entries and return the removed entries.
+    // If `return_removed_entries` is false, then just return an empty set.
+    PageEntriesV3 gcInMemEntries(bool return_removed_entries = true);
 
     std::set<PageId> getAliveExternalIds(NamespaceId ns_id) const;
 
diff --git a/dbms/src/Storages/Page/V3/PageDirectoryFactory.cpp b/dbms/src/Storages/Page/V3/PageDirectoryFactory.cpp
index 483c5073ab5..968049a3273 100644
--- a/dbms/src/Storages/Page/V3/PageDirectoryFactory.cpp
+++ b/dbms/src/Storages/Page/V3/PageDirectoryFactory.cpp
@@ -44,7 +44,8 @@ PageDirectoryPtr PageDirectoryFactory::createFromReader(String storage_name, WAL
 
     // After restoring from the disk, we need cleanup all invalid entries in memory, or it will
     // try to run GC again on some entries that are already marked as invalid in BlobStore.
-    dir->gcInMemEntries();
+    // It's no need to remove the expired entries in BlobStore, so skip filling removed_entries to imporve performance.
+    dir->gcInMemEntries(/*return_removed_entries=*/false);
     LOG_FMT_INFO(DB::Logger::get("PageDirectoryFactory", storage_name), "PageDirectory restored [max_page_id={}] [max_applied_ver={}]", dir->getMaxId(), dir->sequence);
 
     if (blob_stats)
@@ -84,7 +85,8 @@ PageDirectoryPtr PageDirectoryFactory::createFromEdit(String storage_name, FileP
 
     // After restoring from the disk, we need cleanup all invalid entries in memory, or it will
     // try to run GC again on some entries that are already marked as invalid in BlobStore.
-    dir->gcInMemEntries();
+    // It's no need to remove the expired entries in BlobStore when restore, so no need to fill removed_entries.
+    dir->gcInMemEntries(/*return_removed_entries=*/false);
 
     if (blob_stats)
     {
diff --git a/dbms/src/Storages/Page/V3/tests/gtest_page_directory.cpp b/dbms/src/Storages/Page/V3/tests/gtest_page_directory.cpp
index 83e07f75d37..6d6ef41630f 100644
--- a/dbms/src/Storages/Page/V3/tests/gtest_page_directory.cpp
+++ b/dbms/src/Storages/Page/V3/tests/gtest_page_directory.cpp
@@ -644,14 +644,14 @@ class VersionedEntriesTest : public ::testing::Test
     {
         DerefCounter deref_counter;
         PageEntriesV3 removed_entries;
-        bool all_removed = entries.cleanOutdatedEntries(seq, &deref_counter, removed_entries, entries.acquireLock());
+        bool all_removed = entries.cleanOutdatedEntries(seq, &deref_counter, &removed_entries, entries.acquireLock());
         return {all_removed, removed_entries, deref_counter};
     }
 
     std::tuple<bool, PageEntriesV3> runDeref(UInt64 seq, PageVersion ver, Int64 decrease_num)
     {
         PageEntriesV3 removed_entries;
-        bool all_removed = entries.derefAndClean(seq, buildV3Id(TEST_NAMESPACE_ID, page_id), ver, decrease_num, removed_entries);
+        bool all_removed = entries.derefAndClean(seq, buildV3Id(TEST_NAMESPACE_ID, page_id), ver, decrease_num, &removed_entries);
         return {all_removed, removed_entries};
     }
 
diff --git a/dbms/src/Storages/StorageDeltaMerge.cpp b/dbms/src/Storages/StorageDeltaMerge.cpp
index 67d32c73a05..a6de4efb3ac 100644
--- a/dbms/src/Storages/StorageDeltaMerge.cpp
+++ b/dbms/src/Storages/StorageDeltaMerge.cpp
@@ -775,12 +775,12 @@ void StorageDeltaMerge::checkStatus(const Context & context)
 
 void StorageDeltaMerge::flushCache(const Context & context)
 {
-    flushCache(context, DM::RowKeyRange::newAll(is_common_handle, rowkey_column_size));
+    flushCache(context, DM::RowKeyRange::newAll(is_common_handle, rowkey_column_size), /* try_until_succeed */ true);
 }
 
-void StorageDeltaMerge::flushCache(const Context & context, const DM::RowKeyRange & range_to_flush)
+bool StorageDeltaMerge::flushCache(const Context & context, const DM::RowKeyRange & range_to_flush, bool try_until_succeed)
 {
-    getAndMaybeInitStore()->flushCache(context, range_to_flush);
+    return getAndMaybeInitStore()->flushCache(context, range_to_flush, try_until_succeed);
 }
 
 void StorageDeltaMerge::mergeDelta(const Context & context)
diff --git a/dbms/src/Storages/StorageDeltaMerge.h b/dbms/src/Storages/StorageDeltaMerge.h
index 79ee225d237..9e4ab12ad4f 100644
--- a/dbms/src/Storages/StorageDeltaMerge.h
+++ b/dbms/src/Storages/StorageDeltaMerge.h
@@ -73,7 +73,7 @@ class StorageDeltaMerge
 
     void flushCache(const Context & context) override;
 
-    void flushCache(const Context & context, const DM::RowKeyRange & range_to_flush) override;
+    bool flushCache(const Context & context, const DM::RowKeyRange & range_to_flush, bool try_until_succeed) override;
 
     /// Merge delta into the stable layer for all segments.
     ///
diff --git a/dbms/src/Storages/Transaction/Collator.cpp b/dbms/src/Storages/Transaction/Collator.cpp
index a9b4d0784be..1b0221a6829 100644
--- a/dbms/src/Storages/Transaction/Collator.cpp
+++ b/dbms/src/Storages/Transaction/Collator.cpp
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <Common/Exception.h>
+#include <Functions/CollationOperatorOptimized.h>
 #include <Poco/String.h>
 #include <Storages/Transaction/Collator.h>
 
@@ -29,17 +30,10 @@ TiDBCollators dummy_collators;
 std::vector<std::string> dummy_sort_key_contaners;
 std::string dummy_sort_key_contaner;
 
-std::string_view rtrim(const char * s, size_t length)
+ALWAYS_INLINE std::string_view rtrim(const char * s, size_t length)
 {
     auto v = std::string_view(s, length);
-    size_t end = v.find_last_not_of(' ');
-    return end == std::string_view::npos ? "" : v.substr(0, end + 1);
-}
-
-template <typename T>
-int signum(T val)
-{
-    return (0 < val) - (val < 0);
+    return DB::RightTrim(v);
 }
 
 using Rune = int32_t;
@@ -183,26 +177,26 @@ class Pattern : public ITiDBCollator::IPattern
 };
 
 template <typename T, bool padding = false>
-class BinCollator : public ITiDBCollator
+class BinCollator final : public ITiDBCollator
 {
 public:
     explicit BinCollator(int32_t id)
         : ITiDBCollator(id)
     {}
+
     int compare(const char * s1, size_t length1, const char * s2, size_t length2) const override
     {
         if constexpr (padding)
-            return signum(rtrim(s1, length1).compare(rtrim(s2, length2)));
+            return DB::RtrimStrCompare({s1, length1}, {s2, length2});
         else
-            return signum(std::string_view(s1, length1).compare(std::string_view(s2, length2)));
+            return DB::RawStrCompare({s1, length1}, {s2, length2});
     }
 
     StringRef sortKey(const char * s, size_t length, std::string &) const override
     {
         if constexpr (padding)
         {
-            auto v = rtrim(s, length);
-            return StringRef(v.data(), v.length());
+            return StringRef(rtrim(s, length));
         }
         else
         {
@@ -249,7 +243,7 @@ using WeightType = uint16_t;
 extern const std::array<WeightType, 256 * 256> weight_lut;
 } // namespace GeneralCI
 
-class GeneralCICollator : public ITiDBCollator
+class GeneralCICollator final : public ITiDBCollator
 {
 public:
     explicit GeneralCICollator(int32_t id)
@@ -270,7 +264,7 @@ class GeneralCICollator : public ITiDBCollator
             auto sk2 = weight(c2);
             auto cmp = sk1 - sk2;
             if (cmp != 0)
-                return signum(cmp);
+                return DB::signum(cmp);
         }
 
         return (offset1 < v1.length()) - (offset2 < v2.length());
@@ -365,7 +359,7 @@ const std::array<long_weight, 23> weight_lut_long = {
 
 } // namespace UnicodeCI
 
-class UnicodeCICollator : public ITiDBCollator
+class UnicodeCICollator final : public ITiDBCollator
 {
 public:
     explicit UnicodeCICollator(int32_t id)
@@ -420,7 +414,7 @@ class UnicodeCICollator : public ITiDBCollator
                 }
                 else
                 {
-                    return signum(static_cast<int>(s1_first & 0xFFFF) - static_cast<int>(s2_first & 0xFFFF));
+                    return DB::signum(static_cast<int>(s1_first & 0xFFFF) - static_cast<int>(s2_first & 0xFFFF));
                 }
             }
         }
@@ -593,6 +587,8 @@ class UnicodeCICollator : public ITiDBCollator
     friend class Pattern<UnicodeCICollator>;
 };
 
+using UTF8MB4_BIN_TYPE = BinCollator<Rune, true>;
+
 TiDBCollatorPtr ITiDBCollator::getCollator(int32_t id)
 {
     switch (id)
@@ -607,10 +603,10 @@ TiDBCollatorPtr ITiDBCollator::getCollator(int32_t id)
         static const auto latin1_collator = BinCollator<char, true>(LATIN1_BIN);
         return &latin1_collator;
     case ITiDBCollator::UTF8MB4_BIN:
-        static const auto utf8mb4_collator = BinCollator<Rune, true>(UTF8MB4_BIN);
+        static const auto utf8mb4_collator = UTF8MB4_BIN_TYPE(UTF8MB4_BIN);
         return &utf8mb4_collator;
     case ITiDBCollator::UTF8_BIN:
-        static const auto utf8_collator = BinCollator<Rune, true>(UTF8_BIN);
+        static const auto utf8_collator = UTF8MB4_BIN_TYPE(UTF8_BIN);
         return &utf8_collator;
     case ITiDBCollator::UTF8_GENERAL_CI:
         static const auto utf8_general_ci_collator = GeneralCICollator(UTF8_GENERAL_CI);
diff --git a/dbms/src/Storages/Transaction/DecodingStorageSchemaSnapshot.h b/dbms/src/Storages/Transaction/DecodingStorageSchemaSnapshot.h
index e8e0610326c..b0cacefe6f4 100644
--- a/dbms/src/Storages/Transaction/DecodingStorageSchemaSnapshot.h
+++ b/dbms/src/Storages/Transaction/DecodingStorageSchemaSnapshot.h
@@ -77,10 +77,12 @@ struct DecodingStorageSchemaSnapshot
         , decoding_schema_version{decoding_schema_version_}
     {
         std::unordered_map<ColumnID, size_t> column_lut;
+        std::unordered_map<String, ColumnID> column_name_id_map;
         for (size_t i = 0; i < table_info_.columns.size(); i++)
         {
             const auto & ci = table_info_.columns[i];
             column_lut.emplace(ci.id, i);
+            column_name_id_map.emplace(ci.name, ci.id);
         }
         for (size_t i = 0; i < column_defines->size(); i++)
         {
@@ -88,7 +90,7 @@ struct DecodingStorageSchemaSnapshot
             sorted_column_id_with_pos.insert({cd.id, i});
             if (cd.id != TiDBPkColumnID && cd.id != VersionColumnID && cd.id != DelMarkColumnID)
             {
-                auto & columns = table_info_.columns;
+                const auto & columns = table_info_.columns;
                 column_infos.push_back(columns[column_lut.at(cd.id)]);
             }
             else
@@ -100,10 +102,14 @@ struct DecodingStorageSchemaSnapshot
         // create pk related metadata if needed
         if (is_common_handle)
         {
-            const auto & primary_index_info = table_info_.getPrimaryIndexInfo();
-            for (size_t i = 0; i < primary_index_info.idx_cols.size(); i++)
+            /// we will not update the IndexInfo except Rename DDL.
+            /// When the add column / drop column action happenes, the offset of each column may change
+            /// Thus, we should not use offset to get the column we want,
+            /// but use to compare the column name to get the column id.
+            const auto & primary_index_cols = table_info_.getPrimaryIndexInfo().idx_cols;
+            for (const auto & col : primary_index_cols)
             {
-                auto pk_column_id = table_info_.columns[primary_index_info.idx_cols[i].offset].id;
+                auto pk_column_id = column_name_id_map[col.name];
                 pk_column_ids.emplace_back(pk_column_id);
                 pk_pos_map.emplace(pk_column_id, reinterpret_cast<size_t>(std::numeric_limits<size_t>::max()));
             }
diff --git a/dbms/src/Storages/Transaction/KVStore.cpp b/dbms/src/Storages/Transaction/KVStore.cpp
index 318a04c6ed9..fb31e4476bb 100644
--- a/dbms/src/Storages/Transaction/KVStore.cpp
+++ b/dbms/src/Storages/Transaction/KVStore.cpp
@@ -129,7 +129,7 @@ void KVStore::traverseRegions(std::function<void(RegionID, const RegionPtr &)> &
         callback(region.first, region.second);
 }
 
-void KVStore::tryFlushRegionCacheInStorage(TMTContext & tmt, const Region & region, Poco::Logger * log)
+bool KVStore::tryFlushRegionCacheInStorage(TMTContext & tmt, const Region & region, Poco::Logger * log, bool try_until_succeed)
 {
     auto table_id = region.getMappedTableID();
     auto storage = tmt.getStorages().get(table_id);
@@ -139,7 +139,7 @@ void KVStore::tryFlushRegionCacheInStorage(TMTContext & tmt, const Region & regi
                         "tryFlushRegionCacheInStorage can not get table for region {} with table id {}, ignored",
                         region.toString(),
                         table_id);
-        return;
+        return true;
     }
 
     try
@@ -151,7 +151,7 @@ void KVStore::tryFlushRegionCacheInStorage(TMTContext & tmt, const Region & regi
             region.getRange()->getMappedTableID(),
             storage->isCommonHandle(),
             storage->getRowKeyColumnSize());
-        storage->flushCache(tmt.getContext(), rowkey_range);
+        return storage->flushCache(tmt.getContext(), rowkey_range, try_until_succeed);
     }
     catch (DB::Exception & e)
     {
@@ -159,6 +159,7 @@ void KVStore::tryFlushRegionCacheInStorage(TMTContext & tmt, const Region & regi
         if (e.code() != ErrorCodes::TABLE_IS_DROPPED)
             throw;
     }
+    return true;
 }
 
 void KVStore::tryPersist(RegionID region_id)
@@ -326,6 +327,64 @@ void KVStore::persistRegion(const Region & region, const RegionTaskLock & region
     LOG_FMT_DEBUG(log, "Persist {} done", region.toString(false));
 }
 
+bool KVStore::needFlushRegionData(UInt64 region_id, TMTContext & tmt)
+{
+    auto region_task_lock = region_manager.genRegionTaskLock(region_id);
+    const RegionPtr curr_region_ptr = getRegion(region_id);
+    return canFlushRegionDataImpl(curr_region_ptr, false, false, tmt, region_task_lock);
+}
+
+bool KVStore::tryFlushRegionData(UInt64 region_id, bool try_until_succeed, TMTContext & tmt)
+{
+    auto region_task_lock = region_manager.genRegionTaskLock(region_id);
+    const RegionPtr curr_region_ptr = getRegion(region_id);
+    return canFlushRegionDataImpl(curr_region_ptr, true, try_until_succeed, tmt, region_task_lock);
+}
+
+bool KVStore::canFlushRegionDataImpl(const RegionPtr & curr_region_ptr, UInt8 flush_if_possible, bool try_until_succeed, TMTContext & tmt, const RegionTaskLock & region_task_lock)
+{
+    if (curr_region_ptr == nullptr)
+    {
+        throw Exception(fmt::format("region not found when trying flush", ErrorCodes::LOGICAL_ERROR));
+    }
+    auto & curr_region = *curr_region_ptr;
+
+    auto [rows, size_bytes] = curr_region.getApproxMemCacheInfo();
+
+    LOG_FMT_DEBUG(log, "{} approx mem cache info: rows {}, bytes {}", curr_region.toString(false), rows, size_bytes);
+
+    bool can_flush = false;
+    if (rows >= region_compact_log_min_rows.load(std::memory_order_relaxed)
+        || size_bytes >= region_compact_log_min_bytes.load(std::memory_order_relaxed))
+    {
+        // if rows or bytes more than threshold, flush cache and persist mem data.
+        can_flush = true;
+    }
+    else
+    {
+        // if there is little data in mem, wait until time interval reached threshold.
+        // use random period so that lots of regions will not be persisted at same time.
+        auto compact_log_period = std::rand() % region_compact_log_period.load(std::memory_order_relaxed); // NOLINT
+        can_flush = !(curr_region.lastCompactLogTime() + Seconds{compact_log_period} > Clock::now());
+    }
+    if (can_flush && flush_if_possible)
+    {
+        LOG_FMT_DEBUG(log, "{} flush region due to can_flush_data", curr_region.toString(false));
+        if (tryFlushRegionCacheInStorage(tmt, curr_region, log, try_until_succeed))
+        {
+            persistRegion(curr_region, region_task_lock, "compact raft log");
+            curr_region.markCompactLog();
+            curr_region.cleanApproxMemCacheInfo();
+            return true;
+        }
+        else
+        {
+            return false;
+        }
+    }
+    return can_flush;
+}
+
 EngineStoreApplyRes KVStore::handleUselessAdminRaftCmd(
     raft_cmdpb::AdminCmdType cmd_type,
     UInt64 curr_region_id,
@@ -359,32 +418,12 @@ EngineStoreApplyRes KVStore::handleUselessAdminRaftCmd(
         }
         else
         {
-            auto [rows, size_bytes] = curr_region.getApproxMemCacheInfo();
-
-            LOG_FMT_DEBUG(log, "{} approx mem cache info: rows {}, bytes {}", curr_region.toString(false), rows, size_bytes);
-
-            if (rows >= region_compact_log_min_rows.load(std::memory_order_relaxed)
-                || size_bytes >= region_compact_log_min_bytes.load(std::memory_order_relaxed))
-            {
-                // if rows or bytes more than threshold, flush cache and perist mem data.
-                return true;
-            }
-            else
-            {
-                // if thhere is little data in mem, wait until time interval reached threshold.
-                // use random period so that lots of regions will not be persisted at same time.
-                auto compact_log_period = std::rand() % region_compact_log_period.load(std::memory_order_relaxed); // NOLINT
-                return !(curr_region.lastCompactLogTime() + Seconds{compact_log_period} > Clock::now());
-            }
+            return canFlushRegionDataImpl(curr_region_ptr, true, /* try_until_succeed */ false, tmt, region_task_lock);
         }
     };
 
     if (check_sync_log())
     {
-        tryFlushRegionCacheInStorage(tmt, curr_region, log);
-        persistRegion(curr_region, region_task_lock, "compact raft log");
-        curr_region.markCompactLog();
-        curr_region.cleanApproxMemCacheInfo();
         return EngineStoreApplyRes::Persist;
     }
     return EngineStoreApplyRes::None;
diff --git a/dbms/src/Storages/Transaction/KVStore.h b/dbms/src/Storages/Transaction/KVStore.h
index bb45e65d18b..b58083557a1 100644
--- a/dbms/src/Storages/Transaction/KVStore.h
+++ b/dbms/src/Storages/Transaction/KVStore.h
@@ -91,7 +91,7 @@ class KVStore final : private boost::noncopyable
 
     void tryPersist(RegionID region_id);
 
-    static void tryFlushRegionCacheInStorage(TMTContext & tmt, const Region & region, Poco::Logger * log);
+    static bool tryFlushRegionCacheInStorage(TMTContext & tmt, const Region & region, Poco::Logger * log, bool try_until_succeed = true);
 
     size_t regionSize() const;
     EngineStoreApplyRes handleAdminRaftCmd(raft_cmdpb::AdminRequest && request,
@@ -108,6 +108,9 @@ class KVStore final : private boost::noncopyable
         TMTContext & tmt);
     EngineStoreApplyRes handleWriteRaftCmd(const WriteCmdsView & cmds, UInt64 region_id, UInt64 index, UInt64 term, TMTContext & tmt);
 
+    bool needFlushRegionData(UInt64 region_id, TMTContext & tmt);
+    bool tryFlushRegionData(UInt64 region_id, bool try_until_succeed, TMTContext & tmt);
+
     void handleApplySnapshot(metapb::Region && region, uint64_t peer_id, const SSTViewVec, uint64_t index, uint64_t term, TMTContext & tmt);
 
     std::vector<UInt64> /*   */ preHandleSnapshotToFiles(
@@ -219,6 +222,11 @@ class KVStore final : private boost::noncopyable
         UInt64 term,
         TMTContext & tmt);
 
+    /// Notice that if flush_if_possible is set to false, we only check if a flush is allowed by rowsize/size/interval.
+    /// It will not check if a flush will eventually succeed.
+    /// In other words, `canFlushRegionDataImpl(flush_if_possible=true)` can return false.
+    bool canFlushRegionDataImpl(const RegionPtr & curr_region_ptr, UInt8 flush_if_possible, bool try_until_succeed, TMTContext & tmt, const RegionTaskLock & region_task_lock);
+
     void persistRegion(const Region & region, const RegionTaskLock & region_task_lock, const char * caller);
     void releaseReadIndexWorkers();
     void handleDestroy(UInt64 region_id, TMTContext & tmt, const KVStoreTaskLock &);
diff --git a/dbms/src/Storages/Transaction/PartitionStreams.cpp b/dbms/src/Storages/Transaction/PartitionStreams.cpp
index 456f067fe5e..cf151c4270d 100644
--- a/dbms/src/Storages/Transaction/PartitionStreams.cpp
+++ b/dbms/src/Storages/Transaction/PartitionStreams.cpp
@@ -40,6 +40,8 @@ namespace FailPoints
 extern const char pause_before_apply_raft_cmd[];
 extern const char pause_before_apply_raft_snapshot[];
 extern const char force_set_safepoint_when_decode_block[];
+extern const char unblock_query_init_after_write[];
+extern const char pause_query_init[];
 } // namespace FailPoints
 
 namespace ErrorCodes
@@ -151,6 +153,7 @@ static void writeRegionDataToStorage(
         default:
             throw Exception("Unknown StorageEngine: " + toString(static_cast<Int32>(storage->engineType())), ErrorCodes::LOGICAL_ERROR);
         }
+
         write_part_cost = watch.elapsedMilliseconds();
         GET_METRIC(tiflash_raft_write_data_to_storage_duration_seconds, type_write).Observe(write_part_cost / 1000.0);
         if (need_decode)
@@ -165,10 +168,20 @@ static void writeRegionDataToStorage(
     /// decoding data. Check the test case for more details.
     FAIL_POINT_PAUSE(FailPoints::pause_before_apply_raft_cmd);
 
+    /// disable pause_query_init when the write action finish, to make the query action continue.
+    /// the usage of unblock_query_init_after_write and pause_query_init can refer to InterpreterSelectQuery::init
+    SCOPE_EXIT({
+        fiu_do_on(FailPoints::unblock_query_init_after_write, {
+            FailPointHelper::disableFailPoint(FailPoints::pause_query_init);
+        });
+    });
+
     /// Try read then write once.
     {
         if (atomic_read_write(false))
+        {
             return;
+        }
     }
 
     /// If first try failed, sync schema and force read then write.
@@ -177,10 +190,12 @@ static void writeRegionDataToStorage(
         tmt.getSchemaSyncer()->syncSchemas(context);
 
         if (!atomic_read_write(true))
+        {
             // Failure won't be tolerated this time.
             // TODO: Enrich exception message.
             throw Exception("Write region " + std::to_string(region->id()) + " to table " + std::to_string(table_id) + " failed",
                             ErrorCodes::LOGICAL_ERROR);
+        }
     }
 }
 
diff --git a/dbms/src/Storages/Transaction/ProxyFFI.cpp b/dbms/src/Storages/Transaction/ProxyFFI.cpp
index 8a40ca9b15e..d4ba50d5714 100644
--- a/dbms/src/Storages/Transaction/ProxyFFI.cpp
+++ b/dbms/src/Storages/Transaction/ProxyFFI.cpp
@@ -128,6 +128,34 @@ EngineStoreApplyRes HandleAdminRaftCmd(
     }
 }
 
+uint8_t NeedFlushData(EngineStoreServerWrap * server, uint64_t region_id)
+{
+    try
+    {
+        auto & kvstore = server->tmt->getKVStore();
+        return kvstore->needFlushRegionData(region_id, *server->tmt);
+    }
+    catch (...)
+    {
+        tryLogCurrentException(__PRETTY_FUNCTION__);
+        exit(-1);
+    }
+}
+
+uint8_t TryFlushData(EngineStoreServerWrap * server, uint64_t region_id, uint8_t until_succeed)
+{
+    try
+    {
+        auto & kvstore = server->tmt->getKVStore();
+        return kvstore->tryFlushRegionData(region_id, until_succeed, *server->tmt);
+    }
+    catch (...)
+    {
+        tryLogCurrentException(__PRETTY_FUNCTION__);
+        exit(-1);
+    }
+}
+
 static_assert(sizeof(RaftStoreProxyFFIHelper) == sizeof(TiFlashRaftProxyHelper));
 static_assert(alignof(RaftStoreProxyFFIHelper) == alignof(TiFlashRaftProxyHelper));
 
diff --git a/dbms/src/Storages/Transaction/ProxyFFI.h b/dbms/src/Storages/Transaction/ProxyFFI.h
index e1c01599275..aafe4b375eb 100644
--- a/dbms/src/Storages/Transaction/ProxyFFI.h
+++ b/dbms/src/Storages/Transaction/ProxyFFI.h
@@ -125,6 +125,8 @@ EngineStoreApplyRes HandleAdminRaftCmd(
 EngineStoreApplyRes HandleWriteRaftCmd(const EngineStoreServerWrap * server,
                                        WriteCmdsView cmds,
                                        RaftCmdHeader header);
+uint8_t NeedFlushData(EngineStoreServerWrap * server, uint64_t region_id);
+uint8_t TryFlushData(EngineStoreServerWrap * server, uint64_t region_id, uint8_t until_succeed);
 void AtomicUpdateProxy(EngineStoreServerWrap * server, RaftStoreProxyFFIHelper * proxy);
 void HandleDestroy(EngineStoreServerWrap * server, uint64_t region_id);
 EngineStoreApplyRes HandleIngestSST(EngineStoreServerWrap * server, SSTViewVec snaps, RaftCmdHeader header);
@@ -158,6 +160,8 @@ inline EngineStoreServerHelper GetEngineStoreServerHelper(
         .fn_gen_cpp_string = GenCppRawString,
         .fn_handle_write_raft_cmd = HandleWriteRaftCmd,
         .fn_handle_admin_raft_cmd = HandleAdminRaftCmd,
+        .fn_need_flush_data = NeedFlushData,
+        .fn_try_flush_data = TryFlushData,
         .fn_atomic_update_proxy = AtomicUpdateProxy,
         .fn_handle_destroy = HandleDestroy,
         .fn_handle_ingest_sst = HandleIngestSST,
diff --git a/dbms/src/Storages/Transaction/ReadIndexWorker.cpp b/dbms/src/Storages/Transaction/ReadIndexWorker.cpp
index 3223c815989..7de79dd5c6d 100644
--- a/dbms/src/Storages/Transaction/ReadIndexWorker.cpp
+++ b/dbms/src/Storages/Transaction/ReadIndexWorker.cpp
@@ -880,7 +880,7 @@ BatchReadIndexRes ReadIndexWorkerManager::batchReadIndex(
         }
     }
     { // if meet timeout, which means part of regions can not get response from leader, try to poll rest tasks
-        TEST_LOG_FMT("rest {}, poll rest tasks onece", tasks.size());
+        TEST_LOG_FMT("rest {}, poll rest tasks once", tasks.size());
 
         while (!tasks.empty())
         {
diff --git a/dbms/src/Storages/Transaction/RegionBlockReader.cpp b/dbms/src/Storages/Transaction/RegionBlockReader.cpp
index a9384e4a14d..2ec690c467b 100644
--- a/dbms/src/Storages/Transaction/RegionBlockReader.cpp
+++ b/dbms/src/Storages/Transaction/RegionBlockReader.cpp
@@ -208,6 +208,8 @@ bool RegionBlockReader::readImpl(Block & block, const RegionDataReadInfoList & d
         }
         index++;
     }
+    block.checkNumberOfRows();
+
     return true;
 }
 
diff --git a/dbms/src/Storages/Transaction/RegionTable.cpp b/dbms/src/Storages/Transaction/RegionTable.cpp
index c855d5b3226..5ae36a4bd64 100644
--- a/dbms/src/Storages/Transaction/RegionTable.cpp
+++ b/dbms/src/Storages/Transaction/RegionTable.cpp
@@ -230,7 +230,7 @@ void removeObsoleteDataInStorage(
         auto rowkey_range
             = DM::RowKeyRange::fromRegionRange(handle_range, table_id, table_id, storage->isCommonHandle(), storage->getRowKeyColumnSize());
         dm_storage->deleteRange(rowkey_range, context->getSettingsRef());
-        dm_storage->flushCache(*context, rowkey_range); // flush to disk
+        dm_storage->flushCache(*context, rowkey_range, /*try_until_succeed*/ true); // flush to disk
     }
     catch (DB::Exception & e)
     {
diff --git a/dbms/src/Storages/Transaction/TiDB.cpp b/dbms/src/Storages/Transaction/TiDB.cpp
index 15bf2a3fb58..6d07c47f235 100644
--- a/dbms/src/Storages/Transaction/TiDB.cpp
+++ b/dbms/src/Storages/Transaction/TiDB.cpp
@@ -25,6 +25,7 @@
 #include <Storages/Transaction/Collator.h>
 #include <Storages/Transaction/TiDB.h>
 #include <TiDB/Schema/SchemaNameMapper.h>
+#include <common/logger_useful.h>
 
 #include <cmath>
 
@@ -631,8 +632,8 @@ catch (const Poco::Exception & e)
 ///////////////////////
 
 IndexColumnInfo::IndexColumnInfo(Poco::JSON::Object::Ptr json)
-    : offset(0)
-    , length(0)
+    : length(0)
+    , offset(0)
 {
     deserialize(json);
 }
@@ -772,6 +773,37 @@ catch (const Poco::Exception & e)
         DB::Exception(e));
 }
 
+String TiFlashModeToString(TiFlashMode tiflash_mode)
+{
+    switch (tiflash_mode)
+    {
+    case TiFlashMode::Normal:
+        return "";
+    case TiFlashMode::Fast:
+        return "fast";
+    default:
+        LOG_FMT_WARNING(&Poco::Logger::get("TiDB"), "TiFlashModeToString with invalid tiflash mode {}", tiflash_mode);
+        return "";
+    }
+}
+
+TiFlashMode parseTiFlashMode(String mode_str)
+{
+    if (mode_str.empty())
+    {
+        return TiFlashMode::Normal;
+    }
+    else if (mode_str == "fast")
+    {
+        return TiFlashMode::Fast;
+    }
+    else
+    {
+        throw DB::Exception(
+            std::string(__PRETTY_FUNCTION__)
+            + " ParseTiFlashMode Failed. mode " + mode_str + " is unvalid, please set mode as fast/normal");
+    }
+}
 ///////////////////////
 ////// TableInfo //////
 ///////////////////////
@@ -840,6 +872,8 @@ try
 
     json->set("tiflash_replica", replica_info.getJSONObject());
 
+    json->set("tiflash_mode", std::string(TiFlashModeToString(tiflash_mode)));
+
     json->stringify(buf);
 
     return buf.str();
@@ -926,6 +960,14 @@ try
             replica_info.deserialize(replica_obj);
         }
     }
+    if (obj->has("tiflash_mode"))
+    {
+        auto mode = obj->getValue<String>("tiflash_mode");
+        if (!mode.empty())
+        {
+            tiflash_mode = parseTiFlashMode(mode);
+        }
+    }
     if (is_common_handle && index_infos.size() != 1)
     {
         throw DB::Exception(
diff --git a/dbms/src/Storages/Transaction/TiDB.h b/dbms/src/Storages/Transaction/TiDB.h
index f67bfb332c7..a9d46b60c13 100644
--- a/dbms/src/Storages/Transaction/TiDB.h
+++ b/dbms/src/Storages/Transaction/TiDB.h
@@ -179,7 +179,6 @@ struct ColumnInfo
 
     ColumnID id = -1;
     String name;
-    Int32 offset = -1;
     Poco::Dynamic::Var origin_default_value;
     Poco::Dynamic::Var default_value;
     Poco::Dynamic::Var default_bit_value;
@@ -212,6 +211,12 @@ struct ColumnInfo
     static Int64 getTimeValue(const String &);
     static Int64 getYearValue(const String &);
     static UInt64 getBitValue(const String &);
+
+private:
+    /// please be very careful when you have to use offset,
+    /// because we never update offset when DDL action changes.
+    /// Thus, our offset will not exactly correspond the order of columns.
+    Int32 offset = -1;
 };
 
 enum PartitionType
@@ -298,8 +303,13 @@ struct IndexColumnInfo
     void deserialize(Poco::JSON::Object::Ptr json);
 
     String name;
-    Int32 offset;
     Int32 length;
+
+private:
+    /// please be very careful when you have to use offset,
+    /// because we never update offset when DDL action changes.
+    /// Thus, our offset will not exactly correspond the order of columns.
+    Int32 offset;
 };
 struct IndexInfo
 {
@@ -323,6 +333,12 @@ struct IndexInfo
     bool is_global;
 };
 
+enum class TiFlashMode
+{
+    Normal,
+    Fast,
+};
+
 struct TableInfo
 {
     TableInfo() = default;
@@ -372,6 +388,8 @@ struct TableInfo
     // The TiFlash replica info persisted by TiDB
     TiFlashReplicaInfo replica_info;
 
+    TiFlashMode tiflash_mode = TiFlashMode::Normal;
+
     ::TiDB::StorageEngine engine_type = ::TiDB::StorageEngine::UNSPECIFIED;
 
     ColumnID getColumnID(const String & name) const;
@@ -385,7 +403,12 @@ struct TableInfo
 
     bool isLogicalPartitionTable() const { return is_partition_table && belonging_table_id == DB::InvalidTableID && partition.enable; }
 
-    /// should not be called if is_common_handle = false
+    /// should not be called if is_common_handle = false.
+    /// when use IndexInfo, please avoid to use the offset info
+    /// the offset value may be wrong in some cases,
+    /// due to we will not update IndexInfo except RENAME DDL action,
+    /// but DDL like add column / drop column may change the offset of columns
+    /// Thus, please be very careful when you must have to use offset information !!!!!
     const IndexInfo & getPrimaryIndexInfo() const { return index_infos[0]; }
 
     IndexInfo & getPrimaryIndexInfo() { return index_infos[0]; }
@@ -398,4 +421,7 @@ String genJsonNull();
 tipb::FieldType columnInfoToFieldType(const ColumnInfo & ci);
 ColumnInfo fieldTypeToColumnInfo(const tipb::FieldType & field_type);
 
+String TiFlashModeToString(TiFlashMode tiflash_mode);
+TiFlashMode parseTiFlashMode(String mode_str);
+
 } // namespace TiDB
diff --git a/dbms/src/Storages/Transaction/TiKVRecordFormat.h b/dbms/src/Storages/Transaction/TiKVRecordFormat.h
index c507616f6e9..10a7f7220e9 100644
--- a/dbms/src/Storages/Transaction/TiKVRecordFormat.h
+++ b/dbms/src/Storages/Transaction/TiKVRecordFormat.h
@@ -154,9 +154,16 @@ inline TiKVKey genKey(const TiDB::TableInfo & table_info, std::vector<Field> key
     memcpy(key.data() + 1, reinterpret_cast<const char *>(&big_endian_table_id), 8);
     memcpy(key.data() + 1 + 8, RecordKVFormat::RECORD_PREFIX_SEP, 2);
     WriteBufferFromOwnString ss;
+
+    std::unordered_map<String, size_t> column_name_columns_index_map;
+    for (size_t i = 0; i < table_info.columns.size(); i++)
+    {
+        column_name_columns_index_map.emplace(table_info.columns[i].name, i);
+    }
     for (size_t i = 0; i < keys.size(); i++)
     {
-        DB::EncodeDatum(keys[i], table_info.columns[table_info.getPrimaryIndexInfo().idx_cols[i].offset].getCodecFlag(), ss);
+        auto idx = column_name_columns_index_map[table_info.getPrimaryIndexInfo().idx_cols[i].name];
+        DB::EncodeDatum(keys[i], table_info.columns[idx].getCodecFlag(), ss);
     }
     return encodeAsTiKVKey(key + ss.releaseStr());
 }
diff --git a/dbms/src/Storages/Transaction/tests/RowCodecTestUtils.h b/dbms/src/Storages/Transaction/tests/RowCodecTestUtils.h
index 20b395a9952..34e0d3d4104 100644
--- a/dbms/src/Storages/Transaction/tests/RowCodecTestUtils.h
+++ b/dbms/src/Storages/Transaction/tests/RowCodecTestUtils.h
@@ -237,14 +237,14 @@ std::pair<TableInfo, std::vector<Field>> getTableInfoAndFields(ColumnIDs handle_
     {
         table_info.is_common_handle = true;
         TiDB::IndexInfo index_info;
-        for (size_t i = 0; i < handle_ids.size(); i++)
+        for (auto handle_id : handle_ids)
         {
             TiDB::IndexColumnInfo index_column_info;
-            for (size_t pos = 0; pos < table_info.columns.size(); pos++)
+            for (auto & column : table_info.columns)
             {
-                if (table_info.columns[pos].id == handle_ids[i])
+                if (column.id == handle_id)
                 {
-                    index_column_info.offset = pos;
+                    index_column_info.name = column.name;
                     break;
                 }
             }
diff --git a/dbms/src/Storages/Transaction/tests/bench_region_block_reader.cpp b/dbms/src/Storages/Transaction/tests/bench_region_block_reader.cpp
new file mode 100644
index 00000000000..05ab637de7f
--- /dev/null
+++ b/dbms/src/Storages/Transaction/tests/bench_region_block_reader.cpp
@@ -0,0 +1,171 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <Storages/DeltaMerge/DeltaMergeDefines.h>
+#include <Storages/Transaction/RegionBlockReader.h>
+#include <benchmark/benchmark.h>
+#include <gtest/gtest.h>
+
+#include "RowCodecTestUtils.h"
+
+using TableInfo = TiDB::TableInfo;
+namespace DB::tests
+{
+using ColumnIDs = std::vector<ColumnID>;
+class RegionBlockReaderBenchTest : public benchmark::Fixture
+{
+protected:
+    Int64 handle_value = 100;
+    UInt8 del_mark_value = 0;
+    UInt64 version_value = 100;
+
+    RegionDataReadInfoList data_list_read;
+    std::unordered_map<ColumnID, Field> fields_map;
+
+    enum RowEncodeVersion
+    {
+        RowV1,
+        RowV2
+    };
+
+protected:
+    void SetUp(const benchmark::State & /*state*/) override
+    {
+        data_list_read.clear();
+        fields_map.clear();
+    }
+
+    void encodeColumns(TableInfo & table_info, std::vector<Field> & fields, RowEncodeVersion row_version, size_t num_rows)
+    {
+        // for later check
+        std::unordered_map<String, size_t> column_name_columns_index_map;
+        for (size_t i = 0; i < table_info.columns.size(); i++)
+        {
+            fields_map.emplace(table_info.columns[i].id, fields[i]);
+            column_name_columns_index_map.emplace(table_info.columns[i].name, i);
+        }
+
+        std::vector<Field> value_fields;
+        std::vector<Field> pk_fields;
+        for (size_t i = 0; i < table_info.columns.size(); i++)
+        {
+            if (!table_info.columns[i].hasPriKeyFlag())
+                value_fields.emplace_back(fields[i]);
+            else
+                pk_fields.emplace_back(fields[i]);
+        }
+
+        // create PK
+        WriteBufferFromOwnString pk_buf;
+        if (table_info.is_common_handle)
+        {
+            auto & primary_index_info = table_info.getPrimaryIndexInfo();
+            for (size_t i = 0; i < primary_index_info.idx_cols.size(); i++)
+            {
+                auto idx = column_name_columns_index_map[primary_index_info.idx_cols[i].name];
+                EncodeDatum(pk_fields[i], table_info.columns[idx].getCodecFlag(), pk_buf);
+            }
+        }
+        else
+        {
+            DB::EncodeInt64(handle_value, pk_buf);
+        }
+        RawTiDBPK pk{std::make_shared<String>(pk_buf.releaseStr())};
+        // create value
+        WriteBufferFromOwnString value_buf;
+        if (row_version == RowEncodeVersion::RowV1)
+        {
+            encodeRowV1(table_info, value_fields, value_buf);
+        }
+        else if (row_version == RowEncodeVersion::RowV2)
+        {
+            encodeRowV2(table_info, value_fields, value_buf);
+        }
+        else
+        {
+            throw Exception("Unknown row format " + std::to_string(row_version), ErrorCodes::LOGICAL_ERROR);
+        }
+        auto row_value = std::make_shared<const TiKVValue>(std::move(value_buf.str()));
+        for (size_t i = 0; i < num_rows; i++)
+            data_list_read.emplace_back(pk, del_mark_value, version_value, row_value);
+    }
+
+    bool decodeColumns(DecodingStorageSchemaSnapshotConstPtr decoding_schema, bool force_decode) const
+    {
+        RegionBlockReader reader{decoding_schema};
+        Block block = createBlockSortByColumnID(decoding_schema);
+        return reader.read(block, data_list_read, force_decode);
+    }
+
+    std::pair<TableInfo, std::vector<Field>> getNormalTableInfoFields(const ColumnIDs & handle_ids, bool is_common_handle) const
+    {
+        return getTableInfoAndFields(
+            handle_ids,
+            is_common_handle,
+            ColumnIDValue(2, handle_value),
+            ColumnIDValue(3, std::numeric_limits<UInt64>::max()),
+            ColumnIDValue(4, std::numeric_limits<Float32>::min()),
+            ColumnIDValue(9, String("aaa")),
+            ColumnIDValue(10, DecimalField(ToDecimal<UInt64, Decimal64>(12345678910ULL, 4), 4)),
+            ColumnIDValueNull<UInt64>(11));
+    }
+};
+
+BENCHMARK_DEFINE_F(RegionBlockReaderBenchTest, CommonHandle)
+(benchmark::State & state)
+{
+    size_t num_rows = state.range(0);
+    auto [table_info, fields] = getNormalTableInfoFields({2, 3, 4}, true);
+    encodeColumns(table_info, fields, RowEncodeVersion::RowV2, num_rows);
+    auto decoding_schema = getDecodingStorageSchemaSnapshot(table_info);
+    for (auto _ : state)
+    {
+        decodeColumns(decoding_schema, true);
+    }
+}
+
+
+BENCHMARK_DEFINE_F(RegionBlockReaderBenchTest, PKIsNotHandle)
+(benchmark::State & state)
+{
+    size_t num_rows = state.range(0);
+    auto [table_info, fields] = getNormalTableInfoFields({EXTRA_HANDLE_COLUMN_ID}, false);
+    encodeColumns(table_info, fields, RowEncodeVersion::RowV2, num_rows);
+    auto decoding_schema = getDecodingStorageSchemaSnapshot(table_info);
+    for (auto _ : state)
+    {
+        decodeColumns(decoding_schema, true);
+    }
+}
+
+BENCHMARK_DEFINE_F(RegionBlockReaderBenchTest, PKIsHandle)
+(benchmark::State & state)
+{
+    size_t num_rows = state.range(0);
+    auto [table_info, fields] = getNormalTableInfoFields({2}, false);
+    encodeColumns(table_info, fields, RowEncodeVersion::RowV2, num_rows);
+    auto decoding_schema = getDecodingStorageSchemaSnapshot(table_info);
+    for (auto _ : state)
+    {
+        decodeColumns(decoding_schema, true);
+    }
+}
+
+constexpr size_t num_iterations_test = 1000;
+
+BENCHMARK_REGISTER_F(RegionBlockReaderBenchTest, PKIsHandle)->Iterations(num_iterations_test)->Arg(1)->Arg(10)->Arg(100);
+BENCHMARK_REGISTER_F(RegionBlockReaderBenchTest, CommonHandle)->Iterations(num_iterations_test)->Arg(1)->Arg(10)->Arg(100);
+BENCHMARK_REGISTER_F(RegionBlockReaderBenchTest, PKIsNotHandle)->Iterations(num_iterations_test)->Arg(1)->Arg(10)->Arg(100);
+
+} // namespace DB::tests
diff --git a/dbms/src/Storages/Transaction/tests/gtest_decoding_storage_schema_snapshot.cpp b/dbms/src/Storages/Transaction/tests/gtest_decoding_storage_schema_snapshot.cpp
new file mode 100644
index 00000000000..1de9809ecad
--- /dev/null
+++ b/dbms/src/Storages/Transaction/tests/gtest_decoding_storage_schema_snapshot.cpp
@@ -0,0 +1,65 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <Storages/Transaction/DecodingStorageSchemaSnapshot.h>
+#include <TestUtils/TiFlashTestBasic.h>
+#include <gtest/gtest.h>
+
+#include "RowCodecTestUtils.h"
+
+namespace DB::tests
+{
+static TableInfo getTableInfoByJson(const String & json_table_info)
+{
+    return TableInfo(json_table_info);
+}
+TEST(DecodingStorageSchemaSnapshotTest, CheckPKInfosUnderClusteredIndex)
+{
+    // table with column [A,B,C,D], primary keys [A,C]
+    const String json_table_info = R"json({"id":75,"name":{"O":"test","L":"test"},"charset":"utf8mb4","collate":"utf8mb4_bin","cols":[{"id":1,"name":{"O":"A","L":"a"},"offset":0,"origin_default":null,"origin_default_bit":null,"default":null,"default_bit":null,"default_is_expr":false,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":3,"Flag":4099,"Flen":11,"Decimal":0,"Charset":"binary","Collate":"binary","Elems":null},"state":5,"comment":"","hidden":false,"change_state_info":null,"version":2},{"id":2,"name":{"O":"B","L":"b"},"offset":1,"origin_default":null,"origin_default_bit":null,"default":null,"default_bit":null,"default_is_expr":false,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":15,"Flag":0,"Flen":20,"Decimal":0,"Charset":"utf8mb4","Collate":"utf8mb4_bin","Elems":null},"state":5,"comment":"","hidden":false,"change_state_info":null,"version":2},{"id":3,"name":{"O":"C","L":"c"},"offset":2,"origin_default":null,"origin_default_bit":null,"default":null,"default_bit":null,"default_is_expr":false,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":3,"Flag":4099,"Flen":11,"Decimal":0,"Charset":"binary","Collate":"binary","Elems":null},"state":5,"comment":"","hidden":false,"change_state_info":null,"version":2},{"id":4,"name":{"O":"D","L":"d"},"offset":3,"origin_default":null,"origin_default_bit":null,"default":null,"default_bit":null,"default_is_expr":false,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":3,"Flag":0,"Flen":11,"Decimal":0,"Charset":"binary","Collate":"binary","Elems":null},"state":5,"comment":"","hidden":false,"change_state_info":null,"version":2}],"index_info":[{"id":1,"idx_name":{"O":"PRIMARY","L":"primary"},"tbl_name":{"O":"","L":""},"idx_cols":[{"name":{"O":"A","L":"a"},"offset":0,"length":-1},{"name":{"O":"C","L":"c"},"offset":2,"length":-1}],"state":5,"comment":"","index_type":1,"is_unique":true,"is_primary":true,"is_invisible":false,"is_global":false}],"constraint_info":null,"fk_info":null,"state":5,"pk_is_handle":false,"is_common_handle":true,"common_handle_version":1,"comment":"","auto_inc_id":0,"auto_id_cache":0,"auto_rand_id":0,"max_col_id":4,"max_idx_id":1,"max_cst_id":0,"update_timestamp":434039123413303302,"ShardRowIDBits":0,"max_shard_row_id_bits":0,"auto_random_bits":0,"pre_split_regions":0,"partition":null,"compression":"","view":null,"sequence":null,"Lock":null,"version":4,"tiflash_replica":{"Count":1,"LocationLabels":[],"Available":false,"AvailablePartitionIDs":null},"is_columnar":false,"temp_table_type":0,"cache_table_status":0,"policy_ref_info":null,"stats_options":null})json";
+    auto table_info = getTableInfoByJson(json_table_info);
+    auto decoding_schema = getDecodingStorageSchemaSnapshot(table_info);
+
+    //check decoding_schema->pk_column_ids infos
+    ASSERT_EQ(decoding_schema->pk_column_ids.size(), 2);
+    ASSERT_EQ(decoding_schema->pk_column_ids[0], 1);
+    ASSERT_EQ(decoding_schema->pk_column_ids[1], 3);
+
+    //check decoding_schema->pk_pos_map infos
+    ASSERT_EQ(decoding_schema->pk_column_ids.size(), decoding_schema->pk_pos_map.size());
+    // there are three hidden column in the decoded block, so the position of A,C is 3,5
+    ASSERT_EQ(decoding_schema->pk_pos_map.at(decoding_schema->pk_column_ids[0]), 3);
+    ASSERT_EQ(decoding_schema->pk_pos_map.at(decoding_schema->pk_column_ids[1]), 5);
+}
+
+TEST(DecodingStorageSchemaSnapshotTest, CheckPKInfosUnderClusteredIndexAfterDropColumn)
+{
+    // drop column B for [A,B,C,D]; table with column [A,C,D], primary keys [A,C]
+    const String json_table_info = R"json({"id":75,"name":{"O":"test","L":"test"},"charset":"utf8mb4","collate":"utf8mb4_bin","cols":[{"id":1,"name":{"O":"A","L":"a"},"offset":0,"origin_default":null,"origin_default_bit":null,"default":null,"default_bit":null,"default_is_expr":false,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":3,"Flag":4099,"Flen":11,"Decimal":0,"Charset":"binary","Collate":"binary","Elems":null},"state":5,"comment":"","hidden":false,"change_state_info":null,"version":2},{"id":3,"name":{"O":"C","L":"c"},"offset":2,"origin_default":null,"origin_default_bit":null,"default":null,"default_bit":null,"default_is_expr":false,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":3,"Flag":4099,"Flen":11,"Decimal":0,"Charset":"binary","Collate":"binary","Elems":null},"state":5,"comment":"","hidden":false,"change_state_info":null,"version":2},{"id":4,"name":{"O":"D","L":"d"},"offset":3,"origin_default":null,"origin_default_bit":null,"default":null,"default_bit":null,"default_is_expr":false,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":3,"Flag":0,"Flen":11,"Decimal":0,"Charset":"binary","Collate":"binary","Elems":null},"state":5,"comment":"","hidden":false,"change_state_info":null,"version":2}],"index_info":[{"id":1,"idx_name":{"O":"PRIMARY","L":"primary"},"tbl_name":{"O":"","L":""},"idx_cols":[{"name":{"O":"A","L":"a"},"offset":0,"length":-1},{"name":{"O":"C","L":"c"},"offset":2,"length":-1}],"state":5,"comment":"","index_type":1,"is_unique":true,"is_primary":true,"is_invisible":false,"is_global":false}],"constraint_info":null,"fk_info":null,"state":5,"pk_is_handle":false,"is_common_handle":true,"common_handle_version":1,"comment":"","auto_inc_id":0,"auto_id_cache":0,"auto_rand_id":0,"max_col_id":4,"max_idx_id":1,"max_cst_id":0,"update_timestamp":434039123413303302,"ShardRowIDBits":0,"max_shard_row_id_bits":0,"auto_random_bits":0,"pre_split_regions":0,"partition":null,"compression":"","view":null,"sequence":null,"Lock":null,"version":4,"tiflash_replica":{"Count":1,"LocationLabels":[],"Available":false,"AvailablePartitionIDs":null},"is_columnar":false,"temp_table_type":0,"cache_table_status":0,"policy_ref_info":null,"stats_options":null})json";
+    auto table_info = getTableInfoByJson(json_table_info);
+    auto decoding_schema = getDecodingStorageSchemaSnapshot(table_info);
+
+    //check decoding_schema->pk_column_ids infos
+    ASSERT_EQ(decoding_schema->pk_column_ids.size(), 2);
+    ASSERT_EQ(decoding_schema->pk_column_ids[0], 1);
+    ASSERT_EQ(decoding_schema->pk_column_ids[1], 3);
+
+    //check decoding_schema->pk_pos_map infos
+    ASSERT_EQ(decoding_schema->pk_column_ids.size(), decoding_schema->pk_pos_map.size());
+    // there are three hidden column in the decoded block, so the position of A,C is 3,4
+    ASSERT_EQ(decoding_schema->pk_pos_map.at(decoding_schema->pk_column_ids[0]), 3);
+    ASSERT_EQ(decoding_schema->pk_pos_map.at(decoding_schema->pk_column_ids[1]), 4);
+}
+
+} // namespace DB::tests
diff --git a/dbms/src/Storages/Transaction/tests/gtest_kvstore.cpp b/dbms/src/Storages/Transaction/tests/gtest_kvstore.cpp
index 36a91522bb6..77aab06f6cf 100644
--- a/dbms/src/Storages/Transaction/tests/gtest_kvstore.cpp
+++ b/dbms/src/Storages/Transaction/tests/gtest_kvstore.cpp
@@ -1179,6 +1179,12 @@ void RegionKVStoreTest::testKVStore()
             ASSERT_EQ(e.message(), "unsupported admin command type InvalidAdmin");
         }
     }
+    {
+        // There shall be data to flush.
+        ASSERT_EQ(kvs.needFlushRegionData(19, ctx.getTMTContext()), true);
+        // Force flush until succeed only for testing.
+        ASSERT_EQ(kvs.tryFlushRegionData(19, true, ctx.getTMTContext()), true);
+    }
 }
 
 void test_mergeresult()
diff --git a/dbms/src/Storages/Transaction/tests/gtest_region_block_reader.cpp b/dbms/src/Storages/Transaction/tests/gtest_region_block_reader.cpp
index 6a883230854..d08b4dd3738 100644
--- a/dbms/src/Storages/Transaction/tests/gtest_region_block_reader.cpp
+++ b/dbms/src/Storages/Transaction/tests/gtest_region_block_reader.cpp
@@ -26,13 +26,13 @@ using ColumnIDs = std::vector<ColumnID>;
 class RegionBlockReaderTestFixture : public ::testing::Test
 {
 protected:
-    Int64 handle_value_ = 100;
-    UInt8 del_mark_value_ = 0;
-    UInt64 version_value_ = 100;
-    size_t rows_ = 3;
+    Int64 handle_value = 100;
+    UInt8 del_mark_value = 0;
+    UInt64 version_value = 100;
+    size_t rows = 3;
 
-    RegionDataReadInfoList data_list_read_;
-    std::unordered_map<ColumnID, Field> fields_map_;
+    RegionDataReadInfoList data_list_read;
+    std::unordered_map<ColumnID, Field> fields_map;
 
     enum RowEncodeVersion
     {
@@ -43,8 +43,8 @@ class RegionBlockReaderTestFixture : public ::testing::Test
 protected:
     void SetUp() override
     {
-        data_list_read_.clear();
-        fields_map_.clear();
+        data_list_read.clear();
+        fields_map.clear();
     }
 
     void TearDown() override {}
@@ -52,8 +52,12 @@ class RegionBlockReaderTestFixture : public ::testing::Test
     void encodeColumns(TableInfo & table_info, std::vector<Field> & fields, RowEncodeVersion row_version)
     {
         // for later check
+        std::unordered_map<String, size_t> column_name_columns_index_map;
         for (size_t i = 0; i < table_info.columns.size(); i++)
-            fields_map_.emplace(table_info.columns[i].id, fields[i]);
+        {
+            fields_map.emplace(table_info.columns[i].id, fields[i]);
+            column_name_columns_index_map.emplace(table_info.columns[i].name, i);
+        }
 
         std::vector<Field> value_fields;
         std::vector<Field> pk_fields;
@@ -72,13 +76,13 @@ class RegionBlockReaderTestFixture : public ::testing::Test
             auto & primary_index_info = table_info.getPrimaryIndexInfo();
             for (size_t i = 0; i < primary_index_info.idx_cols.size(); i++)
             {
-                size_t pk_offset = primary_index_info.idx_cols[i].offset;
-                EncodeDatum(pk_fields[i], table_info.columns[pk_offset].getCodecFlag(), pk_buf);
+                auto idx = column_name_columns_index_map[primary_index_info.idx_cols[i].name];
+                EncodeDatum(pk_fields[i], table_info.columns[idx].getCodecFlag(), pk_buf);
             }
         }
         else
         {
-            DB::EncodeInt64(handle_value_, pk_buf);
+            DB::EncodeInt64(handle_value, pk_buf);
         }
         RawTiDBPK pk{std::make_shared<String>(pk_buf.releaseStr())};
         // create value
@@ -96,44 +100,44 @@ class RegionBlockReaderTestFixture : public ::testing::Test
             throw Exception("Unknown row format " + std::to_string(row_version), ErrorCodes::LOGICAL_ERROR);
         }
         auto row_value = std::make_shared<const TiKVValue>(std::move(value_buf.str()));
-        for (size_t i = 0; i < rows_; i++)
-            data_list_read_.emplace_back(pk, del_mark_value_, version_value_, row_value);
+        for (size_t i = 0; i < rows; i++)
+            data_list_read.emplace_back(pk, del_mark_value, version_value, row_value);
     }
 
     void checkBlock(DecodingStorageSchemaSnapshotConstPtr decoding_schema, const Block & block) const
     {
         ASSERT_EQ(block.columns(), decoding_schema->column_defines->size());
-        for (size_t row = 0; row < rows_; row++)
+        for (size_t row = 0; row < rows; row++)
         {
             for (size_t pos = 0; pos < block.columns(); pos++)
             {
-                auto & column_element = block.getByPosition(pos);
+                const auto & column_element = block.getByPosition(pos);
                 if (row == 0)
                 {
-                    ASSERT_EQ(column_element.column->size(), rows_);
+                    ASSERT_EQ(column_element.column->size(), rows);
                 }
                 if (column_element.name == EXTRA_HANDLE_COLUMN_NAME)
                 {
                     if (decoding_schema->is_common_handle)
                     {
-                        ASSERT_EQ((*column_element.column)[row], Field(*std::get<0>(data_list_read_[row])));
+                        ASSERT_EQ((*column_element.column)[row], Field(*std::get<0>(data_list_read[row])));
                     }
                     else
                     {
-                        ASSERT_EQ((*column_element.column)[row], Field(handle_value_));
+                        ASSERT_EQ((*column_element.column)[row], Field(handle_value));
                     }
                 }
                 else if (column_element.name == VERSION_COLUMN_NAME)
                 {
-                    ASSERT_EQ((*column_element.column)[row], Field(version_value_));
+                    ASSERT_EQ((*column_element.column)[row], Field(version_value));
                 }
                 else if (column_element.name == TAG_COLUMN_NAME)
                 {
-                    ASSERT_EQ((*column_element.column)[row], Field(NearestFieldType<UInt8>::Type(del_mark_value_)));
+                    ASSERT_EQ((*column_element.column)[row], Field(NearestFieldType<UInt8>::Type(del_mark_value)));
                 }
                 else
                 {
-                    ASSERT_EQ((*column_element.column)[row], fields_map_.at(column_element.column_id));
+                    ASSERT_EQ((*column_element.column)[row], fields_map.at(column_element.column_id));
                 }
             }
         }
@@ -143,7 +147,7 @@ class RegionBlockReaderTestFixture : public ::testing::Test
     {
         RegionBlockReader reader{decoding_schema};
         Block block = createBlockSortByColumnID(decoding_schema);
-        if (!reader.read(block, data_list_read_, force_decode))
+        if (!reader.read(block, data_list_read, force_decode))
             return false;
 
         checkBlock(decoding_schema, block);
@@ -155,7 +159,7 @@ class RegionBlockReaderTestFixture : public ::testing::Test
         return getTableInfoAndFields(
             handle_ids,
             is_common_handle,
-            ColumnIDValue(2, handle_value_),
+            ColumnIDValue(2, handle_value),
             ColumnIDValue(3, std::numeric_limits<UInt64>::max()),
             ColumnIDValue(4, std::numeric_limits<Float32>::min()),
             ColumnIDValue(9, String("aaa")),
@@ -170,7 +174,7 @@ class RegionBlockReaderTestFixture : public ::testing::Test
             handle_ids,
             is_common_handle,
             ColumnIDValue(1, String("")),
-            ColumnIDValue(2, handle_value_),
+            ColumnIDValue(2, handle_value),
             ColumnIDValue(3, std::numeric_limits<UInt64>::max()),
             ColumnIDValue(4, std::numeric_limits<Float32>::min()),
             ColumnIDValue(8, String("")),
@@ -182,12 +186,12 @@ class RegionBlockReaderTestFixture : public ::testing::Test
         // add default value for missing column
         std::vector<ColumnID> missing_column_ids{1, 8, 13};
         String missing_column_default_value = String("default");
-        for (size_t i = 0; i < table_info.columns.size(); i++)
+        for (auto & column : table_info.columns)
         {
-            if (std::find(missing_column_ids.begin(), missing_column_ids.end(), table_info.columns[i].id) != missing_column_ids.end())
+            if (std::find(missing_column_ids.begin(), missing_column_ids.end(), column.id) != missing_column_ids.end())
             {
-                table_info.columns[i].origin_default_value = missing_column_default_value;
-                fields_map_.emplace(table_info.columns[i].id, Field(missing_column_default_value));
+                column.origin_default_value = missing_column_default_value;
+                fields_map.emplace(column.id, Field(missing_column_default_value));
             }
         }
         return table_info;
@@ -199,7 +203,7 @@ class RegionBlockReaderTestFixture : public ::testing::Test
         std::tie(table_info, std::ignore) = getTableInfoAndFields(
             handle_ids,
             is_common_handle,
-            ColumnIDValue(2, handle_value_),
+            ColumnIDValue(2, handle_value),
             ColumnIDValue(4, std::numeric_limits<Float32>::min()),
             ColumnIDValue(9, String("aaa")),
             ColumnIDValue(10, DecimalField(ToDecimal<UInt64, Decimal64>(12345678910ULL, 4), 4)));
@@ -212,7 +216,7 @@ class RegionBlockReaderTestFixture : public ::testing::Test
         std::tie(table_info, std::ignore) = getTableInfoAndFields(
             handle_ids,
             is_common_handle,
-            ColumnIDValue(2, handle_value_),
+            ColumnIDValue(2, handle_value),
             ColumnIDValue(3, std::numeric_limits<UInt8>::max()),
             ColumnIDValue(4, std::numeric_limits<Float32>::min()),
             ColumnIDValue(9, String("aaa")),
@@ -227,7 +231,7 @@ class RegionBlockReaderTestFixture : public ::testing::Test
         std::tie(table_info, std::ignore) = getTableInfoAndFields(
             handle_ids,
             is_common_handle,
-            ColumnIDValue(2, handle_value_),
+            ColumnIDValue(2, handle_value),
             ColumnIDValue(3, std::numeric_limits<UInt64>::max()),
             ColumnIDValue(4, std::numeric_limits<Float32>::min()),
             ColumnIDValue(9, String("aaa")),
diff --git a/dbms/src/Storages/Transaction/tests/gtest_table_info.cpp b/dbms/src/Storages/Transaction/tests/gtest_table_info.cpp
index 516a173b151..871153cb0e9 100644
--- a/dbms/src/Storages/Transaction/tests/gtest_table_info.cpp
+++ b/dbms/src/Storages/Transaction/tests/gtest_table_info.cpp
@@ -42,7 +42,7 @@ struct ParseCase
     std::function<void(const TableInfo & table_info)> check;
 };
 
-TEST(TiDBTableInfo_test, ParseFromJSON)
+TEST(TiDBTableInfoTest, ParseFromJSON)
 try
 {
     auto cases = {
@@ -136,54 +136,54 @@ struct StmtCase
     }
 };
 
-TEST(TiDBTableInfo_test, GenCreateTableStatement)
+TEST(TiDBTableInfoTest, GenCreateTableStatement)
 try
 {
     auto cases = //
         {StmtCase{
              1145, //
              R"json({"id":1939,"db_name":{"O":"customer","L":"customer"},"charset":"utf8mb4","collate":"utf8mb4_bin","state":5})json", //
-             R"json({"id":1145,"name":{"O":"customerdebt","L":"customerdebt"},"cols":[{"id":1,"name":{"O":"id","L":"id"},"offset":0,"origin_default":null,"default":null,"default_bit":null,"type":{"Tp":8,"Flag":515,"Flen":20,"Decimal":0},"state":5,"comment":"i\"d"}],"state":5,"pk_is_handle":true,"schema_version":-1,"comment":"负债信息","partition":null})json", //
-             R"stmt(CREATE TABLE `customer`.`customerdebt`(`id` Int64) Engine = DeltaMerge((`id`), '{"cols":[{"comment":"i\\"d","default":null,"default_bit":null,"id":1,"name":{"L":"id","O":"id"},"offset":0,"origin_default":null,"state":5,"type":{"Charset":null,"Collate":null,"Decimal":0,"Elems":null,"Flag":515,"Flen":20,"Tp":8}}],"comment":"\\u8D1F\\u503A\\u4FE1\\u606F","id":1145,"index_info":[],"is_common_handle":false,"name":{"L":"customerdebt","O":"customerdebt"},"partition":null,"pk_is_handle":true,"schema_version":-1,"state":5,"tiflash_replica":{"Count":0},"update_timestamp":0}'))stmt", //
+             R"json({"id":1145,"name":{"O":"customerdebt","L":"customerdebt"},"cols":[{"id":1,"name":{"O":"id","L":"id"},"offset":0,"origin_default":null,"default":null,"default_bit":null,"type":{"Tp":8,"Flag":515,"Flen":20,"Decimal":0},"state":5,"comment":"i\"d"}],"state":5,"pk_is_handle":true,"schema_version":-1,"comment":"负债信息","partition":null,"tiflash_mode":"fast"})json", //
+             R"stmt(CREATE TABLE `customer`.`customerdebt`(`id` Int64) Engine = DeltaMerge((`id`), '{"cols":[{"comment":"i\\"d","default":null,"default_bit":null,"id":1,"name":{"L":"id","O":"id"},"offset":0,"origin_default":null,"state":5,"type":{"Charset":null,"Collate":null,"Decimal":0,"Elems":null,"Flag":515,"Flen":20,"Tp":8}}],"comment":"\\u8D1F\\u503A\\u4FE1\\u606F","id":1145,"index_info":[],"is_common_handle":false,"name":{"L":"customerdebt","O":"customerdebt"},"partition":null,"pk_is_handle":true,"schema_version":-1,"state":5,"tiflash_mode":"fast","tiflash_replica":{"Count":0},"update_timestamp":0}'))stmt", //
          },
          StmtCase{
              2049, //
              R"json({"id":1939,"db_name":{"O":"customer","L":"customer"},"charset":"utf8mb4","collate":"utf8mb4_bin","state":5})json", //
-             R"json({"id":2049,"name":{"O":"customerdebt","L":"customerdebt"},"cols":[{"id":1,"name":{"O":"id","L":"id"},"offset":0,"origin_default":null,"default":null,"default_bit":null,"type":{"Tp":8,"Flag":515,"Flen":20,"Decimal":0},"state":5,"comment":"i\"d"}],"state":5,"pk_is_handle":true,"schema_version":-1,"comment":"负债信息","update_timestamp":404545295996944390,"partition":null})json", //
-             R"stmt(CREATE TABLE `customer`.`customerdebt`(`id` Int64) Engine = DeltaMerge((`id`), '{"cols":[{"comment":"i\\"d","default":null,"default_bit":null,"id":1,"name":{"L":"id","O":"id"},"offset":0,"origin_default":null,"state":5,"type":{"Charset":null,"Collate":null,"Decimal":0,"Elems":null,"Flag":515,"Flen":20,"Tp":8}}],"comment":"\\u8D1F\\u503A\\u4FE1\\u606F","id":2049,"index_info":[],"is_common_handle":false,"name":{"L":"customerdebt","O":"customerdebt"},"partition":null,"pk_is_handle":true,"schema_version":-1,"state":5,"tiflash_replica":{"Count":0},"update_timestamp":404545295996944390}'))stmt", //
+             R"json({"id":2049,"name":{"O":"customerdebt","L":"customerdebt"},"cols":[{"id":1,"name":{"O":"id","L":"id"},"offset":0,"origin_default":null,"default":null,"default_bit":null,"type":{"Tp":8,"Flag":515,"Flen":20,"Decimal":0},"state":5,"comment":"i\"d"}],"state":5,"pk_is_handle":true,"schema_version":-1,"comment":"负债信息","update_timestamp":404545295996944390,"partition":null,"tiflash_mode":""})json", //
+             R"stmt(CREATE TABLE `customer`.`customerdebt`(`id` Int64) Engine = DeltaMerge((`id`), '{"cols":[{"comment":"i\\"d","default":null,"default_bit":null,"id":1,"name":{"L":"id","O":"id"},"offset":0,"origin_default":null,"state":5,"type":{"Charset":null,"Collate":null,"Decimal":0,"Elems":null,"Flag":515,"Flen":20,"Tp":8}}],"comment":"\\u8D1F\\u503A\\u4FE1\\u606F","id":2049,"index_info":[],"is_common_handle":false,"name":{"L":"customerdebt","O":"customerdebt"},"partition":null,"pk_is_handle":true,"schema_version":-1,"state":5,"tiflash_mode":"","tiflash_replica":{"Count":0},"update_timestamp":404545295996944390}'))stmt", //
          },
          StmtCase{
              31, //
              R"json({"id":1,"db_name":{"O":"db1","L":"db1"},"charset":"utf8mb4","collate":"utf8mb4_bin","state":5})json", //
-             R"json({"id":31,"name":{"O":"simple_t","L":"simple_t"},"charset":"","collate":"","cols":[{"id":1,"name":{"O":"i","L":"i"},"offset":0,"origin_default":null,"default":null,"default_bit":null,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":3,"Flag":0,"Flen":11,"Decimal":0,"Charset":"binary","Collate":"binary","Elems":null},"state":5,"comment":""}],"index_info":null,"fk_info":null,"state":5,"pk_is_handle":false,"schema_version":-1,"comment":"","auto_inc_id":0,"max_col_id":1,"max_idx_id":0,"update_timestamp":404545295996944390,"ShardRowIDBits":0,"partition":null})json", //
-             R"stmt(CREATE TABLE `db1`.`simple_t`(`i` Nullable(Int32), `_tidb_rowid` Int64) Engine = DeltaMerge((`_tidb_rowid`), '{"cols":[{"comment":"","default":null,"default_bit":null,"id":1,"name":{"L":"i","O":"i"},"offset":0,"origin_default":null,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Elems":null,"Flag":0,"Flen":11,"Tp":3}}],"comment":"","id":31,"index_info":[],"is_common_handle":false,"name":{"L":"simple_t","O":"simple_t"},"partition":null,"pk_is_handle":false,"schema_version":-1,"state":5,"tiflash_replica":{"Count":0},"update_timestamp":404545295996944390}'))stmt", //
+             R"json({"id":31,"name":{"O":"simple_t","L":"simple_t"},"charset":"","collate":"","cols":[{"id":1,"name":{"O":"i","L":"i"},"offset":0,"origin_default":null,"default":null,"default_bit":null,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":3,"Flag":0,"Flen":11,"Decimal":0,"Charset":"binary","Collate":"binary","Elems":null},"state":5,"comment":""}],"index_info":null,"fk_info":null,"state":5,"pk_is_handle":false,"schema_version":-1,"comment":"","auto_inc_id":0,"max_col_id":1,"max_idx_id":0,"update_timestamp":404545295996944390,"ShardRowIDBits":0,"partition":null,"tiflash_mode":""})json", //
+             R"stmt(CREATE TABLE `db1`.`simple_t`(`i` Nullable(Int32), `_tidb_rowid` Int64) Engine = DeltaMerge((`_tidb_rowid`), '{"cols":[{"comment":"","default":null,"default_bit":null,"id":1,"name":{"L":"i","O":"i"},"offset":0,"origin_default":null,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Elems":null,"Flag":0,"Flen":11,"Tp":3}}],"comment":"","id":31,"index_info":[],"is_common_handle":false,"name":{"L":"simple_t","O":"simple_t"},"partition":null,"pk_is_handle":false,"schema_version":-1,"state":5,"tiflash_mode":"","tiflash_replica":{"Count":0},"update_timestamp":404545295996944390}'))stmt", //
          },
          StmtCase{
              33, //
              R"json({"id":2,"db_name":{"O":"db2","L":"db2"},"charset":"utf8mb4","collate":"utf8mb4_bin","state":5})json", //
-             R"json({"id":33,"name":{"O":"pk_t","L":"pk_t"},"charset":"","collate":"","cols":[{"id":1,"name":{"O":"i","L":"i"},"offset":0,"origin_default":null,"default":null,"default_bit":null,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":3,"Flag":3,"Flen":11,"Decimal":0,"Charset":"binary","Collate":"binary","Elems":null},"state":5,"comment":""}],"index_info":null,"fk_info":null,"state":5,"pk_is_handle":true,"schema_version":-1,"comment":"","auto_inc_id":0,"max_col_id":1,"max_idx_id":0,"update_timestamp":404545312978108418,"ShardRowIDBits":0,"partition":null})json", //
-             R"stmt(CREATE TABLE `db2`.`pk_t`(`i` Int32) Engine = DeltaMerge((`i`), '{"cols":[{"comment":"","default":null,"default_bit":null,"id":1,"name":{"L":"i","O":"i"},"offset":0,"origin_default":null,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Elems":null,"Flag":3,"Flen":11,"Tp":3}}],"comment":"","id":33,"index_info":[],"is_common_handle":false,"name":{"L":"pk_t","O":"pk_t"},"partition":null,"pk_is_handle":true,"schema_version":-1,"state":5,"tiflash_replica":{"Count":0},"update_timestamp":404545312978108418}'))stmt", //
+             R"json({"id":33,"name":{"O":"pk_t","L":"pk_t"},"charset":"","collate":"","cols":[{"id":1,"name":{"O":"i","L":"i"},"offset":0,"origin_default":null,"default":null,"default_bit":null,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":3,"Flag":3,"Flen":11,"Decimal":0,"Charset":"binary","Collate":"binary","Elems":null},"state":5,"comment":""}],"index_info":null,"fk_info":null,"state":5,"pk_is_handle":true,"schema_version":-1,"comment":"","auto_inc_id":0,"max_col_id":1,"max_idx_id":0,"update_timestamp":404545312978108418,"ShardRowIDBits":0,"partition":null,"tiflash_mode":""})json", //
+             R"stmt(CREATE TABLE `db2`.`pk_t`(`i` Int32) Engine = DeltaMerge((`i`), '{"cols":[{"comment":"","default":null,"default_bit":null,"id":1,"name":{"L":"i","O":"i"},"offset":0,"origin_default":null,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Elems":null,"Flag":3,"Flen":11,"Tp":3}}],"comment":"","id":33,"index_info":[],"is_common_handle":false,"name":{"L":"pk_t","O":"pk_t"},"partition":null,"pk_is_handle":true,"schema_version":-1,"state":5,"tiflash_mode":"","tiflash_replica":{"Count":0},"update_timestamp":404545312978108418}'))stmt", //
          },
          StmtCase{
              35, //
              R"json({"id":1,"db_name":{"O":"db1","L":"db1"},"charset":"utf8mb4","collate":"utf8mb4_bin","state":5})json", //
-             R"json({"id":35,"name":{"O":"not_null_t","L":"not_null_t"},"charset":"","collate":"","cols":[{"id":1,"name":{"O":"i","L":"i"},"offset":0,"origin_default":null,"default":null,"default_bit":null,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":3,"Flag":4097,"Flen":11,"Decimal":0,"Charset":"binary","Collate":"binary","Elems":null},"state":5,"comment":""}],"index_info":null,"fk_info":null,"state":5,"pk_is_handle":false,"schema_version":-1,"comment":"","auto_inc_id":0,"max_col_id":1,"max_idx_id":0,"update_timestamp":404545324922961926,"ShardRowIDBits":0,"partition":null})json", //
-             R"stmt(CREATE TABLE `db1`.`not_null_t`(`i` Int32, `_tidb_rowid` Int64) Engine = DeltaMerge((`_tidb_rowid`), '{"cols":[{"comment":"","default":null,"default_bit":null,"id":1,"name":{"L":"i","O":"i"},"offset":0,"origin_default":null,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Elems":null,"Flag":4097,"Flen":11,"Tp":3}}],"comment":"","id":35,"index_info":[],"is_common_handle":false,"name":{"L":"not_null_t","O":"not_null_t"},"partition":null,"pk_is_handle":false,"schema_version":-1,"state":5,"tiflash_replica":{"Count":0},"update_timestamp":404545324922961926}'))stmt", //
+             R"json({"id":35,"name":{"O":"not_null_t","L":"not_null_t"},"charset":"","collate":"","cols":[{"id":1,"name":{"O":"i","L":"i"},"offset":0,"origin_default":null,"default":null,"default_bit":null,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":3,"Flag":4097,"Flen":11,"Decimal":0,"Charset":"binary","Collate":"binary","Elems":null},"state":5,"comment":""}],"index_info":null,"fk_info":null,"state":5,"pk_is_handle":false,"schema_version":-1,"comment":"","auto_inc_id":0,"max_col_id":1,"max_idx_id":0,"update_timestamp":404545324922961926,"ShardRowIDBits":0,"partition":null,"tiflash_mode":""})json", //
+             R"stmt(CREATE TABLE `db1`.`not_null_t`(`i` Int32, `_tidb_rowid` Int64) Engine = DeltaMerge((`_tidb_rowid`), '{"cols":[{"comment":"","default":null,"default_bit":null,"id":1,"name":{"L":"i","O":"i"},"offset":0,"origin_default":null,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Elems":null,"Flag":4097,"Flen":11,"Tp":3}}],"comment":"","id":35,"index_info":[],"is_common_handle":false,"name":{"L":"not_null_t","O":"not_null_t"},"partition":null,"pk_is_handle":false,"schema_version":-1,"state":5,"tiflash_mode":"","tiflash_replica":{"Count":0},"update_timestamp":404545324922961926}'))stmt", //
          },
          StmtCase{
              37, //
              R"json({"id":2,"db_name":{"O":"db2","L":"db2"},"charset":"utf8mb4","collate":"utf8mb4_bin","state":5})json",
-             R"json({"id":37,"name":{"O":"mytable","L":"mytable"},"charset":"","collate":"","cols":[{"id":1,"name":{"O":"mycol","L":"mycol"},"offset":0,"origin_default":null,"default":null,"default_bit":null,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":15,"Flag":4099,"Flen":256,"Decimal":0,"Charset":"utf8","Collate":"utf8_bin","Elems":null},"state":5,"comment":""}],"index_info":[{"id":1,"idx_name":{"O":"PRIMARY","L":"primary"},"tbl_name":{"O":"","L":""},"idx_cols":[{"name":{"O":"mycol","L":"mycol"},"offset":0,"length":-1}],"is_unique":true,"is_primary":true,"state":5,"comment":"","index_type":1}],"fk_info":null,"state":5,"pk_is_handle":true,"schema_version":-1,"comment":"","auto_inc_id":0,"max_col_id":1,"max_idx_id":1,"update_timestamp":404566455285710853,"ShardRowIDBits":0,"partition":null})json", //
-             R"stmt(CREATE TABLE `db2`.`mytable`(`mycol` String) Engine = DeltaMerge((`mycol`), '{"cols":[{"comment":"","default":null,"default_bit":null,"id":1,"name":{"L":"mycol","O":"mycol"},"offset":0,"origin_default":null,"state":5,"type":{"Charset":"utf8","Collate":"utf8_bin","Decimal":0,"Elems":null,"Flag":4099,"Flen":256,"Tp":15}}],"comment":"","id":37,"index_info":[],"is_common_handle":false,"name":{"L":"mytable","O":"mytable"},"partition":null,"pk_is_handle":true,"schema_version":-1,"state":5,"tiflash_replica":{"Count":0},"update_timestamp":404566455285710853}'))stmt", //
+             R"json({"id":37,"name":{"O":"mytable","L":"mytable"},"charset":"","collate":"","cols":[{"id":1,"name":{"O":"mycol","L":"mycol"},"offset":0,"origin_default":null,"default":null,"default_bit":null,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":15,"Flag":4099,"Flen":256,"Decimal":0,"Charset":"utf8","Collate":"utf8_bin","Elems":null},"state":5,"comment":""}],"index_info":[{"id":1,"idx_name":{"O":"PRIMARY","L":"primary"},"tbl_name":{"O":"","L":""},"idx_cols":[{"name":{"O":"mycol","L":"mycol"},"offset":0,"length":-1}],"is_unique":true,"is_primary":true,"state":5,"comment":"","index_type":1}],"fk_info":null,"state":5,"pk_is_handle":true,"schema_version":-1,"comment":"","auto_inc_id":0,"max_col_id":1,"max_idx_id":1,"update_timestamp":404566455285710853,"ShardRowIDBits":0,"partition":null,"tiflash_mode":""})json", //
+             R"stmt(CREATE TABLE `db2`.`mytable`(`mycol` String) Engine = DeltaMerge((`mycol`), '{"cols":[{"comment":"","default":null,"default_bit":null,"id":1,"name":{"L":"mycol","O":"mycol"},"offset":0,"origin_default":null,"state":5,"type":{"Charset":"utf8","Collate":"utf8_bin","Decimal":0,"Elems":null,"Flag":4099,"Flen":256,"Tp":15}}],"comment":"","id":37,"index_info":[],"is_common_handle":false,"name":{"L":"mytable","O":"mytable"},"partition":null,"pk_is_handle":true,"schema_version":-1,"state":5,"tiflash_mode":"","tiflash_replica":{"Count":0},"update_timestamp":404566455285710853}'))stmt", //
          },
          StmtCase{
              32, //
              R"json({"id":1,"db_name":{"O":"test","L":"test"},"charset":"utf8mb4","collate":"utf8mb4_bin","state":5})json", //
-             R"json({"id":31,"name":{"O":"range_part_t","L":"range_part_t"},"charset":"utf8mb4","collate":"utf8mb4_bin","cols":[{"id":1,"name":{"O":"i","L":"i"},"offset":0,"origin_default":null,"default":null,"default_bit":null,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":3,"Flag":0,"Flen":11,"Decimal":0,"Charset":"binary","Collate":"binary","Elems":null},"state":5,"comment":"","version":0}],"index_info":null,"fk_info":null,"state":5,"pk_is_handle":false,"schema_version":-1,"comment":"","auto_inc_id":0,"max_col_id":1,"max_idx_id":0,"update_timestamp":407445773801488390,"ShardRowIDBits":0,"partition":{"type":1,"expr":"`i`","columns":null,"enable":true,"definitions":[{"id":32,"name":{"O":"p0","L":"p0"},"less_than":["0"]},{"id":33,"name":{"O":"p1","L":"p1"},"less_than":["100"]}],"num":0},"compression":"","version":1})json", //
-             R"stmt(CREATE TABLE `test`.`range_part_t_32`(`i` Nullable(Int32), `_tidb_rowid` Int64) Engine = DeltaMerge((`_tidb_rowid`), '{"belonging_table_id":31,"cols":[{"comment":"","default":null,"default_bit":null,"id":1,"name":{"L":"i","O":"i"},"offset":0,"origin_default":null,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Elems":null,"Flag":0,"Flen":11,"Tp":3}}],"comment":"","id":32,"index_info":[],"is_common_handle":false,"is_partition_sub_table":true,"name":{"L":"range_part_t_32","O":"range_part_t_32"},"partition":null,"pk_is_handle":false,"schema_version":-1,"state":5,"tiflash_replica":{"Count":0},"update_timestamp":407445773801488390}'))stmt", //
+             R"json({"id":31,"name":{"O":"range_part_t","L":"range_part_t"},"charset":"utf8mb4","collate":"utf8mb4_bin","cols":[{"id":1,"name":{"O":"i","L":"i"},"offset":0,"origin_default":null,"default":null,"default_bit":null,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":3,"Flag":0,"Flen":11,"Decimal":0,"Charset":"binary","Collate":"binary","Elems":null},"state":5,"comment":"","version":0}],"index_info":null,"fk_info":null,"state":5,"pk_is_handle":false,"schema_version":-1,"comment":"","auto_inc_id":0,"max_col_id":1,"max_idx_id":0,"update_timestamp":407445773801488390,"ShardRowIDBits":0,"partition":{"type":1,"expr":"`i`","columns":null,"enable":true,"definitions":[{"id":32,"name":{"O":"p0","L":"p0"},"less_than":["0"]},{"id":33,"name":{"O":"p1","L":"p1"},"less_than":["100"]}],"num":0},"compression":"","version":1,"tiflash_mode":""})json", //
+             R"stmt(CREATE TABLE `test`.`range_part_t_32`(`i` Nullable(Int32), `_tidb_rowid` Int64) Engine = DeltaMerge((`_tidb_rowid`), '{"belonging_table_id":31,"cols":[{"comment":"","default":null,"default_bit":null,"id":1,"name":{"L":"i","O":"i"},"offset":0,"origin_default":null,"state":5,"type":{"Charset":"binary","Collate":"binary","Decimal":0,"Elems":null,"Flag":0,"Flen":11,"Tp":3}}],"comment":"","id":32,"index_info":[],"is_common_handle":false,"is_partition_sub_table":true,"name":{"L":"range_part_t_32","O":"range_part_t_32"},"partition":null,"pk_is_handle":false,"schema_version":-1,"state":5,"tiflash_mode":"","tiflash_replica":{"Count":0},"update_timestamp":407445773801488390}'))stmt", //
          }};
 
-    for (auto & c : cases)
+    for (const auto & c : cases)
     {
         c.verifyTableInfo();
     }
diff --git a/dbms/src/TestUtils/ExecutorTestUtils.cpp b/dbms/src/TestUtils/ExecutorTestUtils.cpp
index 881ebaf88db..634e483abd2 100644
--- a/dbms/src/TestUtils/ExecutorTestUtils.cpp
+++ b/dbms/src/TestUtils/ExecutorTestUtils.cpp
@@ -104,41 +104,39 @@ Block mergeBlocks(Blocks blocks)
     return Block(actual_columns);
 }
 
-void readBlock(BlockInputStreamPtr stream, const ColumnsWithTypeAndName & expect_columns)
+DB::ColumnsWithTypeAndName readBlock(BlockInputStreamPtr stream)
 {
     Blocks actual_blocks;
-    Block except_block(expect_columns);
     stream->readPrefix();
     while (auto block = stream->read())
     {
         actual_blocks.push_back(block);
     }
     stream->readSuffix();
-    Block actual_block = mergeBlocks(actual_blocks);
-    ASSERT_BLOCK_EQ(except_block, actual_block);
+    return mergeBlocks(actual_blocks).getColumnsWithTypeAndName();
 }
 } // namespace
 
-void ExecutorTest::executeStreams(const std::shared_ptr<tipb::DAGRequest> & request, std::unordered_map<String, ColumnsWithTypeAndName> & source_columns_map, const ColumnsWithTypeAndName & expect_columns, size_t concurrency)
+DB::ColumnsWithTypeAndName ExecutorTest::executeStreams(const std::shared_ptr<tipb::DAGRequest> & request, std::unordered_map<String, ColumnsWithTypeAndName> & source_columns_map, size_t concurrency)
 {
     DAGContext dag_context(*request, "executor_test", concurrency);
     dag_context.setColumnsForTest(source_columns_map);
     context.context.setDAGContext(&dag_context);
     // Currently, don't care about regions information in tests.
     DAGQuerySource dag(context.context);
-    readBlock(executeQuery(dag, context.context, false, QueryProcessingStage::Complete).in, expect_columns);
+    return readBlock(executeQuery(dag, context.context, false, QueryProcessingStage::Complete).in);
 }
 
-void ExecutorTest::executeStreams(const std::shared_ptr<tipb::DAGRequest> & request, const ColumnsWithTypeAndName & expect_columns, size_t concurrency)
+DB::ColumnsWithTypeAndName ExecutorTest::executeStreams(const std::shared_ptr<tipb::DAGRequest> & request, size_t concurrency)
 {
-    executeStreams(request, context.executorIdColumnsMap(), expect_columns, concurrency);
+    return executeStreams(request, context.executorIdColumnsMap(), concurrency);
 }
 
-void ExecutorTest::executeStreamsWithSingleSource(const std::shared_ptr<tipb::DAGRequest> & request, const ColumnsWithTypeAndName & source_columns, const ColumnsWithTypeAndName & expect_columns, SourceType type, size_t concurrency)
+DB::ColumnsWithTypeAndName ExecutorTest::executeStreamsWithSingleSource(const std::shared_ptr<tipb::DAGRequest> & request, const ColumnsWithTypeAndName & source_columns, SourceType type, size_t concurrency)
 {
     std::unordered_map<String, ColumnsWithTypeAndName> source_columns_map;
     source_columns_map[getSourceName(type)] = source_columns;
-    executeStreams(request, source_columns_map, expect_columns, concurrency);
+    return executeStreams(request, source_columns_map, concurrency);
 }
 
 void ExecutorTest::dagRequestEqual(const String & expected_string, const std::shared_ptr<tipb::DAGRequest> & actual)
diff --git a/dbms/src/TestUtils/ExecutorTestUtils.h b/dbms/src/TestUtils/ExecutorTestUtils.h
index 87bb7115bed..59b829e04b5 100644
--- a/dbms/src/TestUtils/ExecutorTestUtils.h
+++ b/dbms/src/TestUtils/ExecutorTestUtils.h
@@ -25,6 +25,9 @@
 namespace DB::tests
 {
 void executeInterpreter(const std::shared_ptr<tipb::DAGRequest> & request, Context & context);
+
+::testing::AssertionResult check_columns_equality(const ColumnsWithTypeAndName & expected, const ColumnsWithTypeAndName & actual, bool _restrict);
+
 class ExecutorTest : public ::testing::Test
 {
 protected:
@@ -72,20 +75,17 @@ class ExecutorTest : public ::testing::Test
         }
     }
 
-    void executeStreams(
+    ColumnsWithTypeAndName executeStreams(
         const std::shared_ptr<tipb::DAGRequest> & request,
         std::unordered_map<String, ColumnsWithTypeAndName> & source_columns_map,
-        const ColumnsWithTypeAndName & expect_columns,
         size_t concurrency = 1);
-    void executeStreams(
+    ColumnsWithTypeAndName executeStreams(
         const std::shared_ptr<tipb::DAGRequest> & request,
-        const ColumnsWithTypeAndName & expect_columns,
         size_t concurrency = 1);
 
-    void executeStreamsWithSingleSource(
+    ColumnsWithTypeAndName executeStreamsWithSingleSource(
         const std::shared_ptr<tipb::DAGRequest> & request,
         const ColumnsWithTypeAndName & source_columns,
-        const ColumnsWithTypeAndName & expect_columns,
         SourceType type = TableScan,
         size_t concurrency = 1);
 
@@ -96,4 +96,4 @@ class ExecutorTest : public ::testing::Test
 
 #define ASSERT_DAGREQUEST_EQAUL(str, request) dagRequestEqual((str), (request));
 #define ASSERT_BLOCKINPUTSTREAM_EQAUL(str, request, concurrency) executeInterpreter((str), (request), (concurrency))
-} // namespace DB::tests
\ No newline at end of file
+} // namespace DB::tests
diff --git a/dbms/src/TestUtils/FunctionTestUtils.cpp b/dbms/src/TestUtils/FunctionTestUtils.cpp
index 637fbf51c00..1c8b0242bfa 100644
--- a/dbms/src/TestUtils/FunctionTestUtils.cpp
+++ b/dbms/src/TestUtils/FunctionTestUtils.cpp
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include <Columns/ColumnNullable.h>
+#include <Common/FmtUtils.h>
 #include <Core/ColumnNumbers.h>
+#include <Core/Row.h>
 #include <DataTypes/DataTypeNothing.h>
 #include <Flash/Coprocessor/DAGCodec.h>
 #include <Flash/Coprocessor/DAGExpressionAnalyzer.h>
@@ -23,7 +25,10 @@
 #include <TestUtils/ColumnsToTiPBExpr.h>
 #include <TestUtils/FunctionTestUtils.h>
 #include <TestUtils/TiFlashTestBasic.h>
-#include <fmt/core.h>
+
+#include <ext/enumerate.h>
+#include <set>
+
 
 namespace DB
 {
@@ -103,21 +108,118 @@ ::testing::AssertionResult columnEqual(
     return columnEqual(expected.column, actual.column);
 }
 
-void blockEqual(
+::testing::AssertionResult blockEqual(
     const Block & expected,
     const Block & actual)
 {
     size_t columns = actual.columns();
+    size_t expected_columns = expected.columns();
 
-    ASSERT_TRUE(expected.columns() == columns);
+    ASSERT_EQUAL(expected_columns, columns, "Block size mismatch");
 
     for (size_t i = 0; i < columns; ++i)
     {
         const auto & expected_col = expected.getByPosition(i);
         const auto & actual_col = actual.getByPosition(i);
-        ASSERT_TRUE(actual_col.type->getName() == expected_col.type->getName());
-        ASSERT_COLUMN_EQ(expected_col.column, actual_col.column);
+
+        auto cmp_res = columnEqual(expected_col, actual_col);
+        if (!cmp_res)
+            return cmp_res;
+    }
+    return ::testing::AssertionSuccess();
+}
+
+/// size of each column should be the same
+std::multiset<Row> columnsToRowSet(const ColumnsWithTypeAndName & cols)
+{
+    if (cols.empty())
+        return {};
+    if (cols[0].column->empty())
+        return {};
+
+    size_t cols_size = cols.size();
+    std::vector<Row> rows{cols[0].column->size()};
+
+    for (auto & r : rows)
+    {
+        r.resize(cols_size, true);
+    }
+
+    for (auto const & [col_id, col] : ext::enumerate(cols))
+    {
+        for (size_t i = 0, size = col.column->size(); i < size; ++i)
+        {
+            new (rows[i].place(col_id)) Field((*col.column)[i]);
+        }
+    }
+    return {std::make_move_iterator(rows.begin()), std::make_move_iterator(rows.end())};
+}
+
+::testing::AssertionResult columnsEqual(
+    const ColumnsWithTypeAndName & expected,
+    const ColumnsWithTypeAndName & actual,
+    bool _restrict)
+{
+    if (_restrict)
+        return blockEqual(Block(expected), Block(actual));
+
+    auto expect_cols_size = expected.size();
+    auto actual_cols_size = actual.size();
+
+    ASSERT_EQUAL(expect_cols_size, actual_cols_size, "Columns size mismatch");
+
+    for (size_t i = 0; i < expect_cols_size; ++i)
+    {
+        auto const & expect_col = expected[i];
+        auto const & actual_col = actual[i];
+        ASSERT_EQUAL(expect_col.column->getName(), actual_col.column->getName(), fmt::format("Column {} name mismatch", i));
+        ASSERT_EQUAL(expect_col.column->size(), actual_col.column->size(), fmt::format("Column {} size mismatch", i));
+        auto type_eq = dataTypeEqual(expected[i].type, actual[i].type);
+        if (!type_eq)
+            return type_eq;
+    }
+
+    auto const expected_row_set = columnsToRowSet(expected);
+    auto const actual_row_set = columnsToRowSet(actual);
+
+    if (expected_row_set != actual_row_set)
+    {
+        FmtBuffer buf;
+
+        auto expect_it = expected_row_set.begin();
+        auto actual_it = actual_row_set.begin();
+
+        buf.append("Columns row set mismatch\n").append("expected_row_set:\n");
+        for (; expect_it != expected_row_set.end(); ++expect_it, ++actual_it)
+        {
+            buf.joinStr(
+                   expect_it->begin(),
+                   expect_it->end(),
+                   [](const auto & v, FmtBuffer & fb) { fb.append(v.toString()); },
+                   " ")
+                .append("\n");
+            if (*expect_it != *actual_it)
+                break;
+        }
+
+        ++actual_it;
+
+        buf.append("...\nactual_row_set:\n");
+        for (auto it = actual_row_set.begin(); it != actual_it; ++it)
+        {
+            buf.joinStr(
+                   it->begin(),
+                   it->end(),
+                   [](const auto & v, FmtBuffer & fb) { fb.append(v.toString()); },
+                   " ")
+                .append("\n");
+        }
+        buf.append("...\n");
+
+        return testing::AssertionFailure() << buf.toString();
     }
+
+    return testing::AssertionSuccess();
 }
 
 std::pair<ExpressionActionsPtr, String> buildFunction(
@@ -274,5 +376,64 @@ ColumnWithTypeAndName toNullableDatetimeVec(String name, const std::vector<Strin
     DataTypePtr data_type = makeNullable(std::make_shared<DataTypeMyDateTime>(fsp));
     return {makeColumn<Nullable<MyDateTime>>(data_type, vec), data_type, name, 0};
 }
+
+String getColumnsContent(const ColumnsWithTypeAndName & cols)
+{
+    if (cols.size() <= 0)
+        return "";
+    return getColumnsContent(cols, 0, cols[0].column->size() - 1);
+}
+
+String getColumnsContent(const ColumnsWithTypeAndName & cols, size_t begin, size_t end)
+{
+    const size_t col_num = cols.size();
+    if (col_num <= 0)
+        return "";
+
+    const size_t col_size = cols[0].column->size();
+    assert(begin <= end);
+    assert(col_size > end);
+    assert(col_size > begin);
+
+    bool is_same = true;
+
+    for (size_t i = 1; i < col_num; ++i)
+    {
+        if (cols[i].column->size() != col_size)
+            is_same = false;
+    }
+
+    assert(is_same); /// Ensure the sizes of columns in cols are the same
+
+    std::vector<std::pair<size_t, String>> col_content;
+    FmtBuffer fmt_buf;
+    for (size_t i = 0; i < col_num; ++i)
+    {
+        /// Push the column name
+        fmt_buf.append(fmt::format("{}: (", cols[i].name));
+        for (size_t j = begin; j <= end; ++j)
+            col_content.push_back(std::make_pair(j, (*cols[i].column)[j].toString()));
+
+        /// Add content
+        fmt_buf.joinStr(
+            col_content.begin(),
+            col_content.end(),
+            [](const auto & content, FmtBuffer & fmt_buf) {
+                fmt_buf.append(fmt::format("{}: {}", content.first, content.second));
+            },
+            ", ");
+
+        fmt_buf.append(")\n");
+        col_content.clear();
+    }
+
+    return fmt_buf.toString();
+}
+
+ColumnsWithTypeAndName createColumns(const ColumnsWithTypeAndName & cols)
+{
+    return cols;
+}
+
 } // namespace tests
 } // namespace DB
diff --git a/dbms/src/TestUtils/FunctionTestUtils.h b/dbms/src/TestUtils/FunctionTestUtils.h
index d6b7351df05..8680d1886b1 100644
--- a/dbms/src/TestUtils/FunctionTestUtils.h
+++ b/dbms/src/TestUtils/FunctionTestUtils.h
@@ -514,6 +514,17 @@ ColumnWithTypeAndName createConstColumn(
     return createConstColumn<T>(data_type_args, size, InferredFieldType<T>(std::nullopt), name);
 }
 
+String getColumnsContent(const ColumnsWithTypeAndName & cols);
+
+/// We can designate the range of columns printed with begin and end. range: [begin, end]
+String getColumnsContent(const ColumnsWithTypeAndName & cols, size_t begin, size_t end);
+
+// This wrapper function only serves to construct columns input for function-like macros,
+// since preprocessor recognizes `{col1, col2, col3}` as three arguments instead of one.
+// E.g. preprocessor does not allow us to write `ASSERT_COLUMNS_EQ_R({col1, col2, col3}, actual_cols)`,
+//  but with this func we can write `ASSERT_COLUMNS_EQ_R(createColumns{col1, col2, col3}, actual_cols)` instead.
+ColumnsWithTypeAndName createColumns(const ColumnsWithTypeAndName & cols);
+
 ::testing::AssertionResult dataTypeEqual(
     const DataTypePtr & expected,
     const DataTypePtr & actual);
@@ -527,10 +538,15 @@ ::testing::AssertionResult columnEqual(
     const ColumnWithTypeAndName & expected,
     const ColumnWithTypeAndName & actual);
 
-void blockEqual(
+::testing::AssertionResult blockEqual(
     const Block & expected,
     const Block & actual);
 
+::testing::AssertionResult columnsEqual(
+    const ColumnsWithTypeAndName & expected,
+    const ColumnsWithTypeAndName & actual,
+    bool _restrict);
+
 ColumnWithTypeAndName executeFunction(
     Context & context,
     const String & func_name,
@@ -756,5 +772,10 @@ class FunctionTest : public ::testing::Test
 
 #define ASSERT_COLUMN_EQ(expected, actual) ASSERT_TRUE(DB::tests::columnEqual((expected), (actual)))
 #define ASSERT_BLOCK_EQ(expected, actual) DB::tests::blockEqual((expected), (actual))
+
+/// restrictly checking columns equality, both data set and each row's offset should be the same
+#define ASSERT_COLUMNS_EQ_R(expected, actual) ASSERT_TRUE(DB::tests::columnsEqual((expected), (actual), true))
+/// unrestrictly checking columns equality, only checking data set equality
+#define ASSERT_COLUMNS_EQ_UR(expected, actual) ASSERT_TRUE(DB::tests::columnsEqual((expected), (actual), false))
 } // namespace tests
 } // namespace DB
diff --git a/dbms/src/TestUtils/TiFlashTestEnv.cpp b/dbms/src/TestUtils/TiFlashTestEnv.cpp
index cbd42b57550..f44298cbafd 100644
--- a/dbms/src/TestUtils/TiFlashTestEnv.cpp
+++ b/dbms/src/TestUtils/TiFlashTestEnv.cpp
@@ -39,6 +39,11 @@ void TiFlashTestEnv::initializeGlobalContext(Strings testdata_path, PageStorageR
     KeyManagerPtr key_manager = std::make_shared<MockKeyManager>(false);
     global_context->initializeFileProvider(key_manager, false);
 
+    // initialize background & blockable background thread pool
+    Settings & settings = global_context->getSettingsRef();
+    global_context->initializeBackgroundPool(settings.background_pool_size);
+    global_context->initializeBlockableBackgroundPool(settings.background_pool_size);
+
     // Theses global variables should be initialized by the following order
     // 1. capacity
     // 2. path pool
diff --git a/dbms/src/TestUtils/bench_dbms_main.cpp b/dbms/src/TestUtils/bench_dbms_main.cpp
index 48bd02a71f7..092c45c35e2 100644
--- a/dbms/src/TestUtils/bench_dbms_main.cpp
+++ b/dbms/src/TestUtils/bench_dbms_main.cpp
@@ -20,6 +20,8 @@ int main(int argc, char * argv[])
 {
     benchmark::Initialize(&argc, argv);
     DB::tests::TiFlashTestEnv::setupLogger();
+    // Each time TiFlashTestEnv::getContext() is called, some log will print, it's annoying.
+    Poco::Logger::root().setLevel("error");
     DB::tests::TiFlashTestEnv::initializeGlobalContext();
     if (::benchmark::ReportUnrecognizedArguments(argc, argv))
         return 1;
diff --git a/dbms/src/TestUtils/mockExecutor.cpp b/dbms/src/TestUtils/mockExecutor.cpp
index 2cf8a939b58..30d05786c9a 100644
--- a/dbms/src/TestUtils/mockExecutor.cpp
+++ b/dbms/src/TestUtils/mockExecutor.cpp
@@ -35,7 +35,7 @@ ASTPtr buildLiteral(const Field & field)
     return std::make_shared<ASTLiteral>(field);
 }
 
-ASTPtr buildOrderByItemList(MockOrderByItems order_by_items)
+ASTPtr buildOrderByItemVec(MockOrderByItemVec order_by_items)
 {
     std::vector<ASTPtr> vec(order_by_items.size());
     size_t i = 0;
@@ -92,7 +92,7 @@ std::shared_ptr<tipb::DAGRequest> DAGRequestBuilder::build(MockDAGRequestContext
     return dag_request_ptr;
 }
 
-DAGRequestBuilder & DAGRequestBuilder::mockTable(const String & db, const String & table, const MockColumnInfos & columns)
+DAGRequestBuilder & DAGRequestBuilder::mockTable(const String & db, const String & table, const MockColumnInfoVec & columns)
 {
     assert(!columns.empty());
     TableInfo table_info;
@@ -114,27 +114,17 @@ DAGRequestBuilder & DAGRequestBuilder::mockTable(const String & db, const String
     return *this;
 }
 
-DAGRequestBuilder & DAGRequestBuilder::mockTable(const MockTableName & name, const MockColumnInfos & columns)
+DAGRequestBuilder & DAGRequestBuilder::mockTable(const MockTableName & name, const MockColumnInfoVec & columns)
 {
     return mockTable(name.first, name.second, columns);
 }
 
-DAGRequestBuilder & DAGRequestBuilder::mockTable(const MockTableName & name, const MockColumnInfoList & columns)
+DAGRequestBuilder & DAGRequestBuilder::exchangeReceiver(const MockColumnInfoVec & columns, uint64_t fine_grained_shuffle_stream_count)
 {
-    return mockTable(name.first, name.second, columns);
-}
-
-DAGRequestBuilder & DAGRequestBuilder::exchangeReceiver(const MockColumnInfos & columns)
-{
-    return buildExchangeReceiver(columns);
+    return buildExchangeReceiver(columns, fine_grained_shuffle_stream_count);
 }
 
-DAGRequestBuilder & DAGRequestBuilder::exchangeReceiver(const MockColumnInfoList & columns)
-{
-    return buildExchangeReceiver(columns);
-}
-
-DAGRequestBuilder & DAGRequestBuilder::buildExchangeReceiver(const MockColumnInfos & columns)
+DAGRequestBuilder & DAGRequestBuilder::buildExchangeReceiver(const MockColumnInfoVec & columns, uint64_t fine_grained_shuffle_stream_count)
 {
     DAGSchema schema;
     for (const auto & column : columns)
@@ -145,7 +135,7 @@ DAGRequestBuilder & DAGRequestBuilder::buildExchangeReceiver(const MockColumnInf
         schema.push_back({column.first, info});
     }
 
-    root = compileExchangeReceiver(getExecutorIndex(), schema);
+    root = compileExchangeReceiver(getExecutorIndex(), schema, fine_grained_shuffle_stream_count);
     return *this;
 }
 
@@ -180,33 +170,23 @@ DAGRequestBuilder & DAGRequestBuilder::topN(ASTPtr order_exprs, ASTPtr limit_exp
 DAGRequestBuilder & DAGRequestBuilder::topN(const String & col_name, bool desc, int limit)
 {
     assert(root);
-    root = compileTopN(root, getExecutorIndex(), buildOrderByItemList({{col_name, desc}}), buildLiteral(Field(static_cast<UInt64>(limit))));
+    root = compileTopN(root, getExecutorIndex(), buildOrderByItemVec({{col_name, desc}}), buildLiteral(Field(static_cast<UInt64>(limit))));
     return *this;
 }
 
-DAGRequestBuilder & DAGRequestBuilder::topN(MockOrderByItems order_by_items, int limit)
+DAGRequestBuilder & DAGRequestBuilder::topN(MockOrderByItemVec order_by_items, int limit)
 {
     return topN(order_by_items, buildLiteral(Field(static_cast<UInt64>(limit))));
 }
 
-DAGRequestBuilder & DAGRequestBuilder::topN(MockOrderByItems order_by_items, ASTPtr limit_expr)
-{
-    assert(root);
-    root = compileTopN(root, getExecutorIndex(), buildOrderByItemList(order_by_items), limit_expr);
-    return *this;
-}
-
-DAGRequestBuilder & DAGRequestBuilder::project(const String & col_name)
+DAGRequestBuilder & DAGRequestBuilder::topN(MockOrderByItemVec order_by_items, ASTPtr limit_expr)
 {
     assert(root);
-    auto exp_list = std::make_shared<ASTExpressionList>();
-    exp_list->children.push_back(buildColumn(col_name));
-
-    root = compileProject(root, getExecutorIndex(), exp_list);
+    root = compileTopN(root, getExecutorIndex(), buildOrderByItemVec(order_by_items), limit_expr);
     return *this;
 }
 
-DAGRequestBuilder & DAGRequestBuilder::project(MockAsts exprs)
+DAGRequestBuilder & DAGRequestBuilder::project(MockAstVec exprs)
 {
     assert(root);
     auto exp_list = std::make_shared<ASTExpressionList>();
@@ -218,7 +198,7 @@ DAGRequestBuilder & DAGRequestBuilder::project(MockAsts exprs)
     return *this;
 }
 
-DAGRequestBuilder & DAGRequestBuilder::project(MockColumnNames col_names)
+DAGRequestBuilder & DAGRequestBuilder::project(MockColumnNameVec col_names)
 {
     assert(root);
     auto exp_list = std::make_shared<ASTExpressionList>();
@@ -237,12 +217,12 @@ DAGRequestBuilder & DAGRequestBuilder::exchangeSender(tipb::ExchangeType exchang
     return *this;
 }
 
-DAGRequestBuilder & DAGRequestBuilder::join(const DAGRequestBuilder & right, MockAsts exprs)
+DAGRequestBuilder & DAGRequestBuilder::join(const DAGRequestBuilder & right, MockAstVec exprs)
 {
     return join(right, exprs, ASTTableJoin::Kind::Inner);
 }
 
-DAGRequestBuilder & DAGRequestBuilder::join(const DAGRequestBuilder & right, MockAsts exprs, ASTTableJoin::Kind kind)
+DAGRequestBuilder & DAGRequestBuilder::join(const DAGRequestBuilder & right, MockAstVec exprs, ASTTableJoin::Kind kind)
 {
     assert(root);
     assert(right.root);
@@ -268,7 +248,7 @@ DAGRequestBuilder & DAGRequestBuilder::aggregation(ASTPtr agg_func, ASTPtr group
     return buildAggregation(agg_funcs, group_by_exprs);
 }
 
-DAGRequestBuilder & DAGRequestBuilder::aggregation(MockAsts agg_funcs, MockAsts group_by_exprs)
+DAGRequestBuilder & DAGRequestBuilder::aggregation(MockAstVec agg_funcs, MockAstVec group_by_exprs)
 {
     auto agg_func_list = std::make_shared<ASTExpressionList>();
     auto group_by_expr_list = std::make_shared<ASTExpressionList>();
@@ -286,85 +266,63 @@ DAGRequestBuilder & DAGRequestBuilder::buildAggregation(ASTPtr agg_funcs, ASTPtr
     return *this;
 }
 
-DAGRequestBuilder & DAGRequestBuilder::window(ASTPtr window_func, MockOrderByItem order_by, MockPartitionByItem partition_by, MockWindowFrame frame)
+DAGRequestBuilder & DAGRequestBuilder::window(ASTPtr window_func, MockOrderByItem order_by, MockPartitionByItem partition_by, MockWindowFrame frame, uint64_t fine_grained_shuffle_stream_count)
 {
     assert(root);
     auto window_func_list = std::make_shared<ASTExpressionList>();
     window_func_list->children.push_back(window_func);
-    root = compileWindow(root, getExecutorIndex(), window_func_list, buildOrderByItemList({partition_by}), buildOrderByItemList({order_by}), frame);
+    root = compileWindow(root, getExecutorIndex(), window_func_list, buildOrderByItemVec({partition_by}), buildOrderByItemVec({order_by}), frame, fine_grained_shuffle_stream_count);
     return *this;
 }
 
-DAGRequestBuilder & DAGRequestBuilder::window(ASTPtr window_func, MockOrderByItems order_by_list, MockPartitionByItems partition_by_list, MockWindowFrame frame)
+DAGRequestBuilder & DAGRequestBuilder::window(ASTPtr window_func, MockOrderByItemVec order_by_vec, MockPartitionByItemVec partition_by_vec, MockWindowFrame frame, uint64_t fine_grained_shuffle_stream_count)
 {
     assert(root);
     auto window_func_list = std::make_shared<ASTExpressionList>();
     window_func_list->children.push_back(window_func);
-    root = compileWindow(root, getExecutorIndex(), window_func_list, buildOrderByItemList(partition_by_list), buildOrderByItemList(order_by_list), frame);
+    root = compileWindow(root, getExecutorIndex(), window_func_list, buildOrderByItemVec(partition_by_vec), buildOrderByItemVec(order_by_vec), frame, fine_grained_shuffle_stream_count);
     return *this;
 }
 
-DAGRequestBuilder & DAGRequestBuilder::window(MockAsts window_funcs, MockOrderByItems order_by_list, MockPartitionByItems partition_by_list, MockWindowFrame frame)
+DAGRequestBuilder & DAGRequestBuilder::window(MockAstVec window_funcs, MockOrderByItemVec order_by_vec, MockPartitionByItemVec partition_by_vec, MockWindowFrame frame, uint64_t fine_grained_shuffle_stream_count)
 {
     assert(root);
     auto window_func_list = std::make_shared<ASTExpressionList>();
     for (const auto & func : window_funcs)
         window_func_list->children.push_back(func);
-    root = compileWindow(root, getExecutorIndex(), window_func_list, buildOrderByItemList(partition_by_list), buildOrderByItemList(order_by_list), frame);
+    root = compileWindow(root, getExecutorIndex(), window_func_list, buildOrderByItemVec(partition_by_vec), buildOrderByItemVec(order_by_vec), frame, fine_grained_shuffle_stream_count);
     return *this;
 }
 
-DAGRequestBuilder & DAGRequestBuilder::sort(MockOrderByItem order_by, bool is_partial_sort)
+DAGRequestBuilder & DAGRequestBuilder::sort(MockOrderByItem order_by, bool is_partial_sort, uint64_t fine_grained_shuffle_stream_count)
 {
     assert(root);
-    root = compileSort(root, getExecutorIndex(), buildOrderByItemList({order_by}), is_partial_sort);
+    root = compileSort(root, getExecutorIndex(), buildOrderByItemVec({order_by}), is_partial_sort, fine_grained_shuffle_stream_count);
     return *this;
 }
 
-DAGRequestBuilder & DAGRequestBuilder::sort(MockOrderByItems order_by_list, bool is_partial_sort)
+DAGRequestBuilder & DAGRequestBuilder::sort(MockOrderByItemVec order_by_vec, bool is_partial_sort, uint64_t fine_grained_shuffle_stream_count)
 {
     assert(root);
-    root = compileSort(root, getExecutorIndex(), buildOrderByItemList(order_by_list), is_partial_sort);
+    root = compileSort(root, getExecutorIndex(), buildOrderByItemVec(order_by_vec), is_partial_sort, fine_grained_shuffle_stream_count);
     return *this;
 }
 
-void MockDAGRequestContext::addMockTable(const MockTableName & name, const MockColumnInfoList & columnInfos)
-{
-    std::vector<MockColumnInfo> v_column_info(columnInfos.size());
-    size_t i = 0;
-    for (const auto & info : columnInfos)
-    {
-        v_column_info[i++] = std::move(info);
-    }
-    mock_tables[name.first + "." + name.second] = v_column_info;
-}
-
-void MockDAGRequestContext::addMockTable(const String & db, const String & table, const MockColumnInfos & columnInfos)
+void MockDAGRequestContext::addMockTable(const String & db, const String & table, const MockColumnInfoVec & columnInfos)
 {
     mock_tables[db + "." + table] = columnInfos;
 }
 
-void MockDAGRequestContext::addMockTable(const MockTableName & name, const MockColumnInfos & columnInfos)
+void MockDAGRequestContext::addMockTable(const MockTableName & name, const MockColumnInfoVec & columnInfos)
 {
     mock_tables[name.first + "." + name.second] = columnInfos;
 }
 
-void MockDAGRequestContext::addExchangeRelationSchema(String name, const MockColumnInfos & columnInfos)
+void MockDAGRequestContext::addExchangeRelationSchema(String name, const MockColumnInfoVec & columnInfos)
 {
     exchange_schemas[name] = columnInfos;
 }
 
-void MockDAGRequestContext::addExchangeRelationSchema(String name, const MockColumnInfoList & columnInfos)
-{
-    std::vector<MockColumnInfo> v_column_info(columnInfos.size());
-    size_t i = 0;
-    for (const auto & info : columnInfos)
-    {
-        v_column_info[i++] = std::move(info);
-    }
-    exchange_schemas[name] = v_column_info;
-}
-
 void MockDAGRequestContext::addMockTableColumnData(const String & db, const String & table, ColumnsWithTypeAndName columns)
 {
     mock_table_columns[db + "." + table] = columns;
@@ -380,37 +338,19 @@ void MockDAGRequestContext::addExchangeReceiverColumnData(const String & name, C
     mock_exchange_columns[name] = columns;
 }
 
-void MockDAGRequestContext::addMockTable(const String & db, const String & table, const MockColumnInfoList & columnInfos, ColumnsWithTypeAndName columns)
+void MockDAGRequestContext::addMockTable(const String & db, const String & table, const MockColumnInfoVec & columnInfos, ColumnsWithTypeAndName columns)
 {
     addMockTable(db, table, columnInfos);
     addMockTableColumnData(db, table, columns);
 }
 
-void MockDAGRequestContext::addMockTable(const String & db, const String & table, const MockColumnInfos & columnInfos, ColumnsWithTypeAndName columns)
-{
-    addMockTable(db, table, columnInfos);
-    addMockTableColumnData(db, table, columns);
-}
-
-void MockDAGRequestContext::addMockTable(const MockTableName & name, const MockColumnInfoList & columnInfos, ColumnsWithTypeAndName columns)
+void MockDAGRequestContext::addMockTable(const MockTableName & name, const MockColumnInfoVec & columnInfos, ColumnsWithTypeAndName columns)
 {
     addMockTable(name, columnInfos);
     addMockTableColumnData(name, columns);
 }
 
-void MockDAGRequestContext::addMockTable(const MockTableName & name, const MockColumnInfos & columnInfos, ColumnsWithTypeAndName columns)
-{
-    addMockTable(name, columnInfos);
-    addMockTableColumnData(name, columns);
-}
-
-void MockDAGRequestContext::addExchangeReceiver(const String & name, MockColumnInfos columnInfos, ColumnsWithTypeAndName columns)
-{
-    addExchangeRelationSchema(name, columnInfos);
-    addExchangeReceiverColumnData(name, columns);
-}
-
-void MockDAGRequestContext::addExchangeReceiver(const String & name, MockColumnInfoList columnInfos, ColumnsWithTypeAndName columns)
+void MockDAGRequestContext::addExchangeReceiver(const String & name, MockColumnInfoVec columnInfos, ColumnsWithTypeAndName columns)
 {
     addExchangeRelationSchema(name, columnInfos);
     addExchangeReceiverColumnData(name, columns);
@@ -428,9 +368,9 @@ DAGRequestBuilder MockDAGRequestContext::scan(String db_name, String table_name)
     return builder;
 }
 
-DAGRequestBuilder MockDAGRequestContext::receive(String exchange_name)
+DAGRequestBuilder MockDAGRequestContext::receive(String exchange_name, uint64_t fine_grained_shuffle_stream_count)
 {
-    auto builder = DAGRequestBuilder(index).exchangeReceiver(exchange_schemas[exchange_name]);
+    auto builder = DAGRequestBuilder(index).exchangeReceiver(exchange_schemas[exchange_name], fine_grained_shuffle_stream_count);
     receiver_source_task_ids_map[builder.getRoot()->name] = {};
     // If don't have related columns, user must pass input columns as argument of executeStreams in order to run Executors Tests.
     // If user don't want to test executors, it will be safe to run Interpreter Tests.
@@ -440,5 +380,4 @@ DAGRequestBuilder MockDAGRequestContext::receive(String exchange_name)
     }
     return builder;
 }
-
 } // namespace DB::tests
diff --git a/dbms/src/TestUtils/mockExecutor.h b/dbms/src/TestUtils/mockExecutor.h
index c11635ac93e..8b5a6d300ff 100644
--- a/dbms/src/TestUtils/mockExecutor.h
+++ b/dbms/src/TestUtils/mockExecutor.h
@@ -23,15 +23,14 @@
 namespace DB::tests
 {
 using MockColumnInfo = std::pair<String, TiDB::TP>;
-using MockColumnInfos = std::vector<MockColumnInfo>;
-using MockColumnInfoList = std::initializer_list<MockColumnInfo>;
+using MockColumnInfoVec = std::vector<MockColumnInfo>;
 using MockTableName = std::pair<String, String>;
 using MockOrderByItem = std::pair<String, bool>;
-using MockOrderByItems = std::initializer_list<MockOrderByItem>;
+using MockOrderByItemVec = std::vector<MockOrderByItem>;
 using MockPartitionByItem = std::pair<String, bool>;
-using MockPartitionByItems = std::initializer_list<MockPartitionByItem>;
-using MockColumnNames = std::initializer_list<String>;
-using MockAsts = std::initializer_list<ASTPtr>;
+using MockPartitionByItemVec = std::vector<MockPartitionByItem>;
+using MockColumnNameVec = std::vector<String>;
+using MockAstVec = std::vector<ASTPtr>;
 using MockWindowFrame = mock::MockWindowFrame;
 
 class MockDAGRequestContext;
@@ -64,12 +63,10 @@ class DAGRequestBuilder
 
     std::shared_ptr<tipb::DAGRequest> build(MockDAGRequestContext & mock_context);
 
-    DAGRequestBuilder & mockTable(const String & db, const String & table, const MockColumnInfos & columns);
-    DAGRequestBuilder & mockTable(const MockTableName & name, const MockColumnInfos & columns);
-    DAGRequestBuilder & mockTable(const MockTableName & name, const MockColumnInfoList & columns);
+    DAGRequestBuilder & mockTable(const String & db, const String & table, const MockColumnInfoVec & columns);
+    DAGRequestBuilder & mockTable(const MockTableName & name, const MockColumnInfoVec & columns);
 
-    DAGRequestBuilder & exchangeReceiver(const MockColumnInfos & columns);
-    DAGRequestBuilder & exchangeReceiver(const MockColumnInfoList & columns);
+    DAGRequestBuilder & exchangeReceiver(const MockColumnInfoVec & columns, uint64_t fine_grained_shuffle_stream_count = 0);
 
     DAGRequestBuilder & filter(ASTPtr filter_expr);
 
@@ -78,35 +75,34 @@ class DAGRequestBuilder
 
     DAGRequestBuilder & topN(ASTPtr order_exprs, ASTPtr limit_expr);
     DAGRequestBuilder & topN(const String & col_name, bool desc, int limit);
-    DAGRequestBuilder & topN(MockOrderByItems order_by_items, int limit);
-    DAGRequestBuilder & topN(MockOrderByItems order_by_items, ASTPtr limit_expr);
+    DAGRequestBuilder & topN(MockOrderByItemVec order_by_items, int limit);
+    DAGRequestBuilder & topN(MockOrderByItemVec order_by_items, ASTPtr limit_expr);
 
-    DAGRequestBuilder & project(const String & col_name);
-    DAGRequestBuilder & project(MockAsts expr);
-    DAGRequestBuilder & project(MockColumnNames col_names);
+    DAGRequestBuilder & project(MockAstVec exprs);
+    DAGRequestBuilder & project(MockColumnNameVec col_names);
 
     DAGRequestBuilder & exchangeSender(tipb::ExchangeType exchange_type);
 
-    // Currentlt only support inner join, left join and right join.
+    // Currently only support inner join, left join and right join.
     // TODO support more types of join.
-    DAGRequestBuilder & join(const DAGRequestBuilder & right, MockAsts exprs);
-    DAGRequestBuilder & join(const DAGRequestBuilder & right, MockAsts exprs, ASTTableJoin::Kind kind);
+    DAGRequestBuilder & join(const DAGRequestBuilder & right, MockAstVec exprs);
+    DAGRequestBuilder & join(const DAGRequestBuilder & right, MockAstVec exprs, ASTTableJoin::Kind kind);
 
     // aggregation
     DAGRequestBuilder & aggregation(ASTPtr agg_func, ASTPtr group_by_expr);
-    DAGRequestBuilder & aggregation(MockAsts agg_funcs, MockAsts group_by_exprs);
+    DAGRequestBuilder & aggregation(MockAstVec agg_funcs, MockAstVec group_by_exprs);
 
     // window
-    DAGRequestBuilder & window(ASTPtr window_func, MockOrderByItem order_by, MockPartitionByItem partition_by, MockWindowFrame frame);
-    DAGRequestBuilder & window(MockAsts window_funcs, MockOrderByItems order_by_list, MockPartitionByItems partition_by_list, MockWindowFrame frame);
-    DAGRequestBuilder & window(ASTPtr window_func, MockOrderByItems order_by_list, MockPartitionByItems partition_by_list, MockWindowFrame frame);
-    DAGRequestBuilder & sort(MockOrderByItem order_by, bool is_partial_sort);
-    DAGRequestBuilder & sort(MockOrderByItems order_by_list, bool is_partial_sort);
+    DAGRequestBuilder & window(ASTPtr window_func, MockOrderByItem order_by, MockPartitionByItem partition_by, MockWindowFrame frame, uint64_t fine_grained_shuffle_stream_count = 0);
+    DAGRequestBuilder & window(MockAstVec window_funcs, MockOrderByItemVec order_by_vec, MockPartitionByItemVec partition_by_vec, MockWindowFrame frame, uint64_t fine_grained_shuffle_stream_count = 0);
+    DAGRequestBuilder & window(ASTPtr window_func, MockOrderByItemVec order_by_vec, MockPartitionByItemVec partition_by_vec, MockWindowFrame frame, uint64_t fine_grained_shuffle_stream_count = 0);
+    DAGRequestBuilder & sort(MockOrderByItem order_by, bool is_partial_sort, uint64_t fine_grained_shuffle_stream_count = 0);
+    DAGRequestBuilder & sort(MockOrderByItemVec order_by_vec, bool is_partial_sort, uint64_t fine_grained_shuffle_stream_count = 0);
 
 private:
     void initDAGRequest(tipb::DAGRequest & dag_request);
     DAGRequestBuilder & buildAggregation(ASTPtr agg_funcs, ASTPtr group_by_exprs);
-    DAGRequestBuilder & buildExchangeReceiver(const MockColumnInfos & columns);
+    DAGRequestBuilder & buildExchangeReceiver(const MockColumnInfoVec & columns, uint64_t fine_grained_shuffle_stream_count = 0);
 
     ExecutorPtr root;
     DAGProperties properties;
@@ -130,30 +126,25 @@ class MockDAGRequestContext
         return DAGRequestBuilder(index);
     }
 
-    void addMockTable(const MockTableName & name, const MockColumnInfoList & columnInfos);
-    void addMockTable(const String & db, const String & table, const MockColumnInfos & columnInfos);
-    void addMockTable(const MockTableName & name, const MockColumnInfos & columnInfos);
-    void addExchangeRelationSchema(String name, const MockColumnInfos & columnInfos);
-    void addExchangeRelationSchema(String name, const MockColumnInfoList & columnInfos);
+    void addMockTable(const String & db, const String & table, const MockColumnInfoVec & columnInfos);
+    void addMockTable(const MockTableName & name, const MockColumnInfoVec & columnInfos);
+    void addExchangeRelationSchema(String name, const MockColumnInfoVec & columnInfos);
     void addMockTableColumnData(const String & db, const String & table, ColumnsWithTypeAndName columns);
-    void addMockTable(const String & db, const String & table, const MockColumnInfoList & columnInfos, ColumnsWithTypeAndName columns);
-    void addMockTable(const String & db, const String & table, const MockColumnInfos & columnInfos, ColumnsWithTypeAndName columns);
-    void addMockTable(const MockTableName & name, const MockColumnInfoList & columnInfos, ColumnsWithTypeAndName columns);
-    void addMockTable(const MockTableName & name, const MockColumnInfos & columnInfos, ColumnsWithTypeAndName columns);
+    void addMockTable(const String & db, const String & table, const MockColumnInfoVec & columnInfos, ColumnsWithTypeAndName columns);
+    void addMockTable(const MockTableName & name, const MockColumnInfoVec & columnInfos, ColumnsWithTypeAndName columns);
     void addMockTableColumnData(const MockTableName & name, ColumnsWithTypeAndName columns);
     void addExchangeReceiverColumnData(const String & name, ColumnsWithTypeAndName columns);
-    void addExchangeReceiver(const String & name, MockColumnInfos columnInfos, ColumnsWithTypeAndName columns);
-    void addExchangeReceiver(const String & name, MockColumnInfoList columnInfos, ColumnsWithTypeAndName columns);
+    void addExchangeReceiver(const String & name, MockColumnInfoVec columnInfos, ColumnsWithTypeAndName columns);
 
     std::unordered_map<String, ColumnsWithTypeAndName> & executorIdColumnsMap() { return executor_id_columns_map; }
 
     DAGRequestBuilder scan(String db_name, String table_name);
-    DAGRequestBuilder receive(String exchange_name);
+    DAGRequestBuilder receive(String exchange_name, uint64_t fine_grained_shuffle_stream_count = 0);
 
 private:
     size_t index;
-    std::unordered_map<String, MockColumnInfos> mock_tables;
-    std::unordered_map<String, MockColumnInfos> exchange_schemas;
+    std::unordered_map<String, MockColumnInfoVec> mock_tables;
+    std::unordered_map<String, MockColumnInfoVec> exchange_schemas;
     std::unordered_map<String, ColumnsWithTypeAndName> mock_table_columns;
     std::unordered_map<String, ColumnsWithTypeAndName> mock_exchange_columns;
     std::unordered_map<String, ColumnsWithTypeAndName> executor_id_columns_map; /// <executor_id, columns>
@@ -168,21 +159,23 @@ class MockDAGRequestContext
 
 ASTPtr buildColumn(const String & column_name);
 ASTPtr buildLiteral(const Field & field);
-ASTPtr buildFunction(MockAsts exprs, const String & name);
-ASTPtr buildOrderByItemList(MockOrderByItems order_by_items);
+ASTPtr buildFunction(MockAstVec exprs, const String & name);
+ASTPtr buildOrderByItemVec(MockOrderByItemVec order_by_items);
 
 MockWindowFrame buildDefaultRowsFrame();
 
 #define col(name) buildColumn((name))
 #define lit(field) buildLiteral((field))
+#define concat(expr1, expr2) makeASTFunction("concat", (expr1), (expr2))
 #define eq(expr1, expr2) makeASTFunction("equals", (expr1), (expr2))
 #define Not_eq(expr1, expr2) makeASTFunction("notEquals", (expr1), (expr2))
 #define lt(expr1, expr2) makeASTFunction("less", (expr1), (expr2))
 #define gt(expr1, expr2) makeASTFunction("greater", (expr1), (expr2))
 #define And(expr1, expr2) makeASTFunction("and", (expr1), (expr2))
 #define Or(expr1, expr2) makeASTFunction("or", (expr1), (expr2))
-#define NOT(expr) makeASTFunction("not", (expr1), (expr2))
-#define Max(expr) makeASTFunction("max", expr)
+#define NOT(expr) makeASTFunction("not", (expr))
+#define Max(expr) makeASTFunction("max", (expr))
+#define Sum(expr) makeASTFunction("sum", (expr))
 /// Window functions
 #define RowNumber() makeASTFunction("RowNumber")
 #define Rank() makeASTFunction("Rank")
diff --git a/dbms/src/TestUtils/tests/gtest_mock_executors.cpp b/dbms/src/TestUtils/tests/gtest_mock_executors.cpp
index 8bed0f2fc6c..72f0bb505d1 100644
--- a/dbms/src/TestUtils/tests/gtest_mock_executors.cpp
+++ b/dbms/src/TestUtils/tests/gtest_mock_executors.cpp
@@ -76,7 +76,7 @@ TEST_F(MockDAGRequestTest, Projection)
 try
 {
     auto request = context.scan("test_db", "test_table")
-                       .project("s1")
+                       .project({"s1"})
                        .build(context);
     {
         String expected = "project_1 | {<0, String>}\n"
diff --git a/dbms/src/TestUtils/tests/gtest_print_columns.cpp b/dbms/src/TestUtils/tests/gtest_print_columns.cpp
new file mode 100644
index 00000000000..50631fc4f4a
--- /dev/null
+++ b/dbms/src/TestUtils/tests/gtest_print_columns.cpp
@@ -0,0 +1,57 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <TestUtils/ExecutorTestUtils.h>
+#include <TestUtils/mockExecutor.h>
+
+namespace DB
+{
+namespace tests
+{
+
+class PrintColumnsTest : public DB::tests::ExecutorTest
+{
+public:
+    using ColStringType = std::optional<typename TypeTraits<String>::FieldType>;
+    using ColInt32Type = std::optional<typename TypeTraits<Int32>::FieldType>;
+    using ColumnWithString = std::vector<ColStringType>;
+    using ColumnWithInt32 = std::vector<ColInt32Type>;
+
+    void initializeContext() override
+    {
+        test_cols.push_back(toNullableVec<Int32>("col1", ColumnWithInt32{36, 34, 32, 27, {}, {}}));
+        test_cols.push_back(toNullableVec<String>("col2", ColumnWithString{"female", "male", "male", "female", "male", "female"}));
+        col_len = test_cols[0].column->size();
+    }
+
+    ColumnsWithTypeAndName test_cols;
+    size_t col_len;
+    const String result1{"col1: (0: Int64_36, 1: Int64_34, 2: Int64_32, 3: Int64_27, 4: NULL, 5: NULL)\ncol2: (0: 'female', 1: 'male', 2: 'male', 3: 'female', 4: 'male', 5: 'female')\n"};
+    const String result2{"col1: (0: Int64_36, 1: Int64_34, 2: Int64_32, 3: Int64_27, 4: NULL, 5: NULL)\ncol2: (0: 'female', 1: 'male', 2: 'male', 3: 'female', 4: 'male', 5: 'female')\n"};
+    const String result3{"col1: (0: Int64_36)\ncol2: (0: 'female')\n"};
+    const String result4{"col1: (1: Int64_34, 2: Int64_32, 3: Int64_27, 4: NULL)\ncol2: (1: 'male', 2: 'male', 3: 'female', 4: 'male')\n"};
+};
+
+TEST_F(PrintColumnsTest, SimpleTest)
+try
+{
+    EXPECT_EQ(getColumnsContent(test_cols), result1);
+    EXPECT_EQ(getColumnsContent(test_cols, 0, col_len - 1), result2);
+    EXPECT_EQ(getColumnsContent(test_cols, 0, 0), result3);
+    EXPECT_EQ(getColumnsContent(test_cols, 1, col_len - 2), result4);
+}
+CATCH
+
+} // namespace tests
+} // namespace DB
diff --git a/dbms/src/TiDB/Schema/SchemaBuilder.cpp b/dbms/src/TiDB/Schema/SchemaBuilder.cpp
index ae78923fc61..6e4ad10e344 100644
--- a/dbms/src/TiDB/Schema/SchemaBuilder.cpp
+++ b/dbms/src/TiDB/Schema/SchemaBuilder.cpp
@@ -36,6 +36,7 @@
 #include <Storages/IManageableStorage.h>
 #include <Storages/MutableSupport.h>
 #include <Storages/Transaction/TMTContext.h>
+#include <Storages/Transaction/TiDB.h>
 #include <Storages/Transaction/TypeMapping.h>
 #include <TiDB/Schema/SchemaBuilder-internal.h>
 #include <TiDB/Schema/SchemaBuilder.h>
@@ -320,7 +321,7 @@ inline SchemaChanges detectSchemaChanges(
 }
 
 template <typename Getter, typename NameMapper>
-void SchemaBuilder<Getter, NameMapper>::applyAlterPhysicalTable(DBInfoPtr db_info, TableInfoPtr table_info, ManageableStoragePtr storage)
+void SchemaBuilder<Getter, NameMapper>::applyAlterPhysicalTable(const DBInfoPtr & db_info, const TableInfoPtr & table_info, const ManageableStoragePtr & storage)
 {
     LOG_FMT_INFO(log, "Altering table {}", name_mapper.debugCanonicalName(*db_info, *table_info));
 
@@ -394,7 +395,7 @@ void SchemaBuilder<Getter, NameMapper>::applyAlterPhysicalTable(DBInfoPtr db_inf
 }
 
 template <typename Getter, typename NameMapper>
-void SchemaBuilder<Getter, NameMapper>::applyAlterTable(DBInfoPtr db_info, TableID table_id)
+void SchemaBuilder<Getter, NameMapper>::applyAlterTable(const DBInfoPtr & db_info, TableID table_id)
 {
     auto table_info = getter.getTableInfo(db_info->id, table_id);
     if (table_info == nullptr)
@@ -413,7 +414,7 @@ void SchemaBuilder<Getter, NameMapper>::applyAlterTable(DBInfoPtr db_info, Table
 }
 
 template <typename Getter, typename NameMapper>
-void SchemaBuilder<Getter, NameMapper>::applyAlterLogicalTable(DBInfoPtr db_info, TableInfoPtr table_info, ManageableStoragePtr storage)
+void SchemaBuilder<Getter, NameMapper>::applyAlterLogicalTable(const DBInfoPtr & db_info, const TableInfoPtr & table_info, const ManageableStoragePtr & storage)
 {
     // Alter logical table first.
     applyAlterPhysicalTable(db_info, table_info, storage);
@@ -542,6 +543,11 @@ void SchemaBuilder<Getter, NameMapper>::applyDiff(const SchemaDiff & diff)
         applySetTiFlashReplica(db_info, diff.table_id);
         break;
     }
+    case SchemaActionType::SetTiFlashMode:
+    {
+        applySetTiFlashMode(db_info, diff.table_id);
+        break;
+    }
     default:
     {
         if (diff.type < SchemaActionType::MaxRecognizedType)
@@ -569,7 +575,7 @@ void SchemaBuilder<Getter, NameMapper>::applyDiff(const SchemaDiff & diff)
 }
 
 template <typename Getter, typename NameMapper>
-void SchemaBuilder<Getter, NameMapper>::applyPartitionDiff(TiDB::DBInfoPtr db_info, TableID table_id)
+void SchemaBuilder<Getter, NameMapper>::applyPartitionDiff(const TiDB::DBInfoPtr & db_info, TableID table_id)
 {
     auto table_info = getter.getTableInfo(db_info->id, table_id);
     if (table_info == nullptr)
@@ -593,7 +599,7 @@ void SchemaBuilder<Getter, NameMapper>::applyPartitionDiff(TiDB::DBInfoPtr db_in
 }
 
 template <typename Getter, typename NameMapper>
-void SchemaBuilder<Getter, NameMapper>::applyPartitionDiff(TiDB::DBInfoPtr db_info, TableInfoPtr table_info, ManageableStoragePtr storage)
+void SchemaBuilder<Getter, NameMapper>::applyPartitionDiff(const TiDB::DBInfoPtr & db_info, const TableInfoPtr & table_info, const ManageableStoragePtr & storage)
 {
     const auto & orig_table_info = storage->getTableInfo();
     if (!orig_table_info.isLogicalPartitionTable())
@@ -659,7 +665,7 @@ void SchemaBuilder<Getter, NameMapper>::applyPartitionDiff(TiDB::DBInfoPtr db_in
 }
 
 template <typename Getter, typename NameMapper>
-void SchemaBuilder<Getter, NameMapper>::applyRenameTable(DBInfoPtr new_db_info, TableID table_id)
+void SchemaBuilder<Getter, NameMapper>::applyRenameTable(const DBInfoPtr & new_db_info, TableID table_id)
 {
     auto new_table_info = getter.getTableInfo(new_db_info->id, table_id);
     if (new_table_info == nullptr)
@@ -679,9 +685,9 @@ void SchemaBuilder<Getter, NameMapper>::applyRenameTable(DBInfoPtr new_db_info,
 
 template <typename Getter, typename NameMapper>
 void SchemaBuilder<Getter, NameMapper>::applyRenameLogicalTable(
-    DBInfoPtr new_db_info,
-    TableInfoPtr new_table_info,
-    ManageableStoragePtr storage)
+    const DBInfoPtr & new_db_info,
+    const TableInfoPtr & new_table_info,
+    const ManageableStoragePtr & storage)
 {
     applyRenamePhysicalTable(new_db_info, *new_table_info, storage);
 
@@ -703,9 +709,9 @@ void SchemaBuilder<Getter, NameMapper>::applyRenameLogicalTable(
 
 template <typename Getter, typename NameMapper>
 void SchemaBuilder<Getter, NameMapper>::applyRenamePhysicalTable(
-    DBInfoPtr new_db_info,
-    TableInfo & new_table_info,
-    ManageableStoragePtr storage)
+    const DBInfoPtr & new_db_info,
+    const TableInfo & new_table_info,
+    const ManageableStoragePtr & storage)
 {
     const auto old_mapped_db_name = storage->getDatabaseName();
     const auto new_mapped_db_name = name_mapper.mapDatabaseName(*new_db_info);
@@ -908,7 +914,7 @@ String createDatabaseStmt(Context & context, const DBInfo & db_info, const Schem
 }
 
 template <typename Getter, typename NameMapper>
-void SchemaBuilder<Getter, NameMapper>::applyCreateSchema(TiDB::DBInfoPtr db_info)
+void SchemaBuilder<Getter, NameMapper>::applyCreateSchema(const TiDB::DBInfoPtr & db_info)
 {
     GET_METRIC(tiflash_schema_internal_ddl_count, type_create_db).Increment();
     LOG_FMT_INFO(log, "Creating database {}", name_mapper.debugDatabaseName(*db_info));
@@ -1047,7 +1053,7 @@ String createTableStmt(
 }
 
 template <typename Getter, typename NameMapper>
-void SchemaBuilder<Getter, NameMapper>::applyCreatePhysicalTable(DBInfoPtr db_info, TableInfoPtr table_info)
+void SchemaBuilder<Getter, NameMapper>::applyCreatePhysicalTable(const DBInfoPtr & db_info, const TableInfoPtr & table_info)
 {
     GET_METRIC(tiflash_schema_internal_ddl_count, type_create_table).Increment();
     LOG_FMT_INFO(log, "Creating table {}", name_mapper.debugCanonicalName(*db_info, *table_info));
@@ -1109,7 +1115,7 @@ void SchemaBuilder<Getter, NameMapper>::applyCreatePhysicalTable(DBInfoPtr db_in
 }
 
 template <typename Getter, typename NameMapper>
-void SchemaBuilder<Getter, NameMapper>::applyCreateTable(TiDB::DBInfoPtr db_info, TableID table_id)
+void SchemaBuilder<Getter, NameMapper>::applyCreateTable(const TiDB::DBInfoPtr & db_info, TableID table_id)
 {
     auto table_info = getter.getTableInfo(db_info->id, table_id);
     if (table_info == nullptr)
@@ -1123,7 +1129,7 @@ void SchemaBuilder<Getter, NameMapper>::applyCreateTable(TiDB::DBInfoPtr db_info
 }
 
 template <typename Getter, typename NameMapper>
-void SchemaBuilder<Getter, NameMapper>::applyCreateLogicalTable(TiDB::DBInfoPtr db_info, TableInfoPtr table_info)
+void SchemaBuilder<Getter, NameMapper>::applyCreateLogicalTable(const TiDB::DBInfoPtr & db_info, const TableInfoPtr & table_info)
 {
     if (table_info->isLogicalPartitionTable())
     {
@@ -1169,7 +1175,7 @@ void SchemaBuilder<Getter, NameMapper>::applyDropPhysicalTable(const String & db
 }
 
 template <typename Getter, typename NameMapper>
-void SchemaBuilder<Getter, NameMapper>::applyDropTable(DBInfoPtr db_info, TableID table_id)
+void SchemaBuilder<Getter, NameMapper>::applyDropTable(const DBInfoPtr & db_info, TableID table_id)
 {
     auto & tmt_context = context.getTMTContext();
     auto * storage = tmt_context.getStorages().get(table_id).get();
@@ -1193,13 +1199,14 @@ void SchemaBuilder<Getter, NameMapper>::applyDropTable(DBInfoPtr db_info, TableI
 }
 
 template <typename Getter, typename NameMapper>
-void SchemaBuilder<Getter, NameMapper>::applySetTiFlashReplica(TiDB::DBInfoPtr db_info, TableID table_id)
+void SchemaBuilder<Getter, NameMapper>::applySetTiFlashReplica(const TiDB::DBInfoPtr & db_info, TableID table_id)
 {
     auto latest_table_info = getter.getTableInfo(db_info->id, table_id);
     if (unlikely(latest_table_info == nullptr))
     {
         throw TiFlashException(fmt::format("miss table in TiKV : {}", table_id), Errors::DDL::StaleSchema);
     }
+
     auto & tmt_context = context.getTMTContext();
     auto storage = tmt_context.getStorages().get(latest_table_info->id);
     if (unlikely(storage == nullptr))
@@ -1208,18 +1215,37 @@ void SchemaBuilder<Getter, NameMapper>::applySetTiFlashReplica(TiDB::DBInfoPtr d
                                Errors::DDL::MissingTable);
     }
 
-    auto managed_storage = std::dynamic_pointer_cast<IManageableStorage>(storage);
-    if (unlikely(!managed_storage))
-        throw Exception(fmt::format("{} is not a ManageableStorage", name_mapper.debugCanonicalName(*db_info, *latest_table_info)));
+    applySetTiFlashReplicaOnLogicalTable(db_info, latest_table_info, storage);
+}
 
-    applySetTiFlashReplica(db_info, latest_table_info, managed_storage);
+template <typename Getter, typename NameMapper>
+void SchemaBuilder<Getter, NameMapper>::applySetTiFlashReplicaOnLogicalTable(const TiDB::DBInfoPtr & db_info, const TiDB::TableInfoPtr & table_info, const ManageableStoragePtr & storage)
+{
+    applySetTiFlashReplicaOnPhysicalTable(db_info, table_info, storage);
+
+    if (table_info->isLogicalPartitionTable())
+    {
+        auto & tmt_context = context.getTMTContext();
+
+        for (const auto & part_def : table_info->partition.definitions)
+        {
+            auto new_part_table_info = table_info->producePartitionTableInfo(part_def.id, name_mapper);
+            auto part_storage = tmt_context.getStorages().get(new_part_table_info->id);
+            if (unlikely(part_storage == nullptr))
+            {
+                throw TiFlashException(fmt::format("miss table in TiFlash : {}", name_mapper.debugCanonicalName(*db_info, *new_part_table_info)),
+                                       Errors::DDL::MissingTable);
+            }
+            applySetTiFlashReplicaOnPhysicalTable(db_info, new_part_table_info, part_storage);
+        }
+    }
 }
 
 template <typename Getter, typename NameMapper>
-void SchemaBuilder<Getter, NameMapper>::applySetTiFlashReplica(
-    TiDB::DBInfoPtr db_info,
-    TiDB::TableInfoPtr latest_table_info,
-    ManageableStoragePtr storage)
+void SchemaBuilder<Getter, NameMapper>::applySetTiFlashReplicaOnPhysicalTable(
+    const TiDB::DBInfoPtr & db_info,
+    const TiDB::TableInfoPtr & latest_table_info,
+    const ManageableStoragePtr & storage)
 {
     if (storage->getTableInfo().replica_info.count == latest_table_info->replica_info.count)
         return;
@@ -1238,6 +1264,75 @@ void SchemaBuilder<Getter, NameMapper>::applySetTiFlashReplica(
     LOG_FMT_INFO(log, "Updated replica info for {}", name_mapper.debugCanonicalName(*db_info, table_info));
 }
 
+
+template <typename Getter, typename NameMapper>
+void SchemaBuilder<Getter, NameMapper>::applySetTiFlashMode(const TiDB::DBInfoPtr & db_info, TableID table_id)
+{
+    auto latest_table_info = getter.getTableInfo(db_info->id, table_id);
+
+    if (unlikely(latest_table_info == nullptr))
+    {
+        throw TiFlashException(fmt::format("miss table in TiKV : {}", table_id), Errors::DDL::StaleSchema);
+    }
+
+    auto & tmt_context = context.getTMTContext();
+    auto storage = tmt_context.getStorages().get(latest_table_info->id);
+    if (unlikely(storage == nullptr))
+    {
+        throw TiFlashException(fmt::format("miss table in TiFlash : {}", name_mapper.debugCanonicalName(*db_info, *latest_table_info)),
+                               Errors::DDL::MissingTable);
+    }
+
+    applySetTiFlashModeOnLogicalTable(db_info, latest_table_info, storage);
+}
+
+template <typename Getter, typename NameMapper>
+void SchemaBuilder<Getter, NameMapper>::applySetTiFlashModeOnLogicalTable(
+    const TiDB::DBInfoPtr & db_info,
+    const TiDB::TableInfoPtr & table_info,
+    const ManageableStoragePtr & storage)
+{
+    applySetTiFlashModeOnPhysicalTable(db_info, table_info, storage);
+
+    if (table_info->isLogicalPartitionTable())
+    {
+        auto & tmt_context = context.getTMTContext();
+        for (const auto & part_def : table_info->partition.definitions)
+        {
+            auto new_part_table_info = table_info->producePartitionTableInfo(part_def.id, name_mapper);
+            auto part_storage = tmt_context.getStorages().get(table_info->id);
+            if (unlikely(part_storage == nullptr))
+            {
+                throw TiFlashException(fmt::format("miss table in TiFlash : {}", name_mapper.debugCanonicalName(*db_info, *new_part_table_info)),
+                                       Errors::DDL::MissingTable);
+            }
+            applySetTiFlashModeOnPhysicalTable(db_info, new_part_table_info, part_storage);
+        }
+    }
+}
+
+
+template <typename Getter, typename NameMapper>
+void SchemaBuilder<Getter, NameMapper>::applySetTiFlashModeOnPhysicalTable(
+    const TiDB::DBInfoPtr & db_info,
+    const TiDB::TableInfoPtr & latest_table_info,
+    const ManageableStoragePtr & storage)
+{
+    if (storage->getTableInfo().tiflash_mode == latest_table_info->tiflash_mode)
+        return;
+
+    TiDB::TableInfo table_info = storage->getTableInfo();
+    table_info.tiflash_mode = latest_table_info->tiflash_mode;
+    AlterCommands commands;
+
+    LOG_FMT_INFO(log, "Updating tiflash mode for {} to {}", name_mapper.debugCanonicalName(*db_info, table_info), TiFlashModeToString(table_info.tiflash_mode));
+
+    auto alter_lock = storage->lockForAlter(getThreadName());
+    storage->alterFromTiDB(alter_lock, commands, name_mapper.mapDatabaseName(*db_info), table_info, name_mapper, context);
+    LOG_FMT_INFO(log, "Updated tiflash mode for {} to {}", name_mapper.debugCanonicalName(*db_info, table_info), TiFlashModeToString(table_info.tiflash_mode));
+}
+
+
 template <typename Getter, typename NameMapper>
 void SchemaBuilder<Getter, NameMapper>::syncAllSchema()
 {
@@ -1306,7 +1401,9 @@ void SchemaBuilder<Getter, NameMapper>::syncAllSchema()
             /// Rename if needed.
             applyRenameLogicalTable(db, table, storage);
             /// Update replica info if needed.
-            applySetTiFlashReplica(db, table, storage);
+            applySetTiFlashReplicaOnLogicalTable(db, table, storage);
+            /// Update tiflash mode if needed.
+            applySetTiFlashModeOnLogicalTable(db, table, storage);
             /// Alter if needed.
             applyAlterLogicalTable(db, table, storage);
             LOG_FMT_DEBUG(log, "Table {} synced during sync all schemas", name_mapper.debugCanonicalName(*db, *table));
diff --git a/dbms/src/TiDB/Schema/SchemaBuilder.h b/dbms/src/TiDB/Schema/SchemaBuilder.h
index 8446765f74a..827203a682f 100644
--- a/dbms/src/TiDB/Schema/SchemaBuilder.h
+++ b/dbms/src/TiDB/Schema/SchemaBuilder.h
@@ -55,39 +55,44 @@ struct SchemaBuilder
 
     bool applyCreateSchema(DatabaseID schema_id);
 
-    void applyCreateSchema(TiDB::DBInfoPtr db_info);
+    void applyCreateSchema(const TiDB::DBInfoPtr & db_info);
 
-    void applyCreateTable(TiDB::DBInfoPtr db_info, TableID table_id);
+    void applyCreateTable(const TiDB::DBInfoPtr & db_info, TableID table_id);
 
-    void applyCreateLogicalTable(TiDB::DBInfoPtr db_info, TiDB::TableInfoPtr table_info);
+    void applyCreateLogicalTable(const TiDB::DBInfoPtr & db_info, const TiDB::TableInfoPtr & table_info);
 
-    void applyCreatePhysicalTable(TiDB::DBInfoPtr db_info, TiDB::TableInfoPtr table_info);
+    void applyCreatePhysicalTable(const TiDB::DBInfoPtr & db_info, const TiDB::TableInfoPtr & table_info);
 
-    void applyDropTable(TiDB::DBInfoPtr db_info, TableID table_id);
+    void applyDropTable(const TiDB::DBInfoPtr & db_info, TableID table_id);
 
     /// Parameter schema_name should be mapped.
     void applyDropPhysicalTable(const String & db_name, TableID table_id);
 
-    void applyPartitionDiff(TiDB::DBInfoPtr db_info, TableID table_id);
+    void applyPartitionDiff(const TiDB::DBInfoPtr & db_info, TableID table_id);
 
-    void applyPartitionDiff(TiDB::DBInfoPtr db_info, TiDB::TableInfoPtr table_info, ManageableStoragePtr storage);
+    void applyPartitionDiff(const TiDB::DBInfoPtr & db_info, const TiDB::TableInfoPtr & table_info, const ManageableStoragePtr & storage);
 
-    void applyAlterTable(TiDB::DBInfoPtr db_info, TableID table_id);
+    void applyAlterTable(const TiDB::DBInfoPtr & db_info, TableID table_id);
 
-    void applyAlterLogicalTable(TiDB::DBInfoPtr db_info, TiDB::TableInfoPtr table_info, ManageableStoragePtr storage);
+    void applyAlterLogicalTable(const TiDB::DBInfoPtr & db_info, const TiDB::TableInfoPtr & table_info, const ManageableStoragePtr & storage);
 
-    void applyAlterPhysicalTable(TiDB::DBInfoPtr db_info, TiDB::TableInfoPtr table_info, ManageableStoragePtr storage);
+    void applyAlterPhysicalTable(const TiDB::DBInfoPtr & db_info, const TiDB::TableInfoPtr & table_info, const ManageableStoragePtr & storage);
 
-    void applyRenameTable(TiDB::DBInfoPtr new_db_info, TiDB::TableID table_id);
+    void applyRenameTable(const TiDB::DBInfoPtr & new_db_info, TiDB::TableID table_id);
 
-    void applyRenameLogicalTable(TiDB::DBInfoPtr new_db_info, TiDB::TableInfoPtr new_table_info, ManageableStoragePtr storage);
+    void applyRenameLogicalTable(const TiDB::DBInfoPtr & new_db_info, const TiDB::TableInfoPtr & new_table_info, const ManageableStoragePtr & storage);
 
-    void applyRenamePhysicalTable(TiDB::DBInfoPtr new_db_info, TiDB::TableInfo & new_table_info, ManageableStoragePtr storage);
+    void applyRenamePhysicalTable(const TiDB::DBInfoPtr & new_db_info, const TiDB::TableInfo & new_table_info, const ManageableStoragePtr & storage);
 
     void applyExchangeTablePartition(const SchemaDiff & diff);
 
-    void applySetTiFlashReplica(TiDB::DBInfoPtr db_info, TableID table_id);
-    void applySetTiFlashReplica(TiDB::DBInfoPtr db_info, TiDB::TableInfoPtr table_info, ManageableStoragePtr storage);
+    void applySetTiFlashReplica(const TiDB::DBInfoPtr & db_info, TableID table_id);
+    void applySetTiFlashReplicaOnLogicalTable(const TiDB::DBInfoPtr & db_info, const TiDB::TableInfoPtr & table_info, const ManageableStoragePtr & storage);
+    void applySetTiFlashReplicaOnPhysicalTable(const TiDB::DBInfoPtr & db_info, const TiDB::TableInfoPtr & table_info, const ManageableStoragePtr & storage);
+
+    void applySetTiFlashMode(const TiDB::DBInfoPtr & db_info, TableID table_id);
+    void applySetTiFlashModeOnLogicalTable(const TiDB::DBInfoPtr & db_info, const TiDB::TableInfoPtr & table_info, const ManageableStoragePtr & storage);
+    void applySetTiFlashModeOnPhysicalTable(const TiDB::DBInfoPtr & db_info, const TiDB::TableInfoPtr & table_info, const ManageableStoragePtr & storage);
 };
 
 } // namespace DB
diff --git a/dbms/src/TiDB/Schema/SchemaGetter.cpp b/dbms/src/TiDB/Schema/SchemaGetter.cpp
index 7f52f9301b1..6e333d6ba87 100644
--- a/dbms/src/TiDB/Schema/SchemaGetter.cpp
+++ b/dbms/src/TiDB/Schema/SchemaGetter.cpp
@@ -19,7 +19,6 @@
 
 namespace DB
 {
-
 namespace ErrorCodes
 {
 extern const int SCHEMA_SYNC_ERROR;
@@ -188,18 +187,26 @@ Int64 SchemaGetter::getVersion()
     return std::stoll(ver);
 }
 
+bool SchemaGetter::checkSchemaDiffExists(Int64 ver)
+{
+    String key = getSchemaDiffKey(ver);
+    String data = TxnStructure::get(snap, key);
+    return !data.empty();
+}
+
 String SchemaGetter::getSchemaDiffKey(Int64 ver)
 {
     return std::string(schemaDiffPrefix) + ":" + std::to_string(ver);
 }
 
-SchemaDiff SchemaGetter::getSchemaDiff(Int64 ver)
+std::optional<SchemaDiff> SchemaGetter::getSchemaDiff(Int64 ver)
 {
     String key = getSchemaDiffKey(ver);
     String data = TxnStructure::get(snap, key);
     if (data.empty())
     {
-        throw TiFlashException("cannot find schema diff for version: " + std::to_string(ver), Errors::Table::SyncError);
+        LOG_FMT_WARNING(log, "The schema diff for version {}, key {} is empty.", ver, key);
+        return std::nullopt;
     }
     SchemaDiff diff;
     diff.deserialize(data);
diff --git a/dbms/src/TiDB/Schema/SchemaGetter.h b/dbms/src/TiDB/Schema/SchemaGetter.h
index 02d2f7a7c88..72fd00678f7 100644
--- a/dbms/src/TiDB/Schema/SchemaGetter.h
+++ b/dbms/src/TiDB/Schema/SchemaGetter.h
@@ -26,6 +26,8 @@
 
 #include <common/logger_useful.h>
 
+#include <optional>
+
 namespace DB
 {
 // The enum results are completely the same as the DDL Action listed in the "parser/model/ddl.go" of TiDB codebase, which must be keeping in sync.
@@ -92,11 +94,14 @@ enum class SchemaActionType : Int8
     AlterTableStatsOptions = 58,
     AlterNoCacheTable = 59,
     CreateTables = 60,
+    ActionMultiSchemaChange = 61,
+    SetTiFlashMode = 62,
+
 
     // If we supporte new type from TiDB.
     // MaxRecognizedType also needs to be changed.
     // It should always be equal to the maximum supported type + 1
-    MaxRecognizedType = 61,
+    MaxRecognizedType = 63,
 };
 
 struct AffectedOption
@@ -138,7 +143,9 @@ struct SchemaGetter
 
     Int64 getVersion();
 
-    SchemaDiff getSchemaDiff(Int64 ver);
+    bool checkSchemaDiffExists(Int64 ver);
+
+    std::optional<SchemaDiff> getSchemaDiff(Int64 ver);
 
     static String getSchemaDiffKey(Int64 ver);
 
diff --git a/dbms/src/TiDB/Schema/TiDBSchemaSyncer.h b/dbms/src/TiDB/Schema/TiDBSchemaSyncer.h
index 4fdba195acb..a23aeab139f 100644
--- a/dbms/src/TiDB/Schema/TiDBSchemaSyncer.h
+++ b/dbms/src/TiDB/Schema/TiDBSchemaSyncer.h
@@ -106,21 +106,31 @@ struct TiDBSchemaSyncer : public SchemaSyncer
         Stopwatch watch;
         SCOPE_EXIT({ GET_METRIC(tiflash_schema_apply_duration_seconds).Observe(watch.elapsedSeconds()); });
 
-        LOG_FMT_INFO(log, "start to sync schemas. current version is: {} and try to sync schema version to: {}", cur_version, version);
+        LOG_FMT_INFO(log, "Start to sync schemas. current version is: {} and try to sync schema version to: {}", cur_version, version);
 
         // Show whether the schema mutex is held for a long time or not.
         GET_METRIC(tiflash_schema_applying).Set(1.0);
         SCOPE_EXIT({ GET_METRIC(tiflash_schema_applying).Set(0.0); });
 
         GET_METRIC(tiflash_schema_apply_count, type_diff).Increment();
-        if (!tryLoadSchemaDiffs(getter, version, context))
+        // After the feature concurrent DDL, TiDB does `update schema version` before `set schema diff`, and they are done in separate transactions.
+        // So TiFlash may see a schema version X but no schema diff X, meaning that the transaction of schema diff X has not been committed or has
+        // been aborted.
+        // However, TiDB makes sure that if we get a schema version X, then the schema diff X-1 must exist. Otherwise the transaction of schema diff
+        // X-1 is aborted and we can safely ignore it.
+        // Since TiDB can not make sure the schema diff of the latest schema version X is not empty, under this situation we should set the `cur_version`
+        // to X-1 and try to fetch the schema diff X next time.
+        Int64 version_after_load_diff = 0;
+        if (version_after_load_diff = tryLoadSchemaDiffs(getter, version, context); version_after_load_diff == -1)
         {
             GET_METRIC(tiflash_schema_apply_count, type_full).Increment();
             loadAllSchema(getter, version, context);
+            // After loadAllSchema, we need update `version_after_load_diff` by last diff value exist or not
+            version_after_load_diff = getter.checkSchemaDiffExists(version) ? version : version - 1;
         }
-        cur_version = version;
+        cur_version = version_after_load_diff;
         GET_METRIC(tiflash_schema_version).Set(cur_version);
-        LOG_FMT_INFO(log, "end sync schema, version has been updated to {}", cur_version);
+        LOG_FMT_INFO(log, "End sync schema, version has been updated to {}{}", cur_version, cur_version == version ? "" : "(latest diff is empty)");
         return true;
     }
 
@@ -144,30 +154,60 @@ struct TiDBSchemaSyncer : public SchemaSyncer
         return it->second;
     }
 
-    bool tryLoadSchemaDiffs(Getter & getter, Int64 version, Context & context)
+    // Return Values
+    // - if latest schema diff is not empty, return the (latest_version)
+    // - if latest schema diff is empty, return the (latest_version - 1)
+    // - if error happend, return (-1)
+    Int64 tryLoadSchemaDiffs(Getter & getter, Int64 latest_version, Context & context)
     {
-        if (isTooOldSchema(cur_version, version))
+        if (isTooOldSchema(cur_version, latest_version))
         {
-            return false;
+            return -1;
         }
 
-        LOG_FMT_DEBUG(log, "try load schema diffs.");
+        LOG_FMT_DEBUG(log, "Try load schema diffs.");
 
-        SchemaBuilder<Getter, NameMapper> builder(getter, context, databases, version);
+        SchemaBuilder<Getter, NameMapper> builder(getter, context, databases, latest_version);
 
         Int64 used_version = cur_version;
-        std::vector<SchemaDiff> diffs;
-        while (used_version < version)
+        // First get all schema diff from `cur_version` to `latest_version`. Only apply the schema diff(s) if we fetch all
+        // schema diff without any exception.
+        std::vector<std::optional<SchemaDiff>> diffs;
+        while (used_version < latest_version)
         {
             used_version++;
             diffs.push_back(getter.getSchemaDiff(used_version));
         }
-        LOG_FMT_DEBUG(log, "end load schema diffs with total {} entries.", diffs.size());
+        LOG_FMT_DEBUG(log, "End load schema diffs with total {} entries.", diffs.size());
+
         try
         {
-            for (const auto & diff : diffs)
+            for (size_t diff_index = 0; diff_index < diffs.size(); ++diff_index)
             {
-                builder.applyDiff(diff);
+                const auto & schema_diff = diffs[diff_index];
+
+                if (!schema_diff)
+                {
+                    // If `schema diff` from `latest_version` got empty `schema diff`
+                    // Then we won't apply to `latest_version`, but we will apply to `latest_version - 1`
+                    // If `schema diff` from [`cur_version`, `latest_version - 1`] got empty `schema diff`
+                    // Then we should just skip it.
+                    //
+                    // example:
+                    //  - `cur_version` is 1, `latest_version` is 10
+                    //  - The schema diff of schema version [2,4,6] is empty, Then we just skip it.
+                    //  - The schema diff of schema version 10 is empty, Then we should just apply version into 9
+                    if (diff_index != diffs.size() - 1)
+                    {
+                        LOG_FMT_WARNING(log, "Skip the schema diff from version {}. ", cur_version + diff_index + 1);
+                        continue;
+                    }
+
+                    // if diff_index == diffs.size() - 1, return used_version - 1;
+                    return used_version - 1;
+                }
+
+                builder.applyDiff(*schema_diff);
             }
         }
         catch (TiFlashException & e)
@@ -177,7 +217,7 @@ struct TiDBSchemaSyncer : public SchemaSyncer
                 GET_METRIC(tiflash_schema_apply_count, type_failed).Increment();
             }
             LOG_FMT_WARNING(log, "apply diff meets exception : {} \n stack is {}", e.displayText(), e.getStackTrace().toString());
-            return false;
+            return -1;
         }
         catch (Exception & e)
         {
@@ -187,21 +227,22 @@ struct TiDBSchemaSyncer : public SchemaSyncer
             }
             GET_METRIC(tiflash_schema_apply_count, type_failed).Increment();
             LOG_FMT_WARNING(log, "apply diff meets exception : {} \n stack is {}", e.displayText(), e.getStackTrace().toString());
-            return false;
+            return -1;
         }
         catch (Poco::Exception & e)
         {
             GET_METRIC(tiflash_schema_apply_count, type_failed).Increment();
             LOG_FMT_WARNING(log, "apply diff meets exception : {}", e.displayText());
-            return false;
+            return -1;
         }
         catch (std::exception & e)
         {
             GET_METRIC(tiflash_schema_apply_count, type_failed).Increment();
             LOG_FMT_WARNING(log, "apply diff meets exception : {}", e.what());
-            return false;
+            return -1;
         }
-        return true;
+
+        return used_version;
     }
 
     void loadAllSchema(Getter & getter, Int64 version, Context & context)
diff --git a/dbms/src/WindowFunctions/tests/gtest_window_functions.cpp b/dbms/src/WindowFunctions/tests/gtest_window_functions.cpp
index 3addf73a642..06253cac66e 100644
--- a/dbms/src/WindowFunctions/tests/gtest_window_functions.cpp
+++ b/dbms/src/WindowFunctions/tests/gtest_window_functions.cpp
@@ -69,11 +69,10 @@ try
                        .sort({{"partition", false}, {"order", false}, {"partition", false}, {"order", false}}, true)
                        .window(RowNumber(), {"order", false}, {"partition", false}, buildDefaultRowsFrame())
                        .build(context);
-    executeStreams(
-        request,
-        {toNullableVec<Int64>("partition", {1, 1, 1, 1, 2, 2, 2, 2}),
-         toNullableVec<Int64>("order", {1, 1, 2, 2, 1, 1, 2, 2}),
-         toNullableVec<Int64>("row_number", {1, 2, 3, 4, 1, 2, 3, 4})});
+    ASSERT_COLUMNS_EQ_R(executeStreams(request),
+                        createColumns({toNullableVec<Int64>("partition", {1, 1, 1, 1, 2, 2, 2, 2}),
+                                       toNullableVec<Int64>("order", {1, 1, 2, 2, 1, 1, 2, 2}),
+                                       toNullableVec<Int64>("row_number", {1, 2, 3, 4, 1, 2, 3, 4})}));
 
     // null input
     executeStreamsWithSingleSource(
@@ -82,10 +81,8 @@ try
         {});
 
     // nullable
-    executeStreamsWithSingleSource(
-        request,
-        {toNullableVec<Int64>("partition", {{}, 1, 1, 1, 1, 2, 2, 2, 2}), {toNullableVec<Int64>("order", {{}, 1, 1, 2, 2, 1, 1, 2, 2})}},
-        {toNullableVec<Int64>("partition", {{}, 1, 1, 1, 1, 2, 2, 2, 2}), toNullableVec<Int64>("order", {{}, 1, 1, 2, 2, 1, 1, 2, 2}), toNullableVec<Int64>("row_number", {1, 1, 2, 3, 4, 1, 2, 3, 4})});
+    ASSERT_COLUMNS_EQ_R(executeStreamsWithSingleSource(request, {toNullableVec<Int64>("partition", {{}, 1, 1, 1, 1, 2, 2, 2, 2}), {toNullableVec<Int64>("order", {{}, 1, 1, 2, 2, 1, 1, 2, 2})}}),
+                        createColumns({toNullableVec<Int64>("partition", {{}, 1, 1, 1, 1, 2, 2, 2, 2}), toNullableVec<Int64>("order", {{}, 1, 1, 2, 2, 1, 1, 2, 2}), toNullableVec<Int64>("row_number", {1, 1, 2, 3, 4, 1, 2, 3, 4})}));
 
     // string - sql : select *, row_number() over w1 from test2 window w1 as (partition by partition_string order by order_string)
     request = context
@@ -94,20 +91,18 @@ try
                   .window(RowNumber(), {"order", false}, {"partition", false}, buildDefaultRowsFrame())
                   .build(context);
 
-    executeStreams(
-        request,
-        {toNullableVec<String>("partition", {"apple", "apple", "apple", "apple", "banana", "banana", "banana", "banana"}),
-         toNullableVec<String>("order", {"apple", "apple", "banana", "banana", "apple", "apple", "banana", "banana"}),
-         toNullableVec<Int64>("row_number", {1, 2, 3, 4, 1, 2, 3, 4})});
+    ASSERT_COLUMNS_EQ_R(executeStreams(request),
+                        createColumns({toNullableVec<String>("partition", {"apple", "apple", "apple", "apple", "banana", "banana", "banana", "banana"}),
+                                       toNullableVec<String>("order", {"apple", "apple", "banana", "banana", "apple", "apple", "banana", "banana"}),
+                                       toNullableVec<Int64>("row_number", {1, 2, 3, 4, 1, 2, 3, 4})}));
 
     // nullable
-    executeStreamsWithSingleSource(
-        request,
-        {toNullableVec<String>("partition", {"banana", "banana", "banana", "banana", {}, "apple", "apple", "apple", "apple"}),
-         toNullableVec<String>("order", {"apple", "apple", "banana", "banana", {}, "apple", "apple", "banana", "banana"})},
-        {toNullableVec<String>("partition", {{}, "apple", "apple", "apple", "apple", "banana", "banana", "banana", "banana"}),
-         toNullableVec<String>("order", {{}, "apple", "apple", "banana", "banana", "apple", "apple", "banana", "banana"}),
-         toNullableVec<Int64>("row_number", {1, 1, 2, 3, 4, 1, 2, 3, 4})});
+    ASSERT_COLUMNS_EQ_R(executeStreamsWithSingleSource(request,
+                                                       {toNullableVec<String>("partition", {"banana", "banana", "banana", "banana", {}, "apple", "apple", "apple", "apple"}),
+                                                        toNullableVec<String>("order", {"apple", "apple", "banana", "banana", {}, "apple", "apple", "banana", "banana"})}),
+                        createColumns({toNullableVec<String>("partition", {{}, "apple", "apple", "apple", "apple", "banana", "banana", "banana", "banana"}),
+                                       toNullableVec<String>("order", {{}, "apple", "apple", "banana", "banana", "apple", "apple", "banana", "banana"}),
+                                       toNullableVec<Int64>("row_number", {1, 1, 2, 3, 4, 1, 2, 3, 4})}));
 
     // float64 - sql : select *, row_number() over w1 from test3 window w1 as (partition by partition_float order by order_float64)
     request = context
@@ -116,20 +111,18 @@ try
                   .window(RowNumber(), {"order", false}, {"partition", false}, buildDefaultRowsFrame())
                   .build(context);
 
-    executeStreams(
-        request,
-        {toNullableVec<Float64>("partition", {1.00, 1.00, 1.00, 1.00, 2.00, 2.00, 2.00, 2.00}),
-         toNullableVec<Float64>("order", {1.00, 1.00, 2.00, 2.00, 1.00, 1.00, 2.00, 2.00}),
-         toNullableVec<Int64>("row_number", {1, 2, 3, 4, 1, 2, 3, 4})});
+    ASSERT_COLUMNS_EQ_R(executeStreams(request),
+                        createColumns({toNullableVec<Float64>("partition", {1.00, 1.00, 1.00, 1.00, 2.00, 2.00, 2.00, 2.00}),
+                                       toNullableVec<Float64>("order", {1.00, 1.00, 2.00, 2.00, 1.00, 1.00, 2.00, 2.00}),
+                                       toNullableVec<Int64>("row_number", {1, 2, 3, 4, 1, 2, 3, 4})}));
 
     // nullable
-    executeStreamsWithSingleSource(
-        request,
-        {toNullableVec<Float64>("partition", {{}, 1.00, 1.00, 1.00, 1.00, 2.00, 2.00, 2.00, 2.00}),
-         toNullableVec<Float64>("order", {{}, 1.00, 1.00, 2.00, 2.00, 1.00, 1.00, 2.00, 2.00})},
-        {toNullableVec<Float64>("partition", {{}, 1.00, 1.00, 1.00, 1.00, 2.00, 2.00, 2.00, 2.00}),
-         toNullableVec<Float64>("order", {{}, 1.00, 1.00, 2.00, 2.00, 1.00, 1.00, 2.00, 2.00}),
-         toNullableVec<Int64>("row_number", {1, 1, 2, 3, 4, 1, 2, 3, 4})});
+    ASSERT_COLUMNS_EQ_R(executeStreamsWithSingleSource(request,
+                                                       {toNullableVec<Float64>("partition", {{}, 1.00, 1.00, 1.00, 1.00, 2.00, 2.00, 2.00, 2.00}),
+                                                        toNullableVec<Float64>("order", {{}, 1.00, 1.00, 2.00, 2.00, 1.00, 1.00, 2.00, 2.00})}),
+                        createColumns({toNullableVec<Float64>("partition", {{}, 1.00, 1.00, 1.00, 1.00, 2.00, 2.00, 2.00, 2.00}),
+                                       toNullableVec<Float64>("order", {{}, 1.00, 1.00, 2.00, 2.00, 1.00, 1.00, 2.00, 2.00}),
+                                       toNullableVec<Int64>("row_number", {1, 1, 2, 3, 4, 1, 2, 3, 4})}));
 
     // datetime - select *, row_number() over w1 from test4 window w1 as (partition by partition_datetime order by order_datetime);
     request = context
@@ -137,22 +130,20 @@ try
                   .sort({{"partition", false}, {"order", false}, {"partition", false}, {"order", false}}, true)
                   .window(RowNumber(), {"order", false}, {"partition", false}, buildDefaultRowsFrame())
                   .build(context);
-    executeStreamsWithSingleSource(
-        request,
-        {toNullableDatetimeVec("partition", {"20220101010102", "20220101010102", "20220101010102", "20220101010102", "20220101010101", "20220101010101", "20220101010101", "20220101010101"}, 0),
-         toDatetimeVec("order", {"20220101010101", "20220101010101", "20220101010102", "20220101010102", "20220101010101", "20220101010101", "20220101010102", "20220101010102"}, 0)},
-        {toNullableDatetimeVec("partition", {"20220101010101", "20220101010101", "20220101010101", "20220101010101", "20220101010102", "20220101010102", "20220101010102", "20220101010102"}, 0),
-         toNullableDatetimeVec("order", {"20220101010101", "20220101010101", "20220101010102", "20220101010102", "20220101010101", "20220101010101", "20220101010102", "20220101010102"}, 0),
-         toNullableVec<Int64>("row_number", {1, 2, 3, 4, 1, 2, 3, 4})});
+    ASSERT_COLUMNS_EQ_R(executeStreamsWithSingleSource(request,
+                                                       {toNullableDatetimeVec("partition", {"20220101010102", "20220101010102", "20220101010102", "20220101010102", "20220101010101", "20220101010101", "20220101010101", "20220101010101"}, 0),
+                                                        toDatetimeVec("order", {"20220101010101", "20220101010101", "20220101010102", "20220101010102", "20220101010101", "20220101010101", "20220101010102", "20220101010102"}, 0)}),
+                        createColumns({toNullableDatetimeVec("partition", {"20220101010101", "20220101010101", "20220101010101", "20220101010101", "20220101010102", "20220101010102", "20220101010102", "20220101010102"}, 0),
+                                       toNullableDatetimeVec("order", {"20220101010101", "20220101010101", "20220101010102", "20220101010102", "20220101010101", "20220101010101", "20220101010102", "20220101010102"}, 0),
+                                       toNullableVec<Int64>("row_number", {1, 2, 3, 4, 1, 2, 3, 4})}));
 
     // nullable
-    executeStreamsWithSingleSource(
-        request,
-        {toNullableDatetimeVec("partition", {"20220101010102", {}, "20220101010102", "20220101010102", "20220101010102", "20220101010101", "20220101010101", "20220101010101", "20220101010101"}, 0),
-         toNullableDatetimeVec("order", {"20220101010101", {}, "20220101010101", "20220101010102", "20220101010102", "20220101010101", "20220101010101", "20220101010102", "20220101010102"}, 0)},
-        {toNullableDatetimeVec("partition", {{}, "20220101010101", "20220101010101", "20220101010101", "20220101010101", "20220101010102", "20220101010102", "20220101010102", "20220101010102"}, 0),
-         toNullableDatetimeVec("order", {{}, "20220101010101", "20220101010101", "20220101010102", "20220101010102", "20220101010101", "20220101010101", "20220101010102", "20220101010102"}, 0),
-         toNullableVec<Int64>("row_number", {1, 1, 2, 3, 4, 1, 2, 3, 4})});
+    ASSERT_COLUMNS_EQ_R(executeStreamsWithSingleSource(request,
+                                                       {toNullableDatetimeVec("partition", {"20220101010102", {}, "20220101010102", "20220101010102", "20220101010102", "20220101010101", "20220101010101", "20220101010101", "20220101010101"}, 0),
+                                                        toNullableDatetimeVec("order", {"20220101010101", {}, "20220101010101", "20220101010102", "20220101010102", "20220101010101", "20220101010101", "20220101010102", "20220101010102"}, 0)}),
+                        createColumns({toNullableDatetimeVec("partition", {{}, "20220101010101", "20220101010101", "20220101010101", "20220101010101", "20220101010102", "20220101010102", "20220101010102", "20220101010102"}, 0),
+                                       toNullableDatetimeVec("order", {{}, "20220101010101", "20220101010101", "20220101010102", "20220101010102", "20220101010101", "20220101010101", "20220101010102", "20220101010102"}, 0),
+                                       toNullableVec<Int64>("row_number", {1, 1, 2, 3, 4, 1, 2, 3, 4})}));
 
     // 2 partiton key and 2 order key
     // sql : select *, row_number() over w1 from test6 window w1 as (partition by partition_int1, partition_int2 order by order_int1,order_int2)
@@ -162,41 +153,38 @@ try
                   .window(RowNumber(), {{"order1", false}, {"order2", false}}, {{"partition1", false}, {"partition2", false}}, buildDefaultRowsFrame())
                   .build(context);
 
-    executeStreams(
-        request,
-        {toNullableVec<Int64>("partition1", {1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2}),
-         toNullableVec<Int64>("partition2", {1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2}),
-         toNullableVec<Int64>("order1", {1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 2}),
-         toNullableVec<Int64>("order2", {1, 2, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2}),
-         toNullableVec<Int64>("row_number", {1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3})});
+    ASSERT_COLUMNS_EQ_R(executeStreams(request),
+                        createColumns({toNullableVec<Int64>("partition1", {1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2}),
+                                       toNullableVec<Int64>("partition2", {1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2}),
+                                       toNullableVec<Int64>("order1", {1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 2}),
+                                       toNullableVec<Int64>("order2", {1, 2, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2}),
+                                       toNullableVec<Int64>("row_number", {1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3})}));
 
     /***** rank, dense_rank *****/
     request = context.scan("test_db", "test_table_for_rank").sort({{"partition", false}, {"order", false}}, true).window({Rank(), DenseRank()}, {{"order", false}}, {{"partition", false}}, MockWindowFrame{}).build(context);
-    executeStreams(
-        request,
-        {toNullableVec<Int64>("partition", {1, 1, 1, 1, 2, 2, 2, 2}),
-         toNullableVec<Int64>("order", {1, 1, 2, 2, 1, 1, 2, 2}),
-         toNullableVec<Int64>("rank", {1, 1, 3, 3, 1, 1, 3, 3}),
-         toNullableVec<Int64>("dense_rank", {1, 1, 2, 2, 1, 1, 2, 2})});
+    ASSERT_COLUMNS_EQ_R(executeStreams(request),
+                        createColumns({toNullableVec<Int64>("partition", {1, 1, 1, 1, 2, 2, 2, 2}),
+                                       toNullableVec<Int64>("order", {1, 1, 2, 2, 1, 1, 2, 2}),
+                                       toNullableVec<Int64>("rank", {1, 1, 3, 3, 1, 1, 3, 3}),
+                                       toNullableVec<Int64>("dense_rank", {1, 1, 2, 2, 1, 1, 2, 2})}));
 
     // nullable
-    executeStreamsWithSingleSource(
-        request,
-        {toNullableVec<Int64>("partition", {{}, 1, 1, 1, 1, 2, 2, 2, 2}),
-         toNullableVec<Int64>("order", {{}, 1, 1, 2, 2, 1, 1, 2, 2})},
-        {toNullableVec<Int64>("partition", {{}, 1, 1, 1, 1, 2, 2, 2, 2}),
-         toNullableVec<Int64>("order", {{}, 1, 1, 2, 2, 1, 1, 2, 2}),
-         toNullableVec<Int64>("rank", {1, 1, 1, 3, 3, 1, 1, 3, 3}),
-         toNullableVec<Int64>("dense_rank", {1, 1, 1, 2, 2, 1, 1, 2, 2})});
-
-    executeStreamsWithSingleSource(
-        request,
-        {toNullableVec<Int64>("partition", {{}, {}, 1, 1, 1, 1, 2, 2, 2, 2}),
-         toNullableVec<Int64>("order", {{}, 1, 1, 1, 2, 2, 1, 1, 2, 2})},
-        {toNullableVec<Int64>("partition", {{}, {}, 1, 1, 1, 1, 2, 2, 2, 2}),
-         toNullableVec<Int64>("order", {{}, 1, 1, 1, 2, 2, 1, 1, 2, 2}),
-         toNullableVec<Int64>("rank", {1, 2, 1, 1, 3, 3, 1, 1, 3, 3}),
-         toNullableVec<Int64>("dense_rank", {1, 2, 1, 1, 2, 2, 1, 1, 2, 2})});
+    ASSERT_COLUMNS_EQ_R(executeStreamsWithSingleSource(request,
+                                                       {toNullableVec<Int64>("partition", {{}, 1, 1, 1, 1, 2, 2, 2, 2}),
+                                                        toNullableVec<Int64>("order", {{}, 1, 1, 2, 2, 1, 1, 2, 2})}),
+                        createColumns({toNullableVec<Int64>("partition", {{}, 1, 1, 1, 1, 2, 2, 2, 2}),
+                                       toNullableVec<Int64>("order", {{}, 1, 1, 2, 2, 1, 1, 2, 2}),
+                                       toNullableVec<Int64>("rank", {1, 1, 1, 3, 3, 1, 1, 3, 3}),
+                                       toNullableVec<Int64>("dense_rank", {1, 1, 1, 2, 2, 1, 1, 2, 2})}));
+
+    ASSERT_COLUMNS_EQ_R(executeStreamsWithSingleSource(
+                            request,
+                            {toNullableVec<Int64>("partition", {{}, {}, 1, 1, 1, 1, 2, 2, 2, 2}),
+                             toNullableVec<Int64>("order", {{}, 1, 1, 1, 2, 2, 1, 1, 2, 2})}),
+                        createColumns({toNullableVec<Int64>("partition", {{}, {}, 1, 1, 1, 1, 2, 2, 2, 2}),
+                                       toNullableVec<Int64>("order", {{}, 1, 1, 1, 2, 2, 1, 1, 2, 2}),
+                                       toNullableVec<Int64>("rank", {1, 2, 1, 1, 3, 3, 1, 1, 3, 3}),
+                                       toNullableVec<Int64>("dense_rank", {1, 2, 1, 1, 2, 2, 1, 1, 2, 2})}));
 }
 CATCH
 
diff --git a/libs/libcommon/CMakeLists.txt b/libs/libcommon/CMakeLists.txt
index 5fd25c5d238..2bedb312d07 100644
--- a/libs/libcommon/CMakeLists.txt
+++ b/libs/libcommon/CMakeLists.txt
@@ -198,3 +198,7 @@ if (ARCH_AMD64)
             src/crc64_sse2_asimd.cpp
             APPEND COMPILE_FLAGS "-mpclmul")
 endif()
+
+if (ARCH_AARCH64 AND ARCH_LINUX)
+    target_link_libraries (common PUBLIC tiflash-aarch64-string tiflash-aarch64-math)
+endif()
diff --git a/metrics/grafana/tiflash_summary.json b/metrics/grafana/tiflash_summary.json
index f899a47ed10..0d72f950add 100644
--- a/metrics/grafana/tiflash_summary.json
+++ b/metrics/grafana/tiflash_summary.json
@@ -52,7 +52,7 @@
   "gnetId": null,
   "graphTooltip": 1,
   "id": null,
-  "iteration": 1653635389238,
+  "iteration": 1654217728945,
   "links": [],
   "panels": [
     {
@@ -542,7 +542,14 @@
           "pointradius": 5,
           "points": false,
           "renderer": "flot",
-          "seriesOverrides": [],
+          "seriesOverrides": [
+            {
+              "alias": "/limit/",
+              "fill": 0,
+              "nullPointMode": "null",
+              "color": "#C4162A"
+            }
+          ],
           "spaceLength": 10,
           "stack": false,
           "steppedLine": false,
@@ -633,6 +640,13 @@
               "intervalFactor": 1,
               "legendFormat": "{{instance}}",
               "refId": "K"
+            },
+            {
+              "expr": "sum(tiflash_system_current_metric_MemoryCapacity{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (instance)",
+              "legendFormat": "limit-{{instance}}",
+              "exemplar": true,
+              "refId": "L",
+              "hide": false
             }
           ],
           "thresholds": [],
@@ -701,15 +715,15 @@
           "hiddenSeries": false,
           "id": 51,
           "legend": {
-            "alignAsTable": false,
+            "alignAsTable": true,
             "avg": false,
-            "current": false,
+            "current": true,
             "max": false,
             "min": false,
-            "rightSide": false,
+            "rightSide": true,
             "show": true,
             "total": false,
-            "values": false
+            "values": true
           },
           "lines": true,
           "linewidth": 1,
@@ -728,6 +742,12 @@
               "alias": "total",
               "fill": 0,
               "lines": false
+            },
+            {
+              "alias": "/limit/",
+              "fill": 0,
+              "nullPointMode": "null",
+              "color": "#C4162A"
             }
           ],
           "spaceLength": 10,
@@ -742,6 +762,13 @@
               "legendFormat": "{{instance}}",
               "refId": "A",
               "step": 40
+            },
+            {
+              "expr": "sum(tiflash_system_current_metric_LogicalCPUCores{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (instance)",
+              "legendFormat": "limit-{{instance}}",
+              "exemplar": true,
+              "refId": "B",
+              "intervalFactor": 1
             }
           ],
           "thresholds": [],
@@ -3878,7 +3905,7 @@
           "fill": 0,
           "fillGradient": 0,
           "gridPos": {
-            "h": 8,
+            "h": 5,
             "w": 12,
             "x": 0,
             "y": 21
@@ -3893,6 +3920,7 @@
             "min": false,
             "rightSide": true,
             "show": true,
+            "sideWidth": null,
             "total": false,
             "values": false
           },
@@ -3908,38 +3936,27 @@
           "pointradius": 5,
           "points": false,
           "renderer": "flot",
-          "seriesOverrides": [
-            {
-              "alias": "/(delta_merge)|(seg_)/",
-              "yaxis": 2
-            }
-          ],
+          "seriesOverrides": [],
           "spaceLength": 10,
           "stack": false,
-          "steppedLine": false,
+          "steppedLine": true,
           "targets": [
             {
-              "expr": "sum(rate(tiflash_storage_subtask_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type!~\"delta_merge|delta_merge_fg|delta_merge_bg_gc|seg_merge|seg_split|seg_split_fg\"}[1m])) by (type)",
+              "exemplar": true,
+              "expr": "sum(rate(tiflash_storage_subtask_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type!~\"(delta_merge|seg_merge|seg_split).*\"}[$__rate_interval])) by (type)",
               "format": "time_series",
               "hide": false,
-              "intervalFactor": 1,
+              "interval": "",
+              "intervalFactor": 2,
               "legendFormat": "{{type}}",
               "refId": "A"
-            },
-            {
-              "expr": "sum(increase(tiflash_storage_subtask_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=~\"delta_merge|delta_merge_fg|delta_merge_bg_gc|seg_merge|seg_split|seg_split_fg\"}[1m])) by (type)",
-              "format": "time_series",
-              "hide": false,
-              "intervalFactor": 1,
-              "legendFormat": "{{type}}",
-              "refId": "B"
             }
           ],
           "thresholds": [],
           "timeFrom": null,
           "timeRegions": [],
           "timeShift": null,
-          "title": "Internal Tasks OPS",
+          "title": "Small Internal Tasks OPS",
           "tooltip": {
             "shared": true,
             "sort": 0,
@@ -3955,7 +3972,7 @@
           },
           "yaxes": [
             {
-              "decimals": null,
+              "decimals": 1,
               "format": "ops",
               "label": null,
               "logBase": 1,
@@ -3969,7 +3986,7 @@
               "logBase": 1,
               "max": null,
               "min": "0",
-              "show": true
+              "show": false
             }
           ],
           "yaxis": {
@@ -3988,10 +4005,10 @@
             "defaults": {},
             "overrides": []
           },
-          "fill": 1,
+          "fill": 0,
           "fillGradient": 0,
           "gridPos": {
-            "h": 8,
+            "h": 5,
             "w": 12,
             "x": 12,
             "y": 21
@@ -4023,58 +4040,233 @@
           "pointradius": 5,
           "points": false,
           "renderer": "flot",
-          "seriesOverrides": [
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
             {
-              "alias": "/^.*-delta_merge/",
-              "yaxis": 2
+              "exemplar": false,
+              "expr": "histogram_quantile(1, sum(rate(tiflash_storage_subtask_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type!~\"(delta_merge|seg_merge|seg_split).*\"}[$__rate_interval])) by (le,type))",
+              "format": "time_series",
+              "hide": false,
+              "interval": "",
+              "intervalFactor": 2,
+              "legendFormat": "max-{{type}}",
+              "refId": "A"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Small Internal Tasks Duration",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "decimals": 1,
+              "format": "s",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
             },
             {
-              "alias": "/^.*-seg_split/",
-              "yaxis": 2
+              "format": "s",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": false
             }
           ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "${DS_TEST-CLUSTER}",
+          "description": "Total number of storage's internal sub tasks",
+          "fieldConfig": {
+            "defaults": {},
+            "overrides": []
+          },
+          "fill": 0,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 5,
+            "w": 12,
+            "x": 0,
+            "y": 26
+          },
+          "hiddenSeries": false,
+          "id": 130,
+          "legend": {
+            "alignAsTable": true,
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "rightSide": true,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null as zero",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.5.11",
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
           "spaceLength": 10,
           "stack": false,
-          "steppedLine": false,
+          "steppedLine": true,
           "targets": [
             {
-              "expr": "histogram_quantile(1, sum(rate(tiflash_storage_subtask_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le,type))",
+              "exemplar": true,
+              "expr": "sum(rate(tiflash_storage_subtask_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=~\"(delta_merge|seg_merge|seg_split).*\"}[$__rate_interval])) by (type)",
               "format": "time_series",
               "hide": false,
-              "intervalFactor": 1,
-              "legendFormat": "max-{{type}}",
+              "interval": "",
+              "intervalFactor": 2,
+              "legendFormat": "{{type}}",
               "refId": "A"
-            },
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Large Internal Tasks OPS",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
             {
-              "expr": "histogram_quantile(0.99, sum(rate(tiflash_storage_subtask_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le,type))",
-              "format": "time_series",
-              "hide": true,
-              "intervalFactor": 1,
-              "legendFormat": "99-{{type}}",
-              "refId": "B"
+              "decimals": 1,
+              "format": "ops",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
             },
             {
-              "expr": "histogram_quantile(0.95, sum(rate(tiflash_storage_subtask_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le,type))",
-              "format": "time_series",
-              "hide": true,
-              "intervalFactor": 1,
-              "legendFormat": "95-{{type}}",
-              "refId": "C"
-            },
+              "format": "opm",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": false
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "${DS_TEST-CLUSTER}",
+          "description": "Duration of storage's internal sub tasks",
+          "fieldConfig": {
+            "defaults": {},
+            "overrides": []
+          },
+          "fill": 0,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 5,
+            "w": 12,
+            "x": 12,
+            "y": 26
+          },
+          "hiddenSeries": false,
+          "id": 131,
+          "legend": {
+            "alignAsTable": true,
+            "avg": false,
+            "current": true,
+            "max": true,
+            "min": false,
+            "rightSide": true,
+            "show": true,
+            "sort": null,
+            "sortDesc": null,
+            "total": false,
+            "values": true
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null as zero",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.5.11",
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
             {
-              "expr": "histogram_quantile(0.80, sum(rate(tiflash_storage_subtask_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le,type))",
+              "exemplar": true,
+              "expr": "histogram_quantile(1, sum(rate(tiflash_storage_subtask_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=~\"(delta_merge|seg_merge|seg_split).*\"}[$__rate_interval])) by (le,type))",
               "format": "time_series",
-              "hide": true,
-              "intervalFactor": 1,
-              "legendFormat": "80-{{type}}",
-              "refId": "D"
+              "hide": false,
+              "interval": "",
+              "intervalFactor": 2,
+              "legendFormat": "max-{{type}}",
+              "refId": "A"
             }
           ],
           "thresholds": [],
           "timeFrom": null,
           "timeRegions": [],
           "timeShift": null,
-          "title": "Internal Tasks Duration",
+          "title": "Large Internal Tasks Duration",
           "tooltip": {
             "shared": true,
             "sort": 0,
@@ -4090,6 +4282,7 @@
           },
           "yaxes": [
             {
+              "decimals": 1,
               "format": "s",
               "label": null,
               "logBase": 1,
@@ -4103,7 +4296,7 @@
               "logBase": 1,
               "max": null,
               "min": "0",
-              "show": true
+              "show": false
             }
           ],
           "yaxis": {
@@ -4128,7 +4321,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 29
+            "y": 31
           },
           "hiddenSeries": false,
           "id": 43,
@@ -4234,7 +4427,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 29
+            "y": 31
           },
           "heatmap": {},
           "hideZeroBuckets": true,
@@ -4297,7 +4490,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 37
+            "y": 39
           },
           "hiddenSeries": false,
           "id": 46,
@@ -4420,7 +4613,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 37
+            "y": 39
           },
           "hiddenSeries": false,
           "id": 47,
@@ -4544,7 +4737,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 45
+            "y": 47
           },
           "height": "",
           "hiddenSeries": false,
@@ -4674,7 +4867,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 45
+            "y": 47
           },
           "height": "",
           "hiddenSeries": false,
@@ -4802,7 +4995,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 53
+            "y": 55
           },
           "hiddenSeries": false,
           "id": 88,
@@ -5002,7 +5195,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 53
+            "y": 55
           },
           "hiddenSeries": false,
           "id": 67,
@@ -5116,7 +5309,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 61
+            "y": 63
           },
           "hiddenSeries": false,
           "id": 84,
@@ -5216,7 +5409,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 61
+            "y": 63
           },
           "hiddenSeries": false,
           "id": 86,
@@ -8183,5 +8376,5 @@
   "timezone": "",
   "title": "Test-Cluster-TiFlash-Summary",
   "uid": "SVbh2xUWk",
-  "version": 2
-}
+  "version": 1
+}
\ No newline at end of file
diff --git a/release-centos7-llvm/Makefile b/release-centos7-llvm/Makefile
index 1b15df7ddc3..9c1bba42a53 100644
--- a/release-centos7-llvm/Makefile
+++ b/release-centos7-llvm/Makefile
@@ -23,6 +23,10 @@ image_tiflash_llvm_base_aarch64:
 build_tiflash_release_amd64:
 	docker run --rm -v $(realpath ..):/build/tics hub.pingcap.net/tiflash/tiflash-llvm-base:amd64 /build/tics/release-centos7-llvm/scripts/build-release.sh
 
+# Add build_tiflash_debug_amd64 target to enable FailPoints on x86. Since outputs are the same as release version, no new package targets added. 
+build_tiflash_debug_amd64:
+	docker run --rm -v $(realpath ..):/build/tics hub.pingcap.net/tiflash/tiflash-llvm-base:amd64 /build/tics/release-centos7-llvm/scripts/build-debug.sh
+
 build_tiflash_ci_amd64:
 	docker run --rm -v $(realpath ..):/build/tics hub.pingcap.net/tiflash/tiflash-llvm-base:amd64 /build/tics/release-centos7-llvm/scripts/build-tiflash-ci.sh
 
diff --git a/release-centos7-llvm/scripts/build-debug.sh b/release-centos7-llvm/scripts/build-debug.sh
new file mode 100755
index 00000000000..59dc9b86a54
--- /dev/null
+++ b/release-centos7-llvm/scripts/build-debug.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+# Copyright 2022 PingCAP, Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+CMAKE_PREFIX_PATH=$1
+
+set -ueox pipefail
+
+SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
+
+${SCRIPTPATH}/build-tiflash-release.sh "DEBUG" "${CMAKE_PREFIX_PATH}"
diff --git a/release-centos7-llvm/scripts/build-tiflash-release.sh b/release-centos7-llvm/scripts/build-tiflash-release.sh
index 42993b51afe..01ca00e8706 100755
--- a/release-centos7-llvm/scripts/build-tiflash-release.sh
+++ b/release-centos7-llvm/scripts/build-tiflash-release.sh
@@ -47,7 +47,13 @@ ENABLE_PCH=${ENABLE_PCH:-ON}
 INSTALL_DIR="${SRCPATH}/release-centos7-llvm/tiflash"
 rm -rf ${INSTALL_DIR} && mkdir -p ${INSTALL_DIR}
 
-BUILD_DIR="${SRCPATH}/release-centos7-llvm/build-release"
+if [ $CMAKE_BUILD_TYPE == "RELWITHDEBINFO" ]; then
+  BUILD_DIR="$SRCPATH/release-centos7-llvm/build-release"
+  ENABLE_FAILPOINTS="OFF"
+else
+  BUILD_DIR="$SRCPATH/release-centos7-llvm/build-debug"
+  ENABLE_FAILPOINTS="ON"
+fi
 rm -rf ${BUILD_DIR} && mkdir -p ${BUILD_DIR} && cd ${BUILD_DIR}
 
 cmake -S "${SRCPATH}" \
@@ -55,6 +61,7 @@ cmake -S "${SRCPATH}" \
   -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \
   -DENABLE_TESTING=OFF \
   -DENABLE_TESTS=OFF \
+  -DENABLE_FAILPOINTS=${ENABLE_FAILPOINTS} \
   -Wno-dev \
   -DUSE_CCACHE=OFF \
   -DRUN_HAVE_STD_REGEX=0 \
diff --git a/tests/fullstack-test-dt/clustered_index/ddl.test b/tests/fullstack-test-dt/clustered_index/ddl.test
index 8abe450c11a..6c4925c9619 100644
--- a/tests/fullstack-test-dt/clustered_index/ddl.test
+++ b/tests/fullstack-test-dt/clustered_index/ddl.test
@@ -66,3 +66,89 @@ mysql> set session tidb_isolation_read_engines='tiflash'; select * from test.t_2
 
 mysql> drop table test.t_1;
 mysql> drop table test.t_2;
+
+### about issue 5154 to check whether add column/drop column will effect the cluster index decode
+### drop the column between two columns that are cluster index columns 
+
+mysql> drop table if exists test.t_3;
+mysql> create table test.t_3 (A int, B varchar(20), C int, D int, PRIMARY KEY(A,C) CLUSTERED);
+mysql> insert into test.t_3 values (1,'1',1,1),(2,'2',2,2);
+
+mysql> alter table test.t_3 set tiflash replica 1;
+
+func> wait_table test t_3
+
+mysql> set session tidb_isolation_read_engines='tiflash';select * from test.t_3;
++---+---+---+---+
+| A | B | C | D |
++---+---+---+---+
+| 1 | 1 | 1 | 1 |
+| 2 | 2 | 2 | 2 |
++---+---+---+---+
+
+mysql> alter table test.t_3 drop column B;
+
+mysql> set session tidb_isolation_read_engines='tiflash';select * from test.t_3;
++---+---+---+
+| A | C | D |
++---+---+---+
+| 1 | 1 | 1 |
+| 2 | 2 | 2 |
++---+---+---+
+
+# insert some rows
+mysql> insert into test.t_3 values (3,3,3),(4,4,4);
+
+mysql> set session tidb_isolation_read_engines='tiflash';select * from test.t_3;
++---+---+---+
+| A | C | D |
++---+---+---+
+| 1 | 1 | 1 |
+| 2 | 2 | 2 |
+| 3 | 3 | 3 |
+| 4 | 4 | 4 |
++---+---+---+
+
+mysql> drop table test.t_3;
+
+### add the column between two columns that are cluster index columns 
+mysql> drop table if exists test.t_4
+mysql> create table test.t_4 (A int, B varchar(20), C int, D int, PRIMARY KEY(A,C) CLUSTERED);
+
+mysql> insert into test.t_4 values (1,'1',1,1),(2,'2',2,2);
+
+mysql> alter table test.t_4 set tiflash replica 1;
+
+func> wait_table test t_4
+
+mysql> set session tidb_isolation_read_engines='tiflash';select * from test.t_4;
++---+---+---+---+
+| A | B | C | D |
++---+---+---+---+
+| 1 | 1 | 1 | 1 |
+| 2 | 2 | 2 | 2 |
++---+---+---+---+
+
+mysql> alter table test.t_4 Add column E int after B;
+
+mysql> set session tidb_isolation_read_engines='tiflash';select * from test.t_4;
++---+---+------+---+---+
+| A | B |  E   | C | D |
++---+---+------+---+---+
+| 1 | 1 | NULL | 1 | 1 |
+| 2 | 2 | NULL | 2 | 2 |
++---+---+------+---+---+
+
+mysql> insert into test.t_4 values (3,'3',3,3,3),(4,'4',4,4,4);
+
+mysql> set session tidb_isolation_read_engines='tiflash';select * from test.t_4;
++---+---+------+------+------+
+| A | B |  E   |  C   |   D  |
++---+---+------+------+------+
+| 1 | 1 | NULL |  1   |   1  |
+| 2 | 2 | NULL |  2   |   2  |
+| 3 | 3 |    3 |  3   |   3  |
+| 4 | 4 |    4 |  4   |   4  |
++---+---+------+------+------+
+
+mysql> drop table test.t_4;
\ No newline at end of file
diff --git a/tests/fullstack-test/expr/bitshift_operator.test b/tests/fullstack-test/expr/bitshift_operator.test
new file mode 100644
index 00000000000..0d55a1b56a9
--- /dev/null
+++ b/tests/fullstack-test/expr/bitshift_operator.test
@@ -0,0 +1,43 @@
+# Copyright 2022 PingCAP, Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+mysql> drop table if exists test.t;
+mysql> create table test.t (a int);
+mysql> alter table test.t set tiflash replica 1;
+mysql> insert into test.t values(-1);
+
+func> wait_table test t
+
+mysql> set tidb_enforce_mpp=1; set @@session.tidb_isolation_read_engines = "tiflash"; select a>>0 as v1, a>>64 as v2, a>>10 as v3 from test.t;
++----------------------+------+-------------------+
+| v1                   | v2   | v3                |
++----------------------+------+-------------------+
+| 18446744073709551615 |    0 | 18014398509481983 |
++----------------------+------+-------------------+
+
+mysql> set tidb_enforce_mpp=1; set @@session.tidb_isolation_read_engines = "tiflash"; select a from test.t where a>>100000=0;
++------+
+| a    |
++------+
+|   -1 |
++------+
+
+mysql> set tidb_enforce_mpp=1; set @@session.tidb_isolation_read_engines = "tiflash"; select a from test.t where a>>63=1;
++------+
+| a    |
++------+
+|   -1 |
++------+
+
+mysql> drop table if exists test.t
diff --git a/tests/fullstack-test/expr/duration_pushdown.test b/tests/fullstack-test/expr/duration_pushdown.test
index 63106fa1788..442a708a802 100644
--- a/tests/fullstack-test/expr/duration_pushdown.test
+++ b/tests/fullstack-test/expr/duration_pushdown.test
@@ -106,6 +106,14 @@ mysql> use test; set tidb_enforce_mpp=1; set tidb_isolation_read_engines='tiflas
 # |         123500 |
 # +----------------+
 
+mysql> use test; set tidb_enforce_mpp=1; set tidb_isolation_read_engines='tiflash'; select time_to_sec(a) from t;
++----------------+
+| time_to_sec(a) |
++----------------+
+|        2520610 |
+|       -2520610 |
++----------------+
+
 
 mysql> drop table if exists test.time_test;
 mysql> create table test.time_test(id int(11),v1 time(3) not null, v2 time(3));
diff --git a/tests/fullstack-test/expr/format.test b/tests/fullstack-test/expr/format.test
index 8cea75d6914..719e30c974d 100644
--- a/tests/fullstack-test/expr/format.test
+++ b/tests/fullstack-test/expr/format.test
@@ -44,3 +44,52 @@ int_val
 1,234.000
 
 mysql> drop table if exists test.t
+
+mysql> create table test.t(id int, value decimal(65,4))
+mysql> alter table test.t set tiflash replica 1
+mysql> insert into test.t values(1,9999999999999999999999999999999999999999999999999999999999999.9999)
+
+func> wait_table test t
+
+mysql> set tidb_enforce_mpp=1; set tidb_isolation_read_engines='tiflash'; select format(value,-3) as result from test.t
+result
+10,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000
+
+mysql> set tidb_enforce_mpp=1; set tidb_isolation_read_engines='tiflash'; select format(value,0) as result from test.t
+result
+10,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000
+
+mysql> set tidb_enforce_mpp=1; set tidb_isolation_read_engines='tiflash'; select format(value,3) as result from test.t
+result
+10,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000.000
+
+mysql> set tidb_enforce_mpp=1; set tidb_isolation_read_engines='tiflash'; select format(value,10) as result from test.t
+result
+9,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999.9999000000
+
+
+mysql> drop table if exists test.t
+
+mysql> create table test.t(id int, value decimal(7,4))
+mysql> alter table test.t set tiflash replica 1
+mysql> insert into test.t values(1,999.9999)
+
+func> wait_table test t
+
+mysql> set tidb_enforce_mpp=1; set tidb_isolation_read_engines='tiflash'; select format(value,-2) as result from test.t
+result
+1,000
+
+mysql> set tidb_enforce_mpp=1; set tidb_isolation_read_engines='tiflash'; select format(value,0) as result from test.t
+result
+1,000
+
+mysql> set tidb_enforce_mpp=1; set tidb_isolation_read_engines='tiflash'; select format(value,2) as result from test.t
+result
+1,000.00
+
+mysql> set tidb_enforce_mpp=1; set tidb_isolation_read_engines='tiflash'; select format(value,10) as result from test.t
+result
+999.9999000000
+
+mysql> drop table if exists test.t
diff --git a/tests/fullstack-test/expr/get_format.test b/tests/fullstack-test/expr/get_format.test
new file mode 100644
index 00000000000..5409302c10a
--- /dev/null
+++ b/tests/fullstack-test/expr/get_format.test
@@ -0,0 +1,60 @@
+# Copyright 2022 PingCAP, Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+mysql> drop table if exists test.t;
+mysql> create table test.t(location varchar(10));
+mysql> insert into test.t values('USA'), ('JIS'), ('ISO'), ('EUR'), ('INTERNAL');
+mysql> alter table test.t set tiflash replica 1;
+func> wait_table test t
+mysql> set @@tidb_enforce_mpp=1; set @@tidb_isolation_read_engines='tiflash'; select GET_FORMAT(DATE, location) from test.t;
++----------------------------+
+| GET_FORMAT(DATE, location) |
++----------------------------+
+| %m.%d.%Y                   |
+| %Y-%m-%d                   |
+| %Y-%m-%d                   |
+| %d.%m.%Y                   |
+| %Y%m%d                     |
++----------------------------+
+mysql> set @@tidb_enforce_mpp=1; set @@tidb_isolation_read_engines='tiflash'; select GET_FORMAT(DATETIME, location) from test.t;
++--------------------------------+
+| GET_FORMAT(DATETIME, location) |
++--------------------------------+
+| %Y-%m-%d %H.%i.%s              |
+| %Y-%m-%d %H:%i:%s              |
+| %Y-%m-%d %H:%i:%s              |
+| %Y-%m-%d %H.%i.%s              |
+| %Y%m%d%H%i%s                   |
++--------------------------------+
+mysql> set @@tidb_enforce_mpp=1; set @@tidb_isolation_read_engines='tiflash'; select GET_FORMAT(TIMESTAMP, location) from test.t;
++---------------------------------+
+| GET_FORMAT(TIMESTAMP, location) |
++---------------------------------+
+| %Y-%m-%d %H.%i.%s               |
+| %Y-%m-%d %H:%i:%s               |
+| %Y-%m-%d %H:%i:%s               |
+| %Y-%m-%d %H.%i.%s               |
+| %Y%m%d%H%i%s                    |
++---------------------------------+
+mysql> set @@tidb_enforce_mpp=1; set @@tidb_isolation_read_engines='tiflash'; select GET_FORMAT(TIME, location) from test.t;
++----------------------------+
+| GET_FORMAT(TIME, location) |
++----------------------------+
+| %h:%i:%s %p                |
+| %H:%i:%s                   |
+| %H:%i:%s                   |
+| %H.%i.%s                   |
+| %H%i%s                     |
++----------------------------+
+mysql> drop table if exists test.t;
diff --git a/tests/fullstack-test/expr/reverse.test b/tests/fullstack-test/expr/reverse.test
new file mode 100644
index 00000000000..9195adf2b7d
--- /dev/null
+++ b/tests/fullstack-test/expr/reverse.test
@@ -0,0 +1,44 @@
+# Copyright 2022 PingCAP, Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+mysql> drop table if exists test.t;
+mysql> create table if not exists test.t(a varchar(256));
+
+
+mysql> insert into test.t values('one week’s time test');
+mysql> insert into test.t values('abc测试def');
+mysql> insert into test.t values('abcテストabc');
+mysql> insert into test.t values('ѐёђѓєѕіїјљњћќѝўџ');
+mysql> insert into test.t values('+ѐ-ё*ђ/ѓ!є@ѕ#і@ї%ј……љ&њ（ћ）ќ￥ѝ#ў@џ！^');
+mysql> insert into test.t values('αβγδεζηθικλμνξοπρστυφχψωσ');
+mysql> insert into test.t values('▲α▼βγ➨δε☎ζη✂θι€κλ♫μν✓ξο✚πρ℉στ♥υφ♖χψ♘ω★σ✕');
+mysql> insert into test.t values('թփձջրչճժծքոեռտըւիօպասդֆգհյկլխզղցվբնմշ');
+mysql> insert into test.t values(NULL);
+mysql> alter table test.t set tiflash replica 1;
+func> wait_table test t
+
+mysql> set tidb_enforce_mpp=1; set tidb_isolation_read_engines='tiflash'; select reverse(a) from test.t;
++-------------------------------------------------------------------------------------------------+
+| reverse(a)                                                                                      |
++-------------------------------------------------------------------------------------------------+
+| tset emit s’keew eno                                                                            |
+| fed试测cba                                                                                      |
+| cbaトステcba                                                                                    |
+| џўѝќћњљјїіѕєѓђёѐ                                                                                |
+| ^！џ@ў#ѝ￥ќ）ћ（њ&љ……ј%ї@і#ѕ@є!ѓ/ђ*ё-ѐ+                                                         |
+| σωψχφυτσρποξνμλκιθηζεδγβα                                                                       |
+| ✕σ★ω♘ψχ♖φυ♥τσ℉ρπ✚οξ✓νμ♫λκ€ιθ✂ηζ☎εδ➨γβ▼α▲                                                        |
+| շմնբվցղզխլկյհգֆդսապօիւըտռեոքծժճչրջձփթ                                                           |
+| NULL                                                                                            |
++-------------------------------------------------------------------------------------------------+
diff --git a/tests/fullstack-test/mpp/issue_2471.test b/tests/fullstack-test/mpp/issue_2471.test
index 4a1528595e8..9966eaadec3 100644
--- a/tests/fullstack-test/mpp/issue_2471.test
+++ b/tests/fullstack-test/mpp/issue_2471.test
@@ -35,7 +35,7 @@ mysql> use test; set @@tidb_isolation_read_engines='tiflash'; set @@tidb_opt_bro
 => DBGInvoke __enable_fail_point(exception_in_creating_set_input_stream)
 
 mysql> use test; set @@tidb_isolation_read_engines='tiflash'; set @@tidb_opt_broadcast_cartesian_join=2; select * from a as t1 left join a as t2 on t1.id = t2.id;
-ERROR 1105 (HY000) at line 1: other error for mpp stream: DB::Exception: Fail point FailPoints::exception_in_creating_set_input_stream is triggered.
+ERROR 1105 (HY000) at line 1: other error for mpp stream: Code: 10007, e.displayText() = DB::Exception: Fail point FailPoints::exception_in_creating_set_input_stream is triggered., e.what() = DB::Exception,
 
 => DBGInvoke __disable_fail_point(exception_in_creating_set_input_stream)
 
diff --git a/tests/fullstack-test/mpp/mpp_fail.test b/tests/fullstack-test/mpp/mpp_fail.test
index 7af5fef3f89..0e272c0b621 100644
--- a/tests/fullstack-test/mpp/mpp_fail.test
+++ b/tests/fullstack-test/mpp/mpp_fail.test
@@ -71,20 +71,20 @@ ERROR 1105 (HY000) at line 1: DB::Exception: Fail point FailPoints::exception_be
 ## exception during mpp run non root task
 => DBGInvoke __enable_fail_point(exception_during_mpp_non_root_task_run)
 mysql> use test; set @@tidb_isolation_read_engines='tiflash'; set @@tidb_allow_mpp=1; select count(value), id from t group by id;
-ERROR 1105 (HY000) at line 1: other error for mpp stream: DB::Exception: Exchange receiver meet error : DB::Exception: Fail point FailPoints::exception_during_mpp_non_root_task_run is triggered.
+ERROR 1105 (HY000) at line 1: other error for mpp stream: Code: 0, e.displayText() = DB::Exception: Exchange receiver meet error : Code: 10007, e.displayText() = DB::Exception: Fail point FailPoints::exception_during_mpp_non_root_task_run is triggered., e.what() = DB::Exception,, e.what() = DB::Exception,
 => DBGInvoke __disable_fail_point(exception_during_mpp_non_root_task_run)
 
 ## exception during mpp run root task
 => DBGInvoke __enable_fail_point(exception_during_mpp_root_task_run)
 mysql> use test; set @@tidb_isolation_read_engines='tiflash'; set @@tidb_allow_mpp=1; select count(value), id from t group by id;
-ERROR 1105 (HY000) at line 1: other error for mpp stream: DB::Exception: Fail point FailPoints::exception_during_mpp_root_task_run is triggered.
+ERROR 1105 (HY000) at line 1: other error for mpp stream: Code: 10007, e.displayText() = DB::Exception: Fail point FailPoints::exception_during_mpp_root_task_run is triggered., e.what() = DB::Exception,
 => DBGInvoke __disable_fail_point(exception_during_mpp_root_task_run)
 
 ## exception during mpp write err to tunnel
 => DBGInvoke __enable_fail_point(exception_during_mpp_non_root_task_run)
 => DBGInvoke __enable_fail_point(exception_during_mpp_write_err_to_tunnel)
 mysql> use test; set @@tidb_isolation_read_engines='tiflash'; set @@tidb_allow_mpp=1; select count(value), id from t group by id;
-ERROR 1105 (HY000) at line 1: other error for mpp stream: DB::Exception: Exchange receiver meet error : Failed to write error msg to tunnel
+ERROR 1105 (HY000) at line 1: other error for mpp stream: Code: 0, e.displayText() = DB::Exception: Exchange receiver meet error : Failed to write error msg to tunnel, e.what() = DB::Exception,
 => DBGInvoke __disable_fail_point(exception_during_mpp_non_root_task_run)
 => DBGInvoke __disable_fail_point(exception_during_mpp_write_err_to_tunnel)
 
@@ -92,7 +92,7 @@ ERROR 1105 (HY000) at line 1: other error for mpp stream: DB::Exception: Exchang
 => DBGInvoke __enable_fail_point(exception_during_mpp_non_root_task_run)
 => DBGInvoke __enable_fail_point(exception_during_mpp_close_tunnel)
 mysql> use test; set @@tidb_isolation_read_engines='tiflash'; set @@tidb_allow_mpp=1; select count(value), id from t group by id;
-ERROR 1105 (HY000) at line 1: other error for mpp stream: DB::Exception: Exchange receiver meet error : DB::Exception: Fail point FailPoints::exception_during_mpp_non_root_task_run is triggered.
+ERROR 1105 (HY000) at line 1: other error for mpp stream: Code: 0, e.displayText() = DB::Exception: Exchange receiver meet error : Code: 10007, e.displayText() = DB::Exception: Fail point FailPoints::exception_during_mpp_non_root_task_run is triggered., e.what() = DB::Exception,, e.what() = DB::Exception,
 => DBGInvoke __disable_fail_point(exception_during_mpp_non_root_task_run)
 => DBGInvoke __disable_fail_point(exception_during_mpp_close_tunnel)
 
@@ -125,7 +125,7 @@ ERROR 1105 (HY000) at line 1: other error for mpp stream: DB::Exception: Exchang
 ## ensure build1, build2-probe1, probe2 in the CreatingSets, test the bug where build1 throw exception but not change the build state, thus block the build2-probe1, at last this query hangs.
 => DBGInvoke __enable_fail_point(exception_mpp_hash_build)
 mysql> use test; set @@tidb_isolation_read_engines='tiflash'; set @@tidb_allow_mpp=1; set @@tidb_broadcast_join_threshold_count=0; set @@tidb_broadcast_join_threshold_size=0; select t1.id from test.t t1 join test.t t2 on t1.id = t2.id and t1.id <2 join (select id from test.t group by id) t3 on t2.id=t3.id;
-ERROR 1105 (HY000) at line 1: other error for mpp stream: DB::Exception: Fail point FailPoints::exception_mpp_hash_build is triggered.
+ERROR 1105 (HY000) at line 1: other error for mpp stream: Code: 10007, e.displayText() = DB::Exception: Fail point FailPoints::exception_mpp_hash_build is triggered., e.what() = DB::Exception,
 => DBGInvoke __disable_fail_point(exception_mpp_hash_build)
 
 # Clean up.
diff --git a/tests/fullstack-test/mpp/window.test b/tests/fullstack-test/mpp/window.test
new file mode 100644
index 00000000000..698d39ef2ea
--- /dev/null
+++ b/tests/fullstack-test/mpp/window.test
@@ -0,0 +1,32 @@
+# Copyright 2022 PingCAP, Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+mysql> drop table if exists test.t1;
+mysql> create table test.t1(c1 int, c2 int);
+mysql> insert into test.t1 values(1, 1),(2, 2),(3, 3),(1, 1),(2, 2),(3, 3),(4, 4);
+mysql> alter table test.t1 set tiflash replica 1;
+func> wait_table test t1
+mysql> use test; set @@tidb_isolation_read_engines='tiflash'; select c1, c2, row_number() over w2, row_number() over w1 from test.t1 window w1 as(partition by c1), w2 as (partition by c1, c2) order by 1, 2, 3, 4;
++------+------+----------------------+----------------------+
+| c1   | c2   | row_number() over w2 | row_number() over w1 |
++------+------+----------------------+----------------------+
+|    1 |    1 |                    1 |                    1 |
+|    1 |    1 |                    2 |                    2 |
+|    2 |    2 |                    1 |                    1 |
+|    2 |    2 |                    2 |                    2 |
+|    3 |    3 |                    1 |                    1 |
+|    3 |    3 |                    2 |                    2 |
+|    4 |    4 |                    1 |                    1 |
++------+------+----------------------+----------------------+
+mysql> drop table if exists test.t1;
diff --git a/tests/fullstack-test2/ddl/alter_table_tiflash_replica_and_mode.test b/tests/fullstack-test2/ddl/alter_table_tiflash_replica_and_mode.test
new file mode 100644
index 00000000000..5e43936379b
--- /dev/null
+++ b/tests/fullstack-test2/ddl/alter_table_tiflash_replica_and_mode.test
@@ -0,0 +1,89 @@
+# Copyright 2022 PingCAP, Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# test tiflash replica for normal case
+mysql> drop table if exists test.t
+mysql> create table test.t(a int)
+mysql> alter table test.t set tiflash replica 1
+
+func> wait_table test t
+
+>> DBGInvoke get_tiflash_replica_count("test", "t")
+┌─get_tiflash_replica_count(test, t)─┐
+│ 1                                  │
+└────────────────────────────────────┘
+
+# test tiflash mode in normal mode
+>> DBGInvoke get_tiflash_mode("test", "t")
+┌─get_tiflash_mode(test, t)─┐
+│                           │
+└───────────────────────────┘
+
+mysql> alter table test.t set tiflash mode fast
+
+>> DBGInvoke __refresh_schemas()
+
+# test tiflash mode in fast mode
+>> DBGInvoke get_tiflash_mode("test", "t")
+┌─get_tiflash_mode(test, t)───┐
+│ fast                        │
+└─────────────────────────────┘
+
+# test replica for partition tables
+mysql> drop table if exists test.t
+mysql> create table test.t (x int) partition by range (x) (partition p0 values less than (5), partition p1 values less than (10));
+mysql> alter table test.t set tiflash mode fast
+mysql> alter table test.t set tiflash replica 1
+
+func> wait_table test t
+
+>> DBGInvoke get_tiflash_replica_count("test", "t")
+┌─get_tiflash_replica_count(test, t)─┐
+│ 1                                  │
+└────────────────────────────────────┘
+
+>> DBGInvoke get_tiflash_mode("test", "t")
+┌─get_tiflash_mode(test, t)──────────┐
+│ fast                               │
+└────────────────────────────────────┘
+
+>> DBGInvoke get_partition_tables_tiflash_replica_count("test", "t")
+┌─get_partition_tables_tiflash_replica_count(test, t)─┐
+│ 1/1/                                                │
+└─────────────────────────────────────────────────────┘
+
+# test tiflash mode for partition tables
+>> DBGInvoke get_partition_tables_tiflash_mode("test", "t")
+┌─get_partition_tables_tiflash_mode(test, t)─┐
+│ fast/fast/                                 │
+└────────────────────────────────────────────┘
+
+# test replica for add partition tables after set replica
+mysql> alter table test.t add partition (partition p2 values less than (2010));
+
+>> DBGInvoke __refresh_schemas()
+
+>> DBGInvoke get_partition_tables_tiflash_replica_count("test", "t")
+┌─get_partition_tables_tiflash_replica_count(test, t)─┐
+│ 1/1/1/                                              │
+└─────────────────────────────────────────────────────┘
+
+# test tiflash mode for add partition tables after set replica
+>> DBGInvoke get_partition_tables_tiflash_mode("test", "t")
+┌─get_partition_tables_tiflash_mode(test, t)─┐
+│ fast/fast/fast/                            │
+└────────────────────────────────────────────┘
+
+
+
diff --git a/tests/fullstack-test2/ddl/alter_tiflash_mode.test b/tests/fullstack-test2/ddl/alter_tiflash_mode.test
new file mode 100644
index 00000000000..c9f3ef488c4
--- /dev/null
+++ b/tests/fullstack-test2/ddl/alter_tiflash_mode.test
@@ -0,0 +1,48 @@
+# Copyright 2022 PingCAP, Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+mysql> drop table if exists test.t
+mysql> create table test.t(a int, b int)
+mysql> alter table test.t set tiflash replica 1
+
+func> wait_table test t
+
+# check default mode of tiflash table
+mysql> select table_schema,table_name,replica_count,available,table_mode from information_schema.tiflash_replica where table_schema='test' and table_name='t';
++--------------+------------+---------------+-----------+-----------+
+| table_schema | table_name | replica_count | available | table_mode|
++--------------+------------+---------------+-----------+-----------+
+| test         | t          |             1 |         1 |   NORMAL  |
++--------------+------------+---------------+-----------+-----------+
+
+# check change mode 
+
+mysql> alter table test.t set tiflash mode fast
+mysql> select table_schema,table_name,replica_count,available,table_mode from information_schema.tiflash_replica where table_schema='test' and table_name='t';
++--------------+------------+---------------+-----------+-----------+
+| table_schema | table_name | replica_count | available | table_mode|
++--------------+------------+---------------+-----------+-----------+
+| test         | t          |             1 |         1 |    FAST   |
++--------------+------------+---------------+-----------+-----------+
+
+# check change mode
+mysql> alter table test.t set tiflash mode normal
+mysql> select table_schema,table_name,replica_count,available,table_mode from information_schema.tiflash_replica where table_schema='test' and table_name='t';
++--------------+------------+---------------+-----------+-----------+
+| table_schema | table_name | replica_count | available | table_mode|
++--------------+------------+---------------+-----------+-----------+
+| test         | t          |             1 |         1 |   NORMAL  |
++--------------+------------+---------------+-----------+-----------+
+
+mysql> drop table if exists test.t
\ No newline at end of file
diff --git a/tests/fullstack-test2/ddl/multi_alter_with_write.test b/tests/fullstack-test2/ddl/multi_alter_with_write.test
new file mode 100644
index 00000000000..3284511d775
--- /dev/null
+++ b/tests/fullstack-test2/ddl/multi_alter_with_write.test
@@ -0,0 +1,880 @@
+# Copyright 2022 PingCAP, Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# this test focus on the case when multi DDL actions happen closely
+#（ and these DDL actions will be fetched in the same regular sync schema duration.) 
+# and there are some corresponding insert(write) actions between these DDL actions. 
+# Considering that these write actions and these schema change will arrive at 
+# tiflash in a different order, we simulate these different order situation to check 
+# that our schema module was working correctly.
+
+# TiDB Timeline ： write cmd 1 ｜ alter cmd 1 ｜ write cmd 2 | alter cmd 2 | write cmd 3 
+
+# stop regular schema sync
+=> DBGInvoke __enable_schema_sync_service('false') 
+
+# Enable the failpoint and make it pause before applying the raft cmd to write a row
+>> DBGInvoke __init_fail_point()
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# Enable the failpoint to make our query only start when the write action finished
+>> DBGInvoke __enable_fail_point(unblock_query_init_after_write)
+
+# -----------------------------------------------------------------------------
+# Order 1 : write cmd 1 | alter cmd 1 | write cmd 2 | alter cmd 2 | write cmd 3
+# -----------------------------------------------------------------------------
+
+mysql> drop table if exists test.t
+mysql> create table test.t(a int primary key, b decimal(5,2) not NULL, c varchar(10), d int default 0);
+
+mysql> alter table test.t set tiflash replica 1;
+
+func> wait_table test t 
+
+# write cmd 1
+mysql> insert into test.t (a, b, c) values (1, 4.5, 'abc'); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 1 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┐
+│ 1   │ 4.50 │ abc   │ 0   │
+└─────┴──────┴───────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# alter cmd 1
+mysql> alter table test.t add column e decimal(6,1) NULL;
+
+# make alter cmd 1 take effect
+>> DBGInvoke __refresh_schemas()
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┬─e───┐
+│ 1   │ 4.50 │ abc   │ 0   │ \N  │
+└─────┴──────┴───────┴─────┴─────┘
+
+# write cmd 2
+mysql> insert into test.t values (3, 0.2, 'ccc', 3, 0.1); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 2 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┬─e───┐
+│ 1   │ 4.50 │ abc   │ 0   │ \N  │
+│ 3   │ 0.20 │ ccc   │ 3   │ 0.1 │
+└─────┴──────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# alter cmd 2
+mysql> alter table test.t drop column b;
+
+# make alter cmd 2 take effect
+>> DBGInvoke __refresh_schemas()
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─c─────┬─d───┬─e───┐
+│ 1   │ abc   │ 0   │ \N  │
+│ 3   │ ccc   │ 3   │ 0.1 │
+└─────┴───────┴─────┴─────┘
+
+# write cmd 3
+mysql> insert into test.t values (4, 'abcd', 10, 0.2); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 3 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─c─────┬─d───┬─e───┐
+│ 1   │ abc   │ 0   │ \N  │
+│ 3   │ ccc   │ 3   │ 0.1 │
+│ 4   │ abcd  │ 10  │ 0.2 │
+└─────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# ---------------------------------------------------------------------------------------------
+# Order 2 : write cmd 1 | alter cmd 1 | write cmd 2 | write cmd 3 --> sync schema(alter cmd 2)
+# ---------------------------------------------------------------------------------------------
+
+mysql> drop table if exists test.t
+mysql> create table test.t(a int primary key, b decimal(5,2) not NULL, c varchar(10), d int default 0);
+
+mysql> alter table test.t set tiflash replica 1;
+
+func> wait_table test t 
+
+# write cmd 1
+mysql> insert into test.t (a, b, c) values (1, 4.5, 'abc'); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 1 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┐
+│ 1   │ 4.50 │ abc   │ 0   │
+└─────┴──────┴───────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# alter cmd 1
+mysql> alter table test.t add column e decimal(6,1) NULL;
+
+# make alter cmd 1 take effect
+>> DBGInvoke __refresh_schemas()
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┬─e───┐
+│ 1   │ 4.50 │ abc   │ 0   │ \N  │
+└─────┴──────┴───────┴─────┴─────┘
+
+# write cmd 2
+mysql> insert into test.t values (3, 0.2, 'ccc', 3, 0.1); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 2 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┬─e───┐
+│ 1   │ 4.50 │ abc   │ 0   │ \N  │
+│ 3   │ 0.20 │ ccc   │ 3   │ 0.1 │
+└─────┴──────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# alter cmd 2
+mysql> alter table test.t drop column b;
+
+# write cmd 3
+mysql> insert into test.t values (4, 'abcd', 10, 0.2); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 3 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+# check what happen after write cmd 3 --> call sync schema and get alter cmd 2 happen
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─c─────┬─d───┬─e───┐
+│ 1   │ abc   │ 0   │ \N  │
+│ 3   │ ccc   │ 3   │ 0.1 │
+│ 4   │ abcd  │ 10  │ 0.2 │
+└─────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# -----------------------------------------------------------------------------------------------
+# Order 3 : write cmd 1 | alter cmd 1 | alter cmd 2 | write cmd 2 -->sync schema() | write cmd 3
+# -----------------------------------------------------------------------------------------------
+
+mysql> drop table if exists test.t
+mysql> create table test.t(a int primary key, b decimal(5,2) not NULL, c varchar(10), d int default 0);
+
+mysql> alter table test.t set tiflash replica 1;
+
+func> wait_table test t 
+
+# write cmd 1
+mysql> insert into test.t (a, b, c) values (1, 4.5, 'abc'); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 1 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┐
+│ 1   │ 4.50 │ abc   │ 0   │
+└─────┴──────┴───────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# alter cmd 1
+mysql> alter table test.t add column e decimal(6,1) NULL;
+
+# make alter cmd 1 take effect
+>> DBGInvoke __refresh_schemas()
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┬─e───┐
+│ 1   │ 4.50 │ abc   │ 0   │ \N  │
+└─────┴──────┴───────┴─────┴─────┘
+
+# write cmd 2
+mysql> insert into test.t values (3, 0.2, 'ccc', 3, 0.1); 
+
+# alter cmd 2
+mysql> alter table test.t drop column b;
+
+# make alter cmd 2 take effect
+>> DBGInvoke __refresh_schemas()
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 2 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─c─────┬─d───┬─e───┐
+│ 1   │ abc   │ 0   │ \N  │
+│ 3   │ ccc   │ 3   │ 0.1 │
+└─────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# write cmd 3
+mysql> insert into test.t values (4, 'abcd', 10, 0.2); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 3 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─c─────┬─d───┬─e───┐
+│ 1   │ abc   │ 0   │ \N  │
+│ 3   │ ccc   │ 3   │ 0.1 │
+│ 4   │ abcd  │ 10  │ 0.2 │
+└─────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# -----------------------------------------------------------------------------------------------
+# Order 4 : write cmd 1 | write cmd 2 --> sync schema(alter cmd 1) | alter cmd 2 | write cmd 3
+# -----------------------------------------------------------------------------------------------
+
+mysql> drop table if exists test.t
+mysql> create table test.t(a int primary key, b decimal(5,2) not NULL, c varchar(10), d int default 0);
+
+mysql> alter table test.t set tiflash replica 1;
+
+func> wait_table test t 
+
+# write cmd 1
+mysql> insert into test.t (a, b, c) values (1, 4.5, 'abc'); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 1 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┐
+│ 1   │ 4.50 │ abc   │ 0   │
+└─────┴──────┴───────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# alter cmd 1
+mysql> alter table test.t add column e decimal(6,1) NULL;
+
+# check no schema change before write cmd 2 take effect
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┐
+│ 1   │ 4.50 │ abc   │ 0   │
+└─────┴──────┴───────┴─────┘
+
+# write cmd 2
+mysql> insert into test.t values (3, 0.2, 'ccc', 3, 0.1); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 2 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+# check what happen after write cmd 2 --> should call sync schema, get the alter cmd 1 happened.
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┬─e───┐
+│ 1   │ 4.50 │ abc   │ 0   │ \N  │
+│ 3   │ 0.20 │ ccc   │ 3   │ 0.1 │
+└─────┴──────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# alter cmd 2
+mysql> alter table test.t drop column b;
+
+# make alter cmd 2 take effect
+>> DBGInvoke __refresh_schemas()
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─c─────┬─d───┬─e───┐
+│ 1   │ abc   │ 0   │ \N  │
+│ 3   │ ccc   │ 3   │ 0.1 │
+└─────┴───────┴─────┴─────┘
+
+# write cmd 3
+mysql> insert into test.t values (4, 'abcd', 10, 0.2); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 3 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─c─────┬─d───┬─e───┐
+│ 1   │ abc   │ 0   │ \N  │
+│ 3   │ ccc   │ 3   │ 0.1 │
+│ 4   │ abcd  │ 10  │ 0.2 │
+└─────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# ---------------------------------------------------------------------------------------------------------------------
+# Order 5 : write cmd 1 | write cmd 2 --> sync schema(alter cmd 1) |  write cmd 3 --> sync schema(alter cmd 2) 
+# ----------------------------------------------------------------------------------------------------------------------
+
+mysql> drop table if exists test.t
+mysql> create table test.t(a int primary key, b decimal(5,2) not NULL, c varchar(10), d int default 0);
+
+mysql> alter table test.t set tiflash replica 1;
+
+func> wait_table test t 
+
+# write cmd 1
+mysql> insert into test.t (a, b, c) values (1, 4.5, 'abc'); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 1 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┐
+│ 1   │ 4.50 │ abc   │ 0   │
+└─────┴──────┴───────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# alter cmd 1
+mysql> alter table test.t add column e decimal(6,1) NULL;
+
+# write cmd 2
+mysql> insert into test.t values (3, 0.2, 'ccc', 3, 0.1); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 2 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+# check what happen after write cmd 2 --> should call sync schema, get the alter cmd 1 happened.
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┬─e───┐
+│ 1   │ 4.50 │ abc   │ 0   │ \N  │
+│ 3   │ 0.20 │ ccc   │ 3   │ 0.1 │
+└─────┴──────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# alter cmd 2
+mysql> alter table test.t drop column b;
+
+# write cmd 3
+mysql> insert into test.t values (4, 'abcd', 10, 0.2); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 3 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+# check what happen after write cmd 3 --> should call sync schema, get the alter cmd 2 happened.
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─c─────┬─d───┬─e───┐
+│ 1   │ abc   │ 0   │ \N  │
+│ 3   │ ccc   │ 3   │ 0.1 │
+│ 4   │ abcd  │ 10  │ 0.2 │
+└─────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# -----------------------------------------------------------------------------------------------
+# Order 6 : write cmd 1 | write cmd 2 --> sync schema(alter cmd 1 alter cmd 2) | write cmd 3
+# -----------------------------------------------------------------------------------------------
+
+mysql> drop table if exists test.t
+mysql> create table test.t(a int primary key, b decimal(5,2) not NULL, c varchar(10), d int default 0);
+
+mysql> alter table test.t set tiflash replica 1;
+
+func> wait_table test t 
+
+# write cmd 1
+mysql> insert into test.t (a, b, c) values (1, 4.5, 'abc'); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 1 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┐
+│ 1   │ 4.50 │ abc   │ 0   │
+└─────┴──────┴───────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# alter cmd 1
+mysql> alter table test.t add column e decimal(6,1) NULL;
+
+# write cmd 2
+mysql> insert into test.t values (3, 0.2, 'ccc', 3, 0.1); 
+
+# alter cmd 2
+mysql> alter table test.t drop column b;
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 2 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+# check what happen after write cmd 2 --> should call sync schema, get the alter cmd 1 && alter cmd 2 happened.
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─c─────┬─d───┬─e───┐
+│ 1   │ abc   │ 0   │ \N  │
+│ 3   │ ccc   │ 3   │ 0.1 │
+└─────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# write cmd 3
+mysql> insert into test.t values (4, 'abcd', 10, 0.2); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 3 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─c─────┬─d───┬─e───┐
+│ 1   │ abc   │ 0   │ \N  │
+│ 3   │ ccc   │ 3   │ 0.1 │
+│ 4   │ abcd  │ 10  │ 0.2 │
+└─────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# -------------------------------------------------------------------------------
+# Order 7 : alter cmd 1 | write cmd 1 | write cmd 2 | alter cmd 2 | write cmd 3
+# -------------------------------------------------------------------------------
+
+mysql> drop table if exists test.t
+mysql> create table test.t(a int primary key, b decimal(5,2) not NULL, c varchar(10), d int default 0);
+
+mysql> alter table test.t set tiflash replica 1;
+
+func> wait_table test t 
+
+# add a new pre write to make check the alter cmd 1 more convenient.
+mysql> insert into test.t (a, b, c) values (0, 0, ' ');
+ 
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┐
+│ 0   │ 0.00 │       │ 0   │
+└─────┴──────┴───────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+
+# alter cmd 1
+mysql> alter table test.t add column e decimal(6,1) NULL;
+
+# make alter cmd 1 take effect
+>> DBGInvoke __refresh_schemas()
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┬─e───┐
+│ 0   │ 0.00 │       │ 0   │ \N  │
+└─────┴──────┴───────┴─────┴─────┘
+
+# write cmd 1
+mysql> insert into test.t (a, b, c) values (1, 4.5, 'abc'); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 1 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┬─e───┐
+│ 0   │ 0.00 │       │ 0   │ \N  │
+│ 1   │ 4.50 │ abc   │ 0   │ \N  │
+└─────┴──────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# write cmd 2
+mysql> insert into test.t values (3, 0.2, 'ccc', 3, 0.1); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 2 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┬─e───┐
+│ 0   │ 0.00 │       │ 0   │ \N  │
+│ 1   │ 4.50 │ abc   │ 0   │ \N  │
+│ 3   │ 0.20 │ ccc   │ 3   │ 0.1 │
+└─────┴──────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# alter cmd 2
+mysql> alter table test.t drop column b;
+
+# make alter cmd 2 take effect
+>> DBGInvoke __refresh_schemas()
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─c─────┬─d───┬─e───┐
+│ 0   │       │ 0   │ \N  │
+│ 1   │ abc   │ 0   │ \N  │
+│ 3   │ ccc   │ 3   │ 0.1 │
+└─────┴───────┴─────┴─────┘
+
+# write cmd 3
+mysql> insert into test.t values (4, 'abcd', 10, 0.2); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 3 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─c─────┬─d───┬─e───┐
+│ 0   │       │ 0   │ \N  │
+│ 1   │ abc   │ 0   │ \N  │
+│ 3   │ ccc   │ 3   │ 0.1 │
+│ 4   │ abcd  │ 10  │ 0.2 │
+└─────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# --------------------------------------------------------------------------------------------------
+# Order 8 : alter cmd 1 | write cmd 1 | write cmd 2 | write cmd 3 --> sync schema(alter cmd 2) 
+# --------------------------------------------------------------------------------------------------
+
+mysql> drop table if exists test.t
+mysql> create table test.t(a int primary key, b decimal(5,2) not NULL, c varchar(10), d int default 0);
+
+mysql> alter table test.t set tiflash replica 1;
+
+func> wait_table test t 
+
+# add a new pre write to make check the alter cmd 1 more convenient.
+mysql> insert into test.t (a, b, c) values (0, 0, ' ');
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┐
+│ 0   │ 0.00 │       │ 0   │
+└─────┴──────┴───────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# alter cmd 1
+mysql> alter table test.t add column e decimal(6,1) NULL;
+
+# make alter cmd 1 take effect
+>> DBGInvoke __refresh_schemas()
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┬─e───┐
+│ 0   │ 0.00 │       │ 0   │ \N  │
+└─────┴──────┴───────┴─────┴─────┘
+
+# write cmd 1
+mysql> insert into test.t (a, b, c) values (1, 4.5, 'abc'); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 1 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┬─e───┐
+│ 0   │ 0.00 │       │ 0   │ \N  │
+│ 1   │ 4.50 │ abc   │ 0   │ \N  │
+└─────┴──────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# write cmd 2
+mysql> insert into test.t values (3, 0.2, 'ccc', 3, 0.1); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 2 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┬─e───┐
+│ 0   │ 0.00 │       │ 0   │ \N  │
+│ 1   │ 4.50 │ abc   │ 0   │ \N  │
+│ 3   │ 0.20 │ ccc   │ 3   │ 0.1 │
+└─────┴──────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# alter cmd 2
+mysql> alter table test.t drop column b;
+
+# write cmd 3
+mysql> insert into test.t values (4, 'abcd', 10, 0.2); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 3 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+# check what happen after write cmd 3 --> should call sync schema, get the alter cmd 2 happened.
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─c─────┬─d───┬─e───┐
+│ 0   │       │ 0   │ \N  │
+│ 1   │ abc   │ 0   │ \N  │
+│ 3   │ ccc   │ 3   │ 0.1 │
+│ 4   │ abcd  │ 10  │ 0.2 │
+└─────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# --------------------------------------------------------------------------------------------------
+# Order 9 : alter cmd 1 | write cmd 1 | alter cmd 2 | write cmd 2 -->sync schema() | write cmd 3
+# --------------------------------------------------------------------------------------------------
+
+mysql> drop table if exists test.t
+mysql> create table test.t(a int primary key, b decimal(5,2) not NULL, c varchar(10), d int default 0);
+
+mysql> alter table test.t set tiflash replica 1;
+
+func> wait_table test t 
+
+# add a new pre write to make check the alter cmd 1 more convenient.
+mysql> insert into test.t (a, b, c) values (0, 0, ' ');
+ 
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┐
+│ 0   │ 0.00 │       │ 0   │
+└─────┴──────┴───────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# alter cmd 1
+mysql> alter table test.t add column e decimal(6,1) NULL;
+
+# make alter cmd 1 take effect
+>> DBGInvoke __refresh_schemas()
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┬─e───┐
+│ 0   │ 0.00 │       │ 0   │ \N  │
+└─────┴──────┴───────┴─────┴─────┘
+
+# write cmd 1
+mysql> insert into test.t (a, b, c) values (1, 4.5, 'abc'); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 1 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┬─e───┐
+│ 0   │ 0.00 │       │ 0   │ \N  │
+│ 1   │ 4.50 │ abc   │ 0   │ \N  │
+└─────┴──────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# write cmd 2
+mysql> insert into test.t values (3, 0.2, 'ccc', 3, 0.1); 
+
+# alter cmd 2
+mysql> alter table test.t drop column b;
+
+# make alter cmd 2 take effect
+>> DBGInvoke __refresh_schemas()
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 2 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─c─────┬─d───┬─e───┐
+│ 0   │       │ 0   │ \N  │
+│ 1   │ abc   │ 0   │ \N  │
+│ 3   │ ccc   │ 3   │ 0.1 │
+└─────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# write cmd 3
+mysql> insert into test.t values (4, 'abcd', 10, 0.2); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 3 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─c─────┬─d───┬─e───┐
+│ 0   │       │ 0   │ \N  │
+│ 1   │ abc   │ 0   │ \N  │
+│ 3   │ ccc   │ 3   │ 0.1 │
+│ 4   │ abcd  │ 10  │ 0.2 │
+└─────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# ------------------------------------------------------------------------------------------------------------------
+# Order 10 : alter cmd 1 | alter cmd 2 | write cmd 1 -->sync schema() | write cmd 2 -->sync schema()  | write cmd 3
+# ------------------------------------------------------------------------------------------------------------------
+
+mysql> drop table if exists test.t
+mysql> create table test.t(a int primary key, b decimal(5,2) not NULL, c varchar(10), d int default 0);
+
+mysql> alter table test.t set tiflash replica 1;
+
+func> wait_table test t 
+
+# add a new pre write to make check the alter cmd 1 more convenient.
+mysql> insert into test.t (a, b, c) values (0, 0, ' ');
+ 
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─b────┬─c─────┬─d───┐
+│ 0   │ 0.00 │       │ 0   │
+└─────┴──────┴───────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# write cmd 1
+mysql> insert into test.t (a, b, c) values (1, 4.5, 'abc'); 
+
+# alter cmd 1
+mysql> alter table test.t add column e decimal(6,1) NULL;
+
+# make alter cmd 1 take effect
+>> DBGInvoke __refresh_schemas()
+
+# write cmd 2
+mysql> insert into test.t values (3, 0.2, 'ccc', 3, 0.1); 
+
+# alter cmd 2
+mysql> alter table test.t drop column b;
+
+# make alter cmd 2 take effect
+>> DBGInvoke __refresh_schemas()
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 1 and write cmd 2 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─c─────┬─d───┬─e───┐
+│ 0   │       │ 0   │ \N  │
+│ 1   │ abc   │ 0   │ \N  │
+│ 3   │ ccc   │ 3   │ 0.1 │
+└─────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+# write cmd 3
+mysql> insert into test.t values (4, 'abcd', 10, 0.2); 
+
+# enable pause_query_init make query start until write cmd finish
+>> DBGInvoke __enable_fail_point(pause_query_init)
+
+# make write cmd 3 take effect
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+
+=> DBGInvoke query_mapped('select * from \$d.\$t', test, t)
+┌─a───┬─c─────┬─d───┬─e───┐
+│ 0   │       │ 0   │ \N  │
+│ 1   │ abc   │ 0   │ \N  │
+│ 3   │ ccc   │ 3   │ 0.1 │
+│ 4   │ abcd  │ 10  │ 0.2 │
+└─────┴───────┴─────┴─────┘
+
+>> DBGInvoke __enable_fail_point(pause_before_apply_raft_cmd)
+
+
+##
+
+=> DBGInvoke __enable_schema_sync_service('true') 
+>> DBGInvoke __disable_fail_point(pause_before_apply_raft_cmd)
+>> DBGInvoke __disable_fail_point(unblock_query_init_after_write)
+>> DBGInvoke __disable_fail_point(pause_query_init)
\ No newline at end of file
diff --git a/tests/run-test.py b/tests/run-test.py
index 843fe7c79b4..a2bcee0ce99 100644
--- a/tests/run-test.py
+++ b/tests/run-test.py
@@ -29,6 +29,7 @@
 UNFINISHED_1_PREFIX = '\t'
 UNFINISHED_2_PREFIX = '   '
 WORD_PH = '{#WORD}'
+LINE_PH = '{#LINE}'
 CURL_TIDB_STATUS_PREFIX = 'curl_tidb> '
 
 verbose = False
@@ -138,18 +139,22 @@ def match_ph_word(line):
 
 # TODO: Support more place holders, eg: {#NUMBER}
 def compare_line(line, template):
-    while True:
-        i = template.find(WORD_PH)
-        if i < 0:
-            return line == template
-        else:
-            if line[:i] != template[:i]:
-                return False
-            j = match_ph_word(line[i:])
-            if j == 0:
-                return False
-            template = template[i + len(WORD_PH):]
-            line = line[i + j:]
+    l = template.find(LINE_PH)
+    if l >= 0:
+        return True
+    else:
+        while True:
+            i = template.find(WORD_PH)
+            if i < 0:
+                return line == template
+            else:
+                if line[:i] != template[:i]:
+                    return False
+                j = match_ph_word(line[i:])
+                if j == 0:
+                    return False
+                template = template[i + len(WORD_PH):]
+                line = line[i + j:]
 
 
 class MySQLCompare:
@@ -194,11 +199,14 @@ def matched(outputs, matches):
             b = MySQLCompare.parse_excepted_outputs(matches)
             return a == b
         else:
-            if len(outputs) != len(matches):
+            if len(outputs) > len(matches):
                 return False
             for i in range(0, len(outputs)):
                 if not compare_line(outputs[i], matches[i]):
                     return False
+            for i in range(len(outputs), len(matches)):
+                if not compare_line("", matches[i]):
+                    return False
             return True
 
 
@@ -212,11 +220,14 @@ def matched(outputs, matches, fuzz):
         b = parse_table_parts(matches, fuzz)
         return a == b
     else:
-        if len(outputs) != len(matches):
+        if len(outputs) > len(matches):
             return False
         for i in range(0, len(outputs)):
             if not compare_line(outputs[i], matches[i]):
                 return False
+        for i in range(len(outputs), len(matches)):
+            if not compare_line("", matches[i]):
+                return False
         return True
 
 
diff --git a/tests/sanitize/tsan.suppression b/tests/sanitize/tsan.suppression
new file mode 100644
index 00000000000..73824caa2b9
--- /dev/null
+++ b/tests/sanitize/tsan.suppression
@@ -0,0 +1 @@
+race:dbms/src/Common/TiFlashMetrics.h
diff --git a/tests/tidb-ci/new_collation_fullstack/expr.test b/tests/tidb-ci/new_collation_fullstack/expr.test
index 15ada0f335c..1e2135c4f2d 100644
--- a/tests/tidb-ci/new_collation_fullstack/expr.test
+++ b/tests/tidb-ci/new_collation_fullstack/expr.test
@@ -35,6 +35,13 @@ mysql> set session tidb_isolation_read_engines='tiflash'; select /*+ read_from_s
 |    2 | abc   |
 +------+-------+
 
+mysql> set session tidb_isolation_read_engines='tiflash'; select /*+ read_from_storage(tiflash[t]) */ id, value1 from test.t where value1 = 'abc       ';
++------+-------+
+| id   | value1|
++------+-------+
+|    1 | abc   |
+|    2 | abc   |
++------+-------+
 
 mysql> set session tidb_isolation_read_engines='tiflash'; select /*+ read_from_storage(tiflash[t]) */ id, value from test.t where value like 'aB%';
 +------+-------+
@@ -62,6 +69,13 @@ mysql> set session tidb_isolation_read_engines='tiflash'; select /*+ read_from_s
 |    3 | def   |
 +------+-------+
 
+mysql> set session tidb_isolation_read_engines='tiflash'; select /*+ read_from_storage(tiflash[t]) */ id, value1 from test.t where value1 = 'def       ';
++------+-------+
+| id   | value1|
++------+-------+
+|    3 | def   |
++------+-------+
+
 mysql> set session tidb_isolation_read_engines='tiflash'; select /*+ read_from_storage(tiflash[t]) */ id, value1 from test.t where value1 in ('Abc','def');
 +------+-------+
 | id   | value1|