diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index ee2c31bb99..36860f2d44 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -130,104 +130,6 @@ __find_start_point(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_
     return _split_point_t<_Index>{*__res, __index_sum - *__res + 1};
 }
 
-//Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges
-//to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below:
-//     0   1  1  2   3
-//    ------------------
-//   |--->
-// 0 | 0 | 1  1  1   1
-//   |   |
-// 0 | 0 | 1  1  1   1
-//   |   ---------->
-// 2 | 0   0  0  0 | 1
-//   |             ---->
-// 3 | 0   0  0  0   0 |
-template <typename _Rng1, typename _Rng2, typename _Index, typename _Compare>
-_split_point_t<_Index>
-__find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_to, const _Rng2& __rng2,
-                      const _Index __rng2_from, _Index __rng2_to, const _Index __i_elem, _Compare __comp)
-{
-    // ----------------------- EXAMPLE ------------------------
-    // Let's consider the following input data:
-    //    rng1.size() = 10
-    //    rng2.size() = 6
-    //    i_diag = 9
-    // Let's define the following ranges for processing:
-    //    rng1: [3, ..., 9) -> __rng1_from = 3, __rng1_to = 9
-    //    rng2: [1, ..., 4) -> __rng2_from = 1, __rng2_to = 4
-    //
-    // The goal: required to process only X' items of the merge matrix
-    //           as intersection of rng1[3, ..., 9) and rng2[1, ..., 4)
-    //
-    // --------------------------------------------------------
-    //
-    //         __diag_it_begin(rng1)            __diag_it_end(rng1)
-    //      (init state) (dest state)          (init state, dest state)
-    //            |          |                       |
-    //            V          V                       V
-    //                       +   +   +   +   +   +
-    //    \ rng1  0  1   2   3   4   5   6   7   8   9
-    //   rng2   +--------------------------------------+
-    //    0     |                    ^   ^   ^   X     |     <--- __diag_it_end(rng2) (init state)
-    // +  1     | <----------------- +   +   X'2 ^     |     <--- __diag_it_end(rng2) (dest state)
-    // +  2     | <----------------- +   X'1     |     |
-    // +  3     | <----------------- X'0         |     |     <--- __diag_it_begin(rng2) (dest state)
-    //    4     |                X   ^           |     |
-    //    5     |            X       |           |     |     <--- __diag_it_begin(rng2) (init state)
-    //          +-------AX-----------+-----------+-----+
-    //              AX               |           |
-    //           AX                  |           |
-    //              Run lower_bound:[from = 5,   to = 8)
-    //
-    //  AX - absent items in rng2
-    //
-    //  We have three points on diagonal for call comparison:
-    //      X'0 : call __comp(rng1[5], rng2[3])             // 5 + 3 == 9 - 1 == 8
-    //      X'1 : call __comp(rng1[6], rng2[2])             // 6 + 2 == 9 - 1 == 8
-    //      X'3 : call __comp(rng1[7], rng2[1])             // 7 + 1 == 9 - 1 == 8
-    //   - where for every comparing pairs idx(rng1) + idx(rng2) == i_diag - 1
-
-    ////////////////////////////////////////////////////////////////////////////////////
-    // Taking into account the specified constraints of the range of processed data
-    const auto __index_sum = __i_elem - 1;
-
-    using _IndexSigned = std::make_signed_t<_Index>;
-
-    _IndexSigned idx1_from = __rng1_from;
-    _IndexSigned idx1_to = __rng1_to;
-
-    _IndexSigned idx2_from = __index_sum - (__rng1_to - 1);
-    _IndexSigned idx2_to = __index_sum - __rng1_from + 1;
-
-    const _IndexSigned idx2_from_diff =
-        idx2_from < (_IndexSigned)__rng2_from ? (_IndexSigned)__rng2_from - idx2_from : 0;
-    const _IndexSigned idx2_to_diff = idx2_to > (_IndexSigned)__rng2_to ? idx2_to - (_IndexSigned)__rng2_to : 0;
-
-    idx1_to -= idx2_from_diff;
-    idx1_from += idx2_to_diff;
-
-    idx2_from = __index_sum - (idx1_to - 1);
-    idx2_to = __index_sum - idx1_from + 1;
-
-    ////////////////////////////////////////////////////////////////////////////////////
-    // Run search of split point on diagonal
-
-    using __it_t = oneapi::dpl::counting_iterator<_Index>;
-
-    __it_t __diag_it_begin(idx1_from);
-    __it_t __diag_it_end(idx1_to);
-
-    constexpr int kValue = 1;
-    const __it_t __res = std::lower_bound(__diag_it_begin, __diag_it_end, kValue,
-                                          [&__rng1, &__rng2, __index_sum, __comp](_Index __idx, const auto& __value) {
-                                              const auto __zero_or_one =
-                                                  __comp(__rng2[__index_sum - __idx], __rng1[__idx]);
-                                              return __zero_or_one < kValue;
-                                          });
-
-    return _split_point_t<_Index>{*__res, __index_sum - *__res + 1};
-}
-
 // Do serial merge of the data from rng1 (starting from start1) and rng2 (starting from start2) and writing
 // to rng3 (starting from start3) in 'chunk' steps, but do not exceed the total size of the sequences (n1 and n2)
 template <typename _Rng1, typename _Rng2, typename _Rng3, typename _Index, typename _Compare>
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h
index 0018bb0c3e..7583c61423 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h
@@ -241,7 +241,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name
 
     struct nd_range_params
     {
-        std::size_t base_diag_count = 0;
+        std::size_t base_diag_count = 0; // Amount of the base diagonals in all sorting sub-groups (where each has 2 * __n_sorted elements or the data rest)
         std::size_t steps_between_two_base_diags = 0;
         _IndexT chunk = 0;
         _IndexT steps = 0;
@@ -285,27 +285,29 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name
         //                 |                           |
         //                 +---------------------------+
 
-        _IndexT i_elem = 0;       // Global diagonal index
-        _IndexT i_elem_local = 0; // Local diagonal index
-        _IndexT offset = 0;       // Offset to the first element in the subrange (i.e. the first element of the first subrange for merge)
-        _IndexT n1 = 0; // Size of the first subrange
-        _IndexT n2 = 0; // Size of the second subrange
+        std::size_t n_sorted = 0; // Size of the sorted subrange
+        _IndexT i_elem = 0;         // Global diagonal index
+        _IndexT i_elem_local = 0;   // Local diagonal index
+        _IndexT offset = 0;         // Offset to the first element in the subrange (i.e. the first element of the first subrange for merge)
+        _IndexT n1 = 0;             // Size of the first subrange
+        _IndexT n2 = 0;             // Size of the second subrange
 
         WorkDataArea(const std::size_t __n, const std::size_t __n_sorted, const std::size_t __linear_id,
                      const std::size_t __chunk)
+            : n_sorted(__n_sorted)
         {
             // Calculate global diagonal index
             i_elem = __linear_id * __chunk;
 
             // Calculate local diagonal index
-            i_elem_local = i_elem % (__n_sorted * 2);
+            i_elem_local = i_elem % (n_sorted * 2);
 
             // Calculate offset to the first element in the subrange (i.e. the first element of the first subrange for merge)
             offset = std::min<_IndexT>(i_elem - i_elem_local, __n);
 
             // Calculate size of the first and the second subranges
-            n1 = std::min<_IndexT>(offset + __n_sorted, __n) - offset;
-            n2 = std::min<_IndexT>(offset + __n_sorted + n1, __n) - (offset + n1);
+            n1 = std::min<_IndexT>(offset + n_sorted, __n) - offset;
+            n2 = std::min<_IndexT>(offset + n_sorted + n1, __n) - (offset + n1);
         }
 
         inline bool
@@ -332,7 +334,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name
     // Calculate nd-range params
     template <typename _ExecutionPolicy>
     nd_range_params
-    eval_nd_range_params(_ExecutionPolicy&& __exec, const std::size_t __rng_size, _IndexT __n_sorted) const
+    eval_nd_range_params(_ExecutionPolicy&& __exec, const std::size_t __rng_size) const
     {
         const bool __is_cpu = __exec.queue().get_device().is_cpu();
         const _IndexT __chunk = __is_cpu ? 32 : 4;
@@ -373,7 +375,12 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name
     {
         const _IndexT __n = __rng.size();
 
-        return __exec.queue().submit([&, __event_chain](sycl::handler& __cgh) {
+        const std::size_t __bd_chunk = __nd_range_params.chunk * __nd_range_params.steps_between_two_base_diags;
+        const std::size_t __base_diags_amount_in_each_subrange = 2 *  __n_sorted / __bd_chunk;
+
+        return __exec.queue().submit([&__rng, &__temp_buf, &__event_chain, __bd_chunk,
+                                      __base_diags_amount_in_each_subrange, __data_in_temp, __comp, __nd_range_params,
+                                      &__base_diagonals_sp_global_storage, __n, __n_sorted](sycl::handler& __cgh) {
             __cgh.depends_on(__event_chain);
 
             oneapi::dpl::__ranges::__require_access(__cgh, __rng);
@@ -394,52 +401,90 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name
                     auto __base_diagonals_sp_global_ptr =
                         _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc);
 
-                    // We should add `1` to __linear_id here to avoid calculation of split-point for 0-diagonal
-                    const WorkDataArea __data_area(__n, __n_sorted, __linear_id + 1,
-                                                   __nd_range_params.chunk *
-                                                       __nd_range_params.steps_between_two_base_diags);
-
-                    _merge_split_point_t __sp{__data_area.n1, __data_area.n2};
-
-                    if (__data_area.is_i_elem_local_inside_merge_matrix())
-                    {
-                        if (__data_in_temp)
-                        {
-                            DropViews __views(__dst, __data_area);
-                            __sp = __find_start_point_w(__data_area, __views, __comp);
-                        }
-                        else
-                        {
-                            DropViews __views(__rng, __data_area);
-                            __sp = __find_start_point_w(__data_area, __views, __comp);
-                        }
-                    }
-
-                    __base_diagonals_sp_global_ptr[__linear_id] = __sp;
+                    //   |                subrange 0             |                subrange 0             |                subrange 0             |                subrange 0             | subrange 5
+                    //   |      contains 2 * __n_sorted values   |      contains 2 * __n_sorted values   |      contains 2 * __n_sorted values   |      contains 2 * __n_sorted values   | the rest...  < Data parts
+                    //   |----/----/----/----/----/----/----/----|----/----/----/----/----/----/----/----|----/----/----/----/----/----/----/----|----/----/----/----/----/----/----/----|----/---      < Steps
+                    //   ^              ^              ^         ^              ^              ^         ^              ^              ^         ^              ^              ^         ^
+                    //   |              |              |         |              |              |         |              |              |         |              |              |         |
+                    // bd00           bd01           bd02      bd10           bd11           bd12       bd20           bd21           bd22      bd30           bd31           bd32      bd40            < Base diagonals
+                    //                  |              |                        |              |                        |              |                        |              |
+                    // __linear_id:     0              1                        2              3                        4              5                        6              7                        < [0, 1, ..., __nd_range_params.base_diag_count)
+                    //                  3              6                        11             14                       19             22                       27             30                       < __chunk_linear_id_for_this_base_diag
+                    // 
+                    // How we save BD's in the storage: [bd01, bd02, bd11, bd12, bd21, bd22, bd31, bd32]
+                    // We didn't save in the storage bd00, bd10, bd20, bd30 and bd40 because they all equal to {0, 0}
+
+                    const std::size_t __this_subrange_idx = __linear_id / __base_diags_amount_in_each_subrange;
+                    const std::size_t __base_diag_in_this_subrange_idx = __linear_id % __base_diags_amount_in_each_subrange;
+
+                    // Calculate the amount of chunks in each subrange
+                    const std::size_t __chunks_en_each_subrange = 2 * __n_sorted / __nd_range_params.chunk;
+
+                    // Calculate current BD (in chunks)
+                    // We should add `1` to __base_diag_in_this_subrange_idx here to avoid calculation of split-point for 0-diagonal
+                    const std::size_t __chunk_linear_id_for_this_base_diag =
+                        __this_subrange_idx * __chunks_en_each_subrange +
+                        (__base_diag_in_this_subrange_idx + 1) * __nd_range_params.steps_between_two_base_diags;
+
+                    const WorkDataArea __data_area(__n, __n_sorted, __chunk_linear_id_for_this_base_diag, __nd_range_params.chunk);
+
+                    __base_diagonals_sp_global_ptr[__linear_id] =
+                        __data_area.is_i_elem_local_inside_merge_matrix()
+                            ? (__data_in_temp
+                                   ? __find_start_point_w(__data_area, DropViews(__dst, __data_area), __comp)
+                                   : __find_start_point_w(__data_area, DropViews(__rng, __data_area), __comp))
+                            : _merge_split_point_t{__data_area.n1, __data_area.n2};
                 });
         });
     }
 
     template <typename DropViews, typename _Compare, typename _BaseDiagonalsSPStorage>
     inline static _merge_split_point_t
-    __lookup_sp(const std::size_t __global_idx, const nd_range_params& __nd_range_params,
+    __lookup_sp(const std::size_t __linear_id_in_steps_range,                       // [0, 1, 2, ... __nd_range_params.steps)
+                const nd_range_params& __nd_range_params,
                 const WorkDataArea& __data_area, const DropViews& __views, _Compare __comp,
                 _BaseDiagonalsSPStorage __base_diagonals_sp_global_ptr)
-    {
-        std::size_t __diagonal_idx = __global_idx / __nd_range_params.steps_between_two_base_diags;
-
-        assert(__diagonal_idx < __nd_range_params.base_diag_count);
-
-        const _merge_split_point_t __sp_left =
-            __diagonal_idx > 0 ? __base_diagonals_sp_global_ptr[__diagonal_idx - 1] : _merge_split_point_t{0, 0};
-        const _merge_split_point_t __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx];
-
-        return __sp_right.first + __sp_right.second > 0
-                   ? (__global_idx % __nd_range_params.steps_between_two_base_diags != 0
-                          ? __find_start_point_in(__views.rng1, __sp_left.first, __sp_right.first, __views.rng2,
-                                                  __sp_left.second, __sp_right.second, __data_area.i_elem_local, __comp)
-                          : __sp_left)
-                   : __find_start_point_w(__data_area, __views, __comp);
+    {   
+        //   |                subrange 0              |                subrange 0              |                subrange 0              |                subrange 0              | subrange 5
+        //   |      contains 2 * __n_sorted values    |      contains 2 * __n_sorted values    |      contains 2 * __n_sorted values    |      contains 2 * __n_sorted values    | the rest...  < Data parts
+        //   |----/----/----/----/----/----/----/----/|----/----/----/----/----/----/----/----/|----/----/----/----/----/----/----/----/|----/----/----/----/----/----/----/----/|----/---      < Steps
+        //   ^              ^              ^          ^              ^              ^          ^              ^         ^    ^          ^              ^              ^          ^
+        //   |              |              |          |              |              |          |              |         |    |          |              |              |          |
+        // bd00           bd01           bd02       bd10           bd11           bd12       bd20           bd21        |  bd22       bd30           bd31           bd32       bd40             < Base diagonals
+        //                  ^              ^                         ^              ^                         ^         |    ^                         ^              ^ 
+        //                  0              1                         2              3                         4         |    5                         6              7                         < Indexes in the base diagonal's SP storage
+        //   0    1    2    3    4    5    6    7    8    10   11   12    13   14  15   16   17    19   20    ^   22    |    ^   25    26    28  29   30   31   32   34    35   36   38         < Linear IDs: __linear_id_in_steps_range
+        //                                            9                                       18              |         |    |          27                                       37
+        //                                                                                                    |         |    |
+        //                                                                                               __bd_idx_l     |  __bd_idx_r
+        //                                                                                                              |
+        //                                                                                              __linear_id_in_steps_range
+        // 
+        // How we save BD's in the storage: [bd01, bd02, bd11, bd12, bd21, bd22, bd31, bd32]
+        // We didn't save in the storage bd00, bd10, db20, db30 and db40 because they all equal to {0, 0}
+
+        const std::size_t __diags_amount_in_each_subrange = 2 * __data_area.n_sorted / __nd_range_params.chunk;
+        const std::size_t __this_subrange_idx = __linear_id_in_steps_range / __diags_amount_in_each_subrange;
+
+        const std::size_t __bd_chunk = __nd_range_params.chunk * __nd_range_params.steps_between_two_base_diags;
+
+        const std::size_t __base_diags_amount_in_each_subrange = 2 *  __data_area.n_sorted / __bd_chunk;
+        const std::size_t __this_base_diagonal_idx_before      = __this_subrange_idx * __base_diags_amount_in_each_subrange;
+        const std::size_t __this_base_diagonal_idx_local       = __data_area.i_elem_local / __bd_chunk;
+        const bool        __we_are_on_base_diagonal            = __data_area.i_elem_local % __bd_chunk == 0;
+        const std::size_t __this_base_diagonal_idx             = __this_base_diagonal_idx_before + __this_base_diagonal_idx_local;
+
+        assert(__this_base_diagonal_idx_before + __this_base_diagonal_idx_local < __nd_range_params.base_diag_count);
+
+        const _merge_split_point_t __sp_left = __this_base_diagonal_idx_local > 0
+                                                   ? __base_diagonals_sp_global_ptr[__this_base_diagonal_idx - 1]
+                                                   : _merge_split_point_t{0, 0};
+        const _merge_split_point_t __sp_right = __base_diagonals_sp_global_ptr[__this_base_diagonal_idx];
+
+        return __we_are_on_base_diagonal
+                   ? __sp_left
+                   : __find_start_point(__views.rng1, __sp_left.first, __sp_right.first, __views.rng2, __sp_left.second,
+                                        __sp_right.second, __data_area.i_elem_local, __comp);
     }
 
     // Process parallel merge
@@ -516,16 +561,18 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name
                         {
                             DropViews __views(__dst, __data_area);
 
-                            const auto __sp = __lookup_sp(__linear_id /* __global_idx */, __nd_range_params,
-                                                          __data_area, __views, __comp, __base_diagonals_sp_global_ptr);
+                            const auto __sp =
+                                __lookup_sp(__linear_id /* __linear_id_in_steps_range */, __nd_range_params,
+                                            __data_area, __views, __comp, __base_diagonals_sp_global_ptr);
                             __serial_merge_w(__nd_range_params, __data_area, __views, __rng, __sp, __comp);
                         }
                         else
                         {
                             DropViews __views(__rng, __data_area);
 
-                            const auto __sp = __lookup_sp(__linear_id /* __global_idx */, __nd_range_params,
-                                                          __data_area, __views, __comp, __base_diagonals_sp_global_ptr);
+                            const auto __sp =
+                                __lookup_sp(__linear_id /* __linear_id_in_steps_range */, __nd_range_params,
+                                            __data_area, __views, __comp, __base_diagonals_sp_global_ptr);
                             __serial_merge_w(__nd_range_params, __data_area, __views, __dst, __sp, __comp);
                         }
                     }
@@ -549,7 +596,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name
         using __value_type = oneapi::dpl::__internal::__value_t<_Range>;
 
         // Calculate nd-range params
-        const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __n, __n_sorted);
+        const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __n);
 
         using __base_diagonals_sp_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _merge_split_point_t>;
 
@@ -560,10 +607,13 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name
 
         // Create container for storages with split-points on base diagonal
         //  - each iteration should have their own container
-        __container_of_temp_storages_t __temp_sp_storages(std::max(__n_iter, (std::int64_t)0));
+        __container_of_temp_storages_t __temp_sp_storages;
 
         for (std::int64_t __i = 0; __i < __n_iter; ++__i)
         {
+            // Check that the amount of elements in the subrange is a multiple of the chunk size
+            assert(2 * __n_sorted % __nd_range_params.chunk == 0);
+
             if (2 * __n_sorted < __get_starting_size_limit_for_large_submitter<__value_type>())
             {
                 // Process parallel merge
@@ -572,20 +622,31 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name
             }
             else
             {
+                const std::size_t _amount_of_subgroups = oneapi::dpl::__internal::__dpl_ceiling_div(__n, 2 * __n_sorted);
+                const std::size_t __diags_amount_in_each_subrange = 2 * __n_sorted / __nd_range_params.chunk;
+
+                const std::size_t __bd_chunk = __nd_range_params.chunk * __nd_range_params.steps_between_two_base_diags;
+                const std::size_t __base_diags_amount_in_each_subrange = 2 *  __n_sorted / __bd_chunk;
+
+                auto __nd_range_params_this = __nd_range_params;
+
+                // Calculate the amount of the base diagonals in all sorting sub-groups (where each has 2 * __n_sorted elements or the data rest)
+                __nd_range_params_this.base_diag_count = _amount_of_subgroups * __base_diags_amount_in_each_subrange;
+
                 // Create storage for save split-points on each base diagonal
                 // - for current iteration
                 auto __p_base_diagonals_sp_storage =
-                    new __base_diagonals_sp_storage_t(__exec, 0, __nd_range_params.base_diag_count);
-                __temp_sp_storages[__i].reset(__p_base_diagonals_sp_storage);
+                    new __base_diagonals_sp_storage_t(__exec, 0, __nd_range_params_this.base_diag_count);
+                __temp_sp_storages.emplace_back(static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_storage));
 
                 // Calculation of split-points on each base diagonal
                 __event_chain =
                     eval_split_points_for_groups(__event_chain, __n_sorted, __data_in_temp, __exec, __rng, __temp_buf,
-                                                 __comp, __nd_range_params, *__p_base_diagonals_sp_storage);
+                                                 __comp, __nd_range_params_this, *__p_base_diagonals_sp_storage);
 
                 // Process parallel merge with usage of split-points on base diagonals
                 __event_chain = run_parallel_merge(__event_chain, __n_sorted, __data_in_temp, __exec, __rng, __temp_buf,
-                                                   __comp, __nd_range_params, *__p_base_diagonals_sp_storage);
+                                                   __comp, __nd_range_params_this, *__p_base_diagonals_sp_storage);
             }
 
             __n_sorted *= 2;