From 25de131497658c2d0815b89d1013b64d3952ddb5 Mon Sep 17 00:00:00 2001
From: "Zhai, Xuejun" <xuejun.zhai@intel.com>
Date: Thu, 17 Oct 2024 14:40:45 +0800
Subject: [PATCH] Enable all reduce P2P add through iteration & fix mem leak &
 refacter the mem using to decrease the GPU mem using

Signed-off-by: Zhai, Xuejun <xuejun.zhai@intel.com>
---
 .../intel_gpu/plugin/sub_memory_manager.hpp   |  12 +
 .../src/graph/impls/ocl/sync_tensor.cpp       | 729 ++++++++++++++---
 .../transformations/fc_all_reduce_test.cpp    | 732 +++++++++++++++++-
 3 files changed, 1331 insertions(+), 142 deletions(-)
diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/sub_memory_manager.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/sub_memory_manager.hpp
index b02107ca880f99..19567ee766cb96 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/plugin/sub_memory_manager.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/sub_memory_manager.hpp
@@ -20,9 +20,15 @@ class SubMemoryManager {
         void* send_buf;
         std::shared_ptr<void> buf;
         bool flag;
+        int32_t last_rec_part;
         bool last_used;
         std::shared_ptr<cldnn::stream> stream_ptr;
         std::vector<cldnn::memory::ptr> recv_bufs;
+        std::vector<void*> remote_mems;
+        std::vector<void*> remote_mems_p2p;
+        std::vector<bool> recv_flag;
+        std::vector<bool> recv_flag_concat;
+        std::vector<bool> add_flag;
         std::vector<cldnn::event::ptr> events;
         cldnn::memory::ptr output;
         cldnn::layout layout;
@@ -33,9 +39,15 @@ class SubMemoryManager {
          _num_sub_streams = num_sub_streams;
         MemoryInfo memory_info;
         memory_info.flag = false;
+        memory_info.last_rec_part = -1;
         memory_info.last_used = false;
         memory_info.layout = cldnn::layout();
         memory_info.recv_bufs.assign(_num_sub_streams, nullptr);
+        memory_info.remote_mems.assign(_num_sub_streams, nullptr);
+        memory_info.remote_mems_p2p.assign(_num_sub_streams, nullptr);
+        memory_info.recv_flag.assign(_num_sub_streams, false);
+        memory_info.recv_flag_concat.assign(_num_sub_streams, false);
+        memory_info.add_flag.assign(_num_sub_streams, false);
         memory_info.events.assign(_num_sub_streams, nullptr);
         std::vector<MemoryInfo> memorys;
         memorys.assign(_num_sub_streams, memory_info);
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/sync_tensor.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/sync_tensor.cpp
index 3b992c6e8f87a4..df07b184a3627c 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/sync_tensor.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/sync_tensor.cpp
@@ -5,7 +5,7 @@
 #define CL_VERSION_3_0 1
 #include <CL/cl.h>
 #include <CL/cl_ext.h>
-
+#include <thread>
 #include <algorithm>
 #include <condition_variable>
 #include <mutex>
@@ -125,7 +125,7 @@ static void perf_dump_done(const std::chrono::_V2::system_clock::time_point& sta
 
 class gpu_semaphore {
 public:
-    gpu_semaphore(int count = 2) : count_(count), total_(count) {}
+    gpu_semaphore(int count = 1) : count_(count), total_(count) {}
     void signal() {
         std::unique_lock<std::mutex> lock(mutex_);
         if (count_ < total_)
@@ -406,6 +406,23 @@ class gpu_p2p_helper {
     }
 };
 
+static void dump_cl_buf(cl_command_queue queue, cl_mem clbuf, size_t count, size_t offset) {
+    cl_int err;
+    std::vector<float> outBuf(count, 0);
+    err = clEnqueueReadBuffer(queue, clbuf, CL_TRUE, offset, count * 4, outBuf.data(), 0, NULL, NULL);
+    CHECK_OCL_ERROR(err, "clEnqueueReadBuffer failed");
+    clFinish(queue);
+
+    // std::cout << "The first " << count << "elements in cl_mem = " << clbuf << " are: " << std::endl;
+    for (int i = 0; i < static_cast<int>(count); i++) {
+        // printf("%f, ", outBuf[i]);
+        std::cout << outBuf[i] << ", ";
+        if (i && i % 16 == 0)
+            std::cout << std::endl;
+    }
+    std::cout << std::endl;
+}
+
 class simple_tensor_add {
 public:
     simple_tensor_add() {}
@@ -507,6 +524,43 @@ class simple_tensor_add {
         return kernels[type];
     }
 
+    cl_kernel get_or_create_kernel_if_possible_sub(cldnn::stream& stream, kernel_data_type type, size_t offset) {
+        std::lock_guard<std::mutex> lock(mutex);
+        auto it = kernels.find(type);
+        if (it != kernels.end()) {
+            // std::cout << "get_kernel: type = " << static_cast<int>(type) << std::endl;
+            return it->second;
+        }
+#define ADD_OP_KERNEL_SOURCE_CODE(DATA_TYPE)                                                            \
+    "kernel void tensor_add_kernel_" #DATA_TYPE "(const global " #DATA_TYPE " *src, global " #DATA_TYPE \
+    " *dst, int offset) {"                                   \
+    "const int id = get_global_id(0);"                                                                  \
+    "const int idx = id + offset;"                              \
+    "dst[idx] += src[id];"                                                   \
+    "}"
+        if (type == kernel_data_type::e_type_fp16) {
+            const char tensor_add_kernel_fp16[] = ADD_OP_KERNEL_SOURCE_CODE(half);
+            const char kernel_name[] = "tensor_add_kernel_half";
+            kernels[type] = create_kernel(stream, tensor_add_kernel_fp16, kernel_name);
+            return kernels[type];
+        } else if (type == kernel_data_type::e_type_int8) {
+            const char tensor_add_kernel_int8[] = ADD_OP_KERNEL_SOURCE_CODE(char);
+            const char kernel_name[] = "tensor_add_kernel_char";
+            kernels[type] = create_kernel(stream, tensor_add_kernel_int8, kernel_name);
+            return kernels[type];
+        } else if (type == kernel_data_type::e_type_fp32) {
+            const char tensor_add_kernel_fp32[] = ADD_OP_KERNEL_SOURCE_CODE(float);
+            const char kernel_name[] = "tensor_add_kernel_float";
+            kernels[type] = create_kernel(stream, tensor_add_kernel_fp32, kernel_name);
+            return kernels[type];
+        } else {
+            std::cout << "error: unsupported adder kernel data type " << static_cast<int>(type) << std::endl;
+            // OPENVINO_THROW("error: unsupported adder kernel data type ", static_cast<int>(type));
+        }
+#undef ADD_OP_KERNEL_SOURCE_CODE
+        return kernels[type];
+    }
+
     event::ptr tensor_add(cldnn::stream& stream,
                           cl_mem src,
                           cl_mem dst,
@@ -540,6 +594,45 @@ class simple_tensor_add {
         return ocl_stream.create_event(cl::Event(ret));
     }
 
+    event::ptr tensor_add_sub(cldnn::stream& stream,
+                              cl_mem src,
+                              cl_mem dst,
+                              size_t element_count,
+                              kernel_data_type data_type,
+                              size_t offset) {
+        cl_int err;
+        auto& ocl_stream = downcast<ocl::ocl_stream>(stream);
+        if (src == nullptr || dst == nullptr) {
+            std::cout << "tensor_add: invalid arguments!" << std::endl;
+        }
+        OPENVINO_ASSERT(src != nullptr && dst != nullptr, "tensor_add: invalid arguments!");
+
+        const auto start = perf_dump_start();
+        cl_kernel kernel = get_or_create_kernel_if_possible_sub(stream,
+                                                                data_type,
+                                                                static_cast<int>(offset));
+        perf_dump_done(start, std::string("get_or_create_kernel_if_possible"), false);
+
+        err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &src);
+        CHECK_OCL_ERROR(err, "clSetKernelArg src failed");
+
+        err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &dst);
+        CHECK_OCL_ERROR(err, "clSetKernelArg dst failed");
+
+        err = clSetKernelArg(kernel, 2, sizeof(int), &offset);
+        CHECK_OCL_ERROR(err, "clSetKernelArg dst failed");
+
+        size_t global_size[] = {element_count};
+        auto queue = ocl_stream.get_cl_queue().get();
+        cl_event ret;
+        err = clEnqueueNDRangeKernel(queue, kernel, 1, nullptr, global_size, nullptr, 0, nullptr, &ret);
+        CHECK_OCL_ERROR(err, "clEnqueueNDRangeKernel failed");
+        // clWaitForEvents(1, &ret);
+
+        perf_dump_done(start, std::string("tensor add host time"), false);
+        return ocl_stream.create_event(cl::Event(ret));
+    }
+
     void finish(cldnn::stream& stream) {
         auto& ocl_stream = downcast<ocl::ocl_stream>(stream);
         auto queue = ocl_stream.get_cl_queue().get();
@@ -574,6 +667,21 @@ struct sync_tensor_impl : public typed_primitive_impl<sync_tensor> {
 
     sync_tensor_impl() : parent() {}
 
+    ~sync_tensor_impl() {
+        for (auto& mem : all_gather_remote_dst) {
+            if (mem) {
+                release_remote_mems(static_cast<cl_mem>(mem));
+            }
+        }
+        all_gather_remote_dst.clear();
+        for (auto& mem : all_reduce_remote_dst) {
+            if (mem) {
+                release_remote_mems(static_cast<cl_mem>(mem));
+            }
+        }
+        all_reduce_remote_dst.clear();
+    }
+
     explicit sync_tensor_impl(const sync_tensor_node& outer) {
         set_node_params(outer);
     }
@@ -596,6 +704,7 @@ struct sync_tensor_impl : public typed_primitive_impl<sync_tensor> {
                        int id,
                        size_t w_size,
                        int32_t w_rank,
+                       int all_reduce_solution = 0,
                        bool validate = true) {
         // Wait for P2P transferred data are ready
         std::vector<int> copy_list(w_size, 1);
@@ -608,17 +717,13 @@ struct sync_tensor_impl : public typed_primitive_impl<sync_tensor> {
                 if (idx != static_cast<size_t>(w_rank) && copy_list[idx]) {
                     auto& remote_ocl_stream =
                         downcast<ocl::ocl_stream>(*sub_mem_mgr->_memorys_table[id][idx].stream_ptr);
-                    auto event = sub_mem_mgr->_memorys_table[id][w_rank].events[idx];
+                    cldnn::event::ptr event = nullptr;
+                    event = sub_mem_mgr->_memorys_table[id][w_rank].events[idx];
                     if (event) {
                         event->wait();
                         remote_ocl_stream.finish();
                         copy_list[idx] = 0;
-                        // std::lock_guard<std::mutex> lock(sub_mem_mgr->_flagMutex);
                         sub_mem_mgr->_memorys_table[id][w_rank].events[idx] = nullptr;
-                        // MUST release remote cl_mem, but it will cause remote map failed.
-                        // cl_mem remote_mem =
-                        // static_cast<cl_mem>(sub_mem_mgr->_memorys_table[id][idx].remote_mem[w_rank]);
-                        // clReleaseMemObject(remote_mem);
                     }
                 }
             }
@@ -657,14 +762,16 @@ struct sync_tensor_impl : public typed_primitive_impl<sync_tensor> {
         }
     }
 
-    void update_internal_buffer(sync_tensor_inst& instance,
+    bool update_internal_buffer(sync_tensor_inst& instance,
                                 std::vector<cldnn::memory::ptr>& bufs,
                                 cldnn::layout& last_layout,
                                 cldnn::layout& layout,
                                 size_t w_size,
-                                size_t w_rank) {
+                                size_t w_rank,
+                                int all_reduce_solution) {
         auto& engine = instance.get_network().get_engine();
         size_t required_size = layout.bytes_count();
+        bool allocated = false;
         if (0) {
             std::lock_guard<std::mutex> lock(debug_mutex);
             std::cout << "Before update_internal_buffer: " << std::endl;
@@ -672,8 +779,16 @@ struct sync_tensor_impl : public typed_primitive_impl<sync_tensor> {
         }
 
         auto need_realloc = [&](size_t idx) {
-            if (idx == w_rank)
-                return false;
+            if (all_reduce_solution == 2) {
+                if (idx == 1)
+                    return true;
+                else
+                    return false;
+
+            } else {
+                if (idx == w_rank)
+                    return false;
+            }
 
             if (bufs[idx] == nullptr || last_layout.bytes_count() == 0)
                 return true;
@@ -692,16 +807,42 @@ struct sync_tensor_impl : public typed_primitive_impl<sync_tensor> {
             if (!need_realloc(i)) {
                 continue;
             }
-            size_t origin_size = bufs[i] != nullptr ? bufs[i]->size() : 0;
+            // size_t origin_size = bufs[i] != nullptr ? bufs[i]->size() : 0;
             bufs[i] = nullptr;
-            bufs[i] = engine.allocate_memory(layout, cldnn::allocation_type::cl_mem, false);
-            if (debug_enable) {
+            if (all_reduce_solution == 2) {
+                auto width = layout.get_shape()[-1];
+                auto sub_width = width / w_size;
+                if (sub_width * w_size != width)
+                        std::cout << "[Warning] the shape of FC output has ODD number!!!" << std::endl;
+                auto sub_layout = layout;
+                auto sub_shape = layout.get_shape();
+                sub_shape[-1] = sub_width;
+                sub_layout.set_partial_shape(sub_shape);
+                bufs[i] = engine.allocate_memory(sub_layout, cldnn::allocation_type::cl_mem, false);
+            } else {
+                bufs[i] = engine.allocate_memory(layout, cldnn::allocation_type::cl_mem, false);
+            }
+            allocated = true;
+        }
+        if (0) {
+            std::lock_guard<std::mutex> lock(debug_mutex);
+            std::cout << "tensor_sync allocate: rank[" << w_rank << "]: layout[" << 1
+                      << "]=" << layout.to_short_string() << ", last_layout[" << 1
+                      << "]=" << last_layout.to_short_string() << std::endl;
+        }
+        return allocated;
+    }
+
+    void release_remote_mems(cl_mem remote_mems) {
+        if (remote_mems) {
+            size_t data_size = 0;
+            auto _cl_mem = static_cast<cl_mem>(remote_mems);
+            if (0) {
+                auto error = clGetMemObjectInfo(_cl_mem, CL_MEM_SIZE, sizeof(size_t), &data_size, NULL);
                 std::lock_guard<std::mutex> lock(debug_mutex);
-                std::cout << "tensor_sync allocate: rank[" << w_rank << "]: layout[" << i
-                          << "]=" << layout.to_short_string() << ", required_size = " << required_size
-                          << ", current_size = " << origin_size << ", to_allocate_size = " << bufs[i]->size()
-                          << std::endl;
+                std::cout << "clReleaseMemObject...size = " << data_size << ", error: " << error << std::endl;
             }
+            clReleaseMemObject(_cl_mem);
         }
     }
 
@@ -713,18 +854,28 @@ struct sync_tensor_impl : public typed_primitive_impl<sync_tensor> {
         auto w_rank = instance.get_network().get_program()->get_config().subStreamExecConfig.get_rank()[0];
         auto w_size = instance.get_network().get_program()->get_config().get_context_for_tp().size();
         auto is_all_reduce = instance.get_impl_params()->need_add == true;
-        auto start = perf_dump_start();
-        if (!pass_through_events) {
-            for (auto e : events) {
-                e->wait();
-            }
+        if (!is_all_reduce && all_gather_remote_dst.size() == 0) {
+            all_gather_remote_dst.assign(w_size, nullptr);
         }
+        if (is_all_reduce && all_reduce_remote_dst.size() == 0) {
+            all_reduce_remote_dst.assign(w_size, nullptr);
+        }
+
+        int all_reduce_solution = 2;
+        if (is_all_reduce) {
+            const char* all_reduce_add_solution = getenv("OV_TP_ALLREDUCE_ADD_solution");
+            if (all_reduce_add_solution)
+                all_reduce_solution = std::atoi(all_reduce_add_solution);
+        }
+
+        auto start = perf_dump_start();
         perf_dump_done(start,
                        std::string("rank[") + std::to_string(w_rank) + std::string("] sync_tensor wait events"),
                        true);
 
         auto sub_mem_mgr = instance.get_network().get_sub_mem_mgr();
-        auto id = sub_mem_mgr->get_memory_id(w_rank);
+        auto id = 0;
+        // auto id = sub_mem_mgr->get_memory_id(w_rank);
         sub_mem_mgr->set_memory_used(id, w_rank);
         auto start_1 = perf_dump_start();
         while (true) {
@@ -733,8 +884,12 @@ struct sync_tensor_impl : public typed_primitive_impl<sync_tensor> {
                 sub_mem_mgr->_use_count[id] = 0;
                 for (size_t i = 0; i < w_size; i++) {
                     sub_mem_mgr->_memorys_table[id][i].flag = false;
-                    for (size_t j = 0; j < w_size; j++)
+                    for (size_t j = 0; j < w_size; j++) {
                         sub_mem_mgr->_memorys_table[id][i].events[j] = nullptr;
+                        sub_mem_mgr->_memorys_table[id][i].recv_flag[j] = false;
+                        sub_mem_mgr->_memorys_table[id][i].recv_flag_concat[j] = false;
+                        sub_mem_mgr->_memorys_table[id][i].add_flag[j] = false;
+                    }
                 }
             }
             if (sub_mem_mgr->_use_count[id] == 0) {
@@ -755,18 +910,23 @@ struct sync_tensor_impl : public typed_primitive_impl<sync_tensor> {
         auto& ocl_stream = downcast<ocl::ocl_stream>(stream);
         auto local_context = ocl_stream.get_engine().get_cl_context().get();
         auto p2p_src_layout = instance.get_output_layout(0);
+        bool need_update_remote_mems = false;
         if (is_all_reduce) {
             OPENVINO_ASSERT(1 == instance.get_output_memorys().size(), "All reduce only has one output!");
-            sub_mem_mgr->_memorys_table[id][w_rank].recv_bufs[w_rank] = instance.get_output_memorys()[0];
+            if (all_reduce_solution == 2)
+                sub_mem_mgr->_memorys_table[id][w_rank].recv_bufs[0] = instance.get_output_memorys()[0];
+            else
+                sub_mem_mgr->_memorys_table[id][w_rank].recv_bufs[w_rank] = instance.get_output_memorys()[0];
             sub_mem_mgr->_memorys_table[id][w_rank].output = instance.get_output_memorys()[0];
             // Allocate or reuse buffer for P2P target, same shape with output[0]
             p2p_src_layout = instance.get_output_layout(0);
-            update_internal_buffer(instance,
-                                   sub_mem_mgr->_memorys_table[id][w_rank].recv_bufs,
-                                   sub_mem_mgr->_memorys_table[id][w_rank].layout,
-                                   p2p_src_layout,
-                                   w_size,
-                                   w_rank);
+            need_update_remote_mems = update_internal_buffer(instance,
+                                                             sub_mem_mgr->_memorys_table[id][w_rank].recv_bufs,
+                                                             sub_mem_mgr->_memorys_table[id][w_rank].layout,
+                                                             p2p_src_layout,
+                                                             w_size,
+                                                             w_rank,
+                                                             all_reduce_solution);
         } else {
             OPENVINO_ASSERT(2 == instance.get_output_memorys().size(),
                             "All gather need additional buffer for concat result!");
@@ -774,115 +934,414 @@ struct sync_tensor_impl : public typed_primitive_impl<sync_tensor> {
             sub_mem_mgr->_memorys_table[id][w_rank].output = instance.get_output_memorys()[0];
             // All gather doesn't need intermediate buffer at all.
             p2p_src_layout = instance.get_output_layout(1);
+            auto tmp =
+                std::dynamic_pointer_cast<const ocl::gpu_buffer>(instance.get_output_memorys()[0])->get_buffer().get();
+            if (tmp != all_gather_current_dst) {
+                need_update_remote_mems = true;
+                all_gather_current_dst = tmp;
+            }
+        }
+        if (all_reduce_solution == 2) {
+            sub_mem_mgr->_memorys_table[id][w_rank].recv_flag[0] = true;
+            sub_mem_mgr->_memorys_table[id][w_rank].recv_flag_concat[0] = true;
+        } else {
+            sub_mem_mgr->_memorys_table[id][w_rank].flag = true;
         }
-        sub_mem_mgr->_memorys_table[id][w_rank].flag = true;
 
-        std::vector<int> wait_list(w_size, 1);
-        auto start_2 = perf_dump_start();
-        wait_list[w_rank] = 0;  // no need to wait for itself
-        size_t data_size = 0;
-        event::ptr sync_event = nullptr;
-        auto src_p2p_buf =
-            std::dynamic_pointer_cast<const ocl::gpu_buffer>(sub_mem_mgr->_memorys_table[id][w_rank].recv_bufs[w_rank]);
-        auto src_cl_buf = src_p2p_buf->get_buffer().get();
-        while (true) {
-            int wait_size = 0;
-            for (int idx = 0; idx < static_cast<int>(w_size); idx++) {
-                if (idx != w_rank && wait_list[idx] > 0 && sub_mem_mgr->_memorys_table[id][idx].flag) {
-                    cldnn::memory::ptr dst_mem = nullptr;
-                    if (is_all_reduce) {
-                        dst_mem = sub_mem_mgr->_memorys_table[id][idx].recv_bufs[w_rank];
-                    } else {
-                        dst_mem = sub_mem_mgr->_memorys_table[id][idx].output;
+        // The mapped remote cl_mem will hold the original cl_mem, it should be released if the original cl_mem has been
+        // released, else it will cause gpu memory leak.
+        if (need_update_remote_mems) {
+            if (debug_enable) {
+                std::cout << "release_remote_mems: old_layout = "
+                          << sub_mem_mgr->_memorys_table[id][w_rank].layout.to_short_string()
+                          << ", new_layout = " << p2p_src_layout.to_short_string() << std::endl;
+            }
+        }
+
+        std::vector<cldnn::event::ptr> sync_events;
+        if (is_all_reduce && all_reduce_solution == 2) {
+            while (true) {
+                size_t wait_all_ouput_ready = 0;
+                for (int idx = 0; idx < static_cast<int>(w_size); idx++) {
+                    if (sub_mem_mgr->_memorys_table[id][idx].recv_flag[0] == true)
+                        wait_all_ouput_ready++;
+                }
+                if (wait_all_ouput_ready == w_size) {
+                    break;
+                }
+            }
+            auto split_parts = [](int len, int n) {
+                int average = len / n;
+                std::vector<int> parts(n, average);
+                parts.back() = len - average * (n - 1);
+                return parts;
+            };
+
+            auto output_layout = instance.get_output_layout(0);
+            ov::element::Type output_element_type = output_layout.data_type;
+            auto output_element_size = output_element_type.size();
+            auto output_shape = output_layout.get_shape();
+            auto sub_out_dim_vec = split_parts(output_shape[-1], w_size);
+            if (0) {
+                for (size_t iter = 0; iter < w_size; iter++) {
+                    std::cout << "sub_out_dim_vec[" << iter << "] " << sub_out_dim_vec[iter] << std::endl;
+                }
+            }
+            auto output_height = ov::shape_size(output_shape) / output_shape[-1];
+            // Prepare CL memory mapping for P2P copying next
+            {
+                auto dst_idx = (w_rank + 1) % w_size;
+                cl_mem dst_cl_buf = nullptr;
+                cldnn::memory::ptr dst_mem = sub_mem_mgr->_memorys_table[id][dst_idx].recv_bufs[1];
+                size_t data_size = dst_mem->size();
+                auto dst_cl_buf_remote = std::dynamic_pointer_cast<const ocl::gpu_buffer>(dst_mem)->get_buffer().get();
+                dst_cl_buf = static_cast<cl_mem>(sub_mem_mgr->_memorys_table[id][w_rank].remote_mems_p2p[0]);
+                if (need_update_remote_mems || dst_cl_buf == nullptr) {
+                    if (dst_cl_buf) {
+                        release_remote_mems(dst_cl_buf);
                     }
-                    auto dst_cl_buf_remote =
-                        std::dynamic_pointer_cast<const ocl::gpu_buffer>(dst_mem)->get_buffer().get();
-
-                    data_size = dst_mem->size();
-                    auto dst_cl_buf = gpu_p2p_instance.map_remote_mem(local_context, dst_cl_buf_remote, data_size);
-                    auto p2p_data_size = p2p_src_layout.bytes_count();
-                    {
-                        gpu_lock.acquire();
-                        if (is_all_reduce) {
-                            gpu_p2p_instance.remote_copy(stream, src_cl_buf, dst_cl_buf, p2p_data_size);
-                        } else {
-                            gpu_p2p_instance.remote_copy_rect(stream,
-                                                              src_cl_buf,
-                                                              instance.get_output_layout(1),
-                                                              dst_cl_buf,
-                                                              instance.get_output_layout(0),
-                                                              w_rank,
-                                                              false);
-                        }
-                        gpu_lock.signal();
+                    dst_cl_buf = gpu_p2p_instance.map_remote_mem(local_context, dst_cl_buf_remote, data_size);
+                    sub_mem_mgr->_memorys_table[id][w_rank].remote_mems_p2p[0] = dst_cl_buf;
+                }
+            }
+            // Start loop for sub buff copy & add
+            for (int32_t i = 0; i < static_cast<int>(w_size) - 1; i++) {
+                auto src_mem = std::dynamic_pointer_cast<const ocl::gpu_buffer>(
+                    sub_mem_mgr->_memorys_table[id][w_rank].recv_bufs[0]);
+                auto src_buf = src_mem->get_buffer().get();
+
+                int32_t sub_part = (w_rank - i) >= 0 ? (w_rank - i) : ((w_rank - i) + w_size);
+                int32_t rec_sub_part = (sub_part - 1) >= 0 ? (sub_part - 1) : ((sub_part - 1) + w_size);
+                auto dst_idx = (w_rank + 1) % w_size;
+
+                auto dst_cl_buf = static_cast<cl_mem>(sub_mem_mgr->_memorys_table[id][w_rank].remote_mems_p2p[0]);
+
+                auto& ocl_stream = downcast<ocl::ocl_stream>(stream);
+                auto queue = ocl_stream.get_cl_queue().get();
+                int32_t off_set = 0;
+                for (int32_t j = 0; j < sub_part; j++) {
+                    off_set = off_set + sub_out_dim_vec[j];
+                }
+                size_t src_rec[3] = {off_set * output_element_size * output_height, 0, 0};
+                size_t dst_rec[3] = {0, 0, 0};
+                cl_event event;
+                size_t rect[3] = {sub_out_dim_vec[sub_part] * output_element_size, output_height, 1};
+                auto ret = clEnqueueCopyBufferRect(queue,
+                                                   src_buf,
+                                                   dst_cl_buf,
+                                                   src_rec,
+                                                   dst_rec,
+                                                   rect,
+                                                   0,
+                                                   0,
+                                                   0,
+                                                   0,
+                                                   0,
+                                                   nullptr,
+                                                   &event);
+                if (ret != CL_SUCCESS) {
+                    std::cout << "0.clEnqueueCopyBufferRect failed: " << oclErrorCode[ret] << ", idx = " << i
+                              << std::endl;
+                    OPENVINO_THROW("0.clEnqueueCopyBufferRect failed: ", oclErrorCode[ret], ", idx = ", i);
+                }
+                ret = clWaitForEvents(1, &event);
+                CHECK_OCL_ERROR(ret, "clWaitForEvents failed");
+                clReleaseEvent(event);
+
+                sub_mem_mgr->_memorys_table[id][dst_idx].recv_flag[i + 1] = true;
+
+                while (true) {
+                    size_t wait_all_ouput_ready = 0;
+                    for (int idx = 0; idx < static_cast<int>(w_size); idx++) {
+                        if (sub_mem_mgr->_memorys_table[id][idx].recv_flag[i + 1] == true)
+                            wait_all_ouput_ready++;
                     }
-                    // P2P has been done.
-                    {
-                        std::lock_guard<std::mutex> lock(sub_mem_mgr->_flagMutex);
-                        sub_mem_mgr->_memorys_table[id][idx].events[w_rank] = stream.create_user_event(true);
+                    if (wait_all_ouput_ready == w_size)
+                        break;
+                }
+
+                if (0) {
+                    std::lock_guard<std::mutex> lock(debug_mutex);
+                    std::cout << "[After Copy Rank] " << w_rank << ", [iter] " << i << std::endl;
+                    for (int idx = 0; idx < static_cast<int>(w_size); idx++) {
+                        auto mem = std::dynamic_pointer_cast<const ocl::gpu_buffer>(
+                            sub_mem_mgr->_memorys_table[id][w_rank].recv_bufs[idx]);
+                        auto buf = mem->get_buffer().get();
+                        dump_cl_buf(ocl_stream.get_cl_queue().get(), buf, mem->count(), 0);
+                    }
+                }
+
+                auto dst_mem_add = std::dynamic_pointer_cast<const ocl::gpu_buffer>(
+                    sub_mem_mgr->_memorys_table[id][w_rank].recv_bufs[0]);
+                auto dst_cl_buf_add = dst_mem_add->get_buffer().get();
+                auto& adder_instance = get_adder_instance(w_rank);
+
+                auto src_mem_add = sub_mem_mgr->_memorys_table[id][w_rank].recv_bufs[1];
+                auto src_cl_buf_add = std::dynamic_pointer_cast<const ocl::gpu_buffer>(src_mem_add)->get_buffer().get();
+
+                sub_mem_mgr->_memorys_table[id][w_rank].last_rec_part = rec_sub_part;
+                int32_t off_set_add = 0;
+                for (int32_t j = 0; j < rec_sub_part; j++)
+                    off_set_add = off_set_add + sub_out_dim_vec[j];
+
+                cldnn::event::ptr sync_add_event;
+                auto start_add = perf_dump_start();
+                sync_add_event = adder_instance.tensor_add_sub(
+                    stream,
+                    src_cl_buf_add,
+                    dst_cl_buf_add,
+                    output_height * sub_out_dim_vec[rec_sub_part],
+                    adder_instance.element_type_to_kernel_data_type(dst_mem_add->get_layout().data_type),
+                    output_height * off_set_add);
+                sync_add_event->wait();
+                auto end_add = perf_dump_start();
+                std::chrono::duration<double, std::milli> duration = end_add - start_add;
+                if (0) {
+                    std::lock_guard<std::mutex> lock(debug_mutex);
+                    std::cout << "Add Solution[" << all_reduce_solution << "] " << "Rank[" << w_rank
+                              << "] sync tensor p2p add total cost: " << duration.count() << " ms" << std::endl;
+                }
+                sub_mem_mgr->_memorys_table[id][w_rank].add_flag[i + 1] = true;
+
+                while (true) {
+                    size_t wait_all_ouput_ready = 0;
+                    for (int idx = 0; idx < static_cast<int>(w_size); idx++) {
+                        if (sub_mem_mgr->_memorys_table[id][idx].add_flag[i + 1] == true)
+                            wait_all_ouput_ready++;
+                    }
+                    if (wait_all_ouput_ready == w_size)
+                        break;
+                }
+
+                if (0) {
+                    std::lock_guard<std::mutex> lock(debug_mutex);
+                    std::cout << "[After Add Rank] " << w_rank << std::endl;
+                    for (int idx = 0; idx < 1; idx++) {
+                        auto mem = std::dynamic_pointer_cast<const ocl::gpu_buffer>(
+                            sub_mem_mgr->_memorys_table[id][w_rank].recv_bufs[idx]);
+                        auto buf = mem->get_buffer().get();
+                        dump_cl_buf(ocl_stream.get_cl_queue().get(), buf, mem->count(), 0);
                     }
-                    // gpu_p2p_instance.destory_remote_mem(dst_cl_buf);
-                    wait_list[idx] = 0;
                 }
-                wait_size += wait_list[idx];
-            }
-            if (wait_size == 0) {
-                break;
             }
-            auto end_2 = perf_dump_start();
-            std::chrono::duration<double, std::milli> duration = end_2 - start_2;
-            if (duration.count() > 10000) {
-                start_2 = perf_dump_start();
-                std::cout << "rank[" << w_rank << "]Error: sync_tensor p2p write timeout..." << std::endl;
+
+            // Prepare CL memory mapping for P2P copying instand of concat
+            {
+                auto dst_idx = (w_rank + 1) % w_size;
+                cl_mem dst_cl_buf = nullptr;
+                cldnn::memory::ptr dst_mem = sub_mem_mgr->_memorys_table[id][dst_idx].output;
+                size_t data_size = dst_mem->size();
+                auto dst_cl_buf_remote = std::dynamic_pointer_cast<const ocl::gpu_buffer>(dst_mem)->get_buffer().get();
+                dst_cl_buf = static_cast<cl_mem>(all_reduce_remote_dst[dst_idx]);
+                if (need_update_remote_mems || dst_cl_buf == nullptr) {
+                    dst_cl_buf = gpu_p2p_instance.map_remote_mem(local_context, dst_cl_buf_remote, data_size);
+                    all_reduce_remote_dst[dst_idx] = dst_cl_buf;
+                }
             }
-        }
+            // Start loop for sub buff copy (concat)
+            for (int32_t i = 0; i < static_cast<int>(w_size) - 1; i++) {
+                auto src_mem = std::dynamic_pointer_cast<const ocl::gpu_buffer>(
+                    sub_mem_mgr->_memorys_table[id][w_rank].recv_bufs[0]);
+                auto src_buf = src_mem->get_buffer().get();
+
+                auto dst_idx = (w_rank + 1) % w_size;
+                cl_mem dst_cl_buf = nullptr;
+                dst_cl_buf = static_cast<cl_mem>(all_reduce_remote_dst[dst_idx]);
+
+                auto sub_part = sub_mem_mgr->_memorys_table[id][w_rank].last_rec_part;
+                auto& ocl_stream = downcast<ocl::ocl_stream>(stream);
+                auto queue = ocl_stream.get_cl_queue().get();
+                int32_t off_set = 0;
+                for (int32_t j = 0; j < sub_part; j++) {
+                    off_set = off_set + sub_out_dim_vec[j];
+                }
+                size_t src_rec[3] = {off_set * output_element_size * output_height, 0, 0};
+                size_t dst_rec[3] = {off_set * output_element_size * output_height, 0, 0};
+                cl_event event;
+                size_t rect[3] = {sub_out_dim_vec[sub_part] * output_element_size, output_height, 1};
+                auto ret = clEnqueueCopyBufferRect(queue,
+                                                   src_buf,
+                                                   dst_cl_buf,
+                                                   src_rec,
+                                                   dst_rec,
+                                                   rect,
+                                                   0,
+                                                   0,
+                                                   0,
+                                                   0,
+                                                   0,
+                                                   nullptr,
+                                                   &event);
+                if (ret != CL_SUCCESS) {
+                    std::cout << "0.clEnqueueCopyBufferRect failed: " << oclErrorCode[ret] << ", idx = " << i
+                              << std::endl;
+                    OPENVINO_THROW("0.clEnqueueCopyBufferRect failed: ", oclErrorCode[ret], ", idx = ", i);
+                }
+                ret = clWaitForEvents(1, &event);
+                CHECK_OCL_ERROR(ret, "clWaitForEvents failed");
+                clReleaseEvent(event);
+                sub_mem_mgr->_memorys_table[id][dst_idx].last_rec_part = sub_part;
+
+                sub_mem_mgr->_memorys_table[id][dst_idx].recv_flag_concat[i + 1] = true;
+
+                while (true) {
+                    size_t wait_all_ouput_ready = 0;
+                    for (int idx = 0; idx < static_cast<int>(w_size); idx++) {
+                        if (sub_mem_mgr->_memorys_table[id][idx].recv_flag_concat[i + 1] == true)
+                            wait_all_ouput_ready++;
+                    }
+                    if (wait_all_ouput_ready == w_size)
+                        break;
+                }
 
-        auto str_need_add = instance.get_impl_params()->need_add ? std::string("[need_add]") : std::string("");
-        perf_dump_done(start_2,
-                       std::string("rank[") + std::to_string(w_rank) + std::string("] sync_tensor p2p write ") +
-                           std::to_string(data_size) + " bytes" + str_need_add,
-                       true);
+                if (0) {
+                    std::lock_guard<std::mutex> lock(debug_mutex);
+                    std::cout << std::endl;
+                    std::cout << "[Concat Rank] " << w_rank <<  " [iter] " << i << std::endl;
+                    for (int idx = 0; idx < 1; idx++) {
+                        auto mem = std::dynamic_pointer_cast<const ocl::gpu_buffer>(
+                            sub_mem_mgr->_memorys_table[id][w_rank].recv_bufs[idx]);
+                        auto buf = mem->get_buffer().get();
+                        dump_cl_buf(ocl_stream.get_cl_queue().get(), buf, mem->count(), 0);
+                    }
+                }
+            }
+        } else {
+            std::vector<int> wait_list(w_size, 1);
+            auto start_2 = perf_dump_start();
+            wait_list[w_rank] = 0;  // no need to wait for itself
+            size_t data_size = 0;
+            event::ptr sync_event = nullptr;
+            auto src_p2p_buf = std::dynamic_pointer_cast<const ocl::gpu_buffer>(
+                sub_mem_mgr->_memorys_table[id][w_rank].recv_bufs[w_rank]);
+            auto src_cl_buf = src_p2p_buf->get_buffer().get();
+
+            while (true) {
+                int wait_size = 0;
+                for (int idx = 0; idx < static_cast<int>(w_size); idx++) {
+                    if (idx != w_rank && wait_list[idx] > 0 && sub_mem_mgr->_memorys_table[id][idx].flag) {
+                        cl_mem dst_cl_buf = nullptr;
+                        if (is_all_reduce) {
+                            cldnn::memory::ptr dst_mem = sub_mem_mgr->_memorys_table[id][idx].recv_bufs[w_rank];
+                            data_size = dst_mem->size();
+                            auto dst_cl_buf_remote =
+                                std::dynamic_pointer_cast<const ocl::gpu_buffer>(dst_mem)->get_buffer().get();
+                            dst_cl_buf = static_cast<cl_mem>(sub_mem_mgr->_memorys_table[id][w_rank].remote_mems[idx]);
+                            if (need_update_remote_mems || dst_cl_buf == nullptr) {
+                                if (dst_cl_buf) {
+                                    release_remote_mems(dst_cl_buf);
+                                }
+                                dst_cl_buf =
+                                    gpu_p2p_instance.map_remote_mem(local_context, dst_cl_buf_remote, data_size);
+                                sub_mem_mgr->_memorys_table[id][w_rank].remote_mems[idx] = dst_cl_buf;
+                            }
+                        } else {
+                            cldnn::memory::ptr dst_mem = sub_mem_mgr->_memorys_table[id][idx].output;
+                            data_size = dst_mem->size();
+                            auto dst_cl_buf_remote =
+                                std::dynamic_pointer_cast<const ocl::gpu_buffer>(dst_mem)->get_buffer().get();
+                            dst_cl_buf = static_cast<cl_mem>(all_gather_remote_dst[idx]);
+                            if (need_update_remote_mems || dst_cl_buf == nullptr) {
+                                if (dst_cl_buf) {
+                                    release_remote_mems(dst_cl_buf);
+                                }
+                                dst_cl_buf =
+                                    gpu_p2p_instance.map_remote_mem(local_context, dst_cl_buf_remote, data_size);
+                                all_gather_remote_dst[idx] = dst_cl_buf;
+                            }
+                        }
+                        auto p2p_data_size = p2p_src_layout.bytes_count();
+                        {
+                            gpu_lock.acquire();
+                            if (is_all_reduce) {
+                                gpu_p2p_instance.remote_copy(stream, src_cl_buf, dst_cl_buf, p2p_data_size);
+                            } else {
+                                gpu_p2p_instance.remote_copy_rect(stream,
+                                                                  src_cl_buf,
+                                                                  instance.get_output_layout(1),
+                                                                  dst_cl_buf,
+                                                                  instance.get_output_layout(0),
+                                                                  w_rank,
+                                                                  false);
+                            }
+                            gpu_lock.signal();
+                        }
+                        // P2P has been done.
+                        {
+                            std::lock_guard<std::mutex> lock(sub_mem_mgr->_flagMutex);
+                            sub_mem_mgr->_memorys_table[id][idx].events[w_rank] = stream.create_user_event(true);
+                        }
+                        // gpu_p2p_instance.destory_remote_mem(dst_cl_buf);
+                        wait_list[idx] = 0;
+                    }
+                    wait_size += wait_list[idx];
+                }
+                if (wait_size == 0) {
+                    break;
+                }
+                auto end_2 = perf_dump_start();
+                std::chrono::duration<double, std::milli> duration = end_2 - start_2;
+                if (duration.count() > 10000) {
+                    start_2 = perf_dump_start();
+                    std::cout << "rank[" << w_rank << "]Error: sync_tensor p2p write timeout..." << std::endl;
+                }
+            }
 
-        // P2P adopts sync write to avoid the problem of event cannot work across contexts
-        wait_p2p_done(stream, gpu_p2p_instance, sub_mem_mgr, id, w_size, w_rank, false);
+            auto str_need_add = instance.get_impl_params()->need_add ? std::string("[need_add]") : std::string("");
+            perf_dump_done(start_2,
+                           std::string("rank[") + std::to_string(w_rank) + std::string("] sync_tensor p2p write ") +
+                               std::to_string(data_size) + " bytes" + str_need_add,
+                           true);
 
-        std::vector<cldnn::event::ptr> sync_events;
-        if (is_all_reduce) {
-            // All_reduce path
-            auto start_3 = perf_dump_start();
-            auto dst_mem = std::dynamic_pointer_cast<const ocl::gpu_buffer>(instance.output_memory_ptr(0));
-            auto dst_cl_buf = dst_mem->get_buffer().get();
-            auto& adder_instance = get_adder_instance(w_rank);
-            // auto data_size = dst_mem->size();
-            for (size_t idx = 0; idx < w_size; idx++) {
-                if (idx != static_cast<size_t>(w_rank)) {
-                    auto src_mem = sub_mem_mgr->_memorys_table[id][w_rank].recv_bufs[idx];
-                    auto src_cl_buf = std::dynamic_pointer_cast<const ocl::gpu_buffer>(src_mem)->get_buffer().get();
-                    sync_event = adder_instance.tensor_add(
-                        stream,
-                        src_cl_buf,
-                        dst_cl_buf,
-                        dst_mem->count(),
-                        adder_instance.element_type_to_kernel_data_type(dst_mem->get_layout().data_type));
-                    sync_events.emplace_back(sync_event);
+            // P2P adopts sync write to avoid the problem of event cannot work across contexts
+            wait_p2p_done(stream, gpu_p2p_instance, sub_mem_mgr, id, w_size, w_rank, all_reduce_solution, false);
+
+            if (is_all_reduce) {
+                // All_reduce path
+                auto start_3 = perf_dump_start();
+                auto dst_mem = std::dynamic_pointer_cast<const ocl::gpu_buffer>(instance.output_memory_ptr(0));
+                auto dst_cl_buf = dst_mem->get_buffer().get();
+                auto& adder_instance = get_adder_instance(w_rank);
+                // auto data_size = dst_mem->size();
+                for (size_t idx = 0; idx < w_size; idx++) {
+                    if (idx != static_cast<size_t>(w_rank)) {
+                        auto src_mem = sub_mem_mgr->_memorys_table[id][w_rank].recv_bufs[idx];
+                        auto src_cl_buf = std::dynamic_pointer_cast<const ocl::gpu_buffer>(src_mem)->get_buffer().get();
+                        sync_event = adder_instance.tensor_add(
+                            stream,
+                            src_cl_buf,
+                            dst_cl_buf,
+                            dst_mem->count(),
+                            adder_instance.element_type_to_kernel_data_type(dst_mem->get_layout().data_type));
+                        // sync_event->wait();
+                        sync_events.emplace_back(sync_event);
+                    }
+                }
+                const auto end_add = std::chrono::high_resolution_clock::now();
+                const std::chrono::duration<double, std::milli> elapsed_1 = end_add - start_3;
+                if (0) {
+                    std::lock_guard<std::mutex> lock(debug_mutex);
+                    std::cout << "Add Solution[" << all_reduce_solution << "] " << "Rank[" << w_rank
+                              << "] sync tensor p2p add total cost: " << elapsed_1.count() << " ms" << std::endl;
                 }
+                perf_dump_done(
+                    start_3,
+                    std::string("rank[") + std::to_string(w_rank) + std::string("] sync_tensor allreduce add"),
+                    true);
+            } else {
+                auto src_mem = sub_mem_mgr->_memorys_table[id][w_rank].recv_bufs[w_rank];
+                auto src_cl_buf = std::dynamic_pointer_cast<const ocl::gpu_buffer>(src_mem)->get_buffer().get();
+                auto dst_mem = std::dynamic_pointer_cast<const ocl::gpu_buffer>(instance.output_memory_ptr(0));
+                auto dst_cl_buf = dst_mem->get_buffer().get();
+                auto sync_event = gpu_p2p_instance.remote_copy_rect(stream,
+                                                                    src_cl_buf,
+                                                                    instance.get_output_layout(1),
+                                                                    dst_cl_buf,
+                                                                    instance.get_output_layout(0),
+                                                                    w_rank,
+                                                                    false);
+                sync_events.emplace_back(sync_event);
             }
-            perf_dump_done(start_3,
-                           std::string("rank[") + std::to_string(w_rank) + std::string("] sync_tensor allreduce add"),
-                           true);
-        } else {
-            auto src_mem = sub_mem_mgr->_memorys_table[id][w_rank].recv_bufs[w_rank];
-            auto src_cl_buf = std::dynamic_pointer_cast<const ocl::gpu_buffer>(src_mem)->get_buffer().get();
-            auto dst_mem = std::dynamic_pointer_cast<const ocl::gpu_buffer>(instance.output_memory_ptr(0));
-            auto dst_cl_buf = dst_mem->get_buffer().get();
-            auto sync_event = gpu_p2p_instance.remote_copy_rect(stream,
-                                                                src_cl_buf,
-                                                                instance.get_output_layout(1),
-                                                                dst_cl_buf,
-                                                                instance.get_output_layout(0),
-                                                                w_rank,
-                                                                false);
-            sync_events.emplace_back(sync_event);
         }
 
         if (pass_through_events) {
@@ -894,6 +1353,14 @@ struct sync_tensor_impl : public typed_primitive_impl<sync_tensor> {
         }
         perf_dump_done(start, std::string("rank[") + std::to_string(w_rank) + std::string("] sync_tensor total"), true);
 
+        const auto end_xj = std::chrono::high_resolution_clock::now();
+        const std::chrono::duration<double, std::milli> elapsed_1 = end_xj - start;
+        if (0) {
+            std::lock_guard<std::mutex> lock(debug_mutex);
+            std::cout << "Solution[" << all_reduce_solution << "] " << "Rank[" << w_rank
+                      << "] sync tensor p2p add total cost: " << elapsed_1.count() << " ms" << std::endl;
+        }
+
         // This block MUST be put exactly at the end of this method.
         {
             std::lock_guard<std::mutex> lock(sub_mem_mgr->_flagMutex);
@@ -909,6 +1376,10 @@ struct sync_tensor_impl : public typed_primitive_impl<sync_tensor> {
     static std::unique_ptr<primitive_impl> create(const sync_tensor_node& arg, const kernel_impl_params& impl_param) {
         return make_unique<sync_tensor_impl>();
     }
+
+    std::vector<void*> all_gather_remote_dst;
+    std::vector<void*> all_reduce_remote_dst;
+    cl_mem all_gather_current_dst;
 };
 
 namespace detail {
diff --git a/src/plugins/intel_gpu/tests/unit/transformations/fc_all_reduce_test.cpp b/src/plugins/intel_gpu/tests/unit/transformations/fc_all_reduce_test.cpp
index e1998646aaf2e7..631aae306c3c92 100644
--- a/src/plugins/intel_gpu/tests/unit/transformations/fc_all_reduce_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/transformations/fc_all_reduce_test.cpp
@@ -40,7 +40,7 @@ TEST(TransformationTestsF1, FullyConnectedSplitInput11) {
         unsigned long test_size = 2;
         auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::Shape{test_size, test_size});
         std::vector<float> weights(test_size * test_size, 2);
-        auto input2 = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{test_size, test_size}, {1, 2, 3, 4});
+        auto input2 = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{test_size, test_size}, {5, 6, 7, 8});
         std::cout << "\n" << "weights: ";
         for (size_t i = 0; i < input2->get_vector<float>().size(); i++) {
             std::cout << input2->get_vector<float>()[i] << ", ";
@@ -55,16 +55,21 @@ TEST(TransformationTestsF1, FullyConnectedSplitInput11) {
 
         // -------- Loading a model to the device --------
         ov::Core core;
-        ov::CompiledModel compiled_model = core.compile_model(model, "GPU");
+        // ov::CompiledModel compiled_model = core.compile_model(model, "GPU");
+        ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3"));
 
         // -------- Create an infer request --------
         ov::InferRequest infer_request = compiled_model.create_infer_request();
 
         // -------- Prepare input --------
-        auto input_generate = ov::test::utils::InputGenerateData(0, 5);
-        auto tensor = ov::test::utils::create_and_fill_tensor(infer_request.get_input_tensor().get_element_type(),
-                                                              infer_request.get_input_tensor().get_shape(),
-                                                              input_generate);
+        std::vector<float> input_data = {1, 2, 3, 4};
+        auto tensor = ov::Tensor(infer_request.get_input_tensor().get_element_type(),
+                                 infer_request.get_input_tensor().get_shape(),
+                                 input_data.data());
+        // auto input_generate = ov::test::utils::InputGenerateData(0, 5);
+        // auto tensor = ov::test::utils::create_and_fill_tensor(infer_request.get_input_tensor().get_element_type(),
+        //                                                       infer_request.get_input_tensor().get_shape(),
+        //                                                       input_generate);
         std::cout << "\n" << "input_tensor: ";
         for (size_t i = 0; i < tensor.get_size(); i++) {
             std::cout << tensor.data<float>()[i] << ", ";
@@ -75,9 +80,71 @@ TEST(TransformationTestsF1, FullyConnectedSplitInput11) {
 
         // -------- Do inference synchronously --------
         infer_request.infer();
-        for (int iter = 0; iter < 2; iter++) {
-             infer_request.infer();
+        // for (int iter = 0; iter < 2; iter++) {
+        //      infer_request.infer();
+        // }
+
+        // -------- Process output
+        auto output_tensor = infer_request.get_output_tensor();
+        std::cout << "\n"
+                  << "output_tensor: " << output_tensor.get_shape() << std::endl;
+        for (size_t i = 0; i < output_tensor.get_size(); i++) {
+            std::cout << output_tensor.data<float>()[i] << ", ";
+        }
+        std::cout << std::endl;
+    }
+}
+
+TEST(TransformationTestsF1, FullyConnectedSplitInput8) {
+    {
+        // -------- Construct model
+        unsigned long test_size = 4;
+        auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::Shape{test_size, test_size});
+        std::vector<float> weights(test_size * test_size, 2);
+        auto input2 = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{test_size, test_size}, {17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32});
+        std::cout << "\n" << "weights: ";
+        for (size_t i = 0; i < input2->get_vector<float>().size(); i++) {
+            std::cout << input2->get_vector<float>()[i] << ", ";
+        }
+        auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto fc = std::make_shared<op::FullyConnected>(input1, input2, no_bias, ov::element::f32);
+        const auto relu = std::make_shared<ov::op::v0::Relu>(fc);
+        auto model = std::make_shared<ov::Model>(ov::NodeVector{relu}, ov::ParameterVector{input1});
+        //auto model = std::make_shared<ov::Model>(ov::NodeVector{fc}, ov::ParameterVector{input1});
+
+        // ov::serialize(model, "./model_fc.xml", "./model_fc.bin");
+
+        // -------- Loading a model to the device --------
+        ov::Core core;
+        // ov::CompiledModel compiled_model = core.compile_model(model, "GPU");
+        ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3"));
+        // ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3"));
+
+        // -------- Create an infer request --------
+        ov::InferRequest infer_request = compiled_model.create_infer_request();
+
+        // -------- Prepare input --------
+        std::vector<float> input_data = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+        auto tensor = ov::Tensor(infer_request.get_input_tensor().get_element_type(),
+                                 infer_request.get_input_tensor().get_shape(),
+                                 input_data.data());
+        // auto input_generate = ov::test::utils::InputGenerateData(0, 5);
+        // auto tensor = ov::test::utils::create_and_fill_tensor(infer_request.get_input_tensor().get_element_type(),
+        //                                                       infer_request.get_input_tensor().get_shape(),
+        //                                                       input_generate);
+        std::cout << "\n" << "input_tensor: ";
+        for (size_t i = 0; i < tensor.get_size(); i++) {
+            std::cout << tensor.data<float>()[i] << ", ";
         }
+        std::cout << std::endl;
+
+        infer_request.set_input_tensor(tensor);
+
+        // -------- Do inference synchronously --------
+        infer_request.infer();
+        // for (int iter = 0; iter < 2; iter++) {
+        //      infer_request.infer();
+        // }
 
         // -------- Process output
         auto output_tensor = infer_request.get_output_tensor();
@@ -141,7 +208,9 @@ TEST(TransformationTestsF1, FullyConnectedSplitInput16) {
 
         // -------- Loading a model to the device --------
         ov::Core core;
-        ov::CompiledModel compiled_model = core.compile_model(model, "GPU");
+//        ov::CompiledModel compiled_model = core.compile_model(model, "GPU");
+	ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3"));
+    // ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3"));
 
         // -------- Create an infer request --------
         ov::InferRequest infer_request = compiled_model.create_infer_request();
@@ -160,9 +229,9 @@ TEST(TransformationTestsF1, FullyConnectedSplitInput16) {
 
         // -------- Do inference synchronously --------
         infer_request.infer();
-        for (int iter = 0; iter < 2; iter++) {
-             infer_request.infer();
-        }
+        // for (int iter = 0; iter < 2; iter++) {
+        //      infer_request.infer();
+        // }
 
         // -------- Process output
         auto output_tensor = infer_request.get_output_tensor();
@@ -218,7 +287,644 @@ TEST(TransformationTestsF1, FullyConnectedSplitInput1024) {
 
         // -------- Do inference synchronously --------
         infer_request.infer();
-        for (int iter = 0; iter < 100; iter++) {
+        for (int iter = 0; iter < 2000; iter++) {
+            infer_request.infer();
+        }
+
+        // -------- Process output
+        auto output_tensor = infer_request.get_output_tensor();
+        std::cout << "\n"
+                  << "output_tensor: " << output_tensor.get_shape() << std::endl;
+        std::cout << std::endl;
+    }
+}
+
+TEST(TransformationTestsF1, FullyConnectedSplitInput_xj_64) {
+    {
+        // -------- Construct model
+        unsigned long m, k, n;
+        m = k = n = 64;
+        auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::Shape{m, n});
+        std::vector<float> input_data(m * k, 1);
+        std::vector<float> weights(k * n, 2);
+        std::vector<float> result(m * n, 2);
+
+        auto input2 = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{k, n}, weights.data());
+
+        std::cout << "input_shape: " << m << " * " << k << std::endl;
+        std::cout << "weight_shape: " << k << " * " << n << std::endl;
+        std::cout << "output_shape: " << m << " * " << n << std::endl;
+
+        std::cout << std::endl;
+        auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto fc = std::make_shared<op::FullyConnected>(input1, input2, no_bias, ov::element::i8);
+        const auto relu = std::make_shared<ov::op::v0::Relu>(fc);
+        auto model = std::make_shared<ov::Model>(ov::NodeVector{relu}, ov::ParameterVector{input1});
+
+        // ov::serialize(model, "./model_fc.xml", "./model_fc.bin");
+
+        // -------- Loading a model to the device --------
+        ov::Core core;
+        // ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3"));
+        ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3"));
+
+        // -------- Create an infer request --------
+        ov::InferRequest infer_request = compiled_model.create_infer_request();
+
+        // -------- Prepare input --------
+        auto tensor = ov::Tensor(infer_request.get_input_tensor().get_element_type(),
+                                 infer_request.get_input_tensor().get_shape(),
+                                 input_data.data());
+
+        infer_request.set_input_tensor(tensor);
+
+        // -------- Do inference synchronously --------
+        // infer_request.infer();
+        for (int iter = 0; iter < 2000; iter++) {
+            infer_request.infer();
+        }
+
+        // -------- Process output
+        auto output_tensor = infer_request.get_output_tensor();
+        std::cout << "\n"
+                  << "output_tensor: " << output_tensor.get_shape() << std::endl;
+        std::cout << std::endl;
+    }
+}
+
+TEST(TransformationTestsF1, FullyConnectedSplitInput_xj_128) {
+    {
+        // -------- Construct model
+        unsigned long m, k, n;
+        m = k = n = 128;
+        auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::Shape{m, n});
+        std::vector<float> input_data(m * k, 1);
+        std::vector<float> weights(k * n, 2);
+        std::vector<float> result(m * n, 2);
+
+        auto input2 = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{k, n}, weights.data());
+
+        std::cout << "input_shape: " << m << " * " << k << std::endl;
+        std::cout << "weight_shape: " << k << " * " << n << std::endl;
+        std::cout << "output_shape: " << m << " * " << n << std::endl;
+
+        std::cout << std::endl;
+        auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto fc = std::make_shared<op::FullyConnected>(input1, input2, no_bias, ov::element::i8);
+        const auto relu = std::make_shared<ov::op::v0::Relu>(fc);
+        auto model = std::make_shared<ov::Model>(ov::NodeVector{relu}, ov::ParameterVector{input1});
+
+        // ov::serialize(model, "./model_fc.xml", "./model_fc.bin");
+
+        // -------- Loading a model to the device --------
+        ov::Core core;
+        // ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3"));
+        ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3"));
+
+        // -------- Create an infer request --------
+        ov::InferRequest infer_request = compiled_model.create_infer_request();
+
+        // -------- Prepare input --------
+        auto tensor = ov::Tensor(infer_request.get_input_tensor().get_element_type(),
+                                 infer_request.get_input_tensor().get_shape(),
+                                 input_data.data());
+
+        infer_request.set_input_tensor(tensor);
+
+        // -------- Do inference synchronously --------
+        // infer_request.infer();
+        for (int iter = 0; iter < 2000; iter++) {
+            infer_request.infer();
+        }
+
+        // -------- Process output
+        auto output_tensor = infer_request.get_output_tensor();
+        std::cout << "\n"
+                  << "output_tensor: " << output_tensor.get_shape() << std::endl;
+        std::cout << std::endl;
+    }
+}
+
+TEST(TransformationTestsF1, FullyConnectedSplitInput_xj_256) {
+    {
+        // -------- Construct model
+        unsigned long m, k, n;
+        m = k = n = 256;
+        auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::Shape{m, n});
+        std::vector<float> input_data(m * k, 1);
+        std::vector<float> weights(k * n, 2);
+        std::vector<float> result(m * n, 2);
+
+        auto input2 = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{k, n}, weights.data());
+
+        std::cout << "input_shape: " << m << " * " << k << std::endl;
+        std::cout << "weight_shape: " << k << " * " << n << std::endl;
+        std::cout << "output_shape: " << m << " * " << n << std::endl;
+
+        std::cout << std::endl;
+        auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto fc = std::make_shared<op::FullyConnected>(input1, input2, no_bias, ov::element::i8);
+        const auto relu = std::make_shared<ov::op::v0::Relu>(fc);
+        auto model = std::make_shared<ov::Model>(ov::NodeVector{relu}, ov::ParameterVector{input1});
+
+        // ov::serialize(model, "./model_fc.xml", "./model_fc.bin");
+
+        // -------- Loading a model to the device --------
+        ov::Core core;
+        // ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3"));
+        ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3"));
+
+        // -------- Create an infer request --------
+        ov::InferRequest infer_request = compiled_model.create_infer_request();
+
+        // -------- Prepare input --------
+        auto tensor = ov::Tensor(infer_request.get_input_tensor().get_element_type(),
+                                 infer_request.get_input_tensor().get_shape(),
+                                 input_data.data());
+
+        infer_request.set_input_tensor(tensor);
+
+        // -------- Do inference synchronously --------
+        // infer_request.infer();
+        for (int iter = 0; iter < 2000; iter++) {
+            infer_request.infer();
+        }
+
+        // -------- Process output
+        auto output_tensor = infer_request.get_output_tensor();
+        std::cout << "\n"
+                  << "output_tensor: " << output_tensor.get_shape() << std::endl;
+        std::cout << std::endl;
+    }
+}
+
+TEST(TransformationTestsF1, FullyConnectedSplitInput_xj_512) {
+    {
+        // -------- Construct model
+        unsigned long m, k, n;
+        m = k = n = 512;
+        auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::Shape{m, n});
+        std::vector<float> input_data(m * k, 1);
+        std::vector<float> weights(k * n, 2);
+        std::vector<float> result(m * n, 2);
+
+        auto input2 = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{k, n}, weights.data());
+
+        std::cout << "input_shape: " << m << " * " << k << std::endl;
+        std::cout << "weight_shape: " << k << " * " << n << std::endl;
+        std::cout << "output_shape: " << m << " * " << n << std::endl;
+
+        std::cout << std::endl;
+        auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto fc = std::make_shared<op::FullyConnected>(input1, input2, no_bias, ov::element::i8);
+        const auto relu = std::make_shared<ov::op::v0::Relu>(fc);
+        auto model = std::make_shared<ov::Model>(ov::NodeVector{relu}, ov::ParameterVector{input1});
+
+        // ov::serialize(model, "./model_fc.xml", "./model_fc.bin");
+
+        // -------- Loading a model to the device --------
+        ov::Core core;
+        // ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3"));
+        ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3"));
+
+        // -------- Create an infer request --------
+        ov::InferRequest infer_request = compiled_model.create_infer_request();
+
+        // -------- Prepare input --------
+        auto tensor = ov::Tensor(infer_request.get_input_tensor().get_element_type(),
+                                 infer_request.get_input_tensor().get_shape(),
+                                 input_data.data());
+
+        infer_request.set_input_tensor(tensor);
+
+        // -------- Do inference synchronously --------
+        // infer_request.infer();
+        for (int iter = 0; iter < 2000; iter++) {
+            infer_request.infer();
+        }
+
+        // -------- Process output
+        auto output_tensor = infer_request.get_output_tensor();
+        std::cout << "\n"
+                  << "output_tensor: " << output_tensor.get_shape() << std::endl;
+        std::cout << std::endl;
+    }
+}
+
+TEST(TransformationTestsF1, FullyConnectedSplitInput_xj_1024) {
+    {
+        // -------- Construct model
+        unsigned long m, k, n;
+        m = k = n = 1024;
+        auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::Shape{m, n});
+        std::vector<float> input_data(m * k, 1);
+        std::vector<float> weights(k * n, 2);
+        std::vector<float> result(m * n, 2);
+
+        auto input2 = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{k, n}, weights.data());
+
+        std::cout << "input_shape: " << m << " * " << k << std::endl;
+        std::cout << "weight_shape: " << k << " * " << n << std::endl;
+        std::cout << "output_shape: " << m << " * " << n << std::endl;
+
+        std::cout << std::endl;
+        auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto fc = std::make_shared<op::FullyConnected>(input1, input2, no_bias, ov::element::i8);
+        const auto relu = std::make_shared<ov::op::v0::Relu>(fc);
+        auto model = std::make_shared<ov::Model>(ov::NodeVector{relu}, ov::ParameterVector{input1});
+
+        // ov::serialize(model, "./model_fc.xml", "./model_fc.bin");
+
+        // -------- Loading a model to the device --------
+        ov::Core core;
+        // ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3"));
+        ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3"));
+
+        // -------- Create an infer request --------
+        ov::InferRequest infer_request = compiled_model.create_infer_request();
+
+        // -------- Prepare input --------
+        auto tensor = ov::Tensor(infer_request.get_input_tensor().get_element_type(),
+                                 infer_request.get_input_tensor().get_shape(),
+                                 input_data.data());
+
+        infer_request.set_input_tensor(tensor);
+
+        // -------- Do inference synchronously --------
+        // infer_request.infer();
+        for (int iter = 0; iter < 2000; iter++) {
+            infer_request.infer();
+        }
+
+        // -------- Process output
+        auto output_tensor = infer_request.get_output_tensor();
+        std::cout << "\n"
+                  << "output_tensor: " << output_tensor.get_shape() << std::endl;
+        std::cout << std::endl;
+    }
+}
+
+TEST(TransformationTestsF1, FullyConnectedSplitInput_xj_2048) {
+    {
+        // -------- Construct model
+        unsigned long m, k, n;
+        m = k = n = 2048;
+        auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::Shape{m, n});
+        std::vector<float> input_data(m * k, 1);
+        std::vector<float> weights(k * n, 2);
+        std::vector<float> result(m * n, 2);
+
+        auto input2 = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{k, n}, weights.data());
+
+        std::cout << "input_shape: " << m << " * " << k << std::endl;
+        std::cout << "weight_shape: " << k << " * " << n << std::endl;
+        std::cout << "output_shape: " << m << " * " << n << std::endl;
+
+        std::cout << std::endl;
+        auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto fc = std::make_shared<op::FullyConnected>(input1, input2, no_bias, ov::element::i8);
+        const auto relu = std::make_shared<ov::op::v0::Relu>(fc);
+        auto model = std::make_shared<ov::Model>(ov::NodeVector{relu}, ov::ParameterVector{input1});
+
+        // ov::serialize(model, "./model_fc.xml", "./model_fc.bin");
+
+        // -------- Loading a model to the device --------
+        ov::Core core;
+        // ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3"));
+        ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3"));
+
+        // -------- Create an infer request --------
+        ov::InferRequest infer_request = compiled_model.create_infer_request();
+
+        // -------- Prepare input --------
+        auto tensor = ov::Tensor(infer_request.get_input_tensor().get_element_type(),
+                                 infer_request.get_input_tensor().get_shape(),
+                                 input_data.data());
+
+        infer_request.set_input_tensor(tensor);
+
+        // -------- Do inference synchronously --------
+        // infer_request.infer();
+        for (int iter = 0; iter < 2000; iter++) {
+            infer_request.infer();
+        }
+
+        // -------- Process output
+        auto output_tensor = infer_request.get_output_tensor();
+        std::cout << "\n"
+                  << "output_tensor: " << output_tensor.get_shape() << std::endl;
+        std::cout << std::endl;
+    }
+}
+
+TEST(TransformationTestsF1, FullyConnectedSplitInput_xj_4096) {
+    {
+        // -------- Construct model
+        unsigned long m, k, n;
+        m = k = n = 4096;
+        auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::Shape{m, n});
+        std::vector<float> input_data(m * k, 1);
+        std::vector<float> weights(k * n, 2);
+        std::vector<float> result(m * n, 2);
+
+        auto input2 = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{k, n}, weights.data());
+
+        std::cout << "input_shape: " << m << " * " << k << std::endl;
+        std::cout << "weight_shape: " << k << " * " << n << std::endl;
+        std::cout << "output_shape: " << m << " * " << n << std::endl;
+
+        std::cout << std::endl;
+        auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto fc = std::make_shared<op::FullyConnected>(input1, input2, no_bias, ov::element::i8);
+        const auto relu = std::make_shared<ov::op::v0::Relu>(fc);
+        auto model = std::make_shared<ov::Model>(ov::NodeVector{relu}, ov::ParameterVector{input1});
+
+        // ov::serialize(model, "./model_fc.xml", "./model_fc.bin");
+
+        // -------- Loading a model to the device --------
+        ov::Core core;
+        // ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3"));
+        ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3"));
+
+        // -------- Create an infer request --------
+        ov::InferRequest infer_request = compiled_model.create_infer_request();
+
+        // -------- Prepare input --------
+        auto tensor = ov::Tensor(infer_request.get_input_tensor().get_element_type(),
+                                 infer_request.get_input_tensor().get_shape(),
+                                 input_data.data());
+
+        infer_request.set_input_tensor(tensor);
+
+        // -------- Do inference synchronously --------
+        // infer_request.infer();
+        for (int iter = 0; iter < 2000; iter++) {
+            infer_request.infer();
+        }
+
+        // -------- Process output
+        auto output_tensor = infer_request.get_output_tensor();
+        std::cout << "\n"
+                  << "output_tensor: " << output_tensor.get_shape() << std::endl;
+        std::cout << std::endl;
+    }
+}
+
+TEST(TransformationTestsF1, FullyConnectedSplitInput_xj_8192) {
+    {
+        // -------- Construct model
+        unsigned long m, k, n;
+        m = k = n = 8192;
+        auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::Shape{m, n});
+        std::vector<float> input_data(m * k, 1);
+        std::vector<float> weights(k * n, 2);
+        std::vector<float> result(m * n, 2);
+
+        auto input2 = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{k, n}, weights.data());
+
+        std::cout << "input_shape: " << m << " * " << k << std::endl;
+        std::cout << "weight_shape: " << k << " * " << n << std::endl;
+        std::cout << "output_shape: " << m << " * " << n << std::endl;
+
+        std::cout << std::endl;
+        auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto fc = std::make_shared<op::FullyConnected>(input1, input2, no_bias, ov::element::i8);
+        const auto relu = std::make_shared<ov::op::v0::Relu>(fc);
+        auto model = std::make_shared<ov::Model>(ov::NodeVector{relu}, ov::ParameterVector{input1});
+
+        // ov::serialize(model, "./model_fc.xml", "./model_fc.bin");
+
+        // -------- Loading a model to the device --------
+        ov::Core core;
+        // ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3"));
+        ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3"));
+
+        // -------- Create an infer request --------
+        ov::InferRequest infer_request = compiled_model.create_infer_request();
+
+        // -------- Prepare input --------
+        auto tensor = ov::Tensor(infer_request.get_input_tensor().get_element_type(),
+                                 infer_request.get_input_tensor().get_shape(),
+                                 input_data.data());
+
+        infer_request.set_input_tensor(tensor);
+
+        // -------- Do inference synchronously --------
+        // infer_request.infer();
+        for (int iter = 0; iter < 1000; iter++) {
+            infer_request.infer();
+        }
+
+        // -------- Process output
+        auto output_tensor = infer_request.get_output_tensor();
+        std::cout << "\n"
+                  << "output_tensor: " << output_tensor.get_shape() << std::endl;
+        std::cout << std::endl;
+    }
+}
+
+TEST(TransformationTestsF1, FullyConnectedSplitInput_xj_16384) {
+    {
+        // -------- Construct model
+        unsigned long m, k, n;
+        m = k = n = 16384;
+        auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::Shape{m, n});
+        std::vector<float> input_data(m * k, 1);
+        std::vector<float> weights(k * n, 2);
+        std::vector<float> result(m * n, 2);
+
+        auto input2 = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{k, n}, weights.data());
+
+        std::cout << "input_shape: " << m << " * " << k << std::endl;
+        std::cout << "weight_shape: " << k << " * " << n << std::endl;
+        std::cout << "output_shape: " << m << " * " << n << std::endl;
+
+        std::cout << std::endl;
+        auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto fc = std::make_shared<op::FullyConnected>(input1, input2, no_bias, ov::element::i8);
+        const auto relu = std::make_shared<ov::op::v0::Relu>(fc);
+        auto model = std::make_shared<ov::Model>(ov::NodeVector{relu}, ov::ParameterVector{input1});
+
+        // ov::serialize(model, "./model_fc.xml", "./model_fc.bin");
+
+        // -------- Loading a model to the device --------
+        ov::Core core;
+        // ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3"));
+        ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3"));
+
+        // -------- Create an infer request --------
+        ov::InferRequest infer_request = compiled_model.create_infer_request();
+
+        // -------- Prepare input --------
+        auto tensor = ov::Tensor(infer_request.get_input_tensor().get_element_type(),
+                                 infer_request.get_input_tensor().get_shape(),
+                                 input_data.data());
+
+        infer_request.set_input_tensor(tensor);
+
+        // -------- Do inference synchronously --------
+        // infer_request.infer();
+        for (int iter = 0; iter < 200; iter++) {
+            infer_request.infer();
+        }
+
+        // -------- Process output
+        auto output_tensor = infer_request.get_output_tensor();
+        std::cout << "\n"
+                  << "output_tensor: " << output_tensor.get_shape() << std::endl;
+        std::cout << std::endl;
+    }
+}
+
+TEST(TransformationTestsF1, FullyConnectedSplitInput_xj_32768) {
+    {
+        // -------- Construct model
+        unsigned long m, k, n;
+        m = k = n = 32768;
+        auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::Shape{m, n});
+        std::vector<float> input_data(m * k, 1);
+        std::vector<float> weights(k * n, 2);
+        std::vector<float> result(m * n, 2);
+
+        auto input2 = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{k, n}, weights.data());
+
+        std::cout << "input_shape: " << m << " * " << k << std::endl;
+        std::cout << "weight_shape: " << k << " * " << n << std::endl;
+        std::cout << "output_shape: " << m << " * " << n << std::endl;
+
+        std::cout << std::endl;
+        auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto fc = std::make_shared<op::FullyConnected>(input1, input2, no_bias, ov::element::i8);
+        const auto relu = std::make_shared<ov::op::v0::Relu>(fc);
+        auto model = std::make_shared<ov::Model>(ov::NodeVector{relu}, ov::ParameterVector{input1});
+
+        // ov::serialize(model, "./model_fc.xml", "./model_fc.bin");
+
+        // -------- Loading a model to the device --------
+        ov::Core core;
+        // ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3"));
+        ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3"));
+
+        // -------- Create an infer request --------
+        ov::InferRequest infer_request = compiled_model.create_infer_request();
+
+        // -------- Prepare input --------
+        auto tensor = ov::Tensor(infer_request.get_input_tensor().get_element_type(),
+                                 infer_request.get_input_tensor().get_shape(),
+                                 input_data.data());
+
+        infer_request.set_input_tensor(tensor);
+
+        // -------- Do inference synchronously --------
+        // infer_request.infer();
+        for (int iter = 0; iter < 100; iter++) {
+            infer_request.infer();
+        }
+
+        // -------- Process output
+        auto output_tensor = infer_request.get_output_tensor();
+        std::cout << "\n"
+                  << "output_tensor: " << output_tensor.get_shape() << std::endl;
+        std::cout << std::endl;
+    }
+}
+
+TEST(TransformationTestsF1, FullyConnectedSplitInput_xj_1) {
+    {
+        // -------- Construct model
+        unsigned long m, k, n;
+        m = k = 32768;
+        n = 65536;
+        auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::Shape{m, m});
+        std::vector<float> input_data(m * m, 1);
+        std::vector<float> weights(k * n, 2);
+        // std::vector<float> result(m * n, 2);
+
+        auto input2 = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{n, k}, weights.data());
+
+        std::cout << "input_shape: " << m << " * " << k << std::endl;
+        std::cout << "weight_shape: " << k << " * " << n << std::endl;
+        std::cout << "output_shape: " << m << " * " << n << std::endl;
+
+        std::cout << std::endl;
+        auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto fc = std::make_shared<op::FullyConnected>(input1, input2, no_bias, ov::element::i8);
+        const auto relu = std::make_shared<ov::op::v0::Relu>(fc);
+        auto model = std::make_shared<ov::Model>(ov::NodeVector{relu}, ov::ParameterVector{input1});
+
+        // ov::serialize(model, "./model_fc.xml", "./model_fc.bin");
+
+        // -------- Loading a model to the device --------
+        ov::Core core;
+        // ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1"));
+        ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3"));
+
+        // -------- Create an infer request --------
+        ov::InferRequest infer_request = compiled_model.create_infer_request();
+
+        // -------- Prepare input --------
+        auto tensor = ov::Tensor(infer_request.get_input_tensor().get_element_type(),
+                                 infer_request.get_input_tensor().get_shape(),
+                                 input_data.data());
+
+        infer_request.set_input_tensor(tensor);
+
+        // -------- Do inference synchronously --------
+        // infer_request.infer();
+        for (int iter = 0; iter < 2; iter++) {
+            infer_request.infer();
+        }
+
+        // -------- Process output
+        auto output_tensor = infer_request.get_output_tensor();
+        std::cout << "\n"
+                  << "output_tensor: " << output_tensor.get_shape() << std::endl;
+        std::cout << std::endl;
+    }
+}
+
+TEST(TransformationTestsF1, FullyConnectedSplitInput_xj_2) {
+    {
+        // -------- Construct model
+        unsigned long m, k, n;
+        m = k = n = 65536;
+        auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::Shape{m, n});
+        std::vector<float> input_data(m * k, 1);
+        std::vector<float> weights(k * n, 2);
+        // std::vector<float> result(m * n, 2);
+
+        auto input2 = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{k, n}, weights.data());
+
+        std::cout << "input_shape: " << m << " * " << k << std::endl;
+        std::cout << "weight_shape: " << k << " * " << n << std::endl;
+        std::cout << "output_shape: " << m << " * " << n << std::endl;
+
+        std::cout << std::endl;
+        auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto fc = std::make_shared<op::FullyConnected>(input1, input2, no_bias, ov::element::i8);
+        const auto relu = std::make_shared<ov::op::v0::Relu>(fc);
+        auto model = std::make_shared<ov::Model>(ov::NodeVector{relu}, ov::ParameterVector{input1});
+
+        // ov::serialize(model, "./model_fc.xml", "./model_fc.bin");
+
+        // -------- Loading a model to the device --------
+        ov::Core core;
+        // ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3"));
+        ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3"));
+
+        // -------- Create an infer request --------
+        ov::InferRequest infer_request = compiled_model.create_infer_request();
+
+        // -------- Prepare input --------
+        auto tensor = ov::Tensor(infer_request.get_input_tensor().get_element_type(),
+                                 infer_request.get_input_tensor().get_shape(),
+                                 input_data.data());
+
+        infer_request.set_input_tensor(tensor);
+
+        // -------- Do inference synchronously --------
+        // infer_request.infer();
+        for (int iter = 0; iter < 10; iter++) {
             infer_request.infer();
         }