From 25de131497658c2d0815b89d1013b64d3952ddb5 Mon Sep 17 00:00:00 2001 From: "Zhai, Xuejun" Date: Thu, 17 Oct 2024 14:40:45 +0800 Subject: [PATCH] Enable all reduce P2P add through iteration & fix mem leak & refacter the mem using to decrease the GPU mem using Signed-off-by: Zhai, Xuejun --- .../intel_gpu/plugin/sub_memory_manager.hpp | 12 + .../src/graph/impls/ocl/sync_tensor.cpp | 729 ++++++++++++++--- .../transformations/fc_all_reduce_test.cpp | 732 +++++++++++++++++- 3 files changed, 1331 insertions(+), 142 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/sub_memory_manager.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/sub_memory_manager.hpp index b02107ca880f99..19567ee766cb96 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/sub_memory_manager.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/sub_memory_manager.hpp @@ -20,9 +20,15 @@ class SubMemoryManager { void* send_buf; std::shared_ptr buf; bool flag; + int32_t last_rec_part; bool last_used; std::shared_ptr stream_ptr; std::vector recv_bufs; + std::vector remote_mems; + std::vector remote_mems_p2p; + std::vector recv_flag; + std::vector recv_flag_concat; + std::vector add_flag; std::vector events; cldnn::memory::ptr output; cldnn::layout layout; @@ -33,9 +39,15 @@ class SubMemoryManager { _num_sub_streams = num_sub_streams; MemoryInfo memory_info; memory_info.flag = false; + memory_info.last_rec_part = -1; memory_info.last_used = false; memory_info.layout = cldnn::layout(); memory_info.recv_bufs.assign(_num_sub_streams, nullptr); + memory_info.remote_mems.assign(_num_sub_streams, nullptr); + memory_info.remote_mems_p2p.assign(_num_sub_streams, nullptr); + memory_info.recv_flag.assign(_num_sub_streams, false); + memory_info.recv_flag_concat.assign(_num_sub_streams, false); + memory_info.add_flag.assign(_num_sub_streams, false); memory_info.events.assign(_num_sub_streams, nullptr); std::vector memorys; memorys.assign(_num_sub_streams, memory_info); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/sync_tensor.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/sync_tensor.cpp index 3b992c6e8f87a4..df07b184a3627c 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/sync_tensor.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/sync_tensor.cpp @@ -5,7 +5,7 @@ #define CL_VERSION_3_0 1 #include #include - +#include #include #include #include @@ -125,7 +125,7 @@ static void perf_dump_done(const std::chrono::_V2::system_clock::time_point& sta class gpu_semaphore { public: - gpu_semaphore(int count = 2) : count_(count), total_(count) {} + gpu_semaphore(int count = 1) : count_(count), total_(count) {} void signal() { std::unique_lock lock(mutex_); if (count_ < total_) @@ -406,6 +406,23 @@ class gpu_p2p_helper { } }; +static void dump_cl_buf(cl_command_queue queue, cl_mem clbuf, size_t count, size_t offset) { + cl_int err; + std::vector outBuf(count, 0); + err = clEnqueueReadBuffer(queue, clbuf, CL_TRUE, offset, count * 4, outBuf.data(), 0, NULL, NULL); + CHECK_OCL_ERROR(err, "clEnqueueReadBuffer failed"); + clFinish(queue); + + // std::cout << "The first " << count << "elements in cl_mem = " << clbuf << " are: " << std::endl; + for (int i = 0; i < static_cast(count); i++) { + // printf("%f, ", outBuf[i]); + std::cout << outBuf[i] << ", "; + if (i && i % 16 == 0) + std::cout << std::endl; + } + std::cout << std::endl; +} + class simple_tensor_add { public: simple_tensor_add() {} @@ -507,6 +524,43 @@ class simple_tensor_add { return kernels[type]; } + cl_kernel get_or_create_kernel_if_possible_sub(cldnn::stream& stream, kernel_data_type type, size_t offset) { + std::lock_guard lock(mutex); + auto it = kernels.find(type); + if (it != kernels.end()) { + // std::cout << "get_kernel: type = " << static_cast(type) << std::endl; + return it->second; + } +#define ADD_OP_KERNEL_SOURCE_CODE(DATA_TYPE) \ + "kernel void tensor_add_kernel_" #DATA_TYPE "(const global " #DATA_TYPE " *src, global " #DATA_TYPE \ + " *dst, int offset) {" \ + "const int id = get_global_id(0);" \ + "const int idx = id + offset;" \ + "dst[idx] += src[id];" \ + "}" + if (type == kernel_data_type::e_type_fp16) { + const char tensor_add_kernel_fp16[] = ADD_OP_KERNEL_SOURCE_CODE(half); + const char kernel_name[] = "tensor_add_kernel_half"; + kernels[type] = create_kernel(stream, tensor_add_kernel_fp16, kernel_name); + return kernels[type]; + } else if (type == kernel_data_type::e_type_int8) { + const char tensor_add_kernel_int8[] = ADD_OP_KERNEL_SOURCE_CODE(char); + const char kernel_name[] = "tensor_add_kernel_char"; + kernels[type] = create_kernel(stream, tensor_add_kernel_int8, kernel_name); + return kernels[type]; + } else if (type == kernel_data_type::e_type_fp32) { + const char tensor_add_kernel_fp32[] = ADD_OP_KERNEL_SOURCE_CODE(float); + const char kernel_name[] = "tensor_add_kernel_float"; + kernels[type] = create_kernel(stream, tensor_add_kernel_fp32, kernel_name); + return kernels[type]; + } else { + std::cout << "error: unsupported adder kernel data type " << static_cast(type) << std::endl; + // OPENVINO_THROW("error: unsupported adder kernel data type ", static_cast(type)); + } +#undef ADD_OP_KERNEL_SOURCE_CODE + return kernels[type]; + } + event::ptr tensor_add(cldnn::stream& stream, cl_mem src, cl_mem dst, @@ -540,6 +594,45 @@ class simple_tensor_add { return ocl_stream.create_event(cl::Event(ret)); } + event::ptr tensor_add_sub(cldnn::stream& stream, + cl_mem src, + cl_mem dst, + size_t element_count, + kernel_data_type data_type, + size_t offset) { + cl_int err; + auto& ocl_stream = downcast(stream); + if (src == nullptr || dst == nullptr) { + std::cout << "tensor_add: invalid arguments!" << std::endl; + } + OPENVINO_ASSERT(src != nullptr && dst != nullptr, "tensor_add: invalid arguments!"); + + const auto start = perf_dump_start(); + cl_kernel kernel = get_or_create_kernel_if_possible_sub(stream, + data_type, + static_cast(offset)); + perf_dump_done(start, std::string("get_or_create_kernel_if_possible"), false); + + err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &src); + CHECK_OCL_ERROR(err, "clSetKernelArg src failed"); + + err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &dst); + CHECK_OCL_ERROR(err, "clSetKernelArg dst failed"); + + err = clSetKernelArg(kernel, 2, sizeof(int), &offset); + CHECK_OCL_ERROR(err, "clSetKernelArg dst failed"); + + size_t global_size[] = {element_count}; + auto queue = ocl_stream.get_cl_queue().get(); + cl_event ret; + err = clEnqueueNDRangeKernel(queue, kernel, 1, nullptr, global_size, nullptr, 0, nullptr, &ret); + CHECK_OCL_ERROR(err, "clEnqueueNDRangeKernel failed"); + // clWaitForEvents(1, &ret); + + perf_dump_done(start, std::string("tensor add host time"), false); + return ocl_stream.create_event(cl::Event(ret)); + } + void finish(cldnn::stream& stream) { auto& ocl_stream = downcast(stream); auto queue = ocl_stream.get_cl_queue().get(); @@ -574,6 +667,21 @@ struct sync_tensor_impl : public typed_primitive_impl { sync_tensor_impl() : parent() {} + ~sync_tensor_impl() { + for (auto& mem : all_gather_remote_dst) { + if (mem) { + release_remote_mems(static_cast(mem)); + } + } + all_gather_remote_dst.clear(); + for (auto& mem : all_reduce_remote_dst) { + if (mem) { + release_remote_mems(static_cast(mem)); + } + } + all_reduce_remote_dst.clear(); + } + explicit sync_tensor_impl(const sync_tensor_node& outer) { set_node_params(outer); } @@ -596,6 +704,7 @@ struct sync_tensor_impl : public typed_primitive_impl { int id, size_t w_size, int32_t w_rank, + int all_reduce_solution = 0, bool validate = true) { // Wait for P2P transferred data are ready std::vector copy_list(w_size, 1); @@ -608,17 +717,13 @@ struct sync_tensor_impl : public typed_primitive_impl { if (idx != static_cast(w_rank) && copy_list[idx]) { auto& remote_ocl_stream = downcast(*sub_mem_mgr->_memorys_table[id][idx].stream_ptr); - auto event = sub_mem_mgr->_memorys_table[id][w_rank].events[idx]; + cldnn::event::ptr event = nullptr; + event = sub_mem_mgr->_memorys_table[id][w_rank].events[idx]; if (event) { event->wait(); remote_ocl_stream.finish(); copy_list[idx] = 0; - // std::lock_guard lock(sub_mem_mgr->_flagMutex); sub_mem_mgr->_memorys_table[id][w_rank].events[idx] = nullptr; - // MUST release remote cl_mem, but it will cause remote map failed. - // cl_mem remote_mem = - // static_cast(sub_mem_mgr->_memorys_table[id][idx].remote_mem[w_rank]); - // clReleaseMemObject(remote_mem); } } } @@ -657,14 +762,16 @@ struct sync_tensor_impl : public typed_primitive_impl { } } - void update_internal_buffer(sync_tensor_inst& instance, + bool update_internal_buffer(sync_tensor_inst& instance, std::vector& bufs, cldnn::layout& last_layout, cldnn::layout& layout, size_t w_size, - size_t w_rank) { + size_t w_rank, + int all_reduce_solution) { auto& engine = instance.get_network().get_engine(); size_t required_size = layout.bytes_count(); + bool allocated = false; if (0) { std::lock_guard lock(debug_mutex); std::cout << "Before update_internal_buffer: " << std::endl; @@ -672,8 +779,16 @@ struct sync_tensor_impl : public typed_primitive_impl { } auto need_realloc = [&](size_t idx) { - if (idx == w_rank) - return false; + if (all_reduce_solution == 2) { + if (idx == 1) + return true; + else + return false; + + } else { + if (idx == w_rank) + return false; + } if (bufs[idx] == nullptr || last_layout.bytes_count() == 0) return true; @@ -692,16 +807,42 @@ struct sync_tensor_impl : public typed_primitive_impl { if (!need_realloc(i)) { continue; } - size_t origin_size = bufs[i] != nullptr ? bufs[i]->size() : 0; + // size_t origin_size = bufs[i] != nullptr ? bufs[i]->size() : 0; bufs[i] = nullptr; - bufs[i] = engine.allocate_memory(layout, cldnn::allocation_type::cl_mem, false); - if (debug_enable) { + if (all_reduce_solution == 2) { + auto width = layout.get_shape()[-1]; + auto sub_width = width / w_size; + if (sub_width * w_size != width) + std::cout << "[Warning] the shape of FC output has ODD number!!!" << std::endl; + auto sub_layout = layout; + auto sub_shape = layout.get_shape(); + sub_shape[-1] = sub_width; + sub_layout.set_partial_shape(sub_shape); + bufs[i] = engine.allocate_memory(sub_layout, cldnn::allocation_type::cl_mem, false); + } else { + bufs[i] = engine.allocate_memory(layout, cldnn::allocation_type::cl_mem, false); + } + allocated = true; + } + if (0) { + std::lock_guard lock(debug_mutex); + std::cout << "tensor_sync allocate: rank[" << w_rank << "]: layout[" << 1 + << "]=" << layout.to_short_string() << ", last_layout[" << 1 + << "]=" << last_layout.to_short_string() << std::endl; + } + return allocated; + } + + void release_remote_mems(cl_mem remote_mems) { + if (remote_mems) { + size_t data_size = 0; + auto _cl_mem = static_cast(remote_mems); + if (0) { + auto error = clGetMemObjectInfo(_cl_mem, CL_MEM_SIZE, sizeof(size_t), &data_size, NULL); std::lock_guard lock(debug_mutex); - std::cout << "tensor_sync allocate: rank[" << w_rank << "]: layout[" << i - << "]=" << layout.to_short_string() << ", required_size = " << required_size - << ", current_size = " << origin_size << ", to_allocate_size = " << bufs[i]->size() - << std::endl; + std::cout << "clReleaseMemObject...size = " << data_size << ", error: " << error << std::endl; } + clReleaseMemObject(_cl_mem); } } @@ -713,18 +854,28 @@ struct sync_tensor_impl : public typed_primitive_impl { auto w_rank = instance.get_network().get_program()->get_config().subStreamExecConfig.get_rank()[0]; auto w_size = instance.get_network().get_program()->get_config().get_context_for_tp().size(); auto is_all_reduce = instance.get_impl_params()->need_add == true; - auto start = perf_dump_start(); - if (!pass_through_events) { - for (auto e : events) { - e->wait(); - } + if (!is_all_reduce && all_gather_remote_dst.size() == 0) { + all_gather_remote_dst.assign(w_size, nullptr); } + if (is_all_reduce && all_reduce_remote_dst.size() == 0) { + all_reduce_remote_dst.assign(w_size, nullptr); + } + + int all_reduce_solution = 2; + if (is_all_reduce) { + const char* all_reduce_add_solution = getenv("OV_TP_ALLREDUCE_ADD_solution"); + if (all_reduce_add_solution) + all_reduce_solution = std::atoi(all_reduce_add_solution); + } + + auto start = perf_dump_start(); perf_dump_done(start, std::string("rank[") + std::to_string(w_rank) + std::string("] sync_tensor wait events"), true); auto sub_mem_mgr = instance.get_network().get_sub_mem_mgr(); - auto id = sub_mem_mgr->get_memory_id(w_rank); + auto id = 0; + // auto id = sub_mem_mgr->get_memory_id(w_rank); sub_mem_mgr->set_memory_used(id, w_rank); auto start_1 = perf_dump_start(); while (true) { @@ -733,8 +884,12 @@ struct sync_tensor_impl : public typed_primitive_impl { sub_mem_mgr->_use_count[id] = 0; for (size_t i = 0; i < w_size; i++) { sub_mem_mgr->_memorys_table[id][i].flag = false; - for (size_t j = 0; j < w_size; j++) + for (size_t j = 0; j < w_size; j++) { sub_mem_mgr->_memorys_table[id][i].events[j] = nullptr; + sub_mem_mgr->_memorys_table[id][i].recv_flag[j] = false; + sub_mem_mgr->_memorys_table[id][i].recv_flag_concat[j] = false; + sub_mem_mgr->_memorys_table[id][i].add_flag[j] = false; + } } } if (sub_mem_mgr->_use_count[id] == 0) { @@ -755,18 +910,23 @@ struct sync_tensor_impl : public typed_primitive_impl { auto& ocl_stream = downcast(stream); auto local_context = ocl_stream.get_engine().get_cl_context().get(); auto p2p_src_layout = instance.get_output_layout(0); + bool need_update_remote_mems = false; if (is_all_reduce) { OPENVINO_ASSERT(1 == instance.get_output_memorys().size(), "All reduce only has one output!"); - sub_mem_mgr->_memorys_table[id][w_rank].recv_bufs[w_rank] = instance.get_output_memorys()[0]; + if (all_reduce_solution == 2) + sub_mem_mgr->_memorys_table[id][w_rank].recv_bufs[0] = instance.get_output_memorys()[0]; + else + sub_mem_mgr->_memorys_table[id][w_rank].recv_bufs[w_rank] = instance.get_output_memorys()[0]; sub_mem_mgr->_memorys_table[id][w_rank].output = instance.get_output_memorys()[0]; // Allocate or reuse buffer for P2P target, same shape with output[0] p2p_src_layout = instance.get_output_layout(0); - update_internal_buffer(instance, - sub_mem_mgr->_memorys_table[id][w_rank].recv_bufs, - sub_mem_mgr->_memorys_table[id][w_rank].layout, - p2p_src_layout, - w_size, - w_rank); + need_update_remote_mems = update_internal_buffer(instance, + sub_mem_mgr->_memorys_table[id][w_rank].recv_bufs, + sub_mem_mgr->_memorys_table[id][w_rank].layout, + p2p_src_layout, + w_size, + w_rank, + all_reduce_solution); } else { OPENVINO_ASSERT(2 == instance.get_output_memorys().size(), "All gather need additional buffer for concat result!"); @@ -774,115 +934,414 @@ struct sync_tensor_impl : public typed_primitive_impl { sub_mem_mgr->_memorys_table[id][w_rank].output = instance.get_output_memorys()[0]; // All gather doesn't need intermediate buffer at all. p2p_src_layout = instance.get_output_layout(1); + auto tmp = + std::dynamic_pointer_cast(instance.get_output_memorys()[0])->get_buffer().get(); + if (tmp != all_gather_current_dst) { + need_update_remote_mems = true; + all_gather_current_dst = tmp; + } + } + if (all_reduce_solution == 2) { + sub_mem_mgr->_memorys_table[id][w_rank].recv_flag[0] = true; + sub_mem_mgr->_memorys_table[id][w_rank].recv_flag_concat[0] = true; + } else { + sub_mem_mgr->_memorys_table[id][w_rank].flag = true; } - sub_mem_mgr->_memorys_table[id][w_rank].flag = true; - std::vector wait_list(w_size, 1); - auto start_2 = perf_dump_start(); - wait_list[w_rank] = 0; // no need to wait for itself - size_t data_size = 0; - event::ptr sync_event = nullptr; - auto src_p2p_buf = - std::dynamic_pointer_cast(sub_mem_mgr->_memorys_table[id][w_rank].recv_bufs[w_rank]); - auto src_cl_buf = src_p2p_buf->get_buffer().get(); - while (true) { - int wait_size = 0; - for (int idx = 0; idx < static_cast(w_size); idx++) { - if (idx != w_rank && wait_list[idx] > 0 && sub_mem_mgr->_memorys_table[id][idx].flag) { - cldnn::memory::ptr dst_mem = nullptr; - if (is_all_reduce) { - dst_mem = sub_mem_mgr->_memorys_table[id][idx].recv_bufs[w_rank]; - } else { - dst_mem = sub_mem_mgr->_memorys_table[id][idx].output; + // The mapped remote cl_mem will hold the original cl_mem, it should be released if the original cl_mem has been + // released, else it will cause gpu memory leak. + if (need_update_remote_mems) { + if (debug_enable) { + std::cout << "release_remote_mems: old_layout = " + << sub_mem_mgr->_memorys_table[id][w_rank].layout.to_short_string() + << ", new_layout = " << p2p_src_layout.to_short_string() << std::endl; + } + } + + std::vector sync_events; + if (is_all_reduce && all_reduce_solution == 2) { + while (true) { + size_t wait_all_ouput_ready = 0; + for (int idx = 0; idx < static_cast(w_size); idx++) { + if (sub_mem_mgr->_memorys_table[id][idx].recv_flag[0] == true) + wait_all_ouput_ready++; + } + if (wait_all_ouput_ready == w_size) { + break; + } + } + auto split_parts = [](int len, int n) { + int average = len / n; + std::vector parts(n, average); + parts.back() = len - average * (n - 1); + return parts; + }; + + auto output_layout = instance.get_output_layout(0); + ov::element::Type output_element_type = output_layout.data_type; + auto output_element_size = output_element_type.size(); + auto output_shape = output_layout.get_shape(); + auto sub_out_dim_vec = split_parts(output_shape[-1], w_size); + if (0) { + for (size_t iter = 0; iter < w_size; iter++) { + std::cout << "sub_out_dim_vec[" << iter << "] " << sub_out_dim_vec[iter] << std::endl; + } + } + auto output_height = ov::shape_size(output_shape) / output_shape[-1]; + // Prepare CL memory mapping for P2P copying next + { + auto dst_idx = (w_rank + 1) % w_size; + cl_mem dst_cl_buf = nullptr; + cldnn::memory::ptr dst_mem = sub_mem_mgr->_memorys_table[id][dst_idx].recv_bufs[1]; + size_t data_size = dst_mem->size(); + auto dst_cl_buf_remote = std::dynamic_pointer_cast(dst_mem)->get_buffer().get(); + dst_cl_buf = static_cast(sub_mem_mgr->_memorys_table[id][w_rank].remote_mems_p2p[0]); + if (need_update_remote_mems || dst_cl_buf == nullptr) { + if (dst_cl_buf) { + release_remote_mems(dst_cl_buf); } - auto dst_cl_buf_remote = - std::dynamic_pointer_cast(dst_mem)->get_buffer().get(); - - data_size = dst_mem->size(); - auto dst_cl_buf = gpu_p2p_instance.map_remote_mem(local_context, dst_cl_buf_remote, data_size); - auto p2p_data_size = p2p_src_layout.bytes_count(); - { - gpu_lock.acquire(); - if (is_all_reduce) { - gpu_p2p_instance.remote_copy(stream, src_cl_buf, dst_cl_buf, p2p_data_size); - } else { - gpu_p2p_instance.remote_copy_rect(stream, - src_cl_buf, - instance.get_output_layout(1), - dst_cl_buf, - instance.get_output_layout(0), - w_rank, - false); - } - gpu_lock.signal(); + dst_cl_buf = gpu_p2p_instance.map_remote_mem(local_context, dst_cl_buf_remote, data_size); + sub_mem_mgr->_memorys_table[id][w_rank].remote_mems_p2p[0] = dst_cl_buf; + } + } + // Start loop for sub buff copy & add + for (int32_t i = 0; i < static_cast(w_size) - 1; i++) { + auto src_mem = std::dynamic_pointer_cast( + sub_mem_mgr->_memorys_table[id][w_rank].recv_bufs[0]); + auto src_buf = src_mem->get_buffer().get(); + + int32_t sub_part = (w_rank - i) >= 0 ? (w_rank - i) : ((w_rank - i) + w_size); + int32_t rec_sub_part = (sub_part - 1) >= 0 ? (sub_part - 1) : ((sub_part - 1) + w_size); + auto dst_idx = (w_rank + 1) % w_size; + + auto dst_cl_buf = static_cast(sub_mem_mgr->_memorys_table[id][w_rank].remote_mems_p2p[0]); + + auto& ocl_stream = downcast(stream); + auto queue = ocl_stream.get_cl_queue().get(); + int32_t off_set = 0; + for (int32_t j = 0; j < sub_part; j++) { + off_set = off_set + sub_out_dim_vec[j]; + } + size_t src_rec[3] = {off_set * output_element_size * output_height, 0, 0}; + size_t dst_rec[3] = {0, 0, 0}; + cl_event event; + size_t rect[3] = {sub_out_dim_vec[sub_part] * output_element_size, output_height, 1}; + auto ret = clEnqueueCopyBufferRect(queue, + src_buf, + dst_cl_buf, + src_rec, + dst_rec, + rect, + 0, + 0, + 0, + 0, + 0, + nullptr, + &event); + if (ret != CL_SUCCESS) { + std::cout << "0.clEnqueueCopyBufferRect failed: " << oclErrorCode[ret] << ", idx = " << i + << std::endl; + OPENVINO_THROW("0.clEnqueueCopyBufferRect failed: ", oclErrorCode[ret], ", idx = ", i); + } + ret = clWaitForEvents(1, &event); + CHECK_OCL_ERROR(ret, "clWaitForEvents failed"); + clReleaseEvent(event); + + sub_mem_mgr->_memorys_table[id][dst_idx].recv_flag[i + 1] = true; + + while (true) { + size_t wait_all_ouput_ready = 0; + for (int idx = 0; idx < static_cast(w_size); idx++) { + if (sub_mem_mgr->_memorys_table[id][idx].recv_flag[i + 1] == true) + wait_all_ouput_ready++; } - // P2P has been done. - { - std::lock_guard lock(sub_mem_mgr->_flagMutex); - sub_mem_mgr->_memorys_table[id][idx].events[w_rank] = stream.create_user_event(true); + if (wait_all_ouput_ready == w_size) + break; + } + + if (0) { + std::lock_guard lock(debug_mutex); + std::cout << "[After Copy Rank] " << w_rank << ", [iter] " << i << std::endl; + for (int idx = 0; idx < static_cast(w_size); idx++) { + auto mem = std::dynamic_pointer_cast( + sub_mem_mgr->_memorys_table[id][w_rank].recv_bufs[idx]); + auto buf = mem->get_buffer().get(); + dump_cl_buf(ocl_stream.get_cl_queue().get(), buf, mem->count(), 0); + } + } + + auto dst_mem_add = std::dynamic_pointer_cast( + sub_mem_mgr->_memorys_table[id][w_rank].recv_bufs[0]); + auto dst_cl_buf_add = dst_mem_add->get_buffer().get(); + auto& adder_instance = get_adder_instance(w_rank); + + auto src_mem_add = sub_mem_mgr->_memorys_table[id][w_rank].recv_bufs[1]; + auto src_cl_buf_add = std::dynamic_pointer_cast(src_mem_add)->get_buffer().get(); + + sub_mem_mgr->_memorys_table[id][w_rank].last_rec_part = rec_sub_part; + int32_t off_set_add = 0; + for (int32_t j = 0; j < rec_sub_part; j++) + off_set_add = off_set_add + sub_out_dim_vec[j]; + + cldnn::event::ptr sync_add_event; + auto start_add = perf_dump_start(); + sync_add_event = adder_instance.tensor_add_sub( + stream, + src_cl_buf_add, + dst_cl_buf_add, + output_height * sub_out_dim_vec[rec_sub_part], + adder_instance.element_type_to_kernel_data_type(dst_mem_add->get_layout().data_type), + output_height * off_set_add); + sync_add_event->wait(); + auto end_add = perf_dump_start(); + std::chrono::duration duration = end_add - start_add; + if (0) { + std::lock_guard lock(debug_mutex); + std::cout << "Add Solution[" << all_reduce_solution << "] " << "Rank[" << w_rank + << "] sync tensor p2p add total cost: " << duration.count() << " ms" << std::endl; + } + sub_mem_mgr->_memorys_table[id][w_rank].add_flag[i + 1] = true; + + while (true) { + size_t wait_all_ouput_ready = 0; + for (int idx = 0; idx < static_cast(w_size); idx++) { + if (sub_mem_mgr->_memorys_table[id][idx].add_flag[i + 1] == true) + wait_all_ouput_ready++; + } + if (wait_all_ouput_ready == w_size) + break; + } + + if (0) { + std::lock_guard lock(debug_mutex); + std::cout << "[After Add Rank] " << w_rank << std::endl; + for (int idx = 0; idx < 1; idx++) { + auto mem = std::dynamic_pointer_cast( + sub_mem_mgr->_memorys_table[id][w_rank].recv_bufs[idx]); + auto buf = mem->get_buffer().get(); + dump_cl_buf(ocl_stream.get_cl_queue().get(), buf, mem->count(), 0); } - // gpu_p2p_instance.destory_remote_mem(dst_cl_buf); - wait_list[idx] = 0; } - wait_size += wait_list[idx]; - } - if (wait_size == 0) { - break; } - auto end_2 = perf_dump_start(); - std::chrono::duration duration = end_2 - start_2; - if (duration.count() > 10000) { - start_2 = perf_dump_start(); - std::cout << "rank[" << w_rank << "]Error: sync_tensor p2p write timeout..." << std::endl; + + // Prepare CL memory mapping for P2P copying instand of concat + { + auto dst_idx = (w_rank + 1) % w_size; + cl_mem dst_cl_buf = nullptr; + cldnn::memory::ptr dst_mem = sub_mem_mgr->_memorys_table[id][dst_idx].output; + size_t data_size = dst_mem->size(); + auto dst_cl_buf_remote = std::dynamic_pointer_cast(dst_mem)->get_buffer().get(); + dst_cl_buf = static_cast(all_reduce_remote_dst[dst_idx]); + if (need_update_remote_mems || dst_cl_buf == nullptr) { + dst_cl_buf = gpu_p2p_instance.map_remote_mem(local_context, dst_cl_buf_remote, data_size); + all_reduce_remote_dst[dst_idx] = dst_cl_buf; + } } - } + // Start loop for sub buff copy (concat) + for (int32_t i = 0; i < static_cast(w_size) - 1; i++) { + auto src_mem = std::dynamic_pointer_cast( + sub_mem_mgr->_memorys_table[id][w_rank].recv_bufs[0]); + auto src_buf = src_mem->get_buffer().get(); + + auto dst_idx = (w_rank + 1) % w_size; + cl_mem dst_cl_buf = nullptr; + dst_cl_buf = static_cast(all_reduce_remote_dst[dst_idx]); + + auto sub_part = sub_mem_mgr->_memorys_table[id][w_rank].last_rec_part; + auto& ocl_stream = downcast(stream); + auto queue = ocl_stream.get_cl_queue().get(); + int32_t off_set = 0; + for (int32_t j = 0; j < sub_part; j++) { + off_set = off_set + sub_out_dim_vec[j]; + } + size_t src_rec[3] = {off_set * output_element_size * output_height, 0, 0}; + size_t dst_rec[3] = {off_set * output_element_size * output_height, 0, 0}; + cl_event event; + size_t rect[3] = {sub_out_dim_vec[sub_part] * output_element_size, output_height, 1}; + auto ret = clEnqueueCopyBufferRect(queue, + src_buf, + dst_cl_buf, + src_rec, + dst_rec, + rect, + 0, + 0, + 0, + 0, + 0, + nullptr, + &event); + if (ret != CL_SUCCESS) { + std::cout << "0.clEnqueueCopyBufferRect failed: " << oclErrorCode[ret] << ", idx = " << i + << std::endl; + OPENVINO_THROW("0.clEnqueueCopyBufferRect failed: ", oclErrorCode[ret], ", idx = ", i); + } + ret = clWaitForEvents(1, &event); + CHECK_OCL_ERROR(ret, "clWaitForEvents failed"); + clReleaseEvent(event); + sub_mem_mgr->_memorys_table[id][dst_idx].last_rec_part = sub_part; + + sub_mem_mgr->_memorys_table[id][dst_idx].recv_flag_concat[i + 1] = true; + + while (true) { + size_t wait_all_ouput_ready = 0; + for (int idx = 0; idx < static_cast(w_size); idx++) { + if (sub_mem_mgr->_memorys_table[id][idx].recv_flag_concat[i + 1] == true) + wait_all_ouput_ready++; + } + if (wait_all_ouput_ready == w_size) + break; + } - auto str_need_add = instance.get_impl_params()->need_add ? std::string("[need_add]") : std::string(""); - perf_dump_done(start_2, - std::string("rank[") + std::to_string(w_rank) + std::string("] sync_tensor p2p write ") + - std::to_string(data_size) + " bytes" + str_need_add, - true); + if (0) { + std::lock_guard lock(debug_mutex); + std::cout << std::endl; + std::cout << "[Concat Rank] " << w_rank << " [iter] " << i << std::endl; + for (int idx = 0; idx < 1; idx++) { + auto mem = std::dynamic_pointer_cast( + sub_mem_mgr->_memorys_table[id][w_rank].recv_bufs[idx]); + auto buf = mem->get_buffer().get(); + dump_cl_buf(ocl_stream.get_cl_queue().get(), buf, mem->count(), 0); + } + } + } + } else { + std::vector wait_list(w_size, 1); + auto start_2 = perf_dump_start(); + wait_list[w_rank] = 0; // no need to wait for itself + size_t data_size = 0; + event::ptr sync_event = nullptr; + auto src_p2p_buf = std::dynamic_pointer_cast( + sub_mem_mgr->_memorys_table[id][w_rank].recv_bufs[w_rank]); + auto src_cl_buf = src_p2p_buf->get_buffer().get(); + + while (true) { + int wait_size = 0; + for (int idx = 0; idx < static_cast(w_size); idx++) { + if (idx != w_rank && wait_list[idx] > 0 && sub_mem_mgr->_memorys_table[id][idx].flag) { + cl_mem dst_cl_buf = nullptr; + if (is_all_reduce) { + cldnn::memory::ptr dst_mem = sub_mem_mgr->_memorys_table[id][idx].recv_bufs[w_rank]; + data_size = dst_mem->size(); + auto dst_cl_buf_remote = + std::dynamic_pointer_cast(dst_mem)->get_buffer().get(); + dst_cl_buf = static_cast(sub_mem_mgr->_memorys_table[id][w_rank].remote_mems[idx]); + if (need_update_remote_mems || dst_cl_buf == nullptr) { + if (dst_cl_buf) { + release_remote_mems(dst_cl_buf); + } + dst_cl_buf = + gpu_p2p_instance.map_remote_mem(local_context, dst_cl_buf_remote, data_size); + sub_mem_mgr->_memorys_table[id][w_rank].remote_mems[idx] = dst_cl_buf; + } + } else { + cldnn::memory::ptr dst_mem = sub_mem_mgr->_memorys_table[id][idx].output; + data_size = dst_mem->size(); + auto dst_cl_buf_remote = + std::dynamic_pointer_cast(dst_mem)->get_buffer().get(); + dst_cl_buf = static_cast(all_gather_remote_dst[idx]); + if (need_update_remote_mems || dst_cl_buf == nullptr) { + if (dst_cl_buf) { + release_remote_mems(dst_cl_buf); + } + dst_cl_buf = + gpu_p2p_instance.map_remote_mem(local_context, dst_cl_buf_remote, data_size); + all_gather_remote_dst[idx] = dst_cl_buf; + } + } + auto p2p_data_size = p2p_src_layout.bytes_count(); + { + gpu_lock.acquire(); + if (is_all_reduce) { + gpu_p2p_instance.remote_copy(stream, src_cl_buf, dst_cl_buf, p2p_data_size); + } else { + gpu_p2p_instance.remote_copy_rect(stream, + src_cl_buf, + instance.get_output_layout(1), + dst_cl_buf, + instance.get_output_layout(0), + w_rank, + false); + } + gpu_lock.signal(); + } + // P2P has been done. + { + std::lock_guard lock(sub_mem_mgr->_flagMutex); + sub_mem_mgr->_memorys_table[id][idx].events[w_rank] = stream.create_user_event(true); + } + // gpu_p2p_instance.destory_remote_mem(dst_cl_buf); + wait_list[idx] = 0; + } + wait_size += wait_list[idx]; + } + if (wait_size == 0) { + break; + } + auto end_2 = perf_dump_start(); + std::chrono::duration duration = end_2 - start_2; + if (duration.count() > 10000) { + start_2 = perf_dump_start(); + std::cout << "rank[" << w_rank << "]Error: sync_tensor p2p write timeout..." << std::endl; + } + } - // P2P adopts sync write to avoid the problem of event cannot work across contexts - wait_p2p_done(stream, gpu_p2p_instance, sub_mem_mgr, id, w_size, w_rank, false); + auto str_need_add = instance.get_impl_params()->need_add ? std::string("[need_add]") : std::string(""); + perf_dump_done(start_2, + std::string("rank[") + std::to_string(w_rank) + std::string("] sync_tensor p2p write ") + + std::to_string(data_size) + " bytes" + str_need_add, + true); - std::vector sync_events; - if (is_all_reduce) { - // All_reduce path - auto start_3 = perf_dump_start(); - auto dst_mem = std::dynamic_pointer_cast(instance.output_memory_ptr(0)); - auto dst_cl_buf = dst_mem->get_buffer().get(); - auto& adder_instance = get_adder_instance(w_rank); - // auto data_size = dst_mem->size(); - for (size_t idx = 0; idx < w_size; idx++) { - if (idx != static_cast(w_rank)) { - auto src_mem = sub_mem_mgr->_memorys_table[id][w_rank].recv_bufs[idx]; - auto src_cl_buf = std::dynamic_pointer_cast(src_mem)->get_buffer().get(); - sync_event = adder_instance.tensor_add( - stream, - src_cl_buf, - dst_cl_buf, - dst_mem->count(), - adder_instance.element_type_to_kernel_data_type(dst_mem->get_layout().data_type)); - sync_events.emplace_back(sync_event); + // P2P adopts sync write to avoid the problem of event cannot work across contexts + wait_p2p_done(stream, gpu_p2p_instance, sub_mem_mgr, id, w_size, w_rank, all_reduce_solution, false); + + if (is_all_reduce) { + // All_reduce path + auto start_3 = perf_dump_start(); + auto dst_mem = std::dynamic_pointer_cast(instance.output_memory_ptr(0)); + auto dst_cl_buf = dst_mem->get_buffer().get(); + auto& adder_instance = get_adder_instance(w_rank); + // auto data_size = dst_mem->size(); + for (size_t idx = 0; idx < w_size; idx++) { + if (idx != static_cast(w_rank)) { + auto src_mem = sub_mem_mgr->_memorys_table[id][w_rank].recv_bufs[idx]; + auto src_cl_buf = std::dynamic_pointer_cast(src_mem)->get_buffer().get(); + sync_event = adder_instance.tensor_add( + stream, + src_cl_buf, + dst_cl_buf, + dst_mem->count(), + adder_instance.element_type_to_kernel_data_type(dst_mem->get_layout().data_type)); + // sync_event->wait(); + sync_events.emplace_back(sync_event); + } + } + const auto end_add = std::chrono::high_resolution_clock::now(); + const std::chrono::duration elapsed_1 = end_add - start_3; + if (0) { + std::lock_guard lock(debug_mutex); + std::cout << "Add Solution[" << all_reduce_solution << "] " << "Rank[" << w_rank + << "] sync tensor p2p add total cost: " << elapsed_1.count() << " ms" << std::endl; } + perf_dump_done( + start_3, + std::string("rank[") + std::to_string(w_rank) + std::string("] sync_tensor allreduce add"), + true); + } else { + auto src_mem = sub_mem_mgr->_memorys_table[id][w_rank].recv_bufs[w_rank]; + auto src_cl_buf = std::dynamic_pointer_cast(src_mem)->get_buffer().get(); + auto dst_mem = std::dynamic_pointer_cast(instance.output_memory_ptr(0)); + auto dst_cl_buf = dst_mem->get_buffer().get(); + auto sync_event = gpu_p2p_instance.remote_copy_rect(stream, + src_cl_buf, + instance.get_output_layout(1), + dst_cl_buf, + instance.get_output_layout(0), + w_rank, + false); + sync_events.emplace_back(sync_event); } - perf_dump_done(start_3, - std::string("rank[") + std::to_string(w_rank) + std::string("] sync_tensor allreduce add"), - true); - } else { - auto src_mem = sub_mem_mgr->_memorys_table[id][w_rank].recv_bufs[w_rank]; - auto src_cl_buf = std::dynamic_pointer_cast(src_mem)->get_buffer().get(); - auto dst_mem = std::dynamic_pointer_cast(instance.output_memory_ptr(0)); - auto dst_cl_buf = dst_mem->get_buffer().get(); - auto sync_event = gpu_p2p_instance.remote_copy_rect(stream, - src_cl_buf, - instance.get_output_layout(1), - dst_cl_buf, - instance.get_output_layout(0), - w_rank, - false); - sync_events.emplace_back(sync_event); } if (pass_through_events) { @@ -894,6 +1353,14 @@ struct sync_tensor_impl : public typed_primitive_impl { } perf_dump_done(start, std::string("rank[") + std::to_string(w_rank) + std::string("] sync_tensor total"), true); + const auto end_xj = std::chrono::high_resolution_clock::now(); + const std::chrono::duration elapsed_1 = end_xj - start; + if (0) { + std::lock_guard lock(debug_mutex); + std::cout << "Solution[" << all_reduce_solution << "] " << "Rank[" << w_rank + << "] sync tensor p2p add total cost: " << elapsed_1.count() << " ms" << std::endl; + } + // This block MUST be put exactly at the end of this method. { std::lock_guard lock(sub_mem_mgr->_flagMutex); @@ -909,6 +1376,10 @@ struct sync_tensor_impl : public typed_primitive_impl { static std::unique_ptr create(const sync_tensor_node& arg, const kernel_impl_params& impl_param) { return make_unique(); } + + std::vector all_gather_remote_dst; + std::vector all_reduce_remote_dst; + cl_mem all_gather_current_dst; }; namespace detail { diff --git a/src/plugins/intel_gpu/tests/unit/transformations/fc_all_reduce_test.cpp b/src/plugins/intel_gpu/tests/unit/transformations/fc_all_reduce_test.cpp index e1998646aaf2e7..631aae306c3c92 100644 --- a/src/plugins/intel_gpu/tests/unit/transformations/fc_all_reduce_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/transformations/fc_all_reduce_test.cpp @@ -40,7 +40,7 @@ TEST(TransformationTestsF1, FullyConnectedSplitInput11) { unsigned long test_size = 2; auto input1 = std::make_shared(ov::element::f32, ov::Shape{test_size, test_size}); std::vector weights(test_size * test_size, 2); - auto input2 = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{test_size, test_size}, {1, 2, 3, 4}); + auto input2 = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{test_size, test_size}, {5, 6, 7, 8}); std::cout << "\n" << "weights: "; for (size_t i = 0; i < input2->get_vector().size(); i++) { std::cout << input2->get_vector()[i] << ", "; @@ -55,16 +55,21 @@ TEST(TransformationTestsF1, FullyConnectedSplitInput11) { // -------- Loading a model to the device -------- ov::Core core; - ov::CompiledModel compiled_model = core.compile_model(model, "GPU"); + // ov::CompiledModel compiled_model = core.compile_model(model, "GPU"); + ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3")); // -------- Create an infer request -------- ov::InferRequest infer_request = compiled_model.create_infer_request(); // -------- Prepare input -------- - auto input_generate = ov::test::utils::InputGenerateData(0, 5); - auto tensor = ov::test::utils::create_and_fill_tensor(infer_request.get_input_tensor().get_element_type(), - infer_request.get_input_tensor().get_shape(), - input_generate); + std::vector input_data = {1, 2, 3, 4}; + auto tensor = ov::Tensor(infer_request.get_input_tensor().get_element_type(), + infer_request.get_input_tensor().get_shape(), + input_data.data()); + // auto input_generate = ov::test::utils::InputGenerateData(0, 5); + // auto tensor = ov::test::utils::create_and_fill_tensor(infer_request.get_input_tensor().get_element_type(), + // infer_request.get_input_tensor().get_shape(), + // input_generate); std::cout << "\n" << "input_tensor: "; for (size_t i = 0; i < tensor.get_size(); i++) { std::cout << tensor.data()[i] << ", "; @@ -75,9 +80,71 @@ TEST(TransformationTestsF1, FullyConnectedSplitInput11) { // -------- Do inference synchronously -------- infer_request.infer(); - for (int iter = 0; iter < 2; iter++) { - infer_request.infer(); + // for (int iter = 0; iter < 2; iter++) { + // infer_request.infer(); + // } + + // -------- Process output + auto output_tensor = infer_request.get_output_tensor(); + std::cout << "\n" + << "output_tensor: " << output_tensor.get_shape() << std::endl; + for (size_t i = 0; i < output_tensor.get_size(); i++) { + std::cout << output_tensor.data()[i] << ", "; + } + std::cout << std::endl; + } +} + +TEST(TransformationTestsF1, FullyConnectedSplitInput8) { + { + // -------- Construct model + unsigned long test_size = 4; + auto input1 = std::make_shared(ov::element::f32, ov::Shape{test_size, test_size}); + std::vector weights(test_size * test_size, 2); + auto input2 = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{test_size, test_size}, {17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}); + std::cout << "\n" << "weights: "; + for (size_t i = 0; i < input2->get_vector().size(); i++) { + std::cout << input2->get_vector()[i] << ", "; + } + auto no_bias = std::make_shared(); + auto fc = std::make_shared(input1, input2, no_bias, ov::element::f32); + const auto relu = std::make_shared(fc); + auto model = std::make_shared(ov::NodeVector{relu}, ov::ParameterVector{input1}); + //auto model = std::make_shared(ov::NodeVector{fc}, ov::ParameterVector{input1}); + + // ov::serialize(model, "./model_fc.xml", "./model_fc.bin"); + + // -------- Loading a model to the device -------- + ov::Core core; + // ov::CompiledModel compiled_model = core.compile_model(model, "GPU"); + ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3")); + // ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3")); + + // -------- Create an infer request -------- + ov::InferRequest infer_request = compiled_model.create_infer_request(); + + // -------- Prepare input -------- + std::vector input_data = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; + auto tensor = ov::Tensor(infer_request.get_input_tensor().get_element_type(), + infer_request.get_input_tensor().get_shape(), + input_data.data()); + // auto input_generate = ov::test::utils::InputGenerateData(0, 5); + // auto tensor = ov::test::utils::create_and_fill_tensor(infer_request.get_input_tensor().get_element_type(), + // infer_request.get_input_tensor().get_shape(), + // input_generate); + std::cout << "\n" << "input_tensor: "; + for (size_t i = 0; i < tensor.get_size(); i++) { + std::cout << tensor.data()[i] << ", "; } + std::cout << std::endl; + + infer_request.set_input_tensor(tensor); + + // -------- Do inference synchronously -------- + infer_request.infer(); + // for (int iter = 0; iter < 2; iter++) { + // infer_request.infer(); + // } // -------- Process output auto output_tensor = infer_request.get_output_tensor(); @@ -141,7 +208,9 @@ TEST(TransformationTestsF1, FullyConnectedSplitInput16) { // -------- Loading a model to the device -------- ov::Core core; - ov::CompiledModel compiled_model = core.compile_model(model, "GPU"); +// ov::CompiledModel compiled_model = core.compile_model(model, "GPU"); + ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3")); + // ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3")); // -------- Create an infer request -------- ov::InferRequest infer_request = compiled_model.create_infer_request(); @@ -160,9 +229,9 @@ TEST(TransformationTestsF1, FullyConnectedSplitInput16) { // -------- Do inference synchronously -------- infer_request.infer(); - for (int iter = 0; iter < 2; iter++) { - infer_request.infer(); - } + // for (int iter = 0; iter < 2; iter++) { + // infer_request.infer(); + // } // -------- Process output auto output_tensor = infer_request.get_output_tensor(); @@ -218,7 +287,644 @@ TEST(TransformationTestsF1, FullyConnectedSplitInput1024) { // -------- Do inference synchronously -------- infer_request.infer(); - for (int iter = 0; iter < 100; iter++) { + for (int iter = 0; iter < 2000; iter++) { + infer_request.infer(); + } + + // -------- Process output + auto output_tensor = infer_request.get_output_tensor(); + std::cout << "\n" + << "output_tensor: " << output_tensor.get_shape() << std::endl; + std::cout << std::endl; + } +} + +TEST(TransformationTestsF1, FullyConnectedSplitInput_xj_64) { + { + // -------- Construct model + unsigned long m, k, n; + m = k = n = 64; + auto input1 = std::make_shared(ov::element::f32, ov::Shape{m, n}); + std::vector input_data(m * k, 1); + std::vector weights(k * n, 2); + std::vector result(m * n, 2); + + auto input2 = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{k, n}, weights.data()); + + std::cout << "input_shape: " << m << " * " << k << std::endl; + std::cout << "weight_shape: " << k << " * " << n << std::endl; + std::cout << "output_shape: " << m << " * " << n << std::endl; + + std::cout << std::endl; + auto no_bias = std::make_shared(); + auto fc = std::make_shared(input1, input2, no_bias, ov::element::i8); + const auto relu = std::make_shared(fc); + auto model = std::make_shared(ov::NodeVector{relu}, ov::ParameterVector{input1}); + + // ov::serialize(model, "./model_fc.xml", "./model_fc.bin"); + + // -------- Loading a model to the device -------- + ov::Core core; + // ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3")); + ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3")); + + // -------- Create an infer request -------- + ov::InferRequest infer_request = compiled_model.create_infer_request(); + + // -------- Prepare input -------- + auto tensor = ov::Tensor(infer_request.get_input_tensor().get_element_type(), + infer_request.get_input_tensor().get_shape(), + input_data.data()); + + infer_request.set_input_tensor(tensor); + + // -------- Do inference synchronously -------- + // infer_request.infer(); + for (int iter = 0; iter < 2000; iter++) { + infer_request.infer(); + } + + // -------- Process output + auto output_tensor = infer_request.get_output_tensor(); + std::cout << "\n" + << "output_tensor: " << output_tensor.get_shape() << std::endl; + std::cout << std::endl; + } +} + +TEST(TransformationTestsF1, FullyConnectedSplitInput_xj_128) { + { + // -------- Construct model + unsigned long m, k, n; + m = k = n = 128; + auto input1 = std::make_shared(ov::element::f32, ov::Shape{m, n}); + std::vector input_data(m * k, 1); + std::vector weights(k * n, 2); + std::vector result(m * n, 2); + + auto input2 = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{k, n}, weights.data()); + + std::cout << "input_shape: " << m << " * " << k << std::endl; + std::cout << "weight_shape: " << k << " * " << n << std::endl; + std::cout << "output_shape: " << m << " * " << n << std::endl; + + std::cout << std::endl; + auto no_bias = std::make_shared(); + auto fc = std::make_shared(input1, input2, no_bias, ov::element::i8); + const auto relu = std::make_shared(fc); + auto model = std::make_shared(ov::NodeVector{relu}, ov::ParameterVector{input1}); + + // ov::serialize(model, "./model_fc.xml", "./model_fc.bin"); + + // -------- Loading a model to the device -------- + ov::Core core; + // ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3")); + ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3")); + + // -------- Create an infer request -------- + ov::InferRequest infer_request = compiled_model.create_infer_request(); + + // -------- Prepare input -------- + auto tensor = ov::Tensor(infer_request.get_input_tensor().get_element_type(), + infer_request.get_input_tensor().get_shape(), + input_data.data()); + + infer_request.set_input_tensor(tensor); + + // -------- Do inference synchronously -------- + // infer_request.infer(); + for (int iter = 0; iter < 2000; iter++) { + infer_request.infer(); + } + + // -------- Process output + auto output_tensor = infer_request.get_output_tensor(); + std::cout << "\n" + << "output_tensor: " << output_tensor.get_shape() << std::endl; + std::cout << std::endl; + } +} + +TEST(TransformationTestsF1, FullyConnectedSplitInput_xj_256) { + { + // -------- Construct model + unsigned long m, k, n; + m = k = n = 256; + auto input1 = std::make_shared(ov::element::f32, ov::Shape{m, n}); + std::vector input_data(m * k, 1); + std::vector weights(k * n, 2); + std::vector result(m * n, 2); + + auto input2 = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{k, n}, weights.data()); + + std::cout << "input_shape: " << m << " * " << k << std::endl; + std::cout << "weight_shape: " << k << " * " << n << std::endl; + std::cout << "output_shape: " << m << " * " << n << std::endl; + + std::cout << std::endl; + auto no_bias = std::make_shared(); + auto fc = std::make_shared(input1, input2, no_bias, ov::element::i8); + const auto relu = std::make_shared(fc); + auto model = std::make_shared(ov::NodeVector{relu}, ov::ParameterVector{input1}); + + // ov::serialize(model, "./model_fc.xml", "./model_fc.bin"); + + // -------- Loading a model to the device -------- + ov::Core core; + // ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3")); + ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3")); + + // -------- Create an infer request -------- + ov::InferRequest infer_request = compiled_model.create_infer_request(); + + // -------- Prepare input -------- + auto tensor = ov::Tensor(infer_request.get_input_tensor().get_element_type(), + infer_request.get_input_tensor().get_shape(), + input_data.data()); + + infer_request.set_input_tensor(tensor); + + // -------- Do inference synchronously -------- + // infer_request.infer(); + for (int iter = 0; iter < 2000; iter++) { + infer_request.infer(); + } + + // -------- Process output + auto output_tensor = infer_request.get_output_tensor(); + std::cout << "\n" + << "output_tensor: " << output_tensor.get_shape() << std::endl; + std::cout << std::endl; + } +} + +TEST(TransformationTestsF1, FullyConnectedSplitInput_xj_512) { + { + // -------- Construct model + unsigned long m, k, n; + m = k = n = 512; + auto input1 = std::make_shared(ov::element::f32, ov::Shape{m, n}); + std::vector input_data(m * k, 1); + std::vector weights(k * n, 2); + std::vector result(m * n, 2); + + auto input2 = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{k, n}, weights.data()); + + std::cout << "input_shape: " << m << " * " << k << std::endl; + std::cout << "weight_shape: " << k << " * " << n << std::endl; + std::cout << "output_shape: " << m << " * " << n << std::endl; + + std::cout << std::endl; + auto no_bias = std::make_shared(); + auto fc = std::make_shared(input1, input2, no_bias, ov::element::i8); + const auto relu = std::make_shared(fc); + auto model = std::make_shared(ov::NodeVector{relu}, ov::ParameterVector{input1}); + + // ov::serialize(model, "./model_fc.xml", "./model_fc.bin"); + + // -------- Loading a model to the device -------- + ov::Core core; + // ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3")); + ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3")); + + // -------- Create an infer request -------- + ov::InferRequest infer_request = compiled_model.create_infer_request(); + + // -------- Prepare input -------- + auto tensor = ov::Tensor(infer_request.get_input_tensor().get_element_type(), + infer_request.get_input_tensor().get_shape(), + input_data.data()); + + infer_request.set_input_tensor(tensor); + + // -------- Do inference synchronously -------- + // infer_request.infer(); + for (int iter = 0; iter < 2000; iter++) { + infer_request.infer(); + } + + // -------- Process output + auto output_tensor = infer_request.get_output_tensor(); + std::cout << "\n" + << "output_tensor: " << output_tensor.get_shape() << std::endl; + std::cout << std::endl; + } +} + +TEST(TransformationTestsF1, FullyConnectedSplitInput_xj_1024) { + { + // -------- Construct model + unsigned long m, k, n; + m = k = n = 1024; + auto input1 = std::make_shared(ov::element::f32, ov::Shape{m, n}); + std::vector input_data(m * k, 1); + std::vector weights(k * n, 2); + std::vector result(m * n, 2); + + auto input2 = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{k, n}, weights.data()); + + std::cout << "input_shape: " << m << " * " << k << std::endl; + std::cout << "weight_shape: " << k << " * " << n << std::endl; + std::cout << "output_shape: " << m << " * " << n << std::endl; + + std::cout << std::endl; + auto no_bias = std::make_shared(); + auto fc = std::make_shared(input1, input2, no_bias, ov::element::i8); + const auto relu = std::make_shared(fc); + auto model = std::make_shared(ov::NodeVector{relu}, ov::ParameterVector{input1}); + + // ov::serialize(model, "./model_fc.xml", "./model_fc.bin"); + + // -------- Loading a model to the device -------- + ov::Core core; + // ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3")); + ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3")); + + // -------- Create an infer request -------- + ov::InferRequest infer_request = compiled_model.create_infer_request(); + + // -------- Prepare input -------- + auto tensor = ov::Tensor(infer_request.get_input_tensor().get_element_type(), + infer_request.get_input_tensor().get_shape(), + input_data.data()); + + infer_request.set_input_tensor(tensor); + + // -------- Do inference synchronously -------- + // infer_request.infer(); + for (int iter = 0; iter < 2000; iter++) { + infer_request.infer(); + } + + // -------- Process output + auto output_tensor = infer_request.get_output_tensor(); + std::cout << "\n" + << "output_tensor: " << output_tensor.get_shape() << std::endl; + std::cout << std::endl; + } +} + +TEST(TransformationTestsF1, FullyConnectedSplitInput_xj_2048) { + { + // -------- Construct model + unsigned long m, k, n; + m = k = n = 2048; + auto input1 = std::make_shared(ov::element::f32, ov::Shape{m, n}); + std::vector input_data(m * k, 1); + std::vector weights(k * n, 2); + std::vector result(m * n, 2); + + auto input2 = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{k, n}, weights.data()); + + std::cout << "input_shape: " << m << " * " << k << std::endl; + std::cout << "weight_shape: " << k << " * " << n << std::endl; + std::cout << "output_shape: " << m << " * " << n << std::endl; + + std::cout << std::endl; + auto no_bias = std::make_shared(); + auto fc = std::make_shared(input1, input2, no_bias, ov::element::i8); + const auto relu = std::make_shared(fc); + auto model = std::make_shared(ov::NodeVector{relu}, ov::ParameterVector{input1}); + + // ov::serialize(model, "./model_fc.xml", "./model_fc.bin"); + + // -------- Loading a model to the device -------- + ov::Core core; + // ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3")); + ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3")); + + // -------- Create an infer request -------- + ov::InferRequest infer_request = compiled_model.create_infer_request(); + + // -------- Prepare input -------- + auto tensor = ov::Tensor(infer_request.get_input_tensor().get_element_type(), + infer_request.get_input_tensor().get_shape(), + input_data.data()); + + infer_request.set_input_tensor(tensor); + + // -------- Do inference synchronously -------- + // infer_request.infer(); + for (int iter = 0; iter < 2000; iter++) { + infer_request.infer(); + } + + // -------- Process output + auto output_tensor = infer_request.get_output_tensor(); + std::cout << "\n" + << "output_tensor: " << output_tensor.get_shape() << std::endl; + std::cout << std::endl; + } +} + +TEST(TransformationTestsF1, FullyConnectedSplitInput_xj_4096) { + { + // -------- Construct model + unsigned long m, k, n; + m = k = n = 4096; + auto input1 = std::make_shared(ov::element::f32, ov::Shape{m, n}); + std::vector input_data(m * k, 1); + std::vector weights(k * n, 2); + std::vector result(m * n, 2); + + auto input2 = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{k, n}, weights.data()); + + std::cout << "input_shape: " << m << " * " << k << std::endl; + std::cout << "weight_shape: " << k << " * " << n << std::endl; + std::cout << "output_shape: " << m << " * " << n << std::endl; + + std::cout << std::endl; + auto no_bias = std::make_shared(); + auto fc = std::make_shared(input1, input2, no_bias, ov::element::i8); + const auto relu = std::make_shared(fc); + auto model = std::make_shared(ov::NodeVector{relu}, ov::ParameterVector{input1}); + + // ov::serialize(model, "./model_fc.xml", "./model_fc.bin"); + + // -------- Loading a model to the device -------- + ov::Core core; + // ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3")); + ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3")); + + // -------- Create an infer request -------- + ov::InferRequest infer_request = compiled_model.create_infer_request(); + + // -------- Prepare input -------- + auto tensor = ov::Tensor(infer_request.get_input_tensor().get_element_type(), + infer_request.get_input_tensor().get_shape(), + input_data.data()); + + infer_request.set_input_tensor(tensor); + + // -------- Do inference synchronously -------- + // infer_request.infer(); + for (int iter = 0; iter < 2000; iter++) { + infer_request.infer(); + } + + // -------- Process output + auto output_tensor = infer_request.get_output_tensor(); + std::cout << "\n" + << "output_tensor: " << output_tensor.get_shape() << std::endl; + std::cout << std::endl; + } +} + +TEST(TransformationTestsF1, FullyConnectedSplitInput_xj_8192) { + { + // -------- Construct model + unsigned long m, k, n; + m = k = n = 8192; + auto input1 = std::make_shared(ov::element::f32, ov::Shape{m, n}); + std::vector input_data(m * k, 1); + std::vector weights(k * n, 2); + std::vector result(m * n, 2); + + auto input2 = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{k, n}, weights.data()); + + std::cout << "input_shape: " << m << " * " << k << std::endl; + std::cout << "weight_shape: " << k << " * " << n << std::endl; + std::cout << "output_shape: " << m << " * " << n << std::endl; + + std::cout << std::endl; + auto no_bias = std::make_shared(); + auto fc = std::make_shared(input1, input2, no_bias, ov::element::i8); + const auto relu = std::make_shared(fc); + auto model = std::make_shared(ov::NodeVector{relu}, ov::ParameterVector{input1}); + + // ov::serialize(model, "./model_fc.xml", "./model_fc.bin"); + + // -------- Loading a model to the device -------- + ov::Core core; + // ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3")); + ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3")); + + // -------- Create an infer request -------- + ov::InferRequest infer_request = compiled_model.create_infer_request(); + + // -------- Prepare input -------- + auto tensor = ov::Tensor(infer_request.get_input_tensor().get_element_type(), + infer_request.get_input_tensor().get_shape(), + input_data.data()); + + infer_request.set_input_tensor(tensor); + + // -------- Do inference synchronously -------- + // infer_request.infer(); + for (int iter = 0; iter < 1000; iter++) { + infer_request.infer(); + } + + // -------- Process output + auto output_tensor = infer_request.get_output_tensor(); + std::cout << "\n" + << "output_tensor: " << output_tensor.get_shape() << std::endl; + std::cout << std::endl; + } +} + +TEST(TransformationTestsF1, FullyConnectedSplitInput_xj_16384) { + { + // -------- Construct model + unsigned long m, k, n; + m = k = n = 16384; + auto input1 = std::make_shared(ov::element::f32, ov::Shape{m, n}); + std::vector input_data(m * k, 1); + std::vector weights(k * n, 2); + std::vector result(m * n, 2); + + auto input2 = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{k, n}, weights.data()); + + std::cout << "input_shape: " << m << " * " << k << std::endl; + std::cout << "weight_shape: " << k << " * " << n << std::endl; + std::cout << "output_shape: " << m << " * " << n << std::endl; + + std::cout << std::endl; + auto no_bias = std::make_shared(); + auto fc = std::make_shared(input1, input2, no_bias, ov::element::i8); + const auto relu = std::make_shared(fc); + auto model = std::make_shared(ov::NodeVector{relu}, ov::ParameterVector{input1}); + + // ov::serialize(model, "./model_fc.xml", "./model_fc.bin"); + + // -------- Loading a model to the device -------- + ov::Core core; + // ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3")); + ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3")); + + // -------- Create an infer request -------- + ov::InferRequest infer_request = compiled_model.create_infer_request(); + + // -------- Prepare input -------- + auto tensor = ov::Tensor(infer_request.get_input_tensor().get_element_type(), + infer_request.get_input_tensor().get_shape(), + input_data.data()); + + infer_request.set_input_tensor(tensor); + + // -------- Do inference synchronously -------- + // infer_request.infer(); + for (int iter = 0; iter < 200; iter++) { + infer_request.infer(); + } + + // -------- Process output + auto output_tensor = infer_request.get_output_tensor(); + std::cout << "\n" + << "output_tensor: " << output_tensor.get_shape() << std::endl; + std::cout << std::endl; + } +} + +TEST(TransformationTestsF1, FullyConnectedSplitInput_xj_32768) { + { + // -------- Construct model + unsigned long m, k, n; + m = k = n = 32768; + auto input1 = std::make_shared(ov::element::f32, ov::Shape{m, n}); + std::vector input_data(m * k, 1); + std::vector weights(k * n, 2); + std::vector result(m * n, 2); + + auto input2 = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{k, n}, weights.data()); + + std::cout << "input_shape: " << m << " * " << k << std::endl; + std::cout << "weight_shape: " << k << " * " << n << std::endl; + std::cout << "output_shape: " << m << " * " << n << std::endl; + + std::cout << std::endl; + auto no_bias = std::make_shared(); + auto fc = std::make_shared(input1, input2, no_bias, ov::element::i8); + const auto relu = std::make_shared(fc); + auto model = std::make_shared(ov::NodeVector{relu}, ov::ParameterVector{input1}); + + // ov::serialize(model, "./model_fc.xml", "./model_fc.bin"); + + // -------- Loading a model to the device -------- + ov::Core core; + // ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3")); + ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3")); + + // -------- Create an infer request -------- + ov::InferRequest infer_request = compiled_model.create_infer_request(); + + // -------- Prepare input -------- + auto tensor = ov::Tensor(infer_request.get_input_tensor().get_element_type(), + infer_request.get_input_tensor().get_shape(), + input_data.data()); + + infer_request.set_input_tensor(tensor); + + // -------- Do inference synchronously -------- + // infer_request.infer(); + for (int iter = 0; iter < 100; iter++) { + infer_request.infer(); + } + + // -------- Process output + auto output_tensor = infer_request.get_output_tensor(); + std::cout << "\n" + << "output_tensor: " << output_tensor.get_shape() << std::endl; + std::cout << std::endl; + } +} + +TEST(TransformationTestsF1, FullyConnectedSplitInput_xj_1) { + { + // -------- Construct model + unsigned long m, k, n; + m = k = 32768; + n = 65536; + auto input1 = std::make_shared(ov::element::f32, ov::Shape{m, m}); + std::vector input_data(m * m, 1); + std::vector weights(k * n, 2); + // std::vector result(m * n, 2); + + auto input2 = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{n, k}, weights.data()); + + std::cout << "input_shape: " << m << " * " << k << std::endl; + std::cout << "weight_shape: " << k << " * " << n << std::endl; + std::cout << "output_shape: " << m << " * " << n << std::endl; + + std::cout << std::endl; + auto no_bias = std::make_shared(); + auto fc = std::make_shared(input1, input2, no_bias, ov::element::i8); + const auto relu = std::make_shared(fc); + auto model = std::make_shared(ov::NodeVector{relu}, ov::ParameterVector{input1}); + + // ov::serialize(model, "./model_fc.xml", "./model_fc.bin"); + + // -------- Loading a model to the device -------- + ov::Core core; + // ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1")); + ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3")); + + // -------- Create an infer request -------- + ov::InferRequest infer_request = compiled_model.create_infer_request(); + + // -------- Prepare input -------- + auto tensor = ov::Tensor(infer_request.get_input_tensor().get_element_type(), + infer_request.get_input_tensor().get_shape(), + input_data.data()); + + infer_request.set_input_tensor(tensor); + + // -------- Do inference synchronously -------- + // infer_request.infer(); + for (int iter = 0; iter < 2; iter++) { + infer_request.infer(); + } + + // -------- Process output + auto output_tensor = infer_request.get_output_tensor(); + std::cout << "\n" + << "output_tensor: " << output_tensor.get_shape() << std::endl; + std::cout << std::endl; + } +} + +TEST(TransformationTestsF1, FullyConnectedSplitInput_xj_2) { + { + // -------- Construct model + unsigned long m, k, n; + m = k = n = 65536; + auto input1 = std::make_shared(ov::element::f32, ov::Shape{m, n}); + std::vector input_data(m * k, 1); + std::vector weights(k * n, 2); + // std::vector result(m * n, 2); + + auto input2 = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{k, n}, weights.data()); + + std::cout << "input_shape: " << m << " * " << k << std::endl; + std::cout << "weight_shape: " << k << " * " << n << std::endl; + std::cout << "output_shape: " << m << " * " << n << std::endl; + + std::cout << std::endl; + auto no_bias = std::make_shared(); + auto fc = std::make_shared(input1, input2, no_bias, ov::element::i8); + const auto relu = std::make_shared(fc); + auto model = std::make_shared(ov::NodeVector{relu}, ov::ParameterVector{input1}); + + // ov::serialize(model, "./model_fc.xml", "./model_fc.bin"); + + // -------- Loading a model to the device -------- + ov::Core core; + // ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3")); + ov::CompiledModel compiled_model = core.compile_model(model, "GPU", ov::device::priorities("GPU.0,GPU.1,GPU.2,GPU.3")); + + // -------- Create an infer request -------- + ov::InferRequest infer_request = compiled_model.create_infer_request(); + + // -------- Prepare input -------- + auto tensor = ov::Tensor(infer_request.get_input_tensor().get_element_type(), + infer_request.get_input_tensor().get_shape(), + input_data.data()); + + infer_request.set_input_tensor(tensor); + + // -------- Do inference synchronously -------- + // infer_request.infer(); + for (int iter = 0; iter < 10; iter++) { infer_request.infer(); }