From 3ff094330638b01b74d08c0736adc58408466e4c Mon Sep 17 00:00:00 2001 From: Dmitry Matveev Date: Tue, 15 Oct 2024 12:23:13 +0100 Subject: [PATCH] NPUW: Function memory management (#27043) ### Details: - Optimize out temporary results (activations) when possible ### Tickets: - *ticket-id* --- .../src/plugin/npuw/compiled_model.cpp | 5 - .../src/plugin/npuw/compiled_model.hpp | 2 + .../plugin/npuw/just_sync_infer_request.cpp | 211 +++++++++++++++--- .../plugin/npuw/just_sync_infer_request.hpp | 64 +++++- 4 files changed, 238 insertions(+), 44 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index 2fe90eb82c41bb..a312a806cac4bc 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -516,11 +516,6 @@ std::string ov::npuw::CompiledModel::global_mem_device() const { } std::string ov::npuw::CompiledModel::funcall_mem_device(const std::size_t idx) const { - // FIXME: currently we allocate intermediate tensors for EVERY submodel. - // It's not feasible to allocate them in L0 due to high memory consumption. - // Until we make such memory reusable, hard-coding those tensors to CPU. - return "CPU"; - // Force globally set device if set const std::string device_alloc = m_cfg.get<::intel_npu::NPUW_WEIGHTS_BANK_ALLOC>(); if (!device_alloc.empty()) { diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp index 4152d08275ba6d..038c1bb176b029 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp @@ -46,6 +46,8 @@ class CompiledModel : public ov::ICompiledModel { // FIXME: This class has many friends.. friend class IBaseInferRequest; friend class JustInferRequest; + friend class MemAccessSim; + friend class FuncMemMgr; bool compile_for_success(std::size_t id); bool compile_for_device(std::size_t id, const std::string& device_to_try); diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp index fbbabf083bccd8..c4e2c3ee98b676 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp @@ -20,8 +20,173 @@ #include "util.hpp" #include "weights_bank.hpp" +ov::npuw::MemAccessSim::MemAccessSim(const std::shared_ptr& compiled_model) { + LOG_VERB("Running memory access simulation..."); + LOG_BLOCK(); + + // Initialize the read list + m_read_list.resize(compiled_model->m_compiled_submodels.size()); + + // Initialize read counters for tensors in the graph: + // 1. Interconnect + for (const auto& kvp : compiled_model->m_submodels_input_to_prev_output) { + const auto& read_to = kvp.first; // who reads + const auto& read_from = kvp.second; // reads what + + if (read_to == CompiledModel::NO_LINK || read_from == CompiledModel::NO_LINK) { + continue; + } + + // Record # of reads for this particular Source + m_remaining_reads[read_from]++; + + // Record a read request for this particular Subgraph (who reads the Source) + m_read_list[read_to.first].push_back(read_from); + } + // 2. Global model's outputs + for (auto&& read_from : compiled_model->m_outputs_to_submodels_outputs) { + m_remaining_reads[read_from]++; + } + + LOG_VERB("Done"); +} + +const ov::npuw::MemAccessSim::ReadList& ov::npuw::MemAccessSim::read_list(std::size_t idx) const { + return m_read_list.at(idx); +} + +std::size_t ov::npuw::MemAccessSim::remaining_reads(const LinkFrom& from) { + return m_remaining_reads.at(from); +} + +void ov::npuw::MemAccessSim::register_read(const LinkFrom& from) { + m_remaining_reads.at(from)--; +} + +ov::npuw::FuncMemMgr::FuncMemMgr(const std::shared_ptr& compiled_model) + : m_sim(compiled_model), + m_model(compiled_model) {} + +void ov::npuw::FuncMemMgr::set_alloc(AllocFcn&& fcn) { + m_alloc = std::move(fcn); +} + +void ov::npuw::FuncMemMgr::assign_memory() { + LOG_VERB("Assigning function memory..."); + LOG_BLOCK(); + + const auto num_submodels = m_model->m_compiled_submodels.size(); + + // Walk over the subgraphs, pre-allocate and pre-assign tensors to the subgraphs + // outputs. + for (std::size_t idx = 0u; idx < num_submodels; idx++) { + LOG_VERB("Process Subgraph[" << idx << "]"); + LOG_BLOCK(); + const auto& comp_model_desc = m_model->m_compiled_submodels[idx]; + if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) { + // no model & no funcall - optimized out, do nothing + continue; + } + + // Simulate subgraph execution: poll its input list first + const auto& read_list = m_sim.read_list(idx); + + // Now, get the outputs for the subgraph. If it is "regular", there's + // nothing to do - this subgraph owns its outputs on its own. + // If it is a function, though - look up in the function's memory storage. + if (comp_model_desc.replaced_by) { + const auto real_idx = comp_model_desc.replaced_by.value(); + const auto& proto_comp_model_desc = m_model->m_compiled_submodels[real_idx]; + + const auto num_outs = proto_comp_model_desc.compiled_model->outputs().size(); + for (std::size_t out_idx = 0u; out_idx < num_outs; out_idx++) { + const LinkFrom this_out = LinkFrom{idx, out_idx}; + assign(this_out); + } + } + + // Here happens the imaginary execution... Hocus pocus, done - that's a + // simulation after all + // After the execution, mark that the read_list was read. + for (auto&& from : read_list) { + m_sim.register_read(from); + } + LOG_VERB("Done"); + } + + // Report memory residency + for (auto&& m : m_memory) { + LOG_VERB("Function " << m.first.first << "/out port " << m.first.second << " : maximum memory residency " + << m.second.size() << " tensor(s)"); + } + + LOG_VERB("Done"); +} + +void ov::npuw::FuncMemMgr::assign(const LinkFrom& from) { + // This method is the center of the function memory management. + // The logic is simple: + // - Look for an output tensor to reuse + // - If there's one, assign it to this allocation + // - If there's none, allocate a new tensor + // - How a tensor to reuse is piced: + // 1. It should exist + // 2. It's "remaining reads" count should be 0 (all planned reads + // happened at this point). + // The tensor storage is organized like this: + // - Function: Here we use .replaced_by as a function identifier; taken from `from` + // - Output index: taken from `from` + // - A vector of resident tensors + + LOG_VERB("Assinging tensor for Subgraph[" << from.first << "]/" << from.second << "..."); + LOG_BLOCK(); + + const auto& comp_model_desc = m_model->m_compiled_submodels[from.first]; + NPUW_ASSERT(comp_model_desc.replaced_by.has_value()); + + const auto real_idx = comp_model_desc.replaced_by.value(); + + FO func_output = {real_idx, from.second}; + auto& assigned_memory = m_memory[func_output]; + auto asgn_iter = std::find_if(assigned_memory.begin(), assigned_memory.end(), [&](Assignment& a) { + return m_sim.remaining_reads(a.from) == 0u; + }); + if (asgn_iter != assigned_memory.end()) { + // Reassign this memory slot to the new "from" + asgn_iter->from = from; + m_table[from] = asgn_iter->ptr; + } else { + // No free space at this point - allocate a new tensor + const auto& proto_comp_model_desc = m_model->m_compiled_submodels[real_idx]; + const auto& proto_comp_model = proto_comp_model_desc.compiled_model; + + const auto& oport = proto_comp_model->outputs()[from.second]; + ov::Shape oshape = oport.get_shape(); + + if (proto_comp_model_desc.spatial) { + oshape[proto_comp_model_desc.spatial->out_dim] = proto_comp_model_desc.spatial->range; + } + const auto& device = m_model->funcall_mem_device(real_idx); + TensorPtr new_tensor = m_alloc(oport.get_element_type(), oshape, device); + NPUW_ASSERT(new_tensor); + + assigned_memory.push_back(Assignment{new_tensor, from}); + m_table[from] = new_tensor; + } + LOG_VERB("Done"); +} + +ov::npuw::TensorPtr ov::npuw::FuncMemMgr::get_tensor(const LinkFrom& from) { + return m_table.at(from); +} + ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr& compiled_model) - : IBaseInferRequest(compiled_model) { + : IBaseInferRequest(compiled_model), + m_func_mem_mgr(compiled_model) { + using namespace std::placeholders; + m_func_mem_mgr.set_alloc(std::bind(&JustInferRequest::allocMem, this, _1, _2, _3)); + m_func_mem_mgr.assign_memory(); + m_use_function_pipelining = m_npuw_model->m_cfg.get<::intel_npu::NPUW_FUNCALL_ASYNC>(); if (m_use_function_pipelining) { LOG_WARN("Function call pipelining is enabled for " << m_npuw_model->m_name @@ -67,27 +232,20 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptrparams) { const auto& iport = proto_comp_model_desc.compiled_model->inputs()[p.idx]; m_spatial_io[real_idx].input_tails[p.idx] = - allocTensor(iport, m_npuw_model->funcall_mem_device(real_idx)); + allocOut(iport, m_npuw_model->funcall_mem_device(real_idx)); } const auto num_outs = proto_comp_model_desc.compiled_model->outputs().size(); for (std::size_t out_idx = 0u; out_idx < num_outs; out_idx++) { const auto& oport = proto_comp_model_desc.compiled_model->outputs()[out_idx]; m_spatial_io[real_idx].output_tails[out_idx] = - allocTensor(oport, m_npuw_model->funcall_mem_device(real_idx)); + allocOut(oport, m_npuw_model->funcall_mem_device(real_idx)); } } } // if(spatial) for (size_t out_idx = 0; out_idx < num_outputs; out_idx++) { - const auto& port = proto_comp_model->outputs()[out_idx]; - ov::Shape shape = port.get_shape(); - - // If the subgraph is spatial, promote the output size to the full vector size - if (proto_comp_model_desc.spatial) { - shape[proto_comp_model_desc.spatial->out_dim] = proto_comp_model_desc.spatial->range; - } - m_funcall_result[LinkFrom{i, out_idx}] = - allocTensor(port.get_element_type(), shape, m_npuw_model->funcall_mem_device(real_idx)); + const auto from = LinkFrom{i, out_idx}; + m_funcall_result[from] = m_func_mem_mgr.get_tensor(from); } if (real_idx != i) { // If this function call is NOT the function body, do nothing here - the original @@ -152,7 +310,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptrinputs().size(); i++) { const auto& port = m_npuw_model->inputs()[i]; - ov::SoPtr allocated = allocTensor(port, m_npuw_model->global_mem_device()); + ov::SoPtr allocated = allocOut(port, m_npuw_model->global_mem_device()); m_input_tensors.push_back(allocated); m_input_allocated.insert(allocated->data()); m_port_to_tensor[port] = TensorStorage{m_input_tensors.back(), true}; @@ -174,7 +332,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptrsecond // Function calls have their tensors allocated, so just use one - : allocTensor(port, m_npuw_model->global_mem_device()); + : allocOut(port, m_npuw_model->global_mem_device()); m_output_tensors.push_back(tensor); m_port_to_tensor[port] = TensorStorage{tensor, true}; @@ -920,27 +1078,22 @@ void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool } // if (replaced_by) } -ov::SoPtr ov::npuw::JustInferRequest::allocTensor(const ov::element::Type type, - const ov::Shape& shape, - const std::string& device) { +ov::npuw::TensorPtr ov::npuw::JustInferRequest::allocMem(const ov::element::Type type, + const ov::Shape& shape, + const std::string& device) { if (device == "CPU" || ov::shape_size(shape) == 0) { return ov::get_tensor_impl(ov::Tensor(type, shape)); } - ov::SoPtr remote_tensor; - ov::Tensor allocated_tensor; - { - std::lock_guard guard(m_alloc_mutex); - m_remote_ctx = m_npuw_model->get_plugin()->get_core()->get_default_context(device)._ptr; - remote_tensor = m_remote_ctx->create_host_tensor(type, shape); - allocated_tensor = ov::make_tensor(remote_tensor); - } - return ov::get_tensor_impl(allocated_tensor); + std::lock_guard guard(m_alloc_mutex); + auto remote_ctx = m_npuw_model->get_plugin()->get_core()->get_default_context(device)._ptr; + auto remote_tensor = remote_ctx->create_host_tensor(type, shape); + return ov::get_tensor_impl(ov::make_tensor(remote_tensor)); } -ov::SoPtr ov::npuw::JustInferRequest::allocTensor(const ov::Output& node, - const std::string& device) { - return allocTensor(node.get_element_type(), node.get_shape(), device); +ov::npuw::TensorPtr ov::npuw::JustInferRequest::allocOut(const ov::Output& node, + const std::string& device) { + return allocMem(node.get_element_type(), node.get_shape(), device); } void ov::npuw::JustInferRequest::subscribe_subrequest(std::size_t idx, Completed cb) { diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp index 7335b54c30062e..88838d8b39d75f 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp @@ -22,6 +22,56 @@ namespace npuw { class CompiledModel; class AsyncInferRequest; +using LinkFrom = std::pair; // FIXME: This is a third, if not fourth, definitiion of such structure + +using TensorPtr = ov::SoPtr; + +class MemAccessSim { +public: + explicit MemAccessSim(const std::shared_ptr& compiled_model); + + using ReadList = std::list; + const ReadList& read_list(std::size_t idx) const; + + std::size_t remaining_reads(const LinkFrom& from); + void register_read(const LinkFrom& from); + +private: + std::map m_remaining_reads; + std::vector m_read_list; +}; + +class FuncMemMgr { + MemAccessSim m_sim; + std::shared_ptr m_model; + + void assign(const LinkFrom& from); + + // Function ID -> Output port number + using FO = std::pair; + struct Assignment { + TensorPtr ptr; + LinkFrom from; + }; + std::map> m_memory; // Dynamic assignment table + std::map m_table; // Static allocation/assignment table + +public: + explicit FuncMemMgr(const std::shared_ptr& compiled_model); + + using AllocFcn = std::function; + void set_alloc(AllocFcn&& fcn); + void assign_memory(); + + TensorPtr get_tensor(const LinkFrom& from); + +private: + AllocFcn m_alloc; +}; + class JustInferRequest final : public IBaseInferRequest { public: explicit JustInferRequest(const std::shared_ptr& compiled_model); @@ -64,15 +114,11 @@ class JustInferRequest final : public IBaseInferRequest { void connect_subrequests(); void recreate_subrequests(std::size_t idx); - ov::SoPtr allocTensor(const ov::element::Type type, const ov::Shape& shape, const std::string& device); - ov::SoPtr allocTensor(const ov::Output& node, const std::string& device); + TensorPtr allocMem(const ov::element::Type type, const ov::Shape& shape, const std::string& device); + TensorPtr allocOut(const ov::Output& node, const std::string& device); - using LinkFrom = std::pair; // FIXME: This is a third, if not fourth, definitiion of such structure - using TensorPtr = ov::SoPtr; - std::map m_funcall_result; + FuncMemMgr m_func_mem_mgr; // Owns memory + std::map m_funcall_result; // Provides a convenient link bool is_pipelined(std::size_t idx) const; bool m_use_function_pipelining = false; @@ -103,8 +149,6 @@ class JustInferRequest final : public IBaseInferRequest { std::vector m_subrequests_gio; std::mutex m_alloc_mutex; - std::shared_ptr m_remote_ctx = nullptr; - std::unordered_set m_input_allocated; };