From 3ff094330638b01b74d08c0736adc58408466e4c Mon Sep 17 00:00:00 2001
From: Dmitry Matveev <dmitry.matveev@intel.com>
Date: Tue, 15 Oct 2024 12:23:13 +0100
Subject: [PATCH] NPUW: Function memory management (#27043)

### Details:
 - Optimize out temporary results (activations) when possible

### Tickets:
 - *ticket-id*
---
 .../src/plugin/npuw/compiled_model.cpp        |   5 -
 .../src/plugin/npuw/compiled_model.hpp        |   2 +
 .../plugin/npuw/just_sync_infer_request.cpp   | 211 +++++++++++++++---
 .../plugin/npuw/just_sync_infer_request.hpp   |  64 +++++-
 4 files changed, 238 insertions(+), 44 deletions(-)

diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
index 2fe90eb82c41bb..a312a806cac4bc 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
@@ -516,11 +516,6 @@ std::string ov::npuw::CompiledModel::global_mem_device() const {
 }
 
 std::string ov::npuw::CompiledModel::funcall_mem_device(const std::size_t idx) const {
-    // FIXME: currently we allocate intermediate tensors for EVERY submodel.
-    //        It's not feasible to allocate them in L0 due to high memory consumption.
-    //        Until we make such memory reusable, hard-coding those tensors to CPU.
-    return "CPU";
-
     // Force globally set device if set
     const std::string device_alloc = m_cfg.get<::intel_npu::NPUW_WEIGHTS_BANK_ALLOC>();
     if (!device_alloc.empty()) {
diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
index 4152d08275ba6d..038c1bb176b029 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
@@ -46,6 +46,8 @@ class CompiledModel : public ov::ICompiledModel {
     // FIXME: This class has many friends..
     friend class IBaseInferRequest;
     friend class JustInferRequest;
+    friend class MemAccessSim;
+    friend class FuncMemMgr;
 
     bool compile_for_success(std::size_t id);
     bool compile_for_device(std::size_t id, const std::string& device_to_try);
diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp
index fbbabf083bccd8..c4e2c3ee98b676 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp
@@ -20,8 +20,173 @@
 #include "util.hpp"
 #include "weights_bank.hpp"
 
+ov::npuw::MemAccessSim::MemAccessSim(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model) {
+    LOG_VERB("Running memory access simulation...");
+    LOG_BLOCK();
+
+    // Initialize the read list
+    m_read_list.resize(compiled_model->m_compiled_submodels.size());
+
+    // Initialize read counters for tensors in the graph:
+    // 1. Interconnect
+    for (const auto& kvp : compiled_model->m_submodels_input_to_prev_output) {
+        const auto& read_to = kvp.first;     // who reads
+        const auto& read_from = kvp.second;  // reads what
+
+        if (read_to == CompiledModel::NO_LINK || read_from == CompiledModel::NO_LINK) {
+            continue;
+        }
+
+        // Record # of reads for this particular Source
+        m_remaining_reads[read_from]++;
+
+        // Record a read request for this particular Subgraph (who reads the Source)
+        m_read_list[read_to.first].push_back(read_from);
+    }
+    // 2. Global model's outputs
+    for (auto&& read_from : compiled_model->m_outputs_to_submodels_outputs) {
+        m_remaining_reads[read_from]++;
+    }
+
+    LOG_VERB("Done");
+}
+
+const ov::npuw::MemAccessSim::ReadList& ov::npuw::MemAccessSim::read_list(std::size_t idx) const {
+    return m_read_list.at(idx);
+}
+
+std::size_t ov::npuw::MemAccessSim::remaining_reads(const LinkFrom& from) {
+    return m_remaining_reads.at(from);
+}
+
+void ov::npuw::MemAccessSim::register_read(const LinkFrom& from) {
+    m_remaining_reads.at(from)--;
+}
+
+ov::npuw::FuncMemMgr::FuncMemMgr(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model)
+    : m_sim(compiled_model),
+      m_model(compiled_model) {}
+
+void ov::npuw::FuncMemMgr::set_alloc(AllocFcn&& fcn) {
+    m_alloc = std::move(fcn);
+}
+
+void ov::npuw::FuncMemMgr::assign_memory() {
+    LOG_VERB("Assigning function memory...");
+    LOG_BLOCK();
+
+    const auto num_submodels = m_model->m_compiled_submodels.size();
+
+    // Walk over the subgraphs, pre-allocate and pre-assign tensors to the subgraphs
+    // outputs.
+    for (std::size_t idx = 0u; idx < num_submodels; idx++) {
+        LOG_VERB("Process Subgraph[" << idx << "]");
+        LOG_BLOCK();
+        const auto& comp_model_desc = m_model->m_compiled_submodels[idx];
+        if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) {
+            // no model & no funcall - optimized out, do nothing
+            continue;
+        }
+
+        // Simulate subgraph execution: poll its input list first
+        const auto& read_list = m_sim.read_list(idx);
+
+        // Now, get the outputs for the subgraph. If it is "regular", there's
+        // nothing to do - this subgraph owns its outputs on its own.
+        // If it is a function, though - look up in the function's memory storage.
+        if (comp_model_desc.replaced_by) {
+            const auto real_idx = comp_model_desc.replaced_by.value();
+            const auto& proto_comp_model_desc = m_model->m_compiled_submodels[real_idx];
+
+            const auto num_outs = proto_comp_model_desc.compiled_model->outputs().size();
+            for (std::size_t out_idx = 0u; out_idx < num_outs; out_idx++) {
+                const LinkFrom this_out = LinkFrom{idx, out_idx};
+                assign(this_out);
+            }
+        }
+
+        // Here happens the imaginary execution... Hocus pocus, done - that's a
+        // simulation after all
+        // After the execution, mark that the read_list was read.
+        for (auto&& from : read_list) {
+            m_sim.register_read(from);
+        }
+        LOG_VERB("Done");
+    }
+
+    // Report memory residency
+    for (auto&& m : m_memory) {
+        LOG_VERB("Function " << m.first.first << "/out port " << m.first.second << " : maximum memory residency "
+                             << m.second.size() << " tensor(s)");
+    }
+
+    LOG_VERB("Done");
+}
+
+void ov::npuw::FuncMemMgr::assign(const LinkFrom& from) {
+    // This method is the center of the function memory management.
+    // The logic is simple:
+    // - Look for an output tensor to reuse
+    //   - If there's one, assign it to this allocation
+    //   - If there's none, allocate a new tensor
+    // - How a tensor to reuse is piced:
+    //   1. It should exist
+    //   2. It's "remaining reads" count should be 0 (all planned reads
+    //      happened at this point).
+    // The tensor storage is organized like this:
+    // - Function: Here we use .replaced_by as a function identifier; taken from `from`
+    //   - Output index: taken from `from`
+    //     - A vector of resident tensors
+
+    LOG_VERB("Assinging tensor for Subgraph[" << from.first << "]/" << from.second << "...");
+    LOG_BLOCK();
+
+    const auto& comp_model_desc = m_model->m_compiled_submodels[from.first];
+    NPUW_ASSERT(comp_model_desc.replaced_by.has_value());
+
+    const auto real_idx = comp_model_desc.replaced_by.value();
+
+    FO func_output = {real_idx, from.second};
+    auto& assigned_memory = m_memory[func_output];
+    auto asgn_iter = std::find_if(assigned_memory.begin(), assigned_memory.end(), [&](Assignment& a) {
+        return m_sim.remaining_reads(a.from) == 0u;
+    });
+    if (asgn_iter != assigned_memory.end()) {
+        // Reassign this memory slot to the new "from"
+        asgn_iter->from = from;
+        m_table[from] = asgn_iter->ptr;
+    } else {
+        // No free space at this point - allocate a new tensor
+        const auto& proto_comp_model_desc = m_model->m_compiled_submodels[real_idx];
+        const auto& proto_comp_model = proto_comp_model_desc.compiled_model;
+
+        const auto& oport = proto_comp_model->outputs()[from.second];
+        ov::Shape oshape = oport.get_shape();
+
+        if (proto_comp_model_desc.spatial) {
+            oshape[proto_comp_model_desc.spatial->out_dim] = proto_comp_model_desc.spatial->range;
+        }
+        const auto& device = m_model->funcall_mem_device(real_idx);
+        TensorPtr new_tensor = m_alloc(oport.get_element_type(), oshape, device);
+        NPUW_ASSERT(new_tensor);
+
+        assigned_memory.push_back(Assignment{new_tensor, from});
+        m_table[from] = new_tensor;
+    }
+    LOG_VERB("Done");
+}
+
+ov::npuw::TensorPtr ov::npuw::FuncMemMgr::get_tensor(const LinkFrom& from) {
+    return m_table.at(from);
+}
+
 ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model)
-    : IBaseInferRequest(compiled_model) {
+    : IBaseInferRequest(compiled_model),
+      m_func_mem_mgr(compiled_model) {
+    using namespace std::placeholders;
+    m_func_mem_mgr.set_alloc(std::bind(&JustInferRequest::allocMem, this, _1, _2, _3));
+    m_func_mem_mgr.assign_memory();
+
     m_use_function_pipelining = m_npuw_model->m_cfg.get<::intel_npu::NPUW_FUNCALL_ASYNC>();
     if (m_use_function_pipelining) {
         LOG_WARN("Function call pipelining is enabled for " << m_npuw_model->m_name
@@ -67,27 +232,20 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
                     for (auto&& p : proto_comp_model_desc.spatial->params) {
                         const auto& iport = proto_comp_model_desc.compiled_model->inputs()[p.idx];
                         m_spatial_io[real_idx].input_tails[p.idx] =
-                            allocTensor(iport, m_npuw_model->funcall_mem_device(real_idx));
+                            allocOut(iport, m_npuw_model->funcall_mem_device(real_idx));
                     }
                     const auto num_outs = proto_comp_model_desc.compiled_model->outputs().size();
                     for (std::size_t out_idx = 0u; out_idx < num_outs; out_idx++) {
                         const auto& oport = proto_comp_model_desc.compiled_model->outputs()[out_idx];
                         m_spatial_io[real_idx].output_tails[out_idx] =
-                            allocTensor(oport, m_npuw_model->funcall_mem_device(real_idx));
+                            allocOut(oport, m_npuw_model->funcall_mem_device(real_idx));
                     }
                 }
             }  // if(spatial)
 
             for (size_t out_idx = 0; out_idx < num_outputs; out_idx++) {
-                const auto& port = proto_comp_model->outputs()[out_idx];
-                ov::Shape shape = port.get_shape();
-
-                // If the subgraph is spatial, promote the output size to the full vector size
-                if (proto_comp_model_desc.spatial) {
-                    shape[proto_comp_model_desc.spatial->out_dim] = proto_comp_model_desc.spatial->range;
-                }
-                m_funcall_result[LinkFrom{i, out_idx}] =
-                    allocTensor(port.get_element_type(), shape, m_npuw_model->funcall_mem_device(real_idx));
+                const auto from = LinkFrom{i, out_idx};
+                m_funcall_result[from] = m_func_mem_mgr.get_tensor(from);
             }
             if (real_idx != i) {
                 // If this function call is NOT the function body, do nothing here - the original
@@ -152,7 +310,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
     LOG_INFO("Preallocating input tensors...");
     for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) {
         const auto& port = m_npuw_model->inputs()[i];
-        ov::SoPtr<ov::ITensor> allocated = allocTensor(port, m_npuw_model->global_mem_device());
+        ov::SoPtr<ov::ITensor> allocated = allocOut(port, m_npuw_model->global_mem_device());
         m_input_tensors.push_back(allocated);
         m_input_allocated.insert(allocated->data());
         m_port_to_tensor[port] = TensorStorage{m_input_tensors.back(), true};
@@ -174,7 +332,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
         const auto& tensor =
             funcall_result_iter != m_funcall_result.end()
                 ? funcall_result_iter->second  // Function calls have their tensors allocated, so just use one
-                : allocTensor(port, m_npuw_model->global_mem_device());
+                : allocOut(port, m_npuw_model->global_mem_device());
 
         m_output_tensors.push_back(tensor);
         m_port_to_tensor[port] = TensorStorage{tensor, true};
@@ -920,27 +1078,22 @@ void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool
     }  // if (replaced_by)
 }
 
-ov::SoPtr<ov::ITensor> ov::npuw::JustInferRequest::allocTensor(const ov::element::Type type,
-                                                               const ov::Shape& shape,
-                                                               const std::string& device) {
+ov::npuw::TensorPtr ov::npuw::JustInferRequest::allocMem(const ov::element::Type type,
+                                                         const ov::Shape& shape,
+                                                         const std::string& device) {
     if (device == "CPU" || ov::shape_size(shape) == 0) {
         return ov::get_tensor_impl(ov::Tensor(type, shape));
     }
 
-    ov::SoPtr<ov::ITensor> remote_tensor;
-    ov::Tensor allocated_tensor;
-    {
-        std::lock_guard<std::mutex> guard(m_alloc_mutex);
-        m_remote_ctx = m_npuw_model->get_plugin()->get_core()->get_default_context(device)._ptr;
-        remote_tensor = m_remote_ctx->create_host_tensor(type, shape);
-        allocated_tensor = ov::make_tensor(remote_tensor);
-    }
-    return ov::get_tensor_impl(allocated_tensor);
+    std::lock_guard<std::mutex> guard(m_alloc_mutex);
+    auto remote_ctx = m_npuw_model->get_plugin()->get_core()->get_default_context(device)._ptr;
+    auto remote_tensor = remote_ctx->create_host_tensor(type, shape);
+    return ov::get_tensor_impl(ov::make_tensor(remote_tensor));
 }
 
-ov::SoPtr<ov::ITensor> ov::npuw::JustInferRequest::allocTensor(const ov::Output<const ov::Node>& node,
-                                                               const std::string& device) {
-    return allocTensor(node.get_element_type(), node.get_shape(), device);
+ov::npuw::TensorPtr ov::npuw::JustInferRequest::allocOut(const ov::Output<const ov::Node>& node,
+                                                         const std::string& device) {
+    return allocMem(node.get_element_type(), node.get_shape(), device);
 }
 
 void ov::npuw::JustInferRequest::subscribe_subrequest(std::size_t idx, Completed cb) {
diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp
index 7335b54c30062e..88838d8b39d75f 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp
@@ -22,6 +22,56 @@ namespace npuw {
 class CompiledModel;
 class AsyncInferRequest;
 
+using LinkFrom = std::pair<std::size_t /* Subrequest index */
+                           ,
+                           std::size_t /* Subrequest output index */
+                           >;          // FIXME: This is a third, if not fourth, definitiion of such structure
+
+using TensorPtr = ov::SoPtr<ov::ITensor>;
+
+class MemAccessSim {
+public:
+    explicit MemAccessSim(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model);
+
+    using ReadList = std::list<LinkFrom>;
+    const ReadList& read_list(std::size_t idx) const;
+
+    std::size_t remaining_reads(const LinkFrom& from);
+    void register_read(const LinkFrom& from);
+
+private:
+    std::map<LinkFrom, std::size_t> m_remaining_reads;
+    std::vector<ReadList> m_read_list;
+};
+
+class FuncMemMgr {
+    MemAccessSim m_sim;
+    std::shared_ptr<ov::npuw::CompiledModel> m_model;
+
+    void assign(const LinkFrom& from);
+
+    // Function ID -> Output port number
+    using FO = std::pair<std::size_t, std::size_t>;
+    struct Assignment {
+        TensorPtr ptr;
+        LinkFrom from;
+    };
+    std::map<FO, std::vector<Assignment>> m_memory;  // Dynamic assignment table
+    std::map<LinkFrom, TensorPtr> m_table;           // Static allocation/assignment table
+
+public:
+    explicit FuncMemMgr(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model);
+
+    using AllocFcn = std::function<TensorPtr(const ov::element::Type&, const ov::Shape&, const std::string&)>;
+    void set_alloc(AllocFcn&& fcn);
+    void assign_memory();
+
+    TensorPtr get_tensor(const LinkFrom& from);
+
+private:
+    AllocFcn m_alloc;
+};
+
 class JustInferRequest final : public IBaseInferRequest {
 public:
     explicit JustInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model);
@@ -64,15 +114,11 @@ class JustInferRequest final : public IBaseInferRequest {
     void connect_subrequests();
     void recreate_subrequests(std::size_t idx);
 
-    ov::SoPtr<ov::ITensor> allocTensor(const ov::element::Type type, const ov::Shape& shape, const std::string& device);
-    ov::SoPtr<ov::ITensor> allocTensor(const ov::Output<const ov::Node>& node, const std::string& device);
+    TensorPtr allocMem(const ov::element::Type type, const ov::Shape& shape, const std::string& device);
+    TensorPtr allocOut(const ov::Output<const ov::Node>& node, const std::string& device);
 
-    using LinkFrom = std::pair<std::size_t /* Subrequest index */
-                               ,
-                               std::size_t /* Subrequest output index */
-                               >;          // FIXME: This is a third, if not fourth, definitiion of such structure
-    using TensorPtr = ov::SoPtr<ov::ITensor>;
-    std::map<LinkFrom, TensorPtr> m_funcall_result;
+    FuncMemMgr m_func_mem_mgr;                       // Owns memory
+    std::map<LinkFrom, TensorPtr> m_funcall_result;  // Provides a convenient link
 
     bool is_pipelined(std::size_t idx) const;
     bool m_use_function_pipelining = false;
@@ -103,8 +149,6 @@ class JustInferRequest final : public IBaseInferRequest {
     std::vector<GlobalIO> m_subrequests_gio;
 
     std::mutex m_alloc_mutex;
-    std::shared_ptr<ov::IRemoteContext> m_remote_ctx = nullptr;
-
     std::unordered_set<void*> m_input_allocated;
 };