openvinotoolkit · dmatveev · Oct 31, 2024 · Oct 29, 2024 · Oct 29, 2024 · Oct 31, 2024
@@ -56,6 +56,7 @@ DEFINE_OPT(NPUW_WEIGHTS_BANK, std::string, "", npuw::weights_bank, CompileTime);
 DEFINE_OPT(NPUW_WEIGHTS_BANK_ALLOC, std::string, "", npuw::weights_bank_alloc, CompileTime);
 DEFINE_OPT(NPUW_CACHE_DIR, std::string, "", npuw::cache_dir, CompileTime);
 DEFINE_OPT(NPUW_FUNCALL_ASYNC, bool, false, npuw::funcall_async, RunTime);
+DEFINE_OPT(NPUW_UNFOLD_IREQS, bool, false, npuw::unfold_ireqs, RunTime);
 DEFINE_OPT(NPUW_ACC_CHECK, bool, false, npuw::accuracy::check, RunTime);
 DEFINE_OPT(NPUW_ACC_THRESH, double, 0.01, npuw::accuracy::threshold, RunTime);
 DEFINE_OPT(NPUW_ACC_DEVICE, std::string, "", npuw::accuracy::reference_device, RunTime);

@@ -279,6 +279,14 @@ static constexpr ov::Property<bool> parallel_compilation{"NPUW_PARALLEL_COMPILE"
  */
 static constexpr ov::Property<bool> funcall_async{"NPUW_FUNCALL_ASYNC"};
 
+/**
+ * @brief
+ * Type: boolean
+ * Create individual infer requests for partitiongs, even repeating.
+ * Default value: false.
+ */
+static constexpr ov::Property<bool> unfold_ireqs{"NPUW_UNFOLD_IREQS"};
+
 namespace accuracy {
 /**
  * @brief

@@ -38,6 +38,7 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) {
     desc.add<NPUW_FUNCALL_FOR_ALL>();
     desc.add<NPUW_PARALLEL_COMPILE>();
     desc.add<NPUW_FUNCALL_ASYNC>();
+    desc.add<NPUW_UNFOLD_IREQS>();
     desc.add<NPUW_WEIGHTS_BANK>();
     desc.add<NPUW_WEIGHTS_BANK_ALLOC>();
     desc.add<NPUW_CACHE_DIR>();

@@ -21,6 +21,7 @@
 #include "openvino/util/common_util.hpp"
 #include "partitioning/patterns/opt.hpp"
 #include "plugin.hpp"
+#include "unfold_sync_infer_request.hpp"
 #include "util.hpp"
 
 // required for get_properties_per_device()
@@ -708,16 +709,20 @@ void ov::npuw::CompiledModel::dump_on_fail(std::size_t id, const std::string& de
     }
 }
 
-std::shared_ptr<ov::ISyncInferRequest> ov::npuw::CompiledModel::create_just_sync_infer_request() {
-    auto this_sptr = std::static_pointer_cast<ov::npuw::CompiledModel>(shared_from_this());
-    return std::make_shared<ov::npuw::JustInferRequest>(this_sptr);
-}
-
 std::shared_ptr<ov::ISyncInferRequest> ov::npuw::CompiledModel::create_sync_infer_request() const {
     // Synchronous infer request implementation may vary based on the
     // selected strategy
     auto* non_const_this = const_cast<ov::npuw::CompiledModel*>(this);  // because of const in API
-    return non_const_this->create_just_sync_infer_request();
+    auto non_const_this_sptr = std::static_pointer_cast<ov::npuw::CompiledModel>(non_const_this->shared_from_this());
+
+    std::shared_ptr<ov::ISyncInferRequest> result;
+    if (m_cfg.get<::intel_npu::NPUW_UNFOLD_IREQS>()) {
+        result.reset(new ov::npuw::UnfoldInferRequest(non_const_this_sptr));
+    } else {
+        result.reset(new ov::npuw::JustInferRequest(non_const_this_sptr));
+    }
+    NPUW_ASSERT(result);
+    return result;
 }
 
 std::shared_ptr<ov::IAsyncInferRequest> ov::npuw::CompiledModel::create_infer_request() const {
@@ -934,6 +939,7 @@ void ov::npuw::CompiledModel::implement_properties() {
                           BIND(npuw::partitioning::dcoff_with_scale, NPUW_DCOFF_SCALE),
                           BIND(npuw::parallel_compilation, NPUW_PARALLEL_COMPILE),
                           BIND(npuw::funcall_async, NPUW_FUNCALL_ASYNC),
+                          BIND(npuw::unfold_ireqs, NPUW_UNFOLD_IREQS),
                           BIND(npuw::weights_bank, NPUW_WEIGHTS_BANK),
                           BIND(npuw::weights_bank_alloc, NPUW_WEIGHTS_BANK_ALLOC),
                           BIND(npuw::cache_dir, NPUW_CACHE_DIR),

@@ -47,6 +47,7 @@ class CompiledModel : public ov::ICompiledModel {
     // FIXME: This class has many friends..
     friend class IBaseInferRequest;
     friend class JustInferRequest;
+    friend class UnfoldInferRequest;
     friend class MemAccessSim;
     friend class FuncMemMgr;
 
@@ -66,7 +67,6 @@ class CompiledModel : public ov::ICompiledModel {
 
     std::shared_ptr<const ::intel_npu::Plugin> get_npuw_plugin() const;
 
-    std::shared_ptr<ov::ISyncInferRequest> create_just_sync_infer_request();
     std::shared_ptr<ov::ISyncInferRequest> create_sync_infer_request() const override;
 
     std::string submodel_device(const std::size_t idx) const;

@@ -180,9 +180,15 @@ ov::npuw::TensorPtr ov::npuw::FuncMemMgr::get_tensor(const LinkFrom& from) {
     return m_table.at(from);
 }
 
-ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model)
+ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model,
+                                             bool real_work)
     : IBaseInferRequest(compiled_model),
       m_func_mem_mgr(compiled_model) {
+    if (!real_work) {
+        // FIXME: Fragile base class
+        return;
+    }
+
     using namespace std::placeholders;
     m_func_mem_mgr.set_alloc(std::bind(&JustInferRequest::allocMem, this, _1, _2, _3));
     m_func_mem_mgr.assign_memory();
@@ -597,7 +603,7 @@ void ov::npuw::JustInferRequest::bind_global_parameters(std::size_t idx) {
         LOG_BLOCK();
         if (!is_spatial_param(sub_in_idx)) {
             // Input parameter is non-spatial, do normal handling
-            if (do_copy || m_input_allocated.count(g_tnsr->data()) == 0) {
+            if (m_input_allocated.count(g_tnsr->data()) == 0 && do_copy) {
                 LOG_DEBUG("Will be copied");
                 copy_list.emplace_back(g_tnsr, s_port);
             } else {

@@ -73,15 +73,15 @@ class FuncMemMgr {
     AllocFcn m_alloc;
 };
 
-class JustInferRequest final : public IBaseInferRequest {
+class JustInferRequest : public IBaseInferRequest {
 public:
-    explicit JustInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model);
+    explicit JustInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model, bool real_work = true);
 
     // Query APIs
     std::vector<ov::SoPtr<ov::IVariableState>> query_state() const override;
     std::vector<ov::ProfilingInfo> get_profiling_info() const override;
 
-private:
+protected:
     ////////////////////////////////////
     // implement IBaseInferRequest
     void prepare_for_infer() override;

@@ -0,0 +1,238 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "openvino/core/parallel.hpp"
+
+#include "unfold_sync_infer_request.hpp"
+#include "compiled_model.hpp"
+#include "logging.hpp"
+
+ov::npuw::UnfoldInferRequest::UnfoldInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model)
+    : ov::npuw::JustInferRequest(compiled_model, false) {
+    // Create infer requests
+    // Preallocate funcall tensors & substitute function call requests
+    for (std::size_t i = 0; i < m_num_submodels; i++) {
+        LOG_INFO("Creating infer request for Subgraph[" << i << "]...");
+        LOG_BLOCK();
+        auto& comp_model_desc = m_npuw_model->m_compiled_submodels[i];
+
+        if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) {
+            // no model & no funcall - optimized out, do nothing
+            LOG_INFO("OPTIMIZED OUT");
+            continue;
+        }
+
+        if (comp_model_desc.replaced_by) {
+            // Pre-allocate output tensors for this function call
+            const auto real_idx = comp_model_desc.replaced_by.value();
+            auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx];
+            if (proto_comp_model_desc.spatial) {
+                NPUW_ASSERT(false && "Spatial is not supported in unfold");
+            }
+        }  // if(replaced_by)
+
+        const auto real_idx = comp_model_desc.replaced_by.value_or(i);
+        auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx];
+        m_subrequests[i] = proto_comp_model_desc.compiled_model->create_infer_request();
+        m_subrequest_devices[i] = *proto_comp_model_desc.device_it;
+        LOG_INFO("DONE");
+    }  // for(submodels)
+
+    // Preallocate input tensors. Note - there may be
+    // multiple subrequest consumers on the same input tensor
+    LOG_INFO("Preallocating input tensors...");
+    for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) {
+        const auto& port = m_npuw_model->inputs()[i];
+        ov::SoPtr<ov::ITensor> allocated = allocOut(port, m_npuw_model->global_mem_device());
+        m_input_tensors.push_back(allocated);
+        m_input_allocated.insert(allocated->data());
+        m_port_to_tensor[port] = TensorStorage{m_input_tensors.back(), true};
+    }  // for(inputs)
+
+    // Preallocate output tensors
+    LOG_INFO("Preallocating output tensors...");
+    for (size_t i = 0; i < m_npuw_model->outputs().size(); i++) {
+        LOG_BLOCK();
+        const auto& port = m_npuw_model->outputs()[i];
+        LOG_INFO("Output " << i << " of " << m_npuw_model->outputs().size() << ": " << port);
+
+        // FIXME: Yes, the CompiledModel::ToSubmodel == JustInferRequest::LinkFrom
+        const auto& from_submodel = m_npuw_model->m_outputs_to_submodels_outputs.at(i);
+
+        LOG_INFO("Produced by Subgraph[" << from_submodel.first << "] / " << from_submodel.second);
+        const auto& tensor = allocOut(port, m_npuw_model->global_mem_device());
+
+        m_output_tensors.push_back(tensor);
+        m_port_to_tensor[port] = TensorStorage{tensor, true};
+    }
+
+    LOG_INFO("Connecting subrequests...");
+    LOG_BLOCK();
+    for (const auto& kvp : m_npuw_model->m_submodels_input_to_prev_output) {
+        const auto& subm_idx_to = kvp.first.first;
+        const auto& port_idx_to = kvp.first.second;
+        const auto& subm_idx_from = kvp.second.first;
+        const auto& port_idx_from = kvp.second.second;
+
+        LOG_DEBUG("Subgraph[" << subm_idx_from << "]/" << port_idx_from << " --> "
+                              << "Subgraph[" << subm_idx_to << "]/" << port_idx_to);
+        NPUW_ASSERT(m_subrequests[subm_idx_from]);  // prod request is created
+        NPUW_ASSERT(m_subrequests[subm_idx_to]);    // cons request is created
+        NPUW_ASSERT(m_subrequests[subm_idx_from]._ptr != m_subrequests[subm_idx_to]._ptr);
+
+        const auto& iport = m_subrequests[subm_idx_to]->get_compiled_model()->inputs()[port_idx_to];
+        const auto& oport = m_subrequests[subm_idx_from]->get_compiled_model()->outputs()[port_idx_from];
+        const auto& tensor = m_subrequests[subm_idx_from]->get_tensor(oport);
+        LOG_DEBUG("Set Subgraph[" << subm_idx_to << "]/" << iport << " to Subgraph[" << subm_idx_from << "]/" << oport);
+        m_subrequests[subm_idx_to]->set_tensor(iport, tensor);
+    }  // for(map)
+    LOG_INFO("Done");
+
+    // Build the parameter/result mapping {{{
+    m_subrequests_gio.resize(m_subrequests.size());
+
+    // Parameters: stage 1...
+    for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) {
+        const auto& to_submodel = m_npuw_model->m_inputs_to_submodels_inputs.at(i);
+        if (to_submodel != CompiledModel::NO_LINK) {
+            std::size_t sub_idx{}, in_idx{};
+            std::tie(sub_idx, in_idx) = to_submodel;
+            m_subrequests_gio.at(sub_idx).global_params[i] = in_idx;
+        }
+    }  // for(inputs)
+
+    // Parameters: stage 2...
+    for (auto&& it : m_npuw_model->m_param_subscribers) {
+        const auto param_idx = it.first;
+        for (auto&& to_submodel : it.second) {
+            std::size_t sub_idx{}, in_idx{};
+            std::tie(sub_idx, in_idx) = to_submodel;
+            m_subrequests_gio.at(sub_idx).global_params[param_idx] = in_idx;
+        }
+    }
+
+    // Results
+    for (size_t i = 0; i < m_npuw_model->outputs().size(); i++) {
+        std::size_t sub_idx{}, out_idx{};
+        std::tie(sub_idx, out_idx) = m_npuw_model->m_outputs_to_submodels_outputs.at(i);
+        m_subrequests_gio.at(sub_idx).global_results[i] = out_idx;
+    }
+    // }}}
+
+    for (size_t i = 0; i < m_num_submodels; i++) {
+        LOG_VERB("Trying to preemptively set tensors for Subgraph[" << i << "]...");
+        LOG_BLOCK();
+        auto& comp_model_desc = m_npuw_model->m_compiled_submodels[i];
+        if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) {
+            continue;  // Optimized out
+        }
+        unpack_closure(i, m_subrequests[i]);
+        LOG_VERB("Done");
+    }
+}
+
+void ov::npuw::UnfoldInferRequest::prepare(std::size_t idx) {
+    if (idx >= m_subrequests.size()) {
+        return;
+    }
+    auto& subr = m_subrequests.at(idx);
+    const bool do_copy = needs_copy(idx);
+
+    std::vector<std::pair<ov::SoPtr<ov::ITensor>, ov::Output<const ov::Node>>> copy_list;
+
+    // bind_global_parameters(), a simplified way
+    const auto& iodesc = m_subrequests_gio.at(idx);
+    for (auto&& it : iodesc.global_params) {
+        std::size_t param_idx{}, sub_in_idx{};
+        std::tie(param_idx, sub_in_idx) = it;
+        const auto& g_port = m_npuw_model->inputs()[param_idx];
+        const auto& g_tnsr = m_port_to_tensor.at(g_port).tensor;
+        const auto& s_port = subr->get_inputs()[sub_in_idx];
+
+        if (m_input_allocated.count(g_tnsr->data()) == 0 && do_copy) {
+            copy_list.emplace_back(g_tnsr, s_port);
+        } else {
+            subr->set_tensor(s_port, g_tnsr);
+        }
+    }
+
+    // bind_global_results, a simplified way
+    for (auto&& it : iodesc.global_results) {
+        std::size_t result_idx{}, sub_out_idx{};
+        std::tie(result_idx, sub_out_idx) = it;
+        const auto& g_port = m_npuw_model->outputs()[result_idx];
+        const auto& s_port = subr->get_outputs()[sub_out_idx];
+        subr->set_tensor(s_port, m_port_to_tensor.at(g_port).tensor);
+    }
+
+    // run copy, if required
+    // NB: parallel_for was removed here as it causes more overhead for our (usually)
+    // small chunks copied. In the proper app-level pipeline, there must be no copy at all
+    for (auto &&it : copy_list) {
+        ov::SoPtr<ov::ITensor> dst = subr->get_tensor(it.second);
+        it.first->copy_to(dst._ptr);
+    }
+
+    // run host gather, if required
+    auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx];
+    if (comp_model_desc.host_gather.dst_idx != -1) {
+        const auto& gport = comp_model_desc.compiled_model->inputs()[comp_model_desc.host_gather.dst_idx];
+        const auto gather = subr->get_tensor(gport);
+
+        const auto& vocab =
+            comp_model_desc.closure[comp_model_desc.host_gather.src_idx - comp_model_desc.param_base];
+        const auto& lport = comp_model_desc.compiled_model->inputs()[comp_model_desc.host_gather.idx_idx];
+        const auto lookup = subr->get_tensor(lport);
+        ov::npuw::util::gather(ov::get_tensor_impl(vocab), lookup, gather);
+    }
+}
+
+void ov::npuw::UnfoldInferRequest::infer() {
+    const bool do_async = m_npuw_model->m_cfg.get<::intel_npu::NPUW_FUNCALL_ASYNC>();
+
+    auto wait_and_clear = [](RqPtrs &rqs) {
+        for (auto &&r : rqs) {
+            r->wait();
+        }
+        rqs.clear();
+    };
+
+    if (do_async) {
+        std::size_t past_repl_id = 0u;
+        RqPtrs previous_requests;
+
+        prepare(0);
+        for (std::size_t idx = 0; idx < m_num_submodels; idx++) {
+            auto& subr = m_subrequests[idx];
+            if (!subr) {
+                continue;
+            }
+            auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx];
+            const auto this_repl_id = comp_model_desc.replaced_by.value_or(idx);
+            if (this_repl_id != past_repl_id) {
+                // For non-repeating blocks, the above value_or returns idx
+                // For repeating blocks, it returns the function group id
+                // If either is not equal to the past_repl_id, make a barrier here
+                wait_and_clear(previous_requests);
+                past_repl_id = this_repl_id;
+            }
+            subr->start_async();
+            previous_requests.push_back(subr);
+            prepare(idx + 1);
+        }
+        wait_and_clear(previous_requests);
+    } else {
+        prepare(0);
+        for (std::size_t idx = 0; idx < m_num_submodels; idx++) {
+            auto& subr = m_subrequests[idx];
+            if (!subr) {
+                prepare(idx + 1);
+                continue;
+            }
+            subr->start_async();
+            prepare(idx + 1);
+            subr->wait();
+        }
+    } // (async)
+}