diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp index 7b0dab3d16da3c..edd5b1367c217f 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp @@ -56,6 +56,7 @@ DEFINE_OPT(NPUW_WEIGHTS_BANK, std::string, "", npuw::weights_bank, CompileTime); DEFINE_OPT(NPUW_WEIGHTS_BANK_ALLOC, std::string, "", npuw::weights_bank_alloc, CompileTime); DEFINE_OPT(NPUW_CACHE_DIR, std::string, "", npuw::cache_dir, CompileTime); DEFINE_OPT(NPUW_FUNCALL_ASYNC, bool, false, npuw::funcall_async, RunTime); +DEFINE_OPT(NPUW_UNFOLD_IREQS, bool, false, npuw::unfold_ireqs, RunTime); DEFINE_OPT(NPUW_ACC_CHECK, bool, false, npuw::accuracy::check, RunTime); DEFINE_OPT(NPUW_ACC_THRESH, double, 0.01, npuw::accuracy::threshold, RunTime); DEFINE_OPT(NPUW_ACC_DEVICE, std::string, "", npuw::accuracy::reference_device, RunTime); diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp index 67dce9621bfb4e..d7761979339eb5 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp @@ -279,6 +279,14 @@ static constexpr ov::Property parallel_compilation{"NPUW_PARALLEL_COMPILE" */ static constexpr ov::Property funcall_async{"NPUW_FUNCALL_ASYNC"}; +/** + * @brief + * Type: boolean + * Create individual infer requests for partitiongs, even repeating. + * Default value: false. + */ +static constexpr ov::Property unfold_ireqs{"NPUW_UNFOLD_IREQS"}; + namespace accuracy { /** * @brief diff --git a/src/plugins/intel_npu/src/al/src/config/npuw.cpp b/src/plugins/intel_npu/src/al/src/config/npuw.cpp index 6a519a0f754a32..a4478ba3c9dcd2 100644 --- a/src/plugins/intel_npu/src/al/src/config/npuw.cpp +++ b/src/plugins/intel_npu/src/al/src/config/npuw.cpp @@ -38,6 +38,7 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) { desc.add(); desc.add(); desc.add(); + desc.add(); desc.add(); desc.add(); desc.add(); diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index b52dd40ea59364..a87c42b037efb3 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -21,6 +21,7 @@ #include "openvino/util/common_util.hpp" #include "partitioning/patterns/opt.hpp" #include "plugin.hpp" +#include "unfold_sync_infer_request.hpp" #include "util.hpp" // required for get_properties_per_device() @@ -708,16 +709,20 @@ void ov::npuw::CompiledModel::dump_on_fail(std::size_t id, const std::string& de } } -std::shared_ptr ov::npuw::CompiledModel::create_just_sync_infer_request() { - auto this_sptr = std::static_pointer_cast(shared_from_this()); - return std::make_shared(this_sptr); -} - std::shared_ptr ov::npuw::CompiledModel::create_sync_infer_request() const { // Synchronous infer request implementation may vary based on the // selected strategy auto* non_const_this = const_cast(this); // because of const in API - return non_const_this->create_just_sync_infer_request(); + auto non_const_this_sptr = std::static_pointer_cast(non_const_this->shared_from_this()); + + std::shared_ptr result; + if (m_cfg.get<::intel_npu::NPUW_UNFOLD_IREQS>()) { + result.reset(new ov::npuw::UnfoldInferRequest(non_const_this_sptr)); + } else { + result.reset(new ov::npuw::JustInferRequest(non_const_this_sptr)); + } + NPUW_ASSERT(result); + return result; } std::shared_ptr ov::npuw::CompiledModel::create_infer_request() const { @@ -934,6 +939,7 @@ void ov::npuw::CompiledModel::implement_properties() { BIND(npuw::partitioning::dcoff_with_scale, NPUW_DCOFF_SCALE), BIND(npuw::parallel_compilation, NPUW_PARALLEL_COMPILE), BIND(npuw::funcall_async, NPUW_FUNCALL_ASYNC), + BIND(npuw::unfold_ireqs, NPUW_UNFOLD_IREQS), BIND(npuw::weights_bank, NPUW_WEIGHTS_BANK), BIND(npuw::weights_bank_alloc, NPUW_WEIGHTS_BANK_ALLOC), BIND(npuw::cache_dir, NPUW_CACHE_DIR), diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp index 6199ac66c0c64e..e16bf7deb4497a 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp @@ -47,6 +47,7 @@ class CompiledModel : public ov::ICompiledModel { // FIXME: This class has many friends.. friend class IBaseInferRequest; friend class JustInferRequest; + friend class UnfoldInferRequest; friend class MemAccessSim; friend class FuncMemMgr; @@ -66,7 +67,6 @@ class CompiledModel : public ov::ICompiledModel { std::shared_ptr get_npuw_plugin() const; - std::shared_ptr create_just_sync_infer_request(); std::shared_ptr create_sync_infer_request() const override; std::string submodel_device(const std::size_t idx) const; diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp index 4a9a3e06a0aa16..7aa05019da7504 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp @@ -180,9 +180,15 @@ ov::npuw::TensorPtr ov::npuw::FuncMemMgr::get_tensor(const LinkFrom& from) { return m_table.at(from); } -ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr& compiled_model) +ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr& compiled_model, + bool real_work) : IBaseInferRequest(compiled_model), m_func_mem_mgr(compiled_model) { + if (!real_work) { + // FIXME: Fragile base class + return; + } + using namespace std::placeholders; m_func_mem_mgr.set_alloc(std::bind(&JustInferRequest::allocMem, this, _1, _2, _3)); m_func_mem_mgr.assign_memory(); diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp index d219f170a8e6bb..f085f060d4a09e 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp @@ -73,15 +73,15 @@ class FuncMemMgr { AllocFcn m_alloc; }; -class JustInferRequest final : public IBaseInferRequest { +class JustInferRequest : public IBaseInferRequest { public: - explicit JustInferRequest(const std::shared_ptr& compiled_model); + explicit JustInferRequest(const std::shared_ptr& compiled_model, bool real_work = true); // Query APIs std::vector> query_state() const override; std::vector get_profiling_info() const override; -private: +protected: //////////////////////////////////// // implement IBaseInferRequest void prepare_for_infer() override; diff --git a/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.cpp new file mode 100644 index 00000000000000..114e037d2d570d --- /dev/null +++ b/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.cpp @@ -0,0 +1,182 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "unfold_sync_infer_request.hpp" + +#include "compiled_model.hpp" +#include "logging.hpp" + +ov::npuw::UnfoldInferRequest::UnfoldInferRequest(const std::shared_ptr& compiled_model) + : ov::npuw::JustInferRequest(compiled_model, false) { + // Create infer requests + // Preallocate funcall tensors & substitute function call requests + for (std::size_t i = 0; i < m_num_submodels; i++) { + LOG_INFO("Creating infer request for Subgraph[" << i << "]..."); + LOG_BLOCK(); + auto& comp_model_desc = m_npuw_model->m_compiled_submodels[i]; + + if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) { + // no model & no funcall - optimized out, do nothing + LOG_INFO("OPTIMIZED OUT"); + continue; + } + + if (comp_model_desc.replaced_by) { + // Pre-allocate output tensors for this function call + const auto real_idx = comp_model_desc.replaced_by.value(); + auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx]; + if (proto_comp_model_desc.spatial) { + NPUW_ASSERT(false && "Spatial is not supported in unfold"); + } + } // if(replaced_by) + + const auto real_idx = comp_model_desc.replaced_by.value_or(i); + auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx]; + m_subrequests[i] = proto_comp_model_desc.compiled_model->create_infer_request(); + m_subrequest_devices[i] = *proto_comp_model_desc.device_it; + LOG_INFO("DONE"); + } // for(submodels) + + // Preallocate input tensors. Note - there may be + // multiple subrequest consumers on the same input tensor + LOG_INFO("Preallocating input tensors..."); + for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) { + const auto& port = m_npuw_model->inputs()[i]; + ov::SoPtr allocated = allocOut(port, m_npuw_model->global_mem_device()); + m_input_tensors.push_back(allocated); + m_input_allocated.insert(allocated->data()); + m_port_to_tensor[port] = TensorStorage{m_input_tensors.back(), true}; + } // for(inputs) + + // Preallocate output tensors + LOG_INFO("Preallocating output tensors..."); + for (size_t i = 0; i < m_npuw_model->outputs().size(); i++) { + LOG_BLOCK(); + const auto& port = m_npuw_model->outputs()[i]; + LOG_INFO("Output " << i << " of " << m_npuw_model->outputs().size() << ": " << port); + + // FIXME: Yes, the CompiledModel::ToSubmodel == JustInferRequest::LinkFrom + const auto& from_submodel = m_npuw_model->m_outputs_to_submodels_outputs.at(i); + + LOG_INFO("Produced by Subgraph[" << from_submodel.first << "] / " << from_submodel.second); + const auto& tensor = allocOut(port, m_npuw_model->global_mem_device()); + + m_output_tensors.push_back(tensor); + m_port_to_tensor[port] = TensorStorage{tensor, true}; + } + + LOG_INFO("Connecting subrequests..."); + LOG_BLOCK(); + for (const auto& kvp : m_npuw_model->m_submodels_input_to_prev_output) { + const auto& subm_idx_to = kvp.first.first; + const auto& port_idx_to = kvp.first.second; + const auto& subm_idx_from = kvp.second.first; + const auto& port_idx_from = kvp.second.second; + + LOG_DEBUG("Subgraph[" << subm_idx_from << "]/" << port_idx_from << " --> " + << "Subgraph[" << subm_idx_to << "]/" << port_idx_to); + NPUW_ASSERT(m_subrequests[subm_idx_from]); // prod request is created + NPUW_ASSERT(m_subrequests[subm_idx_to]); // cons request is created + NPUW_ASSERT(m_subrequests[subm_idx_from]._ptr != m_subrequests[subm_idx_to]._ptr); + + const auto& iport = m_subrequests[subm_idx_to]->get_compiled_model()->inputs()[port_idx_to]; + const auto& oport = m_subrequests[subm_idx_from]->get_compiled_model()->outputs()[port_idx_from]; + const auto& tensor = m_subrequests[subm_idx_from]->get_tensor(oport); + LOG_DEBUG("Set Subgraph[" << subm_idx_to << "]/" << iport << " to Subgraph[" << subm_idx_from << "]/" << oport); + m_subrequests[subm_idx_to]->set_tensor(iport, tensor); + } // for(map) + LOG_INFO("Done"); + + // Build the parameter/result mapping {{{ + m_subrequests_gio.resize(m_subrequests.size()); + + // Parameters: stage 1... + for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) { + const auto& to_submodel = m_npuw_model->m_inputs_to_submodels_inputs.at(i); + if (to_submodel != CompiledModel::NO_LINK) { + std::size_t sub_idx{}, in_idx{}; + std::tie(sub_idx, in_idx) = to_submodel; + m_subrequests_gio.at(sub_idx).global_params[i] = in_idx; + } + } // for(inputs) + + // Parameters: stage 2... + for (auto&& it : m_npuw_model->m_param_subscribers) { + const auto param_idx = it.first; + for (auto&& to_submodel : it.second) { + std::size_t sub_idx{}, in_idx{}; + std::tie(sub_idx, in_idx) = to_submodel; + m_subrequests_gio.at(sub_idx).global_params[param_idx] = in_idx; + } + } + + // Results + for (size_t i = 0; i < m_npuw_model->outputs().size(); i++) { + std::size_t sub_idx{}, out_idx{}; + std::tie(sub_idx, out_idx) = m_npuw_model->m_outputs_to_submodels_outputs.at(i); + m_subrequests_gio.at(sub_idx).global_results[i] = out_idx; + } + // }}} + + for (size_t i = 0; i < m_num_submodels; i++) { + LOG_VERB("Trying to preemptively set tensors for Subgraph[" << i << "]..."); + LOG_BLOCK(); + auto& comp_model_desc = m_npuw_model->m_compiled_submodels[i]; + if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) { + continue; // Optimized out + } + unpack_closure(i, m_subrequests[i]); + LOG_VERB("Done"); + } +} + +void ov::npuw::UnfoldInferRequest::infer() { + for (std::size_t idx = 0; idx < m_num_submodels; idx++) { + auto& subr = m_subrequests[idx]; + if (!subr) { + continue; + } + + // bind_global_parameters(), a simplified way + const auto& iodesc = m_subrequests_gio.at(idx); + for (auto&& it : iodesc.global_params) { + std::size_t param_idx{}, sub_in_idx{}; + std::tie(param_idx, sub_in_idx) = it; + const auto& g_port = m_npuw_model->inputs()[param_idx]; + const auto& g_tnsr = m_port_to_tensor.at(g_port).tensor; + const auto& s_port = subr->get_inputs()[sub_in_idx]; + subr->set_tensor(s_port, g_tnsr); + } + + // bind_global_results, a simplified way + for (auto&& it : iodesc.global_results) { + std::size_t result_idx{}, sub_out_idx{}; + std::tie(result_idx, sub_out_idx) = it; + const auto& g_port = m_npuw_model->outputs()[result_idx]; + const auto& s_port = subr->get_outputs()[sub_out_idx]; + subr->set_tensor(s_port, m_port_to_tensor.at(g_port).tensor); + } + + // run host gather, if required + auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx]; + if (comp_model_desc.host_gather.dst_idx != -1) { + const auto& gport = comp_model_desc.compiled_model->inputs()[comp_model_desc.host_gather.dst_idx]; + const auto gather = subr->get_tensor(gport); + + const auto& vocab = + comp_model_desc.closure[comp_model_desc.host_gather.src_idx - comp_model_desc.param_base]; + const auto& lport = comp_model_desc.compiled_model->inputs()[comp_model_desc.host_gather.idx_idx]; + const auto lookup = subr->get_tensor(lport); + ov::npuw::util::gather(ov::get_tensor_impl(vocab), lookup, gather); + } + } + + for (std::size_t idx = 0; idx < m_num_submodels; idx++) { + auto& subr = m_subrequests[idx]; + if (!subr) { + continue; + } + subr->infer(); + } +} diff --git a/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.hpp new file mode 100644 index 00000000000000..9e57a33239435a --- /dev/null +++ b/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.hpp @@ -0,0 +1,27 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include + +#include "just_sync_infer_request.hpp" + +namespace ov { +namespace npuw { + +class UnfoldInferRequest final : public JustInferRequest { +public: + explicit UnfoldInferRequest(const std::shared_ptr& compiled_model); + +private: + void infer() override; +}; + +} // namespace npuw +} // namespace ov