Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NPUW: Unfold infer requests #27319

Draft
wants to merge 5 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ DEFINE_OPT(NPUW_WEIGHTS_BANK, std::string, "", npuw::weights_bank, CompileTime);
DEFINE_OPT(NPUW_WEIGHTS_BANK_ALLOC, std::string, "", npuw::weights_bank_alloc, CompileTime);
DEFINE_OPT(NPUW_CACHE_DIR, std::string, "", npuw::cache_dir, CompileTime);
DEFINE_OPT(NPUW_FUNCALL_ASYNC, bool, false, npuw::funcall_async, RunTime);
DEFINE_OPT(NPUW_UNFOLD_IREQS, bool, false, npuw::unfold_ireqs, RunTime);
DEFINE_OPT(NPUW_ACC_CHECK, bool, false, npuw::accuracy::check, RunTime);
DEFINE_OPT(NPUW_ACC_THRESH, double, 0.01, npuw::accuracy::threshold, RunTime);
DEFINE_OPT(NPUW_ACC_DEVICE, std::string, "", npuw::accuracy::reference_device, RunTime);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,14 @@ static constexpr ov::Property<bool> parallel_compilation{"NPUW_PARALLEL_COMPILE"
*/
static constexpr ov::Property<bool> funcall_async{"NPUW_FUNCALL_ASYNC"};

/**
* @brief
* Type: boolean
* Create individual infer requests for partitiongs, even repeating.
* Default value: false.
*/
static constexpr ov::Property<bool> unfold_ireqs{"NPUW_UNFOLD_IREQS"};

namespace accuracy {
/**
* @brief
Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_npu/src/al/src/config/npuw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) {
desc.add<NPUW_FUNCALL_FOR_ALL>();
desc.add<NPUW_PARALLEL_COMPILE>();
desc.add<NPUW_FUNCALL_ASYNC>();
desc.add<NPUW_UNFOLD_IREQS>();
desc.add<NPUW_WEIGHTS_BANK>();
desc.add<NPUW_WEIGHTS_BANK_ALLOC>();
desc.add<NPUW_CACHE_DIR>();
Expand Down
18 changes: 12 additions & 6 deletions src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "openvino/util/common_util.hpp"
#include "partitioning/patterns/opt.hpp"
#include "plugin.hpp"
#include "unfold_sync_infer_request.hpp"
#include "util.hpp"

// required for get_properties_per_device()
Expand Down Expand Up @@ -708,16 +709,20 @@ void ov::npuw::CompiledModel::dump_on_fail(std::size_t id, const std::string& de
}
}

std::shared_ptr<ov::ISyncInferRequest> ov::npuw::CompiledModel::create_just_sync_infer_request() {
auto this_sptr = std::static_pointer_cast<ov::npuw::CompiledModel>(shared_from_this());
return std::make_shared<ov::npuw::JustInferRequest>(this_sptr);
}

std::shared_ptr<ov::ISyncInferRequest> ov::npuw::CompiledModel::create_sync_infer_request() const {
// Synchronous infer request implementation may vary based on the
// selected strategy
auto* non_const_this = const_cast<ov::npuw::CompiledModel*>(this); // because of const in API
return non_const_this->create_just_sync_infer_request();
auto non_const_this_sptr = std::static_pointer_cast<ov::npuw::CompiledModel>(non_const_this->shared_from_this());

std::shared_ptr<ov::ISyncInferRequest> result;
if (m_cfg.get<::intel_npu::NPUW_UNFOLD_IREQS>()) {
result.reset(new ov::npuw::UnfoldInferRequest(non_const_this_sptr));
} else {
result.reset(new ov::npuw::JustInferRequest(non_const_this_sptr));
}
NPUW_ASSERT(result);
return result;
}

std::shared_ptr<ov::IAsyncInferRequest> ov::npuw::CompiledModel::create_infer_request() const {
Expand Down Expand Up @@ -934,6 +939,7 @@ void ov::npuw::CompiledModel::implement_properties() {
BIND(npuw::partitioning::dcoff_with_scale, NPUW_DCOFF_SCALE),
BIND(npuw::parallel_compilation, NPUW_PARALLEL_COMPILE),
BIND(npuw::funcall_async, NPUW_FUNCALL_ASYNC),
BIND(npuw::unfold_ireqs, NPUW_UNFOLD_IREQS),
BIND(npuw::weights_bank, NPUW_WEIGHTS_BANK),
BIND(npuw::weights_bank_alloc, NPUW_WEIGHTS_BANK_ALLOC),
BIND(npuw::cache_dir, NPUW_CACHE_DIR),
Expand Down
2 changes: 1 addition & 1 deletion src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ class CompiledModel : public ov::ICompiledModel {
// FIXME: This class has many friends..
friend class IBaseInferRequest;
friend class JustInferRequest;
friend class UnfoldInferRequest;
friend class MemAccessSim;
friend class FuncMemMgr;

Expand All @@ -66,7 +67,6 @@ class CompiledModel : public ov::ICompiledModel {

std::shared_ptr<const ::intel_npu::Plugin> get_npuw_plugin() const;

std::shared_ptr<ov::ISyncInferRequest> create_just_sync_infer_request();
std::shared_ptr<ov::ISyncInferRequest> create_sync_infer_request() const override;

std::string submodel_device(const std::size_t idx) const;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -180,9 +180,15 @@ ov::npuw::TensorPtr ov::npuw::FuncMemMgr::get_tensor(const LinkFrom& from) {
return m_table.at(from);
}

ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model)
ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model,
bool real_work)
: IBaseInferRequest(compiled_model),
m_func_mem_mgr(compiled_model) {
if (!real_work) {
// FIXME: Fragile base class
return;
}

using namespace std::placeholders;
m_func_mem_mgr.set_alloc(std::bind(&JustInferRequest::allocMem, this, _1, _2, _3));
m_func_mem_mgr.assign_memory();
Expand Down Expand Up @@ -597,7 +603,7 @@ void ov::npuw::JustInferRequest::bind_global_parameters(std::size_t idx) {
LOG_BLOCK();
if (!is_spatial_param(sub_in_idx)) {
// Input parameter is non-spatial, do normal handling
if (do_copy || m_input_allocated.count(g_tnsr->data()) == 0) {
if (m_input_allocated.count(g_tnsr->data()) == 0 && do_copy) {
LOG_DEBUG("Will be copied");
copy_list.emplace_back(g_tnsr, s_port);
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,15 +73,15 @@ class FuncMemMgr {
AllocFcn m_alloc;
};

class JustInferRequest final : public IBaseInferRequest {
class JustInferRequest : public IBaseInferRequest {
public:
explicit JustInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model);
explicit JustInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model, bool real_work = true);

// Query APIs
std::vector<ov::SoPtr<ov::IVariableState>> query_state() const override;
std::vector<ov::ProfilingInfo> get_profiling_info() const override;

private:
protected:
////////////////////////////////////
// implement IBaseInferRequest
void prepare_for_infer() override;
Expand Down
238 changes: 238 additions & 0 deletions src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
// Copyright (C) 2023-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "openvino/core/parallel.hpp"

#include "unfold_sync_infer_request.hpp"
#include "compiled_model.hpp"
#include "logging.hpp"

ov::npuw::UnfoldInferRequest::UnfoldInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model)
: ov::npuw::JustInferRequest(compiled_model, false) {
// Create infer requests
// Preallocate funcall tensors & substitute function call requests
for (std::size_t i = 0; i < m_num_submodels; i++) {
LOG_INFO("Creating infer request for Subgraph[" << i << "]...");
LOG_BLOCK();
auto& comp_model_desc = m_npuw_model->m_compiled_submodels[i];

if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) {
// no model & no funcall - optimized out, do nothing
LOG_INFO("OPTIMIZED OUT");
continue;
}

if (comp_model_desc.replaced_by) {
// Pre-allocate output tensors for this function call
const auto real_idx = comp_model_desc.replaced_by.value();
auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx];
if (proto_comp_model_desc.spatial) {
NPUW_ASSERT(false && "Spatial is not supported in unfold");
}
} // if(replaced_by)

const auto real_idx = comp_model_desc.replaced_by.value_or(i);
auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx];
m_subrequests[i] = proto_comp_model_desc.compiled_model->create_infer_request();
m_subrequest_devices[i] = *proto_comp_model_desc.device_it;
LOG_INFO("DONE");
} // for(submodels)

// Preallocate input tensors. Note - there may be
// multiple subrequest consumers on the same input tensor
LOG_INFO("Preallocating input tensors...");
for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) {
const auto& port = m_npuw_model->inputs()[i];
ov::SoPtr<ov::ITensor> allocated = allocOut(port, m_npuw_model->global_mem_device());
m_input_tensors.push_back(allocated);
m_input_allocated.insert(allocated->data());
m_port_to_tensor[port] = TensorStorage{m_input_tensors.back(), true};
} // for(inputs)

// Preallocate output tensors
LOG_INFO("Preallocating output tensors...");
for (size_t i = 0; i < m_npuw_model->outputs().size(); i++) {
LOG_BLOCK();
const auto& port = m_npuw_model->outputs()[i];
LOG_INFO("Output " << i << " of " << m_npuw_model->outputs().size() << ": " << port);

// FIXME: Yes, the CompiledModel::ToSubmodel == JustInferRequest::LinkFrom
const auto& from_submodel = m_npuw_model->m_outputs_to_submodels_outputs.at(i);

LOG_INFO("Produced by Subgraph[" << from_submodel.first << "] / " << from_submodel.second);
const auto& tensor = allocOut(port, m_npuw_model->global_mem_device());

m_output_tensors.push_back(tensor);
m_port_to_tensor[port] = TensorStorage{tensor, true};
}

LOG_INFO("Connecting subrequests...");
LOG_BLOCK();
for (const auto& kvp : m_npuw_model->m_submodels_input_to_prev_output) {
const auto& subm_idx_to = kvp.first.first;
const auto& port_idx_to = kvp.first.second;
const auto& subm_idx_from = kvp.second.first;
const auto& port_idx_from = kvp.second.second;

LOG_DEBUG("Subgraph[" << subm_idx_from << "]/" << port_idx_from << " --> "
<< "Subgraph[" << subm_idx_to << "]/" << port_idx_to);
NPUW_ASSERT(m_subrequests[subm_idx_from]); // prod request is created
NPUW_ASSERT(m_subrequests[subm_idx_to]); // cons request is created
NPUW_ASSERT(m_subrequests[subm_idx_from]._ptr != m_subrequests[subm_idx_to]._ptr);

const auto& iport = m_subrequests[subm_idx_to]->get_compiled_model()->inputs()[port_idx_to];
const auto& oport = m_subrequests[subm_idx_from]->get_compiled_model()->outputs()[port_idx_from];
const auto& tensor = m_subrequests[subm_idx_from]->get_tensor(oport);
LOG_DEBUG("Set Subgraph[" << subm_idx_to << "]/" << iport << " to Subgraph[" << subm_idx_from << "]/" << oport);
m_subrequests[subm_idx_to]->set_tensor(iport, tensor);
} // for(map)
LOG_INFO("Done");

// Build the parameter/result mapping {{{
m_subrequests_gio.resize(m_subrequests.size());

// Parameters: stage 1...
for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) {
const auto& to_submodel = m_npuw_model->m_inputs_to_submodels_inputs.at(i);
if (to_submodel != CompiledModel::NO_LINK) {
std::size_t sub_idx{}, in_idx{};
std::tie(sub_idx, in_idx) = to_submodel;
m_subrequests_gio.at(sub_idx).global_params[i] = in_idx;
}
} // for(inputs)

// Parameters: stage 2...
for (auto&& it : m_npuw_model->m_param_subscribers) {
const auto param_idx = it.first;
for (auto&& to_submodel : it.second) {
std::size_t sub_idx{}, in_idx{};
std::tie(sub_idx, in_idx) = to_submodel;
m_subrequests_gio.at(sub_idx).global_params[param_idx] = in_idx;
}
}

// Results
for (size_t i = 0; i < m_npuw_model->outputs().size(); i++) {
std::size_t sub_idx{}, out_idx{};
std::tie(sub_idx, out_idx) = m_npuw_model->m_outputs_to_submodels_outputs.at(i);
m_subrequests_gio.at(sub_idx).global_results[i] = out_idx;
}
// }}}

for (size_t i = 0; i < m_num_submodels; i++) {
LOG_VERB("Trying to preemptively set tensors for Subgraph[" << i << "]...");
LOG_BLOCK();
auto& comp_model_desc = m_npuw_model->m_compiled_submodels[i];
if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) {
continue; // Optimized out
}
unpack_closure(i, m_subrequests[i]);
LOG_VERB("Done");
}
}

void ov::npuw::UnfoldInferRequest::prepare(std::size_t idx) {
if (idx >= m_subrequests.size()) {
return;
}
auto& subr = m_subrequests.at(idx);
const bool do_copy = needs_copy(idx);

std::vector<std::pair<ov::SoPtr<ov::ITensor>, ov::Output<const ov::Node>>> copy_list;

// bind_global_parameters(), a simplified way
const auto& iodesc = m_subrequests_gio.at(idx);
for (auto&& it : iodesc.global_params) {
std::size_t param_idx{}, sub_in_idx{};
std::tie(param_idx, sub_in_idx) = it;
const auto& g_port = m_npuw_model->inputs()[param_idx];
const auto& g_tnsr = m_port_to_tensor.at(g_port).tensor;
const auto& s_port = subr->get_inputs()[sub_in_idx];

if (m_input_allocated.count(g_tnsr->data()) == 0 && do_copy) {
copy_list.emplace_back(g_tnsr, s_port);
} else {
subr->set_tensor(s_port, g_tnsr);
}
}

// bind_global_results, a simplified way
for (auto&& it : iodesc.global_results) {
std::size_t result_idx{}, sub_out_idx{};
std::tie(result_idx, sub_out_idx) = it;
const auto& g_port = m_npuw_model->outputs()[result_idx];
const auto& s_port = subr->get_outputs()[sub_out_idx];
subr->set_tensor(s_port, m_port_to_tensor.at(g_port).tensor);
}

// run copy, if required
// NB: parallel_for was removed here as it causes more overhead for our (usually)
// small chunks copied. In the proper app-level pipeline, there must be no copy at all
for (auto &&it : copy_list) {
ov::SoPtr<ov::ITensor> dst = subr->get_tensor(it.second);
it.first->copy_to(dst._ptr);
}

// run host gather, if required
auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx];
if (comp_model_desc.host_gather.dst_idx != -1) {
const auto& gport = comp_model_desc.compiled_model->inputs()[comp_model_desc.host_gather.dst_idx];
const auto gather = subr->get_tensor(gport);

const auto& vocab =
comp_model_desc.closure[comp_model_desc.host_gather.src_idx - comp_model_desc.param_base];
const auto& lport = comp_model_desc.compiled_model->inputs()[comp_model_desc.host_gather.idx_idx];
const auto lookup = subr->get_tensor(lport);
ov::npuw::util::gather(ov::get_tensor_impl(vocab), lookup, gather);
}
}

void ov::npuw::UnfoldInferRequest::infer() {
const bool do_async = m_npuw_model->m_cfg.get<::intel_npu::NPUW_FUNCALL_ASYNC>();

auto wait_and_clear = [](RqPtrs &rqs) {
for (auto &&r : rqs) {
r->wait();
}
rqs.clear();
};

if (do_async) {
std::size_t past_repl_id = 0u;
RqPtrs previous_requests;

prepare(0);
for (std::size_t idx = 0; idx < m_num_submodels; idx++) {
auto& subr = m_subrequests[idx];
if (!subr) {
continue;
}
auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx];
const auto this_repl_id = comp_model_desc.replaced_by.value_or(idx);
if (this_repl_id != past_repl_id) {
// For non-repeating blocks, the above value_or returns idx
// For repeating blocks, it returns the function group id
// If either is not equal to the past_repl_id, make a barrier here
wait_and_clear(previous_requests);
past_repl_id = this_repl_id;
}
subr->start_async();
previous_requests.push_back(subr);
prepare(idx + 1);
}
wait_and_clear(previous_requests);
} else {
prepare(0);
for (std::size_t idx = 0; idx < m_num_submodels; idx++) {
auto& subr = m_subrequests[idx];
if (!subr) {
prepare(idx + 1);
continue;
}
subr->start_async();
prepare(idx + 1);
subr->wait();
}
} // (async)
}
Loading
Loading