Skip to content

Commit

Permalink
NPUW: Unfold infer reqests mode
Browse files Browse the repository at this point in the history
  • Loading branch information
dmatveev committed Oct 29, 2024
1 parent 744475b commit cad54a9
Show file tree
Hide file tree
Showing 9 changed files with 242 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ DEFINE_OPT(NPUW_WEIGHTS_BANK, std::string, "", npuw::weights_bank, CompileTime);
DEFINE_OPT(NPUW_WEIGHTS_BANK_ALLOC, std::string, "", npuw::weights_bank_alloc, CompileTime);
DEFINE_OPT(NPUW_CACHE_DIR, std::string, "", npuw::cache_dir, CompileTime);
DEFINE_OPT(NPUW_FUNCALL_ASYNC, bool, false, npuw::funcall_async, RunTime);
DEFINE_OPT(NPUW_UNFOLD_IREQS, bool, false, npuw::unfold_ireqs, RunTime);
DEFINE_OPT(NPUW_ACC_CHECK, bool, false, npuw::accuracy::check, RunTime);
DEFINE_OPT(NPUW_ACC_THRESH, double, 0.01, npuw::accuracy::threshold, RunTime);
DEFINE_OPT(NPUW_ACC_DEVICE, std::string, "", npuw::accuracy::reference_device, RunTime);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,14 @@ static constexpr ov::Property<bool> parallel_compilation{"NPUW_PARALLEL_COMPILE"
*/
static constexpr ov::Property<bool> funcall_async{"NPUW_FUNCALL_ASYNC"};

/**
* @brief
* Type: boolean
* Create individual infer requests for partitiongs, even repeating.
* Default value: false.
*/
static constexpr ov::Property<bool> unfold_ireqs{"NPUW_UNFOLD_IREQS"};

namespace accuracy {
/**
* @brief
Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_npu/src/al/src/config/npuw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) {
desc.add<NPUW_FUNCALL_FOR_ALL>();
desc.add<NPUW_PARALLEL_COMPILE>();
desc.add<NPUW_FUNCALL_ASYNC>();
desc.add<NPUW_UNFOLD_IREQS>();
desc.add<NPUW_WEIGHTS_BANK>();
desc.add<NPUW_WEIGHTS_BANK_ALLOC>();
desc.add<NPUW_CACHE_DIR>();
Expand Down
18 changes: 12 additions & 6 deletions src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "openvino/util/common_util.hpp"
#include "partitioning/patterns/opt.hpp"
#include "plugin.hpp"
#include "unfold_sync_infer_request.hpp"
#include "util.hpp"

// required for get_properties_per_device()
Expand Down Expand Up @@ -708,16 +709,20 @@ void ov::npuw::CompiledModel::dump_on_fail(std::size_t id, const std::string& de
}
}

std::shared_ptr<ov::ISyncInferRequest> ov::npuw::CompiledModel::create_just_sync_infer_request() {
auto this_sptr = std::static_pointer_cast<ov::npuw::CompiledModel>(shared_from_this());
return std::make_shared<ov::npuw::JustInferRequest>(this_sptr);
}

std::shared_ptr<ov::ISyncInferRequest> ov::npuw::CompiledModel::create_sync_infer_request() const {
// Synchronous infer request implementation may vary based on the
// selected strategy
auto* non_const_this = const_cast<ov::npuw::CompiledModel*>(this); // because of const in API
return non_const_this->create_just_sync_infer_request();
auto non_const_this_sptr = std::static_pointer_cast<ov::npuw::CompiledModel>(non_const_this->shared_from_this());

std::shared_ptr<ov::ISyncInferRequest> result;
if (m_cfg.get<::intel_npu::NPUW_UNFOLD_IREQS>()) {
result.reset(new ov::npuw::UnfoldInferRequest(non_const_this_sptr));
} else {
result.reset(new ov::npuw::JustInferRequest(non_const_this_sptr));
}
NPUW_ASSERT(result);
return result;
}

std::shared_ptr<ov::IAsyncInferRequest> ov::npuw::CompiledModel::create_infer_request() const {
Expand Down Expand Up @@ -934,6 +939,7 @@ void ov::npuw::CompiledModel::implement_properties() {
BIND(npuw::partitioning::dcoff_with_scale, NPUW_DCOFF_SCALE),
BIND(npuw::parallel_compilation, NPUW_PARALLEL_COMPILE),
BIND(npuw::funcall_async, NPUW_FUNCALL_ASYNC),
BIND(npuw::unfold_ireqs, NPUW_UNFOLD_IREQS),
BIND(npuw::weights_bank, NPUW_WEIGHTS_BANK),
BIND(npuw::weights_bank_alloc, NPUW_WEIGHTS_BANK_ALLOC),
BIND(npuw::cache_dir, NPUW_CACHE_DIR),
Expand Down
2 changes: 1 addition & 1 deletion src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ class CompiledModel : public ov::ICompiledModel {
// FIXME: This class has many friends..
friend class IBaseInferRequest;
friend class JustInferRequest;
friend class UnfoldInferRequest;
friend class MemAccessSim;
friend class FuncMemMgr;

Expand All @@ -66,7 +67,6 @@ class CompiledModel : public ov::ICompiledModel {

std::shared_ptr<const ::intel_npu::Plugin> get_npuw_plugin() const;

std::shared_ptr<ov::ISyncInferRequest> create_just_sync_infer_request();
std::shared_ptr<ov::ISyncInferRequest> create_sync_infer_request() const override;

std::string submodel_device(const std::size_t idx) const;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -180,9 +180,15 @@ ov::npuw::TensorPtr ov::npuw::FuncMemMgr::get_tensor(const LinkFrom& from) {
return m_table.at(from);
}

ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model)
ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model,
bool real_work)
: IBaseInferRequest(compiled_model),
m_func_mem_mgr(compiled_model) {
if (!real_work) {
// FIXME: Fragile base class
return;
}

using namespace std::placeholders;
m_func_mem_mgr.set_alloc(std::bind(&JustInferRequest::allocMem, this, _1, _2, _3));
m_func_mem_mgr.assign_memory();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,15 +73,15 @@ class FuncMemMgr {
AllocFcn m_alloc;
};

class JustInferRequest final : public IBaseInferRequest {
class JustInferRequest : public IBaseInferRequest {
public:
explicit JustInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model);
explicit JustInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model, bool real_work = true);

// Query APIs
std::vector<ov::SoPtr<ov::IVariableState>> query_state() const override;
std::vector<ov::ProfilingInfo> get_profiling_info() const override;

private:
protected:
////////////////////////////////////
// implement IBaseInferRequest
void prepare_for_infer() override;
Expand Down
182 changes: 182 additions & 0 deletions src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
// Copyright (C) 2023-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "unfold_sync_infer_request.hpp"

#include "compiled_model.hpp"
#include "logging.hpp"

ov::npuw::UnfoldInferRequest::UnfoldInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model)
: ov::npuw::JustInferRequest(compiled_model, false) {
// Create infer requests
// Preallocate funcall tensors & substitute function call requests
for (std::size_t i = 0; i < m_num_submodels; i++) {
LOG_INFO("Creating infer request for Subgraph[" << i << "]...");
LOG_BLOCK();
auto& comp_model_desc = m_npuw_model->m_compiled_submodels[i];

if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) {
// no model & no funcall - optimized out, do nothing
LOG_INFO("OPTIMIZED OUT");
continue;
}

if (comp_model_desc.replaced_by) {
// Pre-allocate output tensors for this function call
const auto real_idx = comp_model_desc.replaced_by.value();
auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx];
if (proto_comp_model_desc.spatial) {
NPUW_ASSERT(false && "Spatial is not supported in unfold");
}
} // if(replaced_by)

const auto real_idx = comp_model_desc.replaced_by.value_or(i);
auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx];
m_subrequests[i] = proto_comp_model_desc.compiled_model->create_infer_request();
m_subrequest_devices[i] = *proto_comp_model_desc.device_it;
LOG_INFO("DONE");
} // for(submodels)

// Preallocate input tensors. Note - there may be
// multiple subrequest consumers on the same input tensor
LOG_INFO("Preallocating input tensors...");
for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) {
const auto& port = m_npuw_model->inputs()[i];
ov::SoPtr<ov::ITensor> allocated = allocOut(port, m_npuw_model->global_mem_device());
m_input_tensors.push_back(allocated);
m_input_allocated.insert(allocated->data());
m_port_to_tensor[port] = TensorStorage{m_input_tensors.back(), true};
} // for(inputs)

// Preallocate output tensors
LOG_INFO("Preallocating output tensors...");
for (size_t i = 0; i < m_npuw_model->outputs().size(); i++) {
LOG_BLOCK();
const auto& port = m_npuw_model->outputs()[i];
LOG_INFO("Output " << i << " of " << m_npuw_model->outputs().size() << ": " << port);

// FIXME: Yes, the CompiledModel::ToSubmodel == JustInferRequest::LinkFrom
const auto& from_submodel = m_npuw_model->m_outputs_to_submodels_outputs.at(i);

LOG_INFO("Produced by Subgraph[" << from_submodel.first << "] / " << from_submodel.second);
const auto& tensor = allocOut(port, m_npuw_model->global_mem_device());

m_output_tensors.push_back(tensor);
m_port_to_tensor[port] = TensorStorage{tensor, true};
}

LOG_INFO("Connecting subrequests...");
LOG_BLOCK();
for (const auto& kvp : m_npuw_model->m_submodels_input_to_prev_output) {
const auto& subm_idx_to = kvp.first.first;
const auto& port_idx_to = kvp.first.second;
const auto& subm_idx_from = kvp.second.first;
const auto& port_idx_from = kvp.second.second;

LOG_DEBUG("Subgraph[" << subm_idx_from << "]/" << port_idx_from << " --> "
<< "Subgraph[" << subm_idx_to << "]/" << port_idx_to);
NPUW_ASSERT(m_subrequests[subm_idx_from]); // prod request is created
NPUW_ASSERT(m_subrequests[subm_idx_to]); // cons request is created
NPUW_ASSERT(m_subrequests[subm_idx_from]._ptr != m_subrequests[subm_idx_to]._ptr);

const auto& iport = m_subrequests[subm_idx_to]->get_compiled_model()->inputs()[port_idx_to];
const auto& oport = m_subrequests[subm_idx_from]->get_compiled_model()->outputs()[port_idx_from];
const auto& tensor = m_subrequests[subm_idx_from]->get_tensor(oport);
LOG_DEBUG("Set Subgraph[" << subm_idx_to << "]/" << iport << " to Subgraph[" << subm_idx_from << "]/" << oport);
m_subrequests[subm_idx_to]->set_tensor(iport, tensor);
} // for(map)
LOG_INFO("Done");

// Build the parameter/result mapping {{{
m_subrequests_gio.resize(m_subrequests.size());

// Parameters: stage 1...
for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) {
const auto& to_submodel = m_npuw_model->m_inputs_to_submodels_inputs.at(i);
if (to_submodel != CompiledModel::NO_LINK) {
std::size_t sub_idx{}, in_idx{};
std::tie(sub_idx, in_idx) = to_submodel;
m_subrequests_gio.at(sub_idx).global_params[i] = in_idx;
}
} // for(inputs)

// Parameters: stage 2...
for (auto&& it : m_npuw_model->m_param_subscribers) {
const auto param_idx = it.first;
for (auto&& to_submodel : it.second) {
std::size_t sub_idx{}, in_idx{};
std::tie(sub_idx, in_idx) = to_submodel;
m_subrequests_gio.at(sub_idx).global_params[param_idx] = in_idx;
}
}

// Results
for (size_t i = 0; i < m_npuw_model->outputs().size(); i++) {
std::size_t sub_idx{}, out_idx{};
std::tie(sub_idx, out_idx) = m_npuw_model->m_outputs_to_submodels_outputs.at(i);
m_subrequests_gio.at(sub_idx).global_results[i] = out_idx;
}
// }}}

for (size_t i = 0; i < m_num_submodels; i++) {
LOG_VERB("Trying to preemptively set tensors for Subgraph[" << i << "]...");
LOG_BLOCK();
auto& comp_model_desc = m_npuw_model->m_compiled_submodels[i];
if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) {
continue; // Optimized out
}
unpack_closure(i, m_subrequests[i]);
LOG_VERB("Done");
}
}

void ov::npuw::UnfoldInferRequest::infer() {
for (std::size_t idx = 0; idx < m_num_submodels; idx++) {
auto& subr = m_subrequests[idx];
if (!subr) {
continue;
}

// bind_global_parameters(), a simplified way
const auto& iodesc = m_subrequests_gio.at(idx);
for (auto&& it : iodesc.global_params) {
std::size_t param_idx{}, sub_in_idx{};
std::tie(param_idx, sub_in_idx) = it;
const auto& g_port = m_npuw_model->inputs()[param_idx];
const auto& g_tnsr = m_port_to_tensor.at(g_port).tensor;
const auto& s_port = subr->get_inputs()[sub_in_idx];
subr->set_tensor(s_port, g_tnsr);
}

// bind_global_results, a simplified way
for (auto&& it : iodesc.global_results) {
std::size_t result_idx{}, sub_out_idx{};
std::tie(result_idx, sub_out_idx) = it;
const auto& g_port = m_npuw_model->outputs()[result_idx];
const auto& s_port = subr->get_outputs()[sub_out_idx];
subr->set_tensor(s_port, m_port_to_tensor.at(g_port).tensor);
}

// run host gather, if required
auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx];
if (comp_model_desc.host_gather.dst_idx != -1) {
const auto& gport = comp_model_desc.compiled_model->inputs()[comp_model_desc.host_gather.dst_idx];
const auto gather = subr->get_tensor(gport);

const auto& vocab =
comp_model_desc.closure[comp_model_desc.host_gather.src_idx - comp_model_desc.param_base];
const auto& lport = comp_model_desc.compiled_model->inputs()[comp_model_desc.host_gather.idx_idx];
const auto lookup = subr->get_tensor(lport);
ov::npuw::util::gather(ov::get_tensor_impl(vocab), lookup, gather);
}
}

for (std::size_t idx = 0; idx < m_num_submodels; idx++) {
auto& subr = m_subrequests[idx];
if (!subr) {
continue;
}
subr->infer();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include <limits>
#include <map>
#include <mutex>
#include <optional>
#include <vector>

#include "just_sync_infer_request.hpp"

namespace ov {
namespace npuw {

class UnfoldInferRequest final : public JustInferRequest {
public:
explicit UnfoldInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model);

private:
void infer() override;
};

} // namespace npuw
} // namespace ov

0 comments on commit cad54a9

Please sign in to comment.