Skip to content

Commit

Permalink
NPUW: Function memory management (openvinotoolkit#27043)
Browse files Browse the repository at this point in the history
### Details:
 - Optimize out temporary results (activations) when possible

### Tickets:
 - *ticket-id*
  • Loading branch information
dmatveev authored Oct 15, 2024
1 parent 296ab9a commit 3ff0943
Show file tree
Hide file tree
Showing 4 changed files with 238 additions and 44 deletions.
5 changes: 0 additions & 5 deletions src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -516,11 +516,6 @@ std::string ov::npuw::CompiledModel::global_mem_device() const {
}

std::string ov::npuw::CompiledModel::funcall_mem_device(const std::size_t idx) const {
// FIXME: currently we allocate intermediate tensors for EVERY submodel.
// It's not feasible to allocate them in L0 due to high memory consumption.
// Until we make such memory reusable, hard-coding those tensors to CPU.
return "CPU";

// Force globally set device if set
const std::string device_alloc = m_cfg.get<::intel_npu::NPUW_WEIGHTS_BANK_ALLOC>();
if (!device_alloc.empty()) {
Expand Down
2 changes: 2 additions & 0 deletions src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ class CompiledModel : public ov::ICompiledModel {
// FIXME: This class has many friends..
friend class IBaseInferRequest;
friend class JustInferRequest;
friend class MemAccessSim;
friend class FuncMemMgr;

bool compile_for_success(std::size_t id);
bool compile_for_device(std::size_t id, const std::string& device_to_try);
Expand Down
211 changes: 182 additions & 29 deletions src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,173 @@
#include "util.hpp"
#include "weights_bank.hpp"

ov::npuw::MemAccessSim::MemAccessSim(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model) {
LOG_VERB("Running memory access simulation...");
LOG_BLOCK();

// Initialize the read list
m_read_list.resize(compiled_model->m_compiled_submodels.size());

// Initialize read counters for tensors in the graph:
// 1. Interconnect
for (const auto& kvp : compiled_model->m_submodels_input_to_prev_output) {
const auto& read_to = kvp.first; // who reads
const auto& read_from = kvp.second; // reads what

if (read_to == CompiledModel::NO_LINK || read_from == CompiledModel::NO_LINK) {
continue;
}

// Record # of reads for this particular Source
m_remaining_reads[read_from]++;

// Record a read request for this particular Subgraph (who reads the Source)
m_read_list[read_to.first].push_back(read_from);
}
// 2. Global model's outputs
for (auto&& read_from : compiled_model->m_outputs_to_submodels_outputs) {
m_remaining_reads[read_from]++;
}

LOG_VERB("Done");
}

const ov::npuw::MemAccessSim::ReadList& ov::npuw::MemAccessSim::read_list(std::size_t idx) const {
return m_read_list.at(idx);
}

std::size_t ov::npuw::MemAccessSim::remaining_reads(const LinkFrom& from) {
return m_remaining_reads.at(from);
}

void ov::npuw::MemAccessSim::register_read(const LinkFrom& from) {
m_remaining_reads.at(from)--;
}

ov::npuw::FuncMemMgr::FuncMemMgr(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model)
: m_sim(compiled_model),
m_model(compiled_model) {}

void ov::npuw::FuncMemMgr::set_alloc(AllocFcn&& fcn) {
m_alloc = std::move(fcn);
}

void ov::npuw::FuncMemMgr::assign_memory() {
LOG_VERB("Assigning function memory...");
LOG_BLOCK();

const auto num_submodels = m_model->m_compiled_submodels.size();

// Walk over the subgraphs, pre-allocate and pre-assign tensors to the subgraphs
// outputs.
for (std::size_t idx = 0u; idx < num_submodels; idx++) {
LOG_VERB("Process Subgraph[" << idx << "]");
LOG_BLOCK();
const auto& comp_model_desc = m_model->m_compiled_submodels[idx];
if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) {
// no model & no funcall - optimized out, do nothing
continue;
}

// Simulate subgraph execution: poll its input list first
const auto& read_list = m_sim.read_list(idx);

// Now, get the outputs for the subgraph. If it is "regular", there's
// nothing to do - this subgraph owns its outputs on its own.
// If it is a function, though - look up in the function's memory storage.
if (comp_model_desc.replaced_by) {
const auto real_idx = comp_model_desc.replaced_by.value();
const auto& proto_comp_model_desc = m_model->m_compiled_submodels[real_idx];

const auto num_outs = proto_comp_model_desc.compiled_model->outputs().size();
for (std::size_t out_idx = 0u; out_idx < num_outs; out_idx++) {
const LinkFrom this_out = LinkFrom{idx, out_idx};
assign(this_out);
}
}

// Here happens the imaginary execution... Hocus pocus, done - that's a
// simulation after all
// After the execution, mark that the read_list was read.
for (auto&& from : read_list) {
m_sim.register_read(from);
}
LOG_VERB("Done");
}

// Report memory residency
for (auto&& m : m_memory) {
LOG_VERB("Function " << m.first.first << "/out port " << m.first.second << " : maximum memory residency "
<< m.second.size() << " tensor(s)");
}

LOG_VERB("Done");
}

void ov::npuw::FuncMemMgr::assign(const LinkFrom& from) {
// This method is the center of the function memory management.
// The logic is simple:
// - Look for an output tensor to reuse
// - If there's one, assign it to this allocation
// - If there's none, allocate a new tensor
// - How a tensor to reuse is piced:
// 1. It should exist
// 2. It's "remaining reads" count should be 0 (all planned reads
// happened at this point).
// The tensor storage is organized like this:
// - Function: Here we use .replaced_by as a function identifier; taken from `from`
// - Output index: taken from `from`
// - A vector of resident tensors

LOG_VERB("Assinging tensor for Subgraph[" << from.first << "]/" << from.second << "...");
LOG_BLOCK();

const auto& comp_model_desc = m_model->m_compiled_submodels[from.first];
NPUW_ASSERT(comp_model_desc.replaced_by.has_value());

const auto real_idx = comp_model_desc.replaced_by.value();

FO func_output = {real_idx, from.second};
auto& assigned_memory = m_memory[func_output];
auto asgn_iter = std::find_if(assigned_memory.begin(), assigned_memory.end(), [&](Assignment& a) {
return m_sim.remaining_reads(a.from) == 0u;
});
if (asgn_iter != assigned_memory.end()) {
// Reassign this memory slot to the new "from"
asgn_iter->from = from;
m_table[from] = asgn_iter->ptr;
} else {
// No free space at this point - allocate a new tensor
const auto& proto_comp_model_desc = m_model->m_compiled_submodels[real_idx];
const auto& proto_comp_model = proto_comp_model_desc.compiled_model;

const auto& oport = proto_comp_model->outputs()[from.second];
ov::Shape oshape = oport.get_shape();

if (proto_comp_model_desc.spatial) {
oshape[proto_comp_model_desc.spatial->out_dim] = proto_comp_model_desc.spatial->range;
}
const auto& device = m_model->funcall_mem_device(real_idx);
TensorPtr new_tensor = m_alloc(oport.get_element_type(), oshape, device);
NPUW_ASSERT(new_tensor);

assigned_memory.push_back(Assignment{new_tensor, from});
m_table[from] = new_tensor;
}
LOG_VERB("Done");
}

ov::npuw::TensorPtr ov::npuw::FuncMemMgr::get_tensor(const LinkFrom& from) {
return m_table.at(from);
}

ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model)
: IBaseInferRequest(compiled_model) {
: IBaseInferRequest(compiled_model),
m_func_mem_mgr(compiled_model) {
using namespace std::placeholders;
m_func_mem_mgr.set_alloc(std::bind(&JustInferRequest::allocMem, this, _1, _2, _3));
m_func_mem_mgr.assign_memory();

m_use_function_pipelining = m_npuw_model->m_cfg.get<::intel_npu::NPUW_FUNCALL_ASYNC>();
if (m_use_function_pipelining) {
LOG_WARN("Function call pipelining is enabled for " << m_npuw_model->m_name
Expand Down Expand Up @@ -67,27 +232,20 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
for (auto&& p : proto_comp_model_desc.spatial->params) {
const auto& iport = proto_comp_model_desc.compiled_model->inputs()[p.idx];
m_spatial_io[real_idx].input_tails[p.idx] =
allocTensor(iport, m_npuw_model->funcall_mem_device(real_idx));
allocOut(iport, m_npuw_model->funcall_mem_device(real_idx));
}
const auto num_outs = proto_comp_model_desc.compiled_model->outputs().size();
for (std::size_t out_idx = 0u; out_idx < num_outs; out_idx++) {
const auto& oport = proto_comp_model_desc.compiled_model->outputs()[out_idx];
m_spatial_io[real_idx].output_tails[out_idx] =
allocTensor(oport, m_npuw_model->funcall_mem_device(real_idx));
allocOut(oport, m_npuw_model->funcall_mem_device(real_idx));
}
}
} // if(spatial)

for (size_t out_idx = 0; out_idx < num_outputs; out_idx++) {
const auto& port = proto_comp_model->outputs()[out_idx];
ov::Shape shape = port.get_shape();

// If the subgraph is spatial, promote the output size to the full vector size
if (proto_comp_model_desc.spatial) {
shape[proto_comp_model_desc.spatial->out_dim] = proto_comp_model_desc.spatial->range;
}
m_funcall_result[LinkFrom{i, out_idx}] =
allocTensor(port.get_element_type(), shape, m_npuw_model->funcall_mem_device(real_idx));
const auto from = LinkFrom{i, out_idx};
m_funcall_result[from] = m_func_mem_mgr.get_tensor(from);
}
if (real_idx != i) {
// If this function call is NOT the function body, do nothing here - the original
Expand Down Expand Up @@ -152,7 +310,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
LOG_INFO("Preallocating input tensors...");
for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) {
const auto& port = m_npuw_model->inputs()[i];
ov::SoPtr<ov::ITensor> allocated = allocTensor(port, m_npuw_model->global_mem_device());
ov::SoPtr<ov::ITensor> allocated = allocOut(port, m_npuw_model->global_mem_device());
m_input_tensors.push_back(allocated);
m_input_allocated.insert(allocated->data());
m_port_to_tensor[port] = TensorStorage{m_input_tensors.back(), true};
Expand All @@ -174,7 +332,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
const auto& tensor =
funcall_result_iter != m_funcall_result.end()
? funcall_result_iter->second // Function calls have their tensors allocated, so just use one
: allocTensor(port, m_npuw_model->global_mem_device());
: allocOut(port, m_npuw_model->global_mem_device());

m_output_tensors.push_back(tensor);
m_port_to_tensor[port] = TensorStorage{tensor, true};
Expand Down Expand Up @@ -920,27 +1078,22 @@ void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool
} // if (replaced_by)
}

ov::SoPtr<ov::ITensor> ov::npuw::JustInferRequest::allocTensor(const ov::element::Type type,
const ov::Shape& shape,
const std::string& device) {
ov::npuw::TensorPtr ov::npuw::JustInferRequest::allocMem(const ov::element::Type type,
const ov::Shape& shape,
const std::string& device) {
if (device == "CPU" || ov::shape_size(shape) == 0) {
return ov::get_tensor_impl(ov::Tensor(type, shape));
}

ov::SoPtr<ov::ITensor> remote_tensor;
ov::Tensor allocated_tensor;
{
std::lock_guard<std::mutex> guard(m_alloc_mutex);
m_remote_ctx = m_npuw_model->get_plugin()->get_core()->get_default_context(device)._ptr;
remote_tensor = m_remote_ctx->create_host_tensor(type, shape);
allocated_tensor = ov::make_tensor(remote_tensor);
}
return ov::get_tensor_impl(allocated_tensor);
std::lock_guard<std::mutex> guard(m_alloc_mutex);
auto remote_ctx = m_npuw_model->get_plugin()->get_core()->get_default_context(device)._ptr;
auto remote_tensor = remote_ctx->create_host_tensor(type, shape);
return ov::get_tensor_impl(ov::make_tensor(remote_tensor));
}

ov::SoPtr<ov::ITensor> ov::npuw::JustInferRequest::allocTensor(const ov::Output<const ov::Node>& node,
const std::string& device) {
return allocTensor(node.get_element_type(), node.get_shape(), device);
ov::npuw::TensorPtr ov::npuw::JustInferRequest::allocOut(const ov::Output<const ov::Node>& node,
const std::string& device) {
return allocMem(node.get_element_type(), node.get_shape(), device);
}

void ov::npuw::JustInferRequest::subscribe_subrequest(std::size_t idx, Completed cb) {
Expand Down
64 changes: 54 additions & 10 deletions src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,56 @@ namespace npuw {
class CompiledModel;
class AsyncInferRequest;

using LinkFrom = std::pair<std::size_t /* Subrequest index */
,
std::size_t /* Subrequest output index */
>; // FIXME: This is a third, if not fourth, definitiion of such structure

using TensorPtr = ov::SoPtr<ov::ITensor>;

class MemAccessSim {
public:
explicit MemAccessSim(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model);

using ReadList = std::list<LinkFrom>;
const ReadList& read_list(std::size_t idx) const;

std::size_t remaining_reads(const LinkFrom& from);
void register_read(const LinkFrom& from);

private:
std::map<LinkFrom, std::size_t> m_remaining_reads;
std::vector<ReadList> m_read_list;
};

class FuncMemMgr {
MemAccessSim m_sim;
std::shared_ptr<ov::npuw::CompiledModel> m_model;

void assign(const LinkFrom& from);

// Function ID -> Output port number
using FO = std::pair<std::size_t, std::size_t>;
struct Assignment {
TensorPtr ptr;
LinkFrom from;
};
std::map<FO, std::vector<Assignment>> m_memory; // Dynamic assignment table
std::map<LinkFrom, TensorPtr> m_table; // Static allocation/assignment table

public:
explicit FuncMemMgr(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model);

using AllocFcn = std::function<TensorPtr(const ov::element::Type&, const ov::Shape&, const std::string&)>;
void set_alloc(AllocFcn&& fcn);
void assign_memory();

TensorPtr get_tensor(const LinkFrom& from);

private:
AllocFcn m_alloc;
};

class JustInferRequest final : public IBaseInferRequest {
public:
explicit JustInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model);
Expand Down Expand Up @@ -64,15 +114,11 @@ class JustInferRequest final : public IBaseInferRequest {
void connect_subrequests();
void recreate_subrequests(std::size_t idx);

ov::SoPtr<ov::ITensor> allocTensor(const ov::element::Type type, const ov::Shape& shape, const std::string& device);
ov::SoPtr<ov::ITensor> allocTensor(const ov::Output<const ov::Node>& node, const std::string& device);
TensorPtr allocMem(const ov::element::Type type, const ov::Shape& shape, const std::string& device);
TensorPtr allocOut(const ov::Output<const ov::Node>& node, const std::string& device);

using LinkFrom = std::pair<std::size_t /* Subrequest index */
,
std::size_t /* Subrequest output index */
>; // FIXME: This is a third, if not fourth, definitiion of such structure
using TensorPtr = ov::SoPtr<ov::ITensor>;
std::map<LinkFrom, TensorPtr> m_funcall_result;
FuncMemMgr m_func_mem_mgr; // Owns memory
std::map<LinkFrom, TensorPtr> m_funcall_result; // Provides a convenient link

bool is_pipelined(std::size_t idx) const;
bool m_use_function_pipelining = false;
Expand Down Expand Up @@ -103,8 +149,6 @@ class JustInferRequest final : public IBaseInferRequest {
std::vector<GlobalIO> m_subrequests_gio;

std::mutex m_alloc_mutex;
std::shared_ptr<ov::IRemoteContext> m_remote_ctx = nullptr;

std::unordered_set<void*> m_input_allocated;
};

Expand Down

0 comments on commit 3ff0943

Please sign in to comment.