Skip to content

Commit

Permalink
[NPU] Adding support for the remote tensor feature - [Part II - lefto…
Browse files Browse the repository at this point in the history
…vers] (#25572)

### Details:
- *Allocate a Host L0 buffer when the create host tensor method is used*
- *Do not chain the mutable descriptor, call the update mutable command
list after each descriptor instead*
- *Do not throw an error if using an older ze_loader only if the
unsupported functions are going to be called*
 - *Call updateMutableCommandList after set_tensor method is used*
 - *Add extra test cases for batching flow using MCL*


### Tickets:
 - *EISW-131915*
  • Loading branch information
pereanub authored Jul 19, 2024
1 parent 56a8c7e commit ffc135c
Show file tree
Hide file tree
Showing 18 changed files with 331 additions and 99 deletions.
5 changes: 5 additions & 0 deletions src/plugins/intel_npu/src/al/include/npu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,11 @@ class IDevice : public std::enable_shared_from_this<IDevice> {
ov::intel_npu::MemType mem_type = ov::intel_npu::MemType::L0_INTERNAL_BUF,
void* mem = nullptr);

virtual ov::SoPtr<ov::ITensor> createHostTensor(std::shared_ptr<ov::IRemoteContext> context,
const ov::element::Type& element_type,
const ov::Shape& shape,
const Config& config);

protected:
virtual ~IDevice() = default;
};
Expand Down
7 changes: 7 additions & 0 deletions src/plugins/intel_npu/src/al/src/npu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,4 +81,11 @@ ov::SoPtr<ov::IRemoteTensor> IDevice::createRemoteTensor(std::shared_ptr<ov::IRe
OPENVINO_THROW("Create Remote Tensor is not supported");
}

ov::SoPtr<ov::ITensor> IDevice::createHostTensor(std::shared_ptr<ov::IRemoteContext>,
const ov::element::Type&,
const ov::Shape&,
const Config&) {
OPENVINO_THROW("Create Host Tensor is not supported");
}

} // namespace intel_npu
5 changes: 5 additions & 0 deletions src/plugins/intel_npu/src/backend/include/zero_device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@ class ZeroDevice : public IDevice {
ov::intel_npu::MemType mem_type = ov::intel_npu::MemType::L0_INTERNAL_BUF,
void* mem = nullptr) override;

ov::SoPtr<ov::ITensor> createHostTensor(std::shared_ptr<ov::IRemoteContext> context,
const ov::element::Type& element_type,
const ov::Shape& shape,
const Config& config) override;

ZeroDevice& operator=(const ZeroDevice&) = delete;
ZeroDevice(const ZeroDevice&) = delete;

Expand Down
39 changes: 39 additions & 0 deletions src/plugins/intel_npu/src/backend/include/zero_host_tensor.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "intel_npu/al/config/config.hpp"
#include "openvino/runtime/itensor.hpp"
#include "zero_init.hpp"
#include "zero_remote_tensor.hpp"

namespace intel_npu {

class ZeroHostTensor : public ov::ITensor {
public:
ZeroHostTensor(std::shared_ptr<ov::IRemoteContext> context,
std::shared_ptr<ZeroInitStructsHolder> init_structs,
const ov::element::Type element_type,
const ov::Shape& shape,
const Config& config);

~ZeroHostTensor() override = default;

void* data(const ov::element::Type& element_type) const override;
const ov::element::Type& get_element_type() const override;

const ov::Shape& get_shape() const override;

const ov::Strides& get_strides() const override;

void set_shape(ov::Shape new_shape) override;

std::shared_ptr<ZeroRemoteTensor> get_impl() const;

private:
std::shared_ptr<ZeroRemoteTensor> m_impl;
};

} // namespace intel_npu
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,9 @@ class ZeroInferRequest final : public SyncInferRequest {
* @brief Check the received remote tensor and copy it to the Level Zero tensor
* @param tensor Reference to a tensor.
* @param name Friendly name of the tensor.
* @param isParameter True if tensor is a parameter.
*/
void set_remote_tensor_data(std::shared_ptr<ZeroRemoteTensor> tensor, const std::string& name);
void set_remote_tensor_data(std::shared_ptr<ZeroRemoteTensor> tensor, const std::string& name, bool isParameter);

void check_network_precision(const ov::element::Type_t precision) const override;
void create_pipeline();
Expand All @@ -77,8 +78,7 @@ class ZeroInferRequest final : public SyncInferRequest {
// specific operations on the plugin in this case.
size_t _batchSize = DEFAULT_BATCH_SIZE;

bool _createPipeline = true;
bool _updateCommandList = false;
bool _pipelineIsCreated = false;
};

} // namespace intel_npu
3 changes: 1 addition & 2 deletions src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ struct TensorData {
void* mem;
size_t size;
bool levelZeroTensorCreatedLocally = true;
bool changed = false;
};

struct Pipeline {
Expand All @@ -32,7 +31,7 @@ struct Pipeline {
virtual void pull(size_t batch_index) = 0;
virtual void reset(size_t batch_index) const = 0;

virtual void updateCommandList(std::unordered_map<std::string, TensorData>& tensors_data, size_t batch_size) = 0;
virtual void updateCommandList(const TensorData& tensors_data, uint32_t index, size_t batch_size) = 0;

protected:
zeroMemory::MemoryManagementUnit _deviceInputs;
Expand Down
5 changes: 1 addition & 4 deletions src/plugins/intel_npu/src/backend/include/zero_wrappers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ class CommandList {
void appendGraphInitialize(const ze_graph_handle_t& graph_handle) const;
void appendGraphExecute(const ze_graph_handle_t& graph_handle,
const ze_graph_profiling_query_handle_t& profiling_query_handle) const;
void updateMutableCommandList(const void* pNext = nullptr) const;
void updateMutableCommandList(uint32_t arg_index, const void* arg_value) const;
void appendNpuTimestamp(uint64_t* timestamp_buff) const;
void appendBarrier() const;
void close() const;
Expand All @@ -96,9 +96,6 @@ class CommandList {
inline ze_command_list_handle_t handle() const {
return _handle;
}
uint64_t getCommandListId() const {
return _command_id;
}

private:
ze_command_list_handle_t _handle = nullptr;
Expand Down
8 changes: 8 additions & 0 deletions src/plugins/intel_npu/src/backend/src/zero_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "intel_npu/al/itt.hpp"
#include "intel_npu/utils/zero/zero_api.hpp"
#include "zero_executor.hpp"
#include "zero_host_tensor.hpp"
#include "zero_infer_request.hpp"
#include "zero_remote_tensor.hpp"
#include "zero_utils.hpp"
Expand Down Expand Up @@ -193,3 +194,10 @@ ov::SoPtr<ov::IRemoteTensor> ZeroDevice::createRemoteTensor(std::shared_ptr<ov::
return {std::make_shared<
ZeroRemoteTensor>(context, _initStructs, element_type, shape, config, tensor_type, mem_type, mem)};
};

ov::SoPtr<ov::ITensor> ZeroDevice::createHostTensor(std::shared_ptr<ov::IRemoteContext> context,
const ov::element::Type& element_type,
const ov::Shape& shape,
const Config& config) {
return {std::make_shared<ZeroHostTensor>(context, _initStructs, element_type, shape, config)};
};
48 changes: 48 additions & 0 deletions src/plugins/intel_npu/src/backend/src/zero_host_tensor.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "zero_host_tensor.hpp"

#include "openvino/runtime/intel_npu/remote_properties.hpp"

namespace intel_npu {

ZeroHostTensor::ZeroHostTensor(std::shared_ptr<ov::IRemoteContext> context,
std::shared_ptr<ZeroInitStructsHolder> init_structs,
const ov::element::Type element_type,
const ov::Shape& shape,
const Config& config)
: m_impl(std::make_shared<ZeroRemoteTensor>(context,
init_structs,
element_type,
shape,
config,
ov::intel_npu::TensorType::BINDED,
ov::intel_npu::MemType::L0_INTERNAL_BUF)) {}

void* ZeroHostTensor::data(const ov::element::Type&) const {
return m_impl->get_properties().find(ov::intel_npu::mem_handle.name())->second.as<void*>();
}

const ov::element::Type& ZeroHostTensor::get_element_type() const {
return m_impl->get_element_type();
}

const ov::Shape& ZeroHostTensor::get_shape() const {
return m_impl->get_shape();
}

const ov::Strides& ZeroHostTensor::get_strides() const {
return m_impl->get_strides();
}

void ZeroHostTensor::set_shape(ov::Shape new_shape) {
m_impl->set_shape(new_shape);
}

std::shared_ptr<ZeroRemoteTensor> ZeroHostTensor::get_impl() const {
return m_impl;
}

} // namespace intel_npu
57 changes: 36 additions & 21 deletions src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -402,13 +402,26 @@ void ZeroInferRequest::set_tensor_data(std::shared_ptr<ov::ITensor> tensor, cons
if (setTensorData) {
_tensorsData[name] = TensorData{_copyAllTensors.at(name)->data(),
_copyAllTensors.at(name)->get_byte_size(),
levelZeroTensorCreatedLocally,
!_createPipeline};
_updateCommandList = true;
levelZeroTensorCreatedLocally};

if (_pipelineIsCreated) {
_logger.debug("ZeroInferRequest::infer_async - update command list");

intel_npu::ZeroExecutor::ArgumentDescriptor desc;
if (isParameter) {
desc = _executor->inputs_desc_map().at(name);
} else {
desc = _executor->outputs_desc_map().at(name);
}

_pipeline->updateCommandList(_tensorsData[name], desc.idx, _batchSize);
}
}
}

void ZeroInferRequest::set_remote_tensor_data(std::shared_ptr<ZeroRemoteTensor> tensor, const std::string& name) {
void ZeroInferRequest::set_remote_tensor_data(std::shared_ptr<ZeroRemoteTensor> tensor,
const std::string& name,
bool isParameter) {
auto l0_context = reinterpret_cast<ze_context_handle_t>(
extract_object(tensor->get_context()->get_property(), ov::intel_npu::l0_context));
if (_initStructs->getContext() != l0_context) {
Expand All @@ -421,8 +434,20 @@ void ZeroInferRequest::set_remote_tensor_data(std::shared_ptr<ZeroRemoteTensor>
}

_copyAllTensors[name] = tensor;
_tensorsData[name] = TensorData{data, tensor->get_byte_size(), false, !_createPipeline};
_updateCommandList = true;
_tensorsData[name] = TensorData{data, tensor->get_byte_size(), false};

if (_pipelineIsCreated) {
_logger.debug("ZeroInferRequest::infer_async - update command list");

intel_npu::ZeroExecutor::ArgumentDescriptor desc;
if (isParameter) {
desc = _executor->inputs_desc_map().at(name);
} else {
desc = _executor->outputs_desc_map().at(name);
}

_pipeline->updateCommandList(_tensorsData[name], desc.idx, _batchSize);
}
}

void ZeroInferRequest::set_tensor(const ov::Output<const ov::Node>& port, const ov::SoPtr<ov::ITensor>& tensor) {
Expand All @@ -444,7 +469,9 @@ void ZeroInferRequest::set_tensor(const ov::Output<const ov::Node>& port, const
ov::op::util::is_parameter(port.get_node()));
} else {
_logger.debug("ZeroInferRequest::set_tensor - set new remote tensor");
set_remote_tensor_data(remoteTensor, port.get_node()->get_friendly_name());
set_remote_tensor_data(remoteTensor,
port.get_node()->get_friendly_name(),
ov::op::util::is_parameter(port.get_node()));
}
}
}
Expand Down Expand Up @@ -489,23 +516,11 @@ void ZeroInferRequest::infer_async() {
OV_ITT_SCOPED_TASK(itt::domains::LevelZeroBackend, "infer_async");

_executor->mutexLock();

if (_createPipeline) {
if (!_pipelineIsCreated) {
create_pipeline();

_createPipeline = false;
_updateCommandList = false;
_pipelineIsCreated = true;
}

if (_initStructs->getMutableCommandListVersion()) {
if (_updateCommandList) {
_logger.debug("ZeroInferRequest::infer_async - update command list");
_pipeline->updateCommandList(_tensorsData, _batchSize);

_updateCommandList = false;
}
}

_executor->mutexUnlock();

for (const std::string& name : _inputAndStateInputNames) {
Expand Down
59 changes: 5 additions & 54 deletions src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ struct DiscretePipeline final : public Pipeline {
}
};

void updateCommandList(std::unordered_map<std::string, TensorData>&, size_t) override{};
void updateCommandList(const TensorData&, uint32_t, size_t) override {}

private:
const Config _config;
Expand Down Expand Up @@ -274,60 +274,11 @@ struct IntegratedPipeline final : public Pipeline {
_logger.debug("IntegratedPipeline - rest() completed");
};

void updateCommandList(std::unordered_map<std::string, TensorData>& tensors_data, size_t batch_size) override {
std::vector<ze_mutable_graph_argument_exp_desc_t> mutable_argument_desc;
int32_t changed_tensors = 0;

for (const auto& desc : tensors_data) {
if (desc.second.changed == true) {
changed_tensors++;
}
}

mutable_argument_desc.reserve(changed_tensors);

auto set_mutable_desc =
[&](int32_t mutable_desc_index, uint64_t command_list_id, uint32_t arg_index, const void* arg_value) {
mutable_argument_desc.emplace_back(ze_mutable_graph_argument_exp_desc_t{
ZE_STRUCTURE_TYPE_MUTABLE_GRAPH_ARGUMENT_EXP_DESC,
mutable_desc_index ? &mutable_argument_desc.at(mutable_desc_index - 1) : nullptr,
command_list_id,
arg_index,
arg_value});
};

void updateCommandList(const TensorData& tensors_data, uint32_t index, size_t batch_size) override {
for (size_t i = 0; i < batch_size; i++) {
int32_t mutable_argument_desc_index = -1;

for (const auto& desc : _executor->inputs_desc_map()) {
TensorData& inputTensorData = tensors_data.at(desc.first);

if (inputTensorData.changed == true) {
set_mutable_desc(
++mutable_argument_desc_index,
_command_lists.at(i)->getCommandListId(),
desc.second.idx,
static_cast<unsigned char*>(inputTensorData.mem) + (i * inputTensorData.size) / batch_size);

inputTensorData.changed = false;
}
}

for (const auto& desc : _executor->outputs_desc_map()) {
TensorData& outputTensorData = tensors_data.at(desc.first);

if (outputTensorData.changed == true) {
set_mutable_desc(
++mutable_argument_desc_index,
_command_lists.at(i)->getCommandListId(),
desc.second.idx,
static_cast<unsigned char*>(outputTensorData.mem) + (i * outputTensorData.size) / batch_size);

outputTensorData.changed = false;
}
}

_command_lists.at(i)->updateMutableCommandList(&mutable_argument_desc.at(mutable_argument_desc_index));
_command_lists.at(i)->updateMutableCommandList(
index,
static_cast<unsigned char*>(tensors_data.mem) + (i * tensors_data.size) / batch_size);
_command_lists.at(i)->close();
}
};
Expand Down
15 changes: 10 additions & 5 deletions src/plugins/intel_npu/src/backend/src/zero_wrappers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,11 +114,16 @@ CommandList::~CommandList() {
_log.error("zeCommandListDestroy failed %#X", uint64_t(result));
}
}
void CommandList::updateMutableCommandList(const void* pNext) const {
ze_mutable_commands_exp_desc_t mutable_commands_exp_desc_t = {
static_cast<ze_structure_type_t>(ZE_MUTABLE_COMMAND_EXP_FLAG_GRAPH_ARGUMENT),
pNext,
0};
void CommandList::updateMutableCommandList(uint32_t arg_index, const void* arg_value) const {
ze_mutable_graph_argument_exp_desc_t desc = {ZE_STRUCTURE_TYPE_MUTABLE_GRAPH_ARGUMENT_EXP_DESC,
nullptr,
_command_id,
arg_index,
arg_value};

ze_mutable_commands_exp_desc_t mutable_commands_exp_desc_t = {ZE_STRUCTURE_TYPE_MUTABLE_COMMANDS_EXP_DESC,
&desc,
0};

zeroUtils::throwOnFail("zeCommandListUpdateMutableCommandsExp",
zeCommandListUpdateMutableCommandsExp(_handle, &mutable_commands_exp_desc_t));
Expand Down
Loading

0 comments on commit ffc135c

Please sign in to comment.