[NPU] Adding support for the remote tensor feature - [Part II - lefto…

…vers] (#25572) ### Details: - *Allocate a Host L0 buffer when the create host tensor method is used* - *Do not chain the mutable descriptor, call the update mutable command list after each descriptor instead* - *Do not throw an error if using an older ze_loader only if the unsupported functions are going to be called* - *Call updateMutableCommandList after set_tensor method is used* - *Add extra test cases for batching flow using MCL* ### Tickets: - *EISW-131915*
openvinotoolkit · Jul 19, 2024 · ffc135c · ffc135c
1 parent 56a8c7e
commit ffc135c
Show file tree

Hide file tree

Showing 18 changed files with 331 additions and 99 deletions.
diff --git a/src/plugins/intel_npu/src/al/include/npu.hpp b/src/plugins/intel_npu/src/al/include/npu.hpp
@@ -92,6 +92,11 @@ class IDevice : public std::enable_shared_from_this<IDevice> {
         ov::intel_npu::MemType mem_type = ov::intel_npu::MemType::L0_INTERNAL_BUF,
         void* mem = nullptr);
 
+    virtual ov::SoPtr<ov::ITensor> createHostTensor(std::shared_ptr<ov::IRemoteContext> context,
+                                                    const ov::element::Type& element_type,
+                                                    const ov::Shape& shape,
+                                                    const Config& config);
+
 protected:
     virtual ~IDevice() = default;
 };

diff --git a/src/plugins/intel_npu/src/al/src/npu.cpp b/src/plugins/intel_npu/src/al/src/npu.cpp
@@ -81,4 +81,11 @@ ov::SoPtr<ov::IRemoteTensor> IDevice::createRemoteTensor(std::shared_ptr<ov::IRe
     OPENVINO_THROW("Create Remote Tensor is not supported");
 }
 
+ov::SoPtr<ov::ITensor> IDevice::createHostTensor(std::shared_ptr<ov::IRemoteContext>,
+                                                 const ov::element::Type&,
+                                                 const ov::Shape&,
+                                                 const Config&) {
+    OPENVINO_THROW("Create Host Tensor is not supported");
+}
+
 }  // namespace intel_npu
diff --git a/src/plugins/intel_npu/src/backend/include/zero_device.hpp b/src/plugins/intel_npu/src/backend/include/zero_device.hpp
@@ -47,6 +47,11 @@ class ZeroDevice : public IDevice {
         ov::intel_npu::MemType mem_type = ov::intel_npu::MemType::L0_INTERNAL_BUF,
         void* mem = nullptr) override;
 
+    ov::SoPtr<ov::ITensor> createHostTensor(std::shared_ptr<ov::IRemoteContext> context,
+                                            const ov::element::Type& element_type,
+                                            const ov::Shape& shape,
+                                            const Config& config) override;
+
     ZeroDevice& operator=(const ZeroDevice&) = delete;
     ZeroDevice(const ZeroDevice&) = delete;
 

diff --git a/src/plugins/intel_npu/src/backend/include/zero_host_tensor.hpp b/src/plugins/intel_npu/src/backend/include/zero_host_tensor.hpp
@@ -0,0 +1,39 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "intel_npu/al/config/config.hpp"
+#include "openvino/runtime/itensor.hpp"
+#include "zero_init.hpp"
+#include "zero_remote_tensor.hpp"
+
+namespace intel_npu {
+
+class ZeroHostTensor : public ov::ITensor {
+public:
+    ZeroHostTensor(std::shared_ptr<ov::IRemoteContext> context,
+                   std::shared_ptr<ZeroInitStructsHolder> init_structs,
+                   const ov::element::Type element_type,
+                   const ov::Shape& shape,
+                   const Config& config);
+
+    ~ZeroHostTensor() override = default;
+
+    void* data(const ov::element::Type& element_type) const override;
+    const ov::element::Type& get_element_type() const override;
+
+    const ov::Shape& get_shape() const override;
+
+    const ov::Strides& get_strides() const override;
+
+    void set_shape(ov::Shape new_shape) override;
+
+    std::shared_ptr<ZeroRemoteTensor> get_impl() const;
+
+private:
+    std::shared_ptr<ZeroRemoteTensor> m_impl;
+};
+
+}  // namespace intel_npu
diff --git a/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp b/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp
@@ -53,8 +53,9 @@ class ZeroInferRequest final : public SyncInferRequest {
      * @brief Check the received remote tensor and copy it to the Level Zero tensor
      * @param tensor Reference to a tensor.
      * @param name Friendly name of the tensor.
+     * @param isParameter True if tensor is a parameter.
      */
-    void set_remote_tensor_data(std::shared_ptr<ZeroRemoteTensor> tensor, const std::string& name);
+    void set_remote_tensor_data(std::shared_ptr<ZeroRemoteTensor> tensor, const std::string& name, bool isParameter);
 
     void check_network_precision(const ov::element::Type_t precision) const override;
     void create_pipeline();
@@ -77,8 +78,7 @@ class ZeroInferRequest final : public SyncInferRequest {
     // specific operations on the plugin in this case.
     size_t _batchSize = DEFAULT_BATCH_SIZE;
 
-    bool _createPipeline = true;
-    bool _updateCommandList = false;
+    bool _pipelineIsCreated = false;
 };
 
 }  //  namespace intel_npu
diff --git a/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp b/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp
@@ -16,7 +16,6 @@ struct TensorData {
     void* mem;
     size_t size;
     bool levelZeroTensorCreatedLocally = true;
-    bool changed = false;
 };
 
 struct Pipeline {
@@ -32,7 +31,7 @@ struct Pipeline {
     virtual void pull(size_t batch_index) = 0;
     virtual void reset(size_t batch_index) const = 0;
 
-    virtual void updateCommandList(std::unordered_map<std::string, TensorData>& tensors_data, size_t batch_size) = 0;
+    virtual void updateCommandList(const TensorData& tensors_data, uint32_t index, size_t batch_size) = 0;
 
 protected:
     zeroMemory::MemoryManagementUnit _deviceInputs;

diff --git a/src/plugins/intel_npu/src/backend/include/zero_wrappers.hpp b/src/plugins/intel_npu/src/backend/include/zero_wrappers.hpp
@@ -87,7 +87,7 @@ class CommandList {
     void appendGraphInitialize(const ze_graph_handle_t& graph_handle) const;
     void appendGraphExecute(const ze_graph_handle_t& graph_handle,
                             const ze_graph_profiling_query_handle_t& profiling_query_handle) const;
-    void updateMutableCommandList(const void* pNext = nullptr) const;
+    void updateMutableCommandList(uint32_t arg_index, const void* arg_value) const;
     void appendNpuTimestamp(uint64_t* timestamp_buff) const;
     void appendBarrier() const;
     void close() const;
@@ -96,9 +96,6 @@ class CommandList {
     inline ze_command_list_handle_t handle() const {
         return _handle;
     }
-    uint64_t getCommandListId() const {
-        return _command_id;
-    }
 
 private:
     ze_command_list_handle_t _handle = nullptr;

diff --git a/src/plugins/intel_npu/src/backend/src/zero_device.cpp b/src/plugins/intel_npu/src/backend/src/zero_device.cpp
@@ -9,6 +9,7 @@
 #include "intel_npu/al/itt.hpp"
 #include "intel_npu/utils/zero/zero_api.hpp"
 #include "zero_executor.hpp"
+#include "zero_host_tensor.hpp"
 #include "zero_infer_request.hpp"
 #include "zero_remote_tensor.hpp"
 #include "zero_utils.hpp"
@@ -193,3 +194,10 @@ ov::SoPtr<ov::IRemoteTensor> ZeroDevice::createRemoteTensor(std::shared_ptr<ov::
     return {std::make_shared<
         ZeroRemoteTensor>(context, _initStructs, element_type, shape, config, tensor_type, mem_type, mem)};
 };
+
+ov::SoPtr<ov::ITensor> ZeroDevice::createHostTensor(std::shared_ptr<ov::IRemoteContext> context,
+                                                    const ov::element::Type& element_type,
+                                                    const ov::Shape& shape,
+                                                    const Config& config) {
+    return {std::make_shared<ZeroHostTensor>(context, _initStructs, element_type, shape, config)};
+};
diff --git a/src/plugins/intel_npu/src/backend/src/zero_host_tensor.cpp b/src/plugins/intel_npu/src/backend/src/zero_host_tensor.cpp
@@ -0,0 +1,48 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "zero_host_tensor.hpp"
+
+#include "openvino/runtime/intel_npu/remote_properties.hpp"
+
+namespace intel_npu {
+
+ZeroHostTensor::ZeroHostTensor(std::shared_ptr<ov::IRemoteContext> context,
+                               std::shared_ptr<ZeroInitStructsHolder> init_structs,
+                               const ov::element::Type element_type,
+                               const ov::Shape& shape,
+                               const Config& config)
+    : m_impl(std::make_shared<ZeroRemoteTensor>(context,
+                                                init_structs,
+                                                element_type,
+                                                shape,
+                                                config,
+                                                ov::intel_npu::TensorType::BINDED,
+                                                ov::intel_npu::MemType::L0_INTERNAL_BUF)) {}
+
+void* ZeroHostTensor::data(const ov::element::Type&) const {
+    return m_impl->get_properties().find(ov::intel_npu::mem_handle.name())->second.as<void*>();
+}
+
+const ov::element::Type& ZeroHostTensor::get_element_type() const {
+    return m_impl->get_element_type();
+}
+
+const ov::Shape& ZeroHostTensor::get_shape() const {
+    return m_impl->get_shape();
+}
+
+const ov::Strides& ZeroHostTensor::get_strides() const {
+    return m_impl->get_strides();
+}
+
+void ZeroHostTensor::set_shape(ov::Shape new_shape) {
+    m_impl->set_shape(new_shape);
+}
+
+std::shared_ptr<ZeroRemoteTensor> ZeroHostTensor::get_impl() const {
+    return m_impl;
+}
+
+}  // namespace intel_npu
diff --git a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp
@@ -402,13 +402,26 @@ void ZeroInferRequest::set_tensor_data(std::shared_ptr<ov::ITensor> tensor, cons
     if (setTensorData) {
         _tensorsData[name] = TensorData{_copyAllTensors.at(name)->data(),
                                         _copyAllTensors.at(name)->get_byte_size(),
-                                        levelZeroTensorCreatedLocally,
-                                        !_createPipeline};
-        _updateCommandList = true;
+                                        levelZeroTensorCreatedLocally};
+
+        if (_pipelineIsCreated) {
+            _logger.debug("ZeroInferRequest::infer_async - update command list");
+
+            intel_npu::ZeroExecutor::ArgumentDescriptor desc;
+            if (isParameter) {
+                desc = _executor->inputs_desc_map().at(name);
+            } else {
+                desc = _executor->outputs_desc_map().at(name);
+            }
+
+            _pipeline->updateCommandList(_tensorsData[name], desc.idx, _batchSize);
+        }
     }
 }
 
-void ZeroInferRequest::set_remote_tensor_data(std::shared_ptr<ZeroRemoteTensor> tensor, const std::string& name) {
+void ZeroInferRequest::set_remote_tensor_data(std::shared_ptr<ZeroRemoteTensor> tensor,
+                                              const std::string& name,
+                                              bool isParameter) {
     auto l0_context = reinterpret_cast<ze_context_handle_t>(
         extract_object(tensor->get_context()->get_property(), ov::intel_npu::l0_context));
     if (_initStructs->getContext() != l0_context) {
@@ -421,8 +434,20 @@ void ZeroInferRequest::set_remote_tensor_data(std::shared_ptr<ZeroRemoteTensor>
     }
 
     _copyAllTensors[name] = tensor;
-    _tensorsData[name] = TensorData{data, tensor->get_byte_size(), false, !_createPipeline};
-    _updateCommandList = true;
+    _tensorsData[name] = TensorData{data, tensor->get_byte_size(), false};
+
+    if (_pipelineIsCreated) {
+        _logger.debug("ZeroInferRequest::infer_async - update command list");
+
+        intel_npu::ZeroExecutor::ArgumentDescriptor desc;
+        if (isParameter) {
+            desc = _executor->inputs_desc_map().at(name);
+        } else {
+            desc = _executor->outputs_desc_map().at(name);
+        }
+
+        _pipeline->updateCommandList(_tensorsData[name], desc.idx, _batchSize);
+    }
 }
 
 void ZeroInferRequest::set_tensor(const ov::Output<const ov::Node>& port, const ov::SoPtr<ov::ITensor>& tensor) {
@@ -444,7 +469,9 @@ void ZeroInferRequest::set_tensor(const ov::Output<const ov::Node>& port, const
                             ov::op::util::is_parameter(port.get_node()));
         } else {
             _logger.debug("ZeroInferRequest::set_tensor - set new remote tensor");
-            set_remote_tensor_data(remoteTensor, port.get_node()->get_friendly_name());
+            set_remote_tensor_data(remoteTensor,
+                                   port.get_node()->get_friendly_name(),
+                                   ov::op::util::is_parameter(port.get_node()));
         }
     }
 }
@@ -489,23 +516,11 @@ void ZeroInferRequest::infer_async() {
     OV_ITT_SCOPED_TASK(itt::domains::LevelZeroBackend, "infer_async");
 
     _executor->mutexLock();
-
-    if (_createPipeline) {
+    if (!_pipelineIsCreated) {
         create_pipeline();
 
-        _createPipeline = false;
-        _updateCommandList = false;
+        _pipelineIsCreated = true;
     }
-
-    if (_initStructs->getMutableCommandListVersion()) {
-        if (_updateCommandList) {
-            _logger.debug("ZeroInferRequest::infer_async - update command list");
-            _pipeline->updateCommandList(_tensorsData, _batchSize);
-
-            _updateCommandList = false;
-        }
-    }
-
     _executor->mutexUnlock();
 
     for (const std::string& name : _inputAndStateInputNames) {

diff --git a/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp b/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp
@@ -143,7 +143,7 @@ struct DiscretePipeline final : public Pipeline {
         }
     };
 
-    void updateCommandList(std::unordered_map<std::string, TensorData>&, size_t) override{};
+    void updateCommandList(const TensorData&, uint32_t, size_t) override {}
 
 private:
     const Config _config;
@@ -274,60 +274,11 @@ struct IntegratedPipeline final : public Pipeline {
         _logger.debug("IntegratedPipeline - rest() completed");
     };
 
-    void updateCommandList(std::unordered_map<std::string, TensorData>& tensors_data, size_t batch_size) override {
-        std::vector<ze_mutable_graph_argument_exp_desc_t> mutable_argument_desc;
-        int32_t changed_tensors = 0;
-
-        for (const auto& desc : tensors_data) {
-            if (desc.second.changed == true) {
-                changed_tensors++;
-            }
-        }
-
-        mutable_argument_desc.reserve(changed_tensors);
-
-        auto set_mutable_desc =
-            [&](int32_t mutable_desc_index, uint64_t command_list_id, uint32_t arg_index, const void* arg_value) {
-                mutable_argument_desc.emplace_back(ze_mutable_graph_argument_exp_desc_t{
-                    ZE_STRUCTURE_TYPE_MUTABLE_GRAPH_ARGUMENT_EXP_DESC,
-                    mutable_desc_index ? &mutable_argument_desc.at(mutable_desc_index - 1) : nullptr,
-                    command_list_id,
-                    arg_index,
-                    arg_value});
-            };
-
+    void updateCommandList(const TensorData& tensors_data, uint32_t index, size_t batch_size) override {
         for (size_t i = 0; i < batch_size; i++) {
-            int32_t mutable_argument_desc_index = -1;
-
-            for (const auto& desc : _executor->inputs_desc_map()) {
-                TensorData& inputTensorData = tensors_data.at(desc.first);
-
-                if (inputTensorData.changed == true) {
-                    set_mutable_desc(
-                        ++mutable_argument_desc_index,
-                        _command_lists.at(i)->getCommandListId(),
-                        desc.second.idx,
-                        static_cast<unsigned char*>(inputTensorData.mem) + (i * inputTensorData.size) / batch_size);
-
-                    inputTensorData.changed = false;
-                }
-            }
-
-            for (const auto& desc : _executor->outputs_desc_map()) {
-                TensorData& outputTensorData = tensors_data.at(desc.first);
-
-                if (outputTensorData.changed == true) {
-                    set_mutable_desc(
-                        ++mutable_argument_desc_index,
-                        _command_lists.at(i)->getCommandListId(),
-                        desc.second.idx,
-                        static_cast<unsigned char*>(outputTensorData.mem) + (i * outputTensorData.size) / batch_size);
-
-                    outputTensorData.changed = false;
-                }
-            }
-
-            _command_lists.at(i)->updateMutableCommandList(&mutable_argument_desc.at(mutable_argument_desc_index));
+            _command_lists.at(i)->updateMutableCommandList(
+                index,
+                static_cast<unsigned char*>(tensors_data.mem) + (i * tensors_data.size) / batch_size);
             _command_lists.at(i)->close();
         }
     };

diff --git a/src/plugins/intel_npu/src/backend/src/zero_wrappers.cpp b/src/plugins/intel_npu/src/backend/src/zero_wrappers.cpp
@@ -114,11 +114,16 @@ CommandList::~CommandList() {
         _log.error("zeCommandListDestroy failed %#X", uint64_t(result));
     }
 }
-void CommandList::updateMutableCommandList(const void* pNext) const {
-    ze_mutable_commands_exp_desc_t mutable_commands_exp_desc_t = {
-        static_cast<ze_structure_type_t>(ZE_MUTABLE_COMMAND_EXP_FLAG_GRAPH_ARGUMENT),
-        pNext,
-        0};
+void CommandList::updateMutableCommandList(uint32_t arg_index, const void* arg_value) const {
+    ze_mutable_graph_argument_exp_desc_t desc = {ZE_STRUCTURE_TYPE_MUTABLE_GRAPH_ARGUMENT_EXP_DESC,
+                                                 nullptr,
+                                                 _command_id,
+                                                 arg_index,
+                                                 arg_value};
+
+    ze_mutable_commands_exp_desc_t mutable_commands_exp_desc_t = {ZE_STRUCTURE_TYPE_MUTABLE_COMMANDS_EXP_DESC,
+                                                                  &desc,
+                                                                  0};
 
     zeroUtils::throwOnFail("zeCommandListUpdateMutableCommandsExp",
                            zeCommandListUpdateMutableCommandsExp(_handle, &mutable_commands_exp_desc_t));