[NPU] Create compiler adapter class (#27006)

### Details: - *Create a new CompilerAdapter interface that hides different implementations of CIP and CID* - *iCompiler remains an interface only for CIP. This keeps CIP (developed in another repository) decoupled from L0* - we still use NetworkMetadata in the plugin flow, which needs to be decided later if is still needed or if it can be removed - *Graph object is created by compiler_adapter* - *Backend doesn't create/initialize graph any longer* - *Moving common objects for backend and compiler_adapter to utils/zero/* - *Destroy blob on the import path after we load the weights into the NPU memory* - *Create a new property to postpone weights loading until the creation of the first inference request, by default is performed right after the model is compiled - NPU_DEFER_WEIGHTS_LOAD* A short description of the new format: ![Screenshot 2024-10-30 151129](https://github.com/user-attachments/assets/89f86c36-f3e8-4906-8394-7cd0ae5617a2) ### Tickets: - *CVS-153081* --------- Signed-off-by: Bogdan Pereanu <bogdan.pereanu@intel.com>
openvinotoolkit · Nov 1, 2024 · caa1e6a · caa1e6a
1 parent af389b4
commit caa1e6a
Show file tree

Hide file tree

Showing 73 changed files with 2,620 additions and 2,544 deletions.
diff --git a/src/plugins/intel_npu/README.md b/src/plugins/intel_npu/README.md
@@ -78,7 +78,7 @@ There is currently no support for multiple devices, which means only one level-z
 
 ### Inference pipeline
 
-The result of the model compilation is represented through a NetworkDescription. This model description is passed by the plugin to the driver to create a level zero graph instance and obtain a graph handle that can later be used to execute multiple inferences in parallel for the same model. Since the same model instance is shared across all subsequent inference objects, this initialization step is performed by default right after the model is compiled and it can be postponed until the creation of the first inference request through the use of an environment variable: "IE_NPU_CREATE_EXECUTOR" (IE_NPU_CREATE_EXECUTOR=0 to postpone the initialization).
+The result of the model compilation is represented through an IGraph object, which contains a valid level zero graph handle that can later be used to execute multiple inferences in parallel for the same model. By default, weights are loaded into the NPU memory right after the model is compiled, but this step can be postponed until the creation of the first inference request through the use of an internal NPU property: "NPU_DEFER_WEIGHTS_LOAD".
 
 Users can create one or more inference requests for a compiled model using OpenVINO API:
 

diff --git a/src/plugins/intel_npu/cmake/features.cmake b/src/plugins/intel_npu/cmake/features.cmake
@@ -4,29 +4,10 @@
 
 ov_option(ENABLE_MLIR_COMPILER "Enable compilation of npu_mlir_compiler libraries" ON)
 
-ov_option(ENABLE_NPU_RUNTIME_COMMON "Enable compilation of npu runtime common libraries" ON)
+ov_option(ENABLE_NPU_PLUGIN_ENGINE "Enable compilation of NPU plugin engine" ON)
 
-# if ENABLE_ZEROAPI_BACKEND=ON, it adds the ze_loader dependency for driver compiler
-ov_dependent_option(ENABLE_NPU_PLUGIN_ENGINE "Enable compilation of NPU plugin engine" ON "ENABLE_NPU_RUNTIME_COMMON" OFF)
-
-ov_dependent_option(ENABLE_ZEROAPI_BACKEND "Enable zero-api as a plugin backend" ON "ENABLE_NPU_RUNTIME_COMMON;ENABLE_NPU_PLUGIN_ENGINE" OFF)
-
-ov_dependent_option(ENABLE_DRIVER_COMPILER_ADAPTER "Enable NPU Compiler inside driver" ON "ENABLE_ZEROAPI_BACKEND" OFF)
-
-if((NOT ENABLE_NPU_PLUGIN_ENGINE OR NOT ENABLE_NPU_RUNTIME_COMMON) AND ENABLE_TESTS)
-    message(FATAL_ERROR "Tests depends on npu plugin engine and npu runtime common libraries!")
-endif()
-
-if((NOT ENABLE_NPU_PLUGIN_ENGINE OR NOT ENABLE_NPU_RUNTIME_COMMON) AND ENABLE_ZEROAPI_BACKEND)
-    message(FATAL_ERROR "Zero backend depends on npu plugin engine and npu common libraries!")
-endif()
-
-if(NOT ENABLE_ZEROAPI_BACKEND AND ENABLE_DRIVER_COMPILER_ADAPTER)
-    message(FATAL_ERROR "Compiler adapter depends on zero backend to use same context!")
-endif()
-
-if(NOT BUILD_SHARED_LIBS AND NOT ENABLE_MLIR_COMPILER AND NOT ENABLE_DRIVER_COMPILER_ADAPTER)
-    message(FATAL_ERROR "No compiler found for static build!")
+if(NOT ENABLE_NPU_PLUGIN_ENGINE AND ENABLE_TESTS)
+    message(FATAL_ERROR "Tests depends on npu plugin engine!")
 endif()
 
 ov_dependent_option(ENABLE_IMD_BACKEND "Enable InferenceManagerDemo based NPU AL backend" OFF "NOT WIN32;NOT CMAKE_CROSSCOMPILING" OFF)

diff --git a/src/plugins/intel_npu/src/CMakeLists.txt b/src/plugins/intel_npu/src/CMakeLists.txt
@@ -9,18 +9,9 @@ add_subdirectory(utils)
 
 add_subdirectory(al)
 
-if (ENABLE_NPU_RUNTIME_COMMON)
+if (ENABLE_NPU_PLUGIN_ENGINE)
     add_subdirectory(common)
-endif()
-
-if(ENABLE_DRIVER_COMPILER_ADAPTER AND ENABLE_ZEROAPI_BACKEND)
-    add_subdirectory(compiler)
-endif()
-
-if(ENABLE_ZEROAPI_BACKEND)
+    add_subdirectory(compiler_adapter)
     add_subdirectory(backend)
-endif()
-
-if (ENABLE_NPU_PLUGIN_ENGINE)
     add_subdirectory(plugin)
 endif()
diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp
@@ -131,6 +131,38 @@ struct CREATE_EXECUTOR final : OptionBase<CREATE_EXECUTOR, int64_t> {
     }
 };
 
+//
+// DEFER_WEIGHTS_LOAD
+//
+
+struct DEFER_WEIGHTS_LOAD final : OptionBase<DEFER_WEIGHTS_LOAD, bool> {
+    static std::string_view key() {
+        return ov::intel_npu::defer_weights_load.name();
+    }
+
+    static int64_t defaultValue() {
+        return false;
+    }
+
+    static constexpr std::string_view getTypeName() {
+        return "bool";
+    }
+
+#ifdef NPU_PLUGIN_DEVELOPER_BUILD
+    static std::string_view envVar() {
+        return "OV_NPU_DEFER_WEIGHTS_LOAD";
+    }
+#endif
+
+    static bool isPublic() {
+        return false;
+    }
+
+    static OptionMode mode() {
+        return OptionMode::RunTime;
+    }
+};
+
 //
 // NUM_STREAMS
 //

diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/icompiler.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/icompiler.hpp
@@ -6,128 +6,12 @@
 
 #pragma once
 
-#include <cstddef>
-#include <memory>
-#include <set>
-#include <string>
-#include <string_view>
-#include <unordered_map>
-#include <unordered_set>
-
 #include "intel_npu/config/config.hpp"
-#include "openvino/core/partial_shape.hpp"
-#include "openvino/core/type/element_type.hpp"
-#include "openvino/runtime/common.hpp"
+#include "intel_npu/network_metadata.hpp"
 #include "openvino/runtime/profiling_info.hpp"
 
 namespace intel_npu {
 
-/**
- * @brief A helper structure used for storing metadata corresponding to one input/output entry.
- */
-struct IODescriptor {
-    /**
-     * @brief The name of the input/output assigned by the compiler.
-     * @details This value may differ from other name attributes:
-     *  - The compiler could have created additional inputs/outputs (e.g. for representing states). These are not
-     * found in the original IR model.
-     *  - The compiler may append indices to names in the case where duplicate names are found.
-     * @note The prefixes introduced by the compiler in order to differentiate the special cases (e.g. states and shape
-     * tensors) were removed prior to initializing this field.
-     */
-    std::string nameFromCompiler;
-
-    ov::element::Type precision;
-
-    ov::PartialShape shapeFromCompiler;
-
-    /**
-     * @brief If set to "true", the current object describes a buffer which may be used for altering a state tensor.
-     * @details This flag is set if the compiler prefixed the name using a "read value" prefix. The state input and
-     * state output descriptors are also tied using the "relatedDescriptorIndex" attribute.
-     */
-    bool isStateInput = false;
-
-    /**
-     * @brief If set to "true", the current object describes a buffer which reflects the value of a state tensor.
-     * @details This flag is set if the compiler prefixed the name using an "assign" prefix. The state input and
-     * state output descriptors are also tied using the "relatedDescriptorIndex" attribute.
-     */
-    bool isStateOutput = false;
-
-    /**
-     * @brief If set to "true", the buffer of the tensor described here contains as value the shape of the referenced
-     * tensor.
-     * @details This flag is set if the compiler prefixed the name using a "shape" prefix.
-     *
-     * The referenced tensor bears the same name ("nameFromCompiler"), but its "isShapeTensor" value is set to
-     * "false". The two descriptors are also tied using the "relatedDescriptorIndex" attribute.
-     */
-    bool isShapeTensor = false;
-
-    /**
-     * @brief Points towards a related descriptor.
-     * @details The related descriptors are defined by (state input, state output) or (dynamic tensor, shape tensor)
-     * pairs.
-     */
-    std::optional<size_t> relatedDescriptorIndex;
-
-    /**
-     * @brief The friendly name of the node extracted from the IR model.
-     * @details In some cases, this field is required for constructing a dummy model which uses the same input/output
-     * metadata as the original IR model.
-     *
-     * This field may be empty if the I/O entry is not found in the original IR model (i.e. the entry was added by the
-     * compiler).
-     */
-    std::string nodeFriendlyName;
-
-    /**
-     * @brief The names of the output tensors extracted from the IR model.
-     * @details In some cases, this field is required for constructing a dummy model which uses the same input/output
-     * metadata as the original IR model.
-     *
-     * This field may be empty if the I/O entry is not found in the original IR model (i.e. the entry was added by the
-     * compiler).
-     */
-    std::unordered_set<std::string> outputTensorNames;
-
-    /**
-     * @brief The shape extracted from the IR model.
-     * @details The values may differ from the ones found in "shapeFromCompiler" if batching is to be handled by the
-     * plugin.
-     *
-     * This field may be empty if the I/O entry is not found in the original IR model (i.e. the entry was added
-     * by the compiler).
-     */
-    std::optional<ov::PartialShape> shapeFromIRModel = std::nullopt;
-};
-
-struct NetworkMetadata final {
-    std::string name;
-
-    std::vector<IODescriptor> inputs;
-    std::vector<IODescriptor> outputs;
-    std::vector<IODescriptor> profilingOutputs;
-
-    size_t numStreams = 1;
-
-    // Used primarily in the CID path to pass the level zero graph handle from compiler to the backend executor
-    void* graphHandle = nullptr;
-
-    /**
-     * @brief Binds the (state input, state output) and (dynamic tensor, shape tensor) pairs using the
-     * "relatedDescriptorIndex" attribute.
-     * @details For state inputs, the "relatedDescriptorIndex" value is set to the index of the output which bears the
-     * same name. The reverse is also applied.
-     *
-     * For shape tensors, the lookup is performed in the same container (inputs or outputs). The value is once again set
-     * to the index of the entry which bears the same name.
-     */
-    void bindRelatedDescriptors();
-
-};  // namespace intel_npu
-
 /**
  * @struct NetworkDescription
  * @brief The object returned by the compiler
@@ -138,7 +22,6 @@ struct NetworkDescription final {
     NetworkDescription(std::vector<uint8_t>&& compiledNetwork, NetworkMetadata&& metadata)
         : compiledNetwork(std::move(compiledNetwork)),
           metadata(std::move(metadata)) {}
-    NetworkDescription(NetworkMetadata&& metadata) : metadata(std::move(metadata)) {}
     // Force move semantics to prevent blob copies
     NetworkDescription(const NetworkDescription&) = delete;
     NetworkDescription(NetworkDescription&&) = default;
@@ -151,45 +34,13 @@ struct NetworkDescription final {
     NetworkMetadata metadata;
 };
 
-/**
- * @struct CompiledNetwork
- * @brief Custom container for compiled network, used for export
- * @var CompiledNetwork::data
- * Pointer to the address of compiled network
- * @var CompiledNetwork:size
- * Size of the compiled network
- * @var CompiledNetwork::ownedStorage
- * Plugin owned compiled network storage that is required in case of a driver that
- * doesn't support graph extension 1.7, as in this case plugin must create a copy of the compiled network.
- * @note It's unsafe to store either data or size outside of the compiled network object as its destructor
- * would release the owning container
- */
-
-struct CompiledNetwork {
-    const uint8_t* data;
-    size_t size;
-    CompiledNetwork(const uint8_t* data, size_t size, std::vector<uint8_t> storage)
-        : data(data),
-          size(size),
-          ownedStorage(std::move(storage)) {}
-
-private:
-    std::vector<uint8_t> ownedStorage;
-};
-
 /**
  * @interface ICompiler
  * @brief An interface to be implemented by a concrete compiler to provide
  * methods for preparing a network for execution on a NPU device
  */
 class ICompiler : public std::enable_shared_from_this<ICompiler> {
 public:
-    /**
-     * @brief Returns the maximum OpenVino opset version supported by the compiler
-     * @return opset version e.g. 11 for opset11
-     */
-    virtual uint32_t getSupportedOpsetVersion() const = 0;
-
     /**
      * @brief Transforms a network from the OpenVINO model representation to a format executable
      * by a NPU device
@@ -216,8 +67,6 @@ class ICompiler : public std::enable_shared_from_this<ICompiler> {
      * @param config a reference to NPUConfig containing plugin config options
      *        Note: compilation options will be ignored,
      *        since the network is already compiled
-     * @param netName a reference to the string describing network name
-     *        to be used for creating network description
      * @return a shared pointer on an object implementing NetworkDescription interface
      */
     virtual NetworkMetadata parse(const std::vector<uint8_t>& network, const Config& config) const = 0;
@@ -226,15 +75,6 @@ class ICompiler : public std::enable_shared_from_this<ICompiler> {
                                                                     const std::vector<uint8_t>& network,
                                                                     const Config& config) const = 0;
 
-    // Driver compiler can use this to release graphHandle, if we do not have executor
-    virtual void release([[maybe_unused]] std::shared_ptr<const NetworkDescription> networkDescription){};
-
-    virtual CompiledNetwork getCompiledNetwork(const NetworkDescription& networkDescription) {
-        return CompiledNetwork(networkDescription.compiledNetwork.data(),
-                               networkDescription.compiledNetwork.size(),
-                               networkDescription.compiledNetwork);
-    }
-
 protected:
     virtual ~ICompiler() = default;
 };