diff --git a/src/inference/dev_api/openvino/runtime/memory_solver.hpp b/src/inference/dev_api/openvino/runtime/memory_solver.hpp index b2e11d203ef50f..a43e02f7a0b43a 100644 --- a/src/inference/dev_api/openvino/runtime/memory_solver.hpp +++ b/src/inference/dev_api/openvino/runtime/memory_solver.hpp @@ -16,6 +16,7 @@ #include #include "openvino/core/except.hpp" +// #include "global_execution_index.hpp" namespace ov { @@ -52,6 +53,7 @@ class MemorySolver { struct Box { /** Execution order index of first use. The data will be produced here. */ int start; + // intel_cpu::GlobalExecutionIndex start; /** * The execution order index of last use. After that data will be released. @@ -59,6 +61,7 @@ class MemorySolver { * end of execution. */ int finish; + // intel_cpu::GlobalExecutionIndex finish; /** Size of data. In abstract unit of measure (byte, simd, cache line, ...) */ int64_t size; diff --git a/src/plugins/intel_cpu/src/allocation_context.hpp b/src/plugins/intel_cpu/src/allocation_context.hpp new file mode 100644 index 00000000000000..8affe814807004 --- /dev/null +++ b/src/plugins/intel_cpu/src/allocation_context.hpp @@ -0,0 +1,26 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include + +namespace ov { +namespace intel_cpu { + +class Node; +class Edge; + +using GlobalExecutionIndex = std::unordered_map, std::pair>; + +struct AllocationContext { + std::vector> edges; + GlobalExecutionIndex execIndex; + std::vector syncPoints; +}; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/compiled_model.cpp b/src/plugins/intel_cpu/src/compiled_model.cpp index bbee5d937be5d5..9f9fed5421b163 100644 --- a/src/plugins/intel_cpu/src/compiled_model.cpp +++ b/src/plugins/intel_cpu/src/compiled_model.cpp @@ -4,6 +4,7 @@ #include "compiled_model.h" #include "async_infer_request.h" +#include "graph.h" #include "infer_request.h" #include "itt.h" #include "low_precision/low_precision.hpp" @@ -19,6 +20,7 @@ #include "openvino/runtime/threading/cpu_streams_info.hpp" #include "openvino/runtime/threading/cpu_message.hpp" #include "utils/serialize.hpp" +#include "memory_control.hpp" #include "cpu/x64/cpu_isa_traits.hpp" #include @@ -52,7 +54,8 @@ CompiledModel::CompiledModel(const std::shared_ptr& model, m_cfg{cfg}, m_name{model->get_name()}, m_loaded_from_cache(loaded_from_cache), - m_sub_memory_manager(sub_memory_manager) { + m_sub_memory_manager(sub_memory_manager), + m_networkMemoryControl(std::make_shared()) { m_mutex = std::make_shared(); const auto& core = m_plugin->get_core(); if (!core) @@ -152,20 +155,26 @@ CompiledModel::GraphGuard::Lock CompiledModel::get_graph() const { std::exception_ptr exception; auto makeGraph = [&] { try { + MemoryControl* memoryControl = m_networkMemoryControl->createMemoryControlUnit(); GraphContext::Ptr ctx; { std::lock_guard lock{*m_mutex.get()}; auto isQuantizedFlag = (m_cfg.lpTransformsMode == Config::On) && ov::pass::low_precision::LowPrecision::isFunctionQuantized(m_model); - ctx = std::make_shared(m_cfg, m_socketWeights[socketId], isQuantizedFlag, + memoryControl, + m_networkMemoryControl, streamsExecutor, m_sub_memory_manager); } + const std::shared_ptr model = m_model; - graphLock._graph.CreateGraph(model, ctx); + // @todo propagate input / output memory descriptors + graphLock._graph.Init(model, ctx); + // @todo pass input / output memory + graphLock._graph.Activate({}, {}, true); } catch (...) { exception = std::current_exception(); } @@ -346,7 +355,7 @@ void CompiledModel::release_memory() { for (auto&& graph : m_graphs) { GraphGuard::Lock graph_lock{graph}; auto ctx = graph_lock._graph.getGraphContext(); - ctx->getNetworkMemoryControl()->releaseMemory(); + m_networkMemoryControl->releaseMemory(); } } diff --git a/src/plugins/intel_cpu/src/compiled_model.h b/src/plugins/intel_cpu/src/compiled_model.h index faedf1ae5a744c..cab50971f31a78 100644 --- a/src/plugins/intel_cpu/src/compiled_model.h +++ b/src/plugins/intel_cpu/src/compiled_model.h @@ -4,6 +4,7 @@ #pragma once +#include #include #include @@ -19,6 +20,8 @@ namespace ov { namespace intel_cpu { +class NetworkMemoryControl; + class CompiledModel : public ov::ICompiledModel { public: typedef std::shared_ptr Ptr; @@ -51,6 +54,10 @@ class CompiledModel : public ov::ICompiledModel { void release_memory() override; + std::shared_ptr get_network_memory_control() const { + return m_networkMemoryControl; + } + private: std::shared_ptr create_sync_infer_request() const override; friend class SyncInferRequest; @@ -91,6 +98,7 @@ class CompiledModel : public ov::ICompiledModel { std::vector> m_sub_compiled_models; std::shared_ptr m_sub_memory_manager = nullptr; + std::shared_ptr m_networkMemoryControl = nullptr; bool m_has_sub_compiled_models = false; }; diff --git a/src/plugins/intel_cpu/src/edge.cpp b/src/plugins/intel_cpu/src/edge.cpp index c314718bb82416..c5d75bc3d16b62 100644 --- a/src/plugins/intel_cpu/src/edge.cpp +++ b/src/plugins/intel_cpu/src/edge.cpp @@ -235,7 +235,7 @@ Edge::ReorderStatus Edge::needReorder() { } void Edge::reuse(MemoryPtr ptr) { - OPENVINO_ASSERT(ptr != nullptr, "Attempt to reuse initialized memory in " + name()); + OPENVINO_ASSERT(ptr != nullptr, "Attempt to reuse uninitialized memory in " + name()); memoryPtr = ptr; changeStatus(Status::Allocated); @@ -292,7 +292,11 @@ std::string Edge::name() const { std::stringstream result; - result << parentPtr->getName() << " port " << parent_port << " <-> " << childPtr->getName() << " port " << child_port; + result << parentPtr->getName()<< " port " << parent_port + << " <-> " + << childPtr->getName() << " port " << child_port + << " status: " + << static_cast(getStatus()); return result.str(); } @@ -441,7 +445,7 @@ void Edge::validate() { getChild(); if (status != Status::Allocated || !memoryPtr) { - OPENVINO_THROW("Error memory is not allocated!"); + OPENVINO_THROW("Error memory is not allocated for edge: ", name()); } status = Status::Validated; } diff --git a/src/plugins/intel_cpu/src/edge.h b/src/plugins/intel_cpu/src/edge.h index 29cb8113943cd3..e77a5cecf89aeb 100644 --- a/src/plugins/intel_cpu/src/edge.h +++ b/src/plugins/intel_cpu/src/edge.h @@ -82,6 +82,7 @@ class Edge { } std::string name() const; + const MemoryDesc& getDesc() const; private: std::weak_ptr parent; @@ -99,7 +100,6 @@ class Edge { PortDescBaseCPtr getInputPortDesc() const; PortDescBaseCPtr getOutputPortDesc() const; - const MemoryDesc& getDesc() const; bool enforceReorder(); void collectConsumers(std::vector>& result) const; diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp index f3f3a379fc2af7..a552cf616724d4 100644 --- a/src/plugins/intel_cpu/src/graph.cpp +++ b/src/plugins/intel_cpu/src/graph.cpp @@ -6,6 +6,9 @@ #include #include +#include +#include +#include #include #include #include @@ -16,11 +19,14 @@ #include #include +#include "allocation_context.hpp" #include "edge.h" +#include "graph_context.h" #include "graph_dumper.h" #include "graph_optimizer.h" #include "infer_request.h" #include "itt.h" +#include "memory_control.hpp" #include "memory_desc/cpu_memory_desc_utils.h" #include "memory_desc/dnnl_blocked_memory_desc.h" #include "node.h" @@ -350,16 +356,23 @@ static void UseExternalOutputMemory(const std::map& output } void Graph::Activate(const std::vector& externalInputMemory, - const std::vector& externalOutputMemory) { - OPENVINO_ASSERT(status == Status::Initialized, "Invalid graph status"); + const std::vector& externalOutputMemory, + bool globalAllocation) { + // OPENVINO_ASSERT(status == Status::Initialized, "Invalid graph status"); - const bool hasDynNodes = ProcessDynNodes(); - const auto syncNodesInds = hasDynNodes ? IdentifySyncPoints(graphNodes) : std::vector{}; + // const bool hasDynNodes = ProcessDynNodes(); + // const auto syncNodesInds = hasDynNodes ? IdentifySyncPoints(graphNodes) : std::vector{}; UseExternalInputMemory(inputNodesMap, externalInputMemory); UseExternalOutputMemory(outputNodesMap, externalOutputMemory); - Allocate(syncNodesInds); + // std::tie(m_executableGraphNodes, m_executableSyncNodesInds) = ExtractExecutableNodesAndSyncPoints(syncNodesInds, graphNodes); + + // status = hasDynNodes ? (parallel_get_max_threads() > 1 ? Status::ReadyDynamic : Status::ReadyDynamicSeq) + // : Status::ReadyStatic; + + // CPU_DEBUG_CAP_ENABLE(serialize(*this)); + Allocate(globalAllocation); CreatePrimitivesAndExecConstants(); @@ -369,22 +382,22 @@ void Graph::Activate(const std::vector& externalInputMemory, } #endif - std::tie(m_executableGraphNodes, m_executableSyncNodesInds) = ExtractExecutableNodesAndSyncPoints(syncNodesInds, graphNodes); - - if (hasDynNodes) { - status = Status::ReadyDynamic; - // Here we use the following heuristic: if the number of sync nodes is less than 10 times of the number of exec - // nodes, it does make sense to use Sequential dynamic shapes processing due to the high overheads on context - // switching when the dynamic shapes are being processed in parallel and there are a lot of sync points. Also - // this rule works for short graphs (usually subgraphs) when the amount of nodes is to low to process them in - // parallel. - const auto exec2sync = m_executableGraphNodes.size() / m_executableSyncNodesInds.size(); - if (exec2sync < 10 || parallel_get_max_threads() < 2) { - status = Status::ReadyDynamicSeq; - } - } else { - status = Status::ReadyStatic; - } + // std::tie(m_executableGraphNodes, m_executableSyncNodesInds) = ExtractExecutableNodesAndSyncPoints(syncNodesInds, graphNodes); + + // if (hasDynNodes) { + // status = Status::ReadyDynamic; + // // Here we use the following heuristic: if the number of sync nodes is less than 10 times of the number of exec + // // nodes, it does make sense to use Sequential dynamic shapes processing due to the high overheads on context + // // switching when the dynamic shapes are being processed in parallel and there are a lot of sync points. Also + // // this rule works for short graphs (usually subgraphs) when the amount of nodes is to low to process them in + // // parallel. + // const auto exec2sync = m_executableGraphNodes.size() / m_executableSyncNodesInds.size(); + // if (exec2sync < 10 || parallel_get_max_threads() < 2) { + // status = Status::ReadyDynamicSeq; + // } + // } else { + // status = Status::ReadyStatic; + // } CPU_DEBUG_CAP_ENABLE(serialize(*this)); } @@ -713,213 +726,141 @@ void Graph::ResolveComplexInplaceConflicts() { } } -static inline bool isConstOutput(EdgePtr edge) { - return edge->getParent()->isConstant() && !edge->getChild()->isConstant(); -} - -void Graph::AllocateWithReuse(const std::vector& syncNodesInds) { - edgeClusters edge_clusters = MemoryControl::findEdgeClusters(graphEdges); - - size_t remaining_edge_clusters_count = edge_clusters.size(); - - // Resolve special cases: - for (size_t i = 0; i < remaining_edge_clusters_count;) { - auto &cluster = edge_clusters[i]; - bool erase = false; - for (auto &edge : cluster) { - // Remove already allocated edges from the mem reuse algo - if (edge->getStatus() == Edge::Status::Allocated) { - erase = true; - break; - } - - // Special allocation for string tensors - if (edge->getDesc().getPrecision() == element::string && edge->getStatus() == Edge::Status::NeedAllocation) { - StringMemory::StringMemoryBlockPtr memBlcok; - if (edge->getParent()->isConstant()) { - if (edge->getParent()->getType() == Type::Input) { - auto constNode = static_cast(edge->getParent().get()); - edge->reuse(std::const_pointer_cast(constNode->getMemoryPtr())); - } else { - edge->externalAllocate(m_context->getWeightsCache()); - } - auto stringMemory = dynamic_cast(edge->getMemoryPtr().get()); - OPENVINO_ASSERT(stringMemory, "[CPU] Edge between nodes '", - edge->getParent()->getName(), "' and '", edge->getChild()->getName(), "' must have StringMemory."); - memBlcok = stringMemory->getStringMemoryBlockPtr(); - } else { - auto memory = std::make_shared(getEngine(), edge->getDesc()); - edge->reuse(memory); - memBlcok = memory->getStringMemoryBlockPtr(); - } - for (auto& edge_c : cluster) { - if (edge_c == edge) { - continue; - } - OPENVINO_ASSERT(edge_c->getDesc().getPrecision() == element::string, "All edges in the cluster must be string."); - if (edge_c->getStatus() == Edge::Status::NotAllocated) { - auto memory = std::make_shared(getEngine(), edge_c->getDesc(), memBlcok); - edge_c->reuse(memory); - } else { - OPENVINO_THROW("[CPU] String tensors allocation in the cluster. Edge between nodes '", edge_c->getParent()->getName(), "' and '", - edge_c->getChild()->getName(), "' has an unexpected status: ", static_cast(edge_c->getStatus())); - } - } - erase = true; - continue; - } - - // Special allocation for constants - if (edge->getStatus() != Edge::Status::NeedAllocation || !edge->getParent()->isConstant()) { - continue; - } +/** + * Partition the \clusters of Edges, by moving and allocating at the same time + * the clusters which cannot be handled as part of generic memory solver algorithm. + * Such clusters meet one of the following criteria: + * - base edge of a cluster is already Allocated + * - base edge of a cluster is a "ov::element::string" type of edge + * - base edge of a cluster is a Constant edge + * + * @return a remaining number of clusters to process (left partition) + */ +static size_t AllocateStringsAndConstants(EdgeClusters& clusters, + const GraphContext::CPtr context) { + auto allocateStringMemory = [context](const EdgePtr& edge) { + if (edge->getParent()->isConstant()) { if (edge->getParent()->getType() == Type::Input) { - auto constNode = std::static_pointer_cast(edge->getParent()); + auto constNode = static_cast(edge->getParent().get()); edge->reuse(std::const_pointer_cast(constNode->getMemoryPtr())); } else { - edge->externalAllocate(m_context->getWeightsCache()); - } - erase = true; - } - - if (erase) { - std::swap(edge_clusters[i], edge_clusters[remaining_edge_clusters_count - 1]); - --remaining_edge_clusters_count; - } else { - ++i; - } - } - - // Markup the memory regions - std::vector memoryRegions; - memoryRegions.reserve(remaining_edge_clusters_count); - - for (size_t i = 0; i < remaining_edge_clusters_count; ++i) { - MemoryRegion reg = {std::numeric_limits::max(), - 0, - 0, - static_cast(i), - MemoryRegion::RegionType::VARIABLE, - MemoryRegion::AllocType::UNKNOWN}; - - int64_t boxSize = 0; - bool isConst = false, isOutput = false, isInput = false; - for (auto &edge : edge_clusters[i]) { - int e_start = edge->getParent()->getExecIndex(); - int e_finish = edge->getChild()->getExecIndex(); - - auto&& desc = edge->getDesc(); - - if (boxSize != -1 && desc.isDefined()) { - int64_t e_size = desc.getCurrentMemSize(); // size in bytes (from the beginning of data to the last element) - boxSize = std::max(e_size, boxSize); - } else { - boxSize = -1; - } - - reg.start = std::min(e_start, reg.start); - reg.finish = std::max(e_finish, reg.finish); - - auto allocType = - desc.getPrecision() == element::string ? MemoryRegion::AllocType::STRING : MemoryRegion::AllocType::POD; - - if (reg.alloc_type != allocType && MemoryRegion::AllocType::UNKNOWN != reg.alloc_type) { - OPENVINO_THROW("Different allocation types in the same memory region"); - } - reg.alloc_type = allocType; - - isConst |= isConstOutput(edge); - isOutput |= edge->getChild()->getType() == Type::Output; - isInput |= edge->getParent()->getType() == Type::Input; - } - - reg.size = boxSize; - - if (isConst) { - reg.type = MemoryRegion::RegionType::CONSTANT; - } else if (isInput) { - if (isOutput) { - reg.type = MemoryRegion::RegionType::IO; - } else { - reg.type = MemoryRegion::RegionType::INPUT; + edge->externalAllocate(context->getWeightsCache()); } - } else if (isOutput) { - reg.type = MemoryRegion::RegionType::OUTPUT; + auto stringMemory = dynamic_cast(edge->getMemoryPtr().get()); + OPENVINO_ASSERT(stringMemory, "[CPU] Edge between nodes '", + edge->getParent()->getName(), "' and '", edge->getChild()->getName(), "' must have StringMemory."); + return stringMemory->getStringMemoryBlockPtr(); } - memoryRegions.push_back(reg); - } + auto memory = std::make_shared(context->getEngine(), edge->getDesc()); + edge->reuse(memory); + return memory->getStringMemoryBlockPtr(); + }; - // special processing of the dynamic output edges - auto it = std::remove_if(memoryRegions.begin(), memoryRegions.end(), [&](const MemoryRegion& region) { - if (region.size >= 0 || !one_of(region.type, MemoryRegion::RegionType::OUTPUT, MemoryRegion::RegionType::IO)) { - return false; - } - bool result = false; - for (auto& edge : edge_clusters[region.id]) { - auto child = edge->getChild(); - if (child->getType() == Type::Output && edge->getStatus() == Edge::Status::NeedAllocation) { - auto proxyMemBlock = std::make_shared(); - DEBUG_LOG("ProxyMemoryBlock ", proxyMemBlock, " ", this); - edge->allocate(proxyMemBlock); - - // Store the output memory blocks. - // So that, the infer requests can be able to access them. - int count = 0; - for (auto& output : outputNodesMap) { - if (output.second == child) { - outputNodesMemBlocksMap[output.first] = proxyMemBlock; - count++; - } - } - // sometimes there are unused output ports. - OPENVINO_ASSERT(count <= 1, "CPU plugin cannot find output node. count ", count); - result = true; - } + auto allocateConstantEdge = [context](const EdgePtr& edge) { + if (edge->getParent()->getType() == Type::Input) { + auto constNode = std::static_pointer_cast(edge->getParent()); + edge->reuse(std::const_pointer_cast(constNode->getMemoryPtr())); + } else { + edge->externalAllocate(context->getWeightsCache()); } - return result; - }); - - memoryRegions.erase(it, memoryRegions.end()); + }; - //Set up the memory control subsystem. - this->m_pMemoryControl = &(getGraphContext()->getNetworkMemoryControl()->createMemoryControlUnit(syncNodesInds)); - auto memoryBlocks = m_pMemoryControl->insert(memoryRegions); + auto endOfNotAllocatedPartition = + std::partition(clusters.begin(), clusters.end(), + [&allocateStringMemory, &allocateConstantEdge, &context](const EdgeCluster& cluster) { + if (cluster.empty()) return false; + + auto baseEdgeIt = std::find_if(cluster.begin(), cluster.end(), [](const EdgePtr& edge) { + return one_of(edge->getStatus(), Edge::Status::Allocated, Edge::Status::NeedAllocation); + }); + + OPENVINO_ASSERT(baseEdgeIt != cluster.end(), "Unexpected cluster state"); + + // const auto& baseEdge = cluster.front(); + const auto& baseEdge = *baseEdgeIt; + // Skip already allocated cluster + if (baseEdge->getStatus() == Edge::Status::Allocated) { + return false; + } + + // Skip if the baseEdge does not require allocation + if (baseEdge->getStatus() != Edge::Status::NeedAllocation) { + return true; + } + + // Allocate a string cluster + if (baseEdge->getDesc().getPrecision() == element::string) { + OPENVINO_ASSERT(std::all_of(cluster.begin(), cluster.end(), + [](const EdgePtr& edge) { + return edge->getDesc().getPrecision() == element::string; + }), "All edges in the cluster must be string."); + auto memBlock = allocateStringMemory(baseEdge); + for (auto &edge : cluster) { + if (edge->getStatus() == Edge::Status::NotAllocated) { + edge->reuse(std::make_shared(context->getEngine(), edge->getDesc(), memBlock)); + } + } + return false; + } + + // Allocate a constant cluster + if (baseEdge->getParent()->isConstant()) { + // @todo can we add some meaningful assert here? + for (auto &edge : cluster) { + if (edge->getParent()->isConstant() && edge->getStatus() == Edge::Status::NeedAllocation) { + allocateConstantEdge(edge); + } + } + return false; + } + + return true; + }); + + return std::distance(clusters.begin(), endOfNotAllocatedPartition); +} - // attach all the not yet allocated edges to the memory contol +static void attachEdgeToMemoryControl(const EdgeClusters& edgeClusters, + const MemoryControl::MemoryBlockMap& memoryBlocks) { + // attach all the not yet allocated edges to the memory control for (auto&& item : memoryBlocks) { int count = 0; - for (auto&& edge : edge_clusters[item.first]) { + // std::cout << "Processing cluster: " << item.first << "\n"; + for (auto&& edge : edgeClusters[item.first]) { + // std::cout << "Processing edge: " << edge->name() << "\n"; if (edge->getStatus() == Edge::Status::NeedAllocation) { + // std::cout << "Allocating edge: " << edge->name() << "\n"; + edge->allocate(item.second); // TODO: WA for some test (like strided_slice_test) which use tensors with // shapes {0}. And it is implicitly converted into {1} tensor. // Zeroing of input data allow pass tests. - if (edge->getParent()->type == Type::Input && edge->hasDefinedMaxSize()) + if (edge->getParent()->getType() == Type::Input && edge->hasDefinedMaxSize()) edge->getMemoryPtr()->nullify(); count++; } } - OPENVINO_ASSERT(count == 1); + OPENVINO_ASSERT(count == 1, "Expected exactly one allocation. Actual number of allocations: ", count); } +} - m_pMemoryControl->allocateMemory(); - - // Resolve all other edges with status NotAllocated and in-place - for (auto& cluster : edge_clusters) { +static void resolveInPlaceEdges(const EdgeClusters& clusters) { + for (auto& cluster : clusters) { for (auto& edge : cluster) { if (edge->getStatus() != Edge::Status::NotAllocated) { continue; } + std::vector edges_to_process; edges_to_process.push_back(edge); - for (auto next_edge = edge->getSharedEdge(std::nothrow); - next_edge; - next_edge = next_edge->getSharedEdge(std::nothrow)) { + for (auto next_edge = edge->getSharedEdge(std::nothrow); next_edge; + next_edge = next_edge->getSharedEdge(std::nothrow)) { edges_to_process.push_back(next_edge); } + std::for_each(edges_to_process.rbegin(), edges_to_process.rend(), [](const EdgePtr& edge) { if (edge->getStatus() == Edge::Status::NotAllocated) { if (edge->inPlace(Edge::LOOK_DOWN)) { @@ -938,11 +879,37 @@ void Graph::AllocateWithReuse(const std::vector& syncNodesInds) { } } -void Graph::Allocate(const std::vector& syncNodesInds) { - OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "Graph::Allocate"); +std::vector Graph::CreateExecutionGraph() { + const bool hasDynNodes = ProcessDynNodes(); + auto syncNodesInds = hasDynNodes ? IdentifySyncPoints(graphNodes) : std::vector{}; + + std::tie(m_executableGraphNodes, m_executableSyncNodesInds) = + ExtractExecutableNodesAndSyncPoints(syncNodesInds, graphNodes); + + status = hasDynNodes ? (parallel_get_max_threads() > 1 ? Status::ReadyDynamic : Status::ReadyDynamicSeq) + : Status::ReadyStatic; + + if (hasDynNodes) { + status = Status::ReadyDynamic; + // Here we use the following heuristic: if the number of sync nodes is less than 10 times of the number of exec + // nodes, it does make sense to use Sequential dynamic shapes processing due to the high overheads on context + // switching when the dynamic shapes are being processed in parallel and there are a lot of sync points. Also + // this rule works for short graphs (usually subgraphs) when the amount of nodes is to low to process them in + // parallel. + const auto exec2sync = m_executableGraphNodes.size() / m_executableSyncNodesInds.size(); + if (exec2sync < 10 || parallel_get_max_threads() < 2) { + status = Status::ReadyDynamicSeq; + } + } else { + status = Status::ReadyStatic; + } + + return syncNodesInds; +} - //resolve inplace dead end nodes - for (const auto& edge : graphEdges) { +static void ResolveInOutInPlaceEdgesLegacy(const std::vector& edges) { + for (const auto& edge : edges) { + // std::cout << edge->name() << "\n"; if (edge->getStatus() == Edge::Status::Uninitialized) { if (edge->getParent()->getParentEdges().empty() && one_of(edge->getParent()->getType(), Type::Input, Type::MemoryInput) && @@ -955,20 +922,123 @@ void Graph::Allocate(const std::vector& syncNodesInds) { } } } +} + +static void ResolveInOutInPlaceEdges(const std::vector& edges) { + for (const auto& edge : edges) { + if (edge->getStatus() == Edge::Status::Uninitialized) { + if (edge->getParent()->getParentEdges().empty() && + one_of(edge->getParent()->getType(), Type::MemoryInput) && + edge->inPlace(Edge::LOOK_UP)) { + edge->getParent()->resolveInPlaceEdges(Edge::LOOK_UP); + } else if (edge->getChild()->getChildEdges().empty() && + one_of(edge->getChild()->getType(), Type::MemoryOutput) && + edge->inPlace(Edge::LOOK_DOWN)) { + edge->getChild()->resolveInPlaceEdges(Edge::LOOK_DOWN); + } + } + } +} + +int Graph::RegisterToAllocationContext(int offset, AllocationContext& context) { + auto syncNodesInds = CreateExecutionGraph(); + + ResolveInOutInPlaceEdges(graphEdges); + + for (size_t i = 0, j = 0; i < graphNodes.size(); i++) { + const auto& node = graphNodes[i]; + const auto inputExecIndex = i + offset; + offset = node->registerToAllocationContext(offset, context) - 1; + const auto outputExecIndex = i + offset; + context.execIndex[node] = {inputExecIndex, outputExecIndex}; + + if (j < syncNodesInds.size() && syncNodesInds[j] == i) { + context.syncPoints.push_back(inputExecIndex); + } + } + + context.edges.insert(context.edges.end(), graphEdges.begin(), graphEdges.end()); + + return offset; +} + +AllocationContext Graph::CreateAllocationContext(bool global) { + AllocationContext allocationContext; + + if (global) { + RegisterToAllocationContext(-1, allocationContext); + } else { // local allocation context. Used for the nodes with inner graph which are not updated yet + ResolveInOutInPlaceEdgesLegacy(graphEdges); + + auto syncNodesInds = CreateExecutionGraph(); + + for (size_t i = 0; i < graphNodes.size(); i++) { + const auto& node = graphNodes[i]; + const int inputExecIndex = i; + const int outputExecIndex = i; + allocationContext.execIndex[node] = {inputExecIndex, outputExecIndex}; + } + + allocationContext.edges = graphEdges; + allocationContext.syncPoints = syncNodesInds; + } - // resolve edges. Define which will be a view on others - // NeedAllocation - real blob - // NotAllocated - view on other blob, peer or in-place - for (auto& edge : graphEdges) edge->init(); + return allocationContext; +} + +void Graph::Allocate(bool globalAllocation) { + if (std::getenv("FORCE_LOCAL")) + globalAllocation = false; + // Set up the memory control subsystem. + auto memoryControl = globalAllocation ? m_context->getMemoryControl() : m_context->getNetworkMemoryControl()->createMemoryControlUnit(); + + if (memoryControl->allocated()) { + // std::cout << "Memory is already allocated for a subgraph: " << _name << "\n"; + return; + } + + // @todo collect syncNodesInds with respect to global context as well + auto allocationContext = CreateAllocationContext(globalAllocation); + const auto& edges = allocationContext.edges; + + // std::cout << "### Global edges:" << "\n"; + // for (const auto& edge : edges) { + // const auto& parent = edge->getParent(); + // const auto& child = edge->getChild(); + // std::cout << "[" << allocationContext.execIndex[parent].second << " - " << allocationContext.execIndex[child].first << "]" + // << edge->name() + // << "\n"; + // } + + // ResolveInOutInPlaceEdges(edges); + + for (auto& edge : edges) edge->init(); + + auto edgeClusters = MemoryControl::formEdgeClusters(edges); - // Allocate memory space for all edges marked with NeedAllocation - AllocateWithReuse(syncNodesInds); + const size_t remainingEdgeClustersCount = AllocateStringsAndConstants(edgeClusters, m_context); - // Check all getters. Should work. - for (auto& edge : graphEdges) edge->validate(); + // std::cout << "Edge clusters size: " << edgeClusters.size() << " remaining: " << remainingEdgeClustersCount << "\n"; + + auto memoryRegions = MemoryControl::formMemoryRegions(edgeClusters, + remainingEdgeClustersCount, + allocationContext.execIndex); + + m_outputNodesMemBlocks = MemoryControl::filterOutDynamicOutputEdges(memoryRegions, + edgeClusters, + outputNodesMap); + + memoryControl->insert(memoryRegions, allocationContext.syncPoints); + auto memoryBlocks = memoryControl->solve(); + + attachEdgeToMemoryControl(edgeClusters, memoryBlocks); + memoryControl->allocateMemory(); + resolveInPlaceEdges(edgeClusters); + + for (auto& edge : edges) edge->validate(); } -bool Graph::ProcessDynNodes() { +bool Graph::ProcessDynNodes() const { OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "Graph::ProcessDynNodes"); const bool containsDynamicNodes = std::any_of(graphNodes.begin(), graphNodes.end(), [](const NodePtr& node) { @@ -1395,14 +1465,6 @@ void Graph::Infer(SyncInferRequest* request) { DEBUG_LOG("Infer graph: ", GetName(), ". Status: ", static_cast(status)); const int numaId = GetNumaNodeId(m_context); - if (!m_pMemoryControl) { - OPENVINO_THROW("Memory control unit is not initilized in graph: ", GetName()); - } - - if (!m_pMemoryControl->allocated()) { - m_pMemoryControl->allocateMemory(); - } - switch (status) { case Status::ReadyDynamic: InferDynamic(request, numaId, UpdateNodes(m_executableGraphNodes)); diff --git a/src/plugins/intel_cpu/src/graph.h b/src/plugins/intel_cpu/src/graph.h index d50ccc152c9186..2e31c9f9243b0c 100644 --- a/src/plugins/intel_cpu/src/graph.h +++ b/src/plugins/intel_cpu/src/graph.h @@ -4,17 +4,17 @@ #pragma once +#include "allocation_context.hpp" #include "config.h" #include "cpu_memory.h" #include "nodes/input.h" -#include "openvino/core/node_vector.hpp" #include "openvino/runtime/profiling_info.hpp" #include "node.h" #include "edge.h" #include "graph_context.h" -#include "memory_control.hpp" #include "openvino/runtime/profiling_info.hpp" +#include #include #include #include @@ -31,6 +31,19 @@ namespace node { class MemoryStateNode; } // namespace node +struct MemoryRegion { + int start; // Execution order index of first use. + int finish; // Execution order index of last use. -1 means inf + int64_t size; // size in bytes + int64_t id; // ID unique for each region + + enum class RegionType : uint8_t { VARIABLE, CONSTANT, INPUT, OUTPUT, IO } type; + enum class AllocType : uint8_t { POD, STRING, UNKNOWN } alloc_type; +}; + +using MemoryRegions = std::vector; +using OutputMemoryBlocks = std::unordered_map; + class Graph { public: typedef std::shared_ptr Ptr; @@ -75,6 +88,9 @@ class Graph { void PushInputData(const std::size_t& index, const ov::SoPtr& input); void PullOutputData(std::unordered_map>& output); + // @todo pass as part of one of the graph configuration stages + // void SetGlobalExecutionIndex() { + // } // Returns Output nodes memory descriptors VecMemoryDescs getOutputMemoryDescriptors() const; @@ -213,12 +229,26 @@ class Graph { /** * Activate execution graph using \p externalInputMemory and \p externalOutputMemory + * 'globalAllocation' is a temporary flag indicating that the current graph is participaing in + * global memory reuse (together with all inner / outer graphs). + * The flag should be dropped after all the nodes with inner graphs participate in + * global memory reuse by default */ void Activate(const std::vector& externalInputMemory = {}, - const std::vector& externalOutputMemory = {}); + const std::vector& externalOutputMemory = {}, + bool globalAllocation = false); + + MemoryRegions RegisterExternalMemory(const std::vector& externalInputMemory = {}, + const std::vector& externalOutputMemory = {}); + + void Allocate(bool globalAllocation = false); + + AllocationContext CreateAllocationContext(bool global); + + int RegisterToAllocationContext(int offset, AllocationContext& context); const std::unordered_map& getOutputNodesMemBlocksMap() const { - return outputNodesMemBlocksMap; + return m_outputNodesMemBlocks; } protected: @@ -256,10 +286,10 @@ class Graph { void InitOptimalPrimitiveDescriptors(); void ResolveEdgeConflicts(); void ResolveComplexInplaceConflicts(); - bool ProcessDynNodes(); - void Allocate(const std::vector& syncNodesInds); - void AllocateWithReuse(const std::vector& syncNodesInds); + bool ProcessDynNodes() const; + void AllocateWithReuse(const std::vector& syncNodesInds, GlobalExecutionIndex globalExecIndex); void CreatePrimitivesAndExecConstants() const; + std::vector CreateExecutionGraph(); /** * Execute a given \p node within \p request using \p numaId @@ -300,7 +330,7 @@ class Graph { std::map inputNodesMap; std::map outputNodesMap; - std::unordered_map outputNodesMemBlocksMap; + OutputMemoryBlocks m_outputNodesMemBlocks; // these node pointers (from graphNodes) are to avoid regular checking for // constantness of nodes in Infer methods and calls of @@ -310,8 +340,6 @@ class Graph { GraphContext::CPtr m_context; dnnl::stream m_stream; - - MemoryControl* m_pMemoryControl = nullptr; }; using GraphPtr = std::shared_ptr; diff --git a/src/plugins/intel_cpu/src/graph_context.cpp b/src/plugins/intel_cpu/src/graph_context.cpp index e200766fa4791c..e4eb13ed58f53f 100644 --- a/src/plugins/intel_cpu/src/graph_context.cpp +++ b/src/plugins/intel_cpu/src/graph_context.cpp @@ -1,7 +1,6 @@ // Copyright (C) 2018-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // -#include "dnnl_types.h" #include "graph_context.h" #include "nodes/memory.hpp" #include "memory_control.hpp" @@ -12,6 +11,8 @@ namespace intel_cpu { GraphContext::GraphContext(const Config& config, WeightsSharing::Ptr w_cache, bool isGraphQuantized, + MemoryControl* memoryControl, + std::shared_ptr networkMemoryControl, ov::threading::IStreamsExecutor::Ptr streamExecutor, std::shared_ptr sub_memory_manager) : config(config), @@ -20,7 +21,8 @@ GraphContext::GraphContext(const Config& config, streamExecutor(streamExecutor), subMemoryManager(sub_memory_manager), memoryStatesRegister(std::make_shared()), - networkMemoryControl(std::make_shared()) { + memoryControl(memoryControl), + networkMemoryControl(networkMemoryControl) { rtParamsCache = std::make_shared(config.rtCacheCapacity); // primitive/executors can be shared across sub-stream // but scratch pad cannot be shared. diff --git a/src/plugins/intel_cpu/src/graph_context.h b/src/plugins/intel_cpu/src/graph_context.h index db2b126213978c..15f947d17788a9 100644 --- a/src/plugins/intel_cpu/src/graph_context.h +++ b/src/plugins/intel_cpu/src/graph_context.h @@ -18,6 +18,7 @@ namespace node { class MemoryStatesRegister; } // namespace node +class MemoryControl; class NetworkMemoryControl; class GraphContext { @@ -28,6 +29,8 @@ class GraphContext { GraphContext(const Config& config, WeightsSharing::Ptr w_cache, bool isGraphQuantized, + MemoryControl* memoryControl, + std::shared_ptr networkMemoryControl, // obsolete in favor of local memoryControl ov::threading::IStreamsExecutor::Ptr streamExecutor = nullptr, std::shared_ptr sub_memory_manager = nullptr); @@ -78,6 +81,10 @@ class GraphContext { return memoryStatesRegister; } + MemoryControl* getMemoryControl() const { + return memoryControl; + } + const std::shared_ptr& getNetworkMemoryControl() const { return networkMemoryControl; } @@ -103,6 +110,10 @@ class GraphContext { int numNumaNodes = 1; std::shared_ptr memoryStatesRegister; + MemoryControl* memoryControl; + // to be removed in favor of local memoryControl + // currently required for the nodes with inner graphs which + // do not participate in global memory reuse std::shared_ptr networkMemoryControl; }; diff --git a/src/plugins/intel_cpu/src/graph_dumper.cpp b/src/plugins/intel_cpu/src/graph_dumper.cpp index fab6e99dcf2550..6bcc46153cbf79 100644 --- a/src/plugins/intel_cpu/src/graph_dumper.cpp +++ b/src/plugins/intel_cpu/src/graph_dumper.cpp @@ -243,7 +243,6 @@ void serializeToXML(const Graph &graph, const std::string& path) { void serializeToCout(const Graph &graph) { for (const auto& node : graph.GetNodes()) { - std::cout << "name: " << node->getName() << " [ "; auto nodeDesc = node->getSelectedPrimitiveDescriptor(); if (nodeDesc) { auto& inConfs = nodeDesc->getConfig().inConfs; diff --git a/src/plugins/intel_cpu/src/infer_request.cpp b/src/plugins/intel_cpu/src/infer_request.cpp index f0b817dcda859c..2110d837ab7bc9 100644 --- a/src/plugins/intel_cpu/src/infer_request.cpp +++ b/src/plugins/intel_cpu/src/infer_request.cpp @@ -19,6 +19,7 @@ #include "utils/general_utils.h" #include "utils/ngraph_utils.hpp" #include "openvino/runtime/threading/cpu_message.hpp" +#include "memory_control.hpp" using OvString = ov::element_type_traits::value_type; @@ -135,6 +136,15 @@ void SyncInferRequest::infer() { push_input_data(); + MemoryControl* network_memory_control = m_graph->getGraphContext()->getMemoryControl(); + if (!network_memory_control) { + OPENVINO_THROW("Memory control unit is not initilized for graph: ", m_graph->GetName()); + } + + if (!network_memory_control->allocated()) { + network_memory_control->allocateMemory(); + } + m_graph->Infer(this); throw_if_canceled(); diff --git a/src/plugins/intel_cpu/src/memory_control.cpp b/src/plugins/intel_cpu/src/memory_control.cpp index 0f202c296891c1..3edc3f423b9032 100644 --- a/src/plugins/intel_cpu/src/memory_control.cpp +++ b/src/plugins/intel_cpu/src/memory_control.cpp @@ -4,10 +4,15 @@ #include "memory_control.hpp" +#include +#include #include +#include "edge.h" #include "node.h" #include "openvino/runtime/memory_solver.hpp" +#include "proxy_mem_blk.h" +#include "utils/general_utils.h" namespace ov { namespace intel_cpu { @@ -84,7 +89,7 @@ class MemoryBlockWithRelease : public IMemoryBlockObserver { class IMemoryManager { public: virtual ~IMemoryManager() = default; - virtual void insert(const MemoryRegion& reg) = 0; + virtual void insert(const MemoryRegion& reg, const std::vector& syncInds) = 0; virtual const MemoryControl::MemoryBlockMap& lastSolution() = 0; virtual void allocate() = 0; virtual void release() = 0; @@ -99,7 +104,8 @@ std::shared_ptr makeDnnlMemoryBlock(Args&&... args) { class MemoryManagerIO : public IMemoryManager { public: - void insert(const MemoryRegion& reg) override { + void insert(const MemoryRegion& reg, const std::vector& syncInds) override { + (void) syncInds; m_blocks.insert({reg.id, makeDnnlMemoryBlock()}); } @@ -120,7 +126,8 @@ class MemoryManagerIO : public IMemoryManager { class MemoryManagerStatic : public IMemoryManager { public: - void insert(const MemoryRegion& reg) override { + void insert(const MemoryRegion& reg, const std::vector& syncInds) override { + (void) syncInds; m_boxes.emplace_back(MemorySolver::Box{reg.start, reg.finish, reg.size, reg.id}); } @@ -167,19 +174,18 @@ class MemoryManagerStatic : public IMemoryManager { class MemoryManageNonOverlapingSets : public IMemoryManager { public: - MemoryManageNonOverlapingSets(std::vector syncInds) : m_syncInds(std::move(syncInds)) {} - void insert(const MemoryRegion& reg) override { + void insert(const MemoryRegion& reg, const std::vector& syncInds) override { MemorySolver::Box box = {reg.start, reg.finish, reg.size, reg.id}; if (-1 != reg.finish) { //We have to extend the lifespan of tensors that are crossing a sync point border in order to save //the intermediate computation results from possible loss due to the tensor resize auto itr_upper = - std::upper_bound(m_syncInds.begin(), m_syncInds.end(), box.finish, [](int y, int x) { + std::upper_bound(syncInds.begin(), syncInds.end(), box.finish, [](int y, int x) { return y <= x; }); - auto itr_lower = std::lower_bound(m_syncInds.begin(), m_syncInds.end(), box.start); + auto itr_lower = std::lower_bound(syncInds.begin(), syncInds.end(), box.start); if (itr_lower != itr_upper) { // across sections - if (itr_upper == m_syncInds.end()) { + if (itr_upper == syncInds.end()) { box.finish = -1; } else { box.finish = *itr_upper; @@ -242,7 +248,6 @@ class MemoryManageNonOverlapingSets : public IMemoryManager { std::unordered_map> m_internalBlocks; std::vector m_boxes; - std::vector m_syncInds; }; } // namespace @@ -256,12 +261,12 @@ class MemoryControl::RegionHandler { : m_cond(std::move(cond)), m_memManager(std::move(memManager)) {} - bool insert(const MemoryRegion& reg) { + bool insert(const MemoryRegion& reg, const std::vector& syncInds) { if (!m_cond(reg)) { return false; } - m_memManager->insert(reg); + m_memManager->insert(reg, syncInds); return true; } @@ -292,9 +297,8 @@ MemoryControl::RegionHandlerPtr buildHandler(F&& f, Args&&... args) { } // namespace -MemoryControl::MemoryControl(std::vector syncInds) { +MemoryControl::MemoryControl() { // init handlers - // handler for dynamic tensors m_handlers.emplace_back(buildHandler([](const MemoryRegion& reg) { if (reg.size < 0 || MemoryRegion::RegionType::VARIABLE != reg.type || @@ -311,7 +315,7 @@ MemoryControl::MemoryControl(std::vector syncInds) { return false; } return true; - }, std::move(syncInds))); + })); //handler for I/O tensors, so far simply individual blocks m_handlers.emplace_back(buildHandler([](const MemoryRegion& reg) { @@ -322,22 +326,24 @@ MemoryControl::MemoryControl(std::vector syncInds) { })); } -void MemoryControl::insert(const MemoryRegion& region) { +void MemoryControl::insert(const MemoryRegion& region, const std::vector& syncInds) { for (auto&& handler : m_handlers) { - if (handler->insert(region)) { + if (handler->insert(region, syncInds)) { return; } } OPENVINO_THROW("No suitable hanlder was found for the given memory region"); } -MemoryControl::MemoryBlockMap MemoryControl::insert(const std::vector& regions) { +void MemoryControl::insert(const std::vector& regions, + const std::vector& syncInds) { for (auto&& region : regions) { - insert(region); + insert(region, syncInds); } +} +MemoryControl::MemoryBlockMap MemoryControl::solve() { MemoryControl::MemoryBlockMap blocksMap; - blocksMap.reserve(regions.size()); for (auto&& handler : m_handlers) { auto&& solution = handler->lastSolution(); @@ -364,52 +370,186 @@ void MemoryControl::releaseMemory() { m_allocated = false; } -edgeClusters MemoryControl::findEdgeClusters(const std::vector& graphEdges) { - typedef std::unordered_map edge_cluster_idx_map_t; - - edgeClusters edge_clusters; - edge_cluster_idx_map_t edge_cluster_indices; +// /** +// * Forms clusters of edges. +// * An edge cluster is a collection of edges, so: +// * - base edge is an edge with a Memory which other edges point to by means of inplace logic +// * - first edge of a cluster is a base edge with a status either NeedAllocation or Allocated +// * - rest of the edges in a cluster are NotAllocated ones, since they point to their base edge +// */ +// EdgeClusters MemoryControl::flattenEdgeClusters(const EdgeClusters& clusters) { +// } + +/** + * Forms clusters of edges. + * An edge cluster is a collection of edges, so: + * - base edge is an edge with a Memory which other edges point to by means of inplace logic + * - first edge of a cluster is a base edge with a status either NeedAllocation or Allocated + * - rest of the edges in a cluster are NotAllocated ones, since they point to their base edge + */ +EdgeClusters MemoryControl::formEdgeClusters(const std::vector& graphEdges) { + typedef std::unordered_map EdgeClusterIdxMap; + EdgeClusters edgeClusters; + EdgeClusterIdxMap edgeClusterIndices; for (auto& edge : graphEdges) { - auto edge_it = edge_cluster_indices.find(edge); - if (edge_it != edge_cluster_indices.end()) - continue; // edge is visited + if (edgeClusterIndices.count(edge)) + continue; // edge is visited - size_t cluster_idx = edge_clusters.size(); - EdgePtr last_shared_edge = nullptr; + size_t clusterIdx = edgeClusters.size(); + EdgePtr lastSharedEdge = nullptr; // find cluster index for (auto shared_edge = edge->getSharedEdge(std::nothrow); shared_edge; shared_edge = shared_edge->getSharedEdge(std::nothrow)) { - auto shared_edge_it = edge_cluster_indices.find(shared_edge); - if (shared_edge_it != edge_cluster_indices.end()) { - cluster_idx = shared_edge_it->second; - last_shared_edge = shared_edge; + auto shared_edge_it = edgeClusterIndices.find(shared_edge); + if (shared_edge_it != edgeClusterIndices.end()) { + clusterIdx = shared_edge_it->second; + lastSharedEdge = shared_edge; break; } } - // add shared edges to cluster - edge_cluster_indices.emplace(edge, cluster_idx); + if (clusterIdx == edgeClusters.size()) + edgeClusters.emplace_back(EdgeCluster{edge}); - if (cluster_idx == edge_clusters.size()) - edge_clusters.emplace_back(edgeCluster{edge}); - else - edge_clusters[cluster_idx].emplace(edge); + // use recursive approach to ensure that the base edge is placed as a first entry of a cluster + std::function addToCluster; + addToCluster = [&addToCluster, &edgeClusterIndices, &clusterIdx, &edgeClusters, &lastSharedEdge](EdgePtr edge) { + if (edge == lastSharedEdge) + return; - for (auto shared_edge = edge->getSharedEdge(std::nothrow); shared_edge != last_shared_edge; - shared_edge = shared_edge->getSharedEdge(std::nothrow)) { - edge_cluster_indices.emplace(shared_edge, cluster_idx); - edge_clusters[cluster_idx].emplace(shared_edge); + addToCluster(edge->getSharedEdge(std::nothrow)); + + edgeClusterIndices.emplace(edge, clusterIdx); + edgeClusters[clusterIdx].push_back(edge); + }; + + addToCluster(edge); + } + + return edgeClusters; +} + +static inline bool isConstOutput(EdgePtr edge) { + return edge->getParent()->isConstant() && !edge->getChild()->isConstant(); +} + +MemoryRegions MemoryControl::formMemoryRegions(const EdgeClusters& clusters, + size_t remaining, + const GlobalExecutionIndex& globalExecIndex) { + // Markup the memory regions + MemoryRegions memoryRegions; + memoryRegions.reserve(remaining); + + for (size_t i = 0; i < remaining; ++i) { + MemoryRegion reg = {std::numeric_limits::max(), + 0, + 0, + static_cast(i), + MemoryRegion::RegionType::VARIABLE, + MemoryRegion::AllocType::UNKNOWN}; + + int64_t boxSize = 0; + bool isConst = false, isOutput = false, isInput = false; + // std::cout << "Form memory region for cluster: " << i << "\n"; + for (auto &edge : clusters[i]) { + const auto& parent = edge->getParent(); + const auto& child = edge->getChild(); + + // std::cout << "[" << globalExecIndex.at(parent).second << " - " << globalExecIndex.at(child).first << "]" + // << edge->name() << " status: " << static_cast(edge->getStatus()) + // << "\n"; + + int e_start = globalExecIndex.at(parent).second; + int e_finish = globalExecIndex.at(child).first; + // int e_finish = edge->getChild()->getExecIndex(); + + auto&& desc = edge->getDesc(); + + if (boxSize != -1 && desc.isDefined()) { + int64_t e_size = desc.getCurrentMemSize(); // size in bytes (from the beginning of data to the last element) + boxSize = std::max(e_size, boxSize); + } else { + boxSize = -1; + } + + reg.start = std::min(e_start, reg.start); + reg.finish = std::max(e_finish, reg.finish); + + auto allocType = + desc.getPrecision() == element::string ? MemoryRegion::AllocType::STRING : MemoryRegion::AllocType::POD; + + if (reg.alloc_type != allocType && MemoryRegion::AllocType::UNKNOWN != reg.alloc_type) { + OPENVINO_THROW("Different allocation types in the same memory region"); + } + reg.alloc_type = allocType; + + isConst |= isConstOutput(edge); + isOutput |= child->getType() == Type::Output; + isInput |= parent->getType() == Type::Input; + } + + reg.size = boxSize; + + if (isConst) { + reg.type = MemoryRegion::RegionType::CONSTANT; + } else if (isInput) { + if (isOutput) { + reg.type = MemoryRegion::RegionType::IO; + } else { + reg.type = MemoryRegion::RegionType::INPUT; + } + } else if (isOutput) { + reg.type = MemoryRegion::RegionType::OUTPUT; } + + memoryRegions.push_back(reg); } - return edge_clusters; + return memoryRegions; +} + +OutputMemoryBlocks MemoryControl::filterOutDynamicOutputEdges(MemoryRegions& memoryRegions, + const EdgeClusters& clusters, + const std::map& outputNodes) { + OutputMemoryBlocks outputMemBlocks; + memoryRegions.erase(std::remove_if(memoryRegions.begin(), memoryRegions.end(), [&](const MemoryRegion& region) { + if (region.size >= 0 || !one_of(region.type, MemoryRegion::RegionType::OUTPUT, MemoryRegion::RegionType::IO)) { + return false; + } + bool result = false; + for (auto& edge : clusters[region.id]) { + auto child = edge->getChild(); + if (child->getType() == Type::Output && edge->getStatus() == Edge::Status::NeedAllocation) { + auto proxyMemBlock = std::make_shared(); + DEBUG_LOG("ProxyMemoryBlock ", proxyMemBlock); + // std::cout << "Allocating output edge: " << edge->name() << "\n"; + edge->allocate(proxyMemBlock); + + // Store the output memory blocks. + // So that, the infer requests can be able to access them. + int count = 0; + for (auto& output : outputNodes) { + if (output.second == child) { + outputMemBlocks[output.first] = proxyMemBlock; + count++; + } + } + // sometimes there are unused output ports. + OPENVINO_ASSERT(count <= 1, "CPU plugin cannot find output node. count ", count); + result = true; + } + } + return result; + }), memoryRegions.end()); + + return outputMemBlocks; } -MemoryControl& NetworkMemoryControl::createMemoryControlUnit(std::vector syncInds) { - m_controlUnits.emplace_back(std::unique_ptr(new MemoryControl(syncInds))); - return *(m_controlUnits.back()); +MemoryControl* NetworkMemoryControl::createMemoryControlUnit() { + m_controlUnits.emplace_back(std::unique_ptr(new MemoryControl())); + return m_controlUnits.back().get(); } void NetworkMemoryControl::allocateMemory() { @@ -425,4 +565,4 @@ void NetworkMemoryControl::releaseMemory() { } } // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/plugins/intel_cpu/src/memory_control.hpp b/src/plugins/intel_cpu/src/memory_control.hpp index ce4dc90890f3fa..fc38cf8df2ccb0 100644 --- a/src/plugins/intel_cpu/src/memory_control.hpp +++ b/src/plugins/intel_cpu/src/memory_control.hpp @@ -5,22 +5,15 @@ #pragma once #include "edge.h" +#include "graph.h" +#include "node.h" +#include "proxy_mem_blk.h" namespace ov { namespace intel_cpu { -using edgeCluster = std::unordered_set; -using edgeClusters = std::vector; - -struct MemoryRegion { - int start; // Execution order index of first use. - int finish; // Execution order index of last use. -1 means inf - int64_t size; // size in bytes - int64_t id; // ID unique for each region - - enum class RegionType : uint8_t { VARIABLE, CONSTANT, INPUT, OUTPUT, IO } type; - enum class AllocType : uint8_t { POD, STRING, UNKNOWN } alloc_type; -}; +using EdgeCluster = std::vector; +using EdgeClusters = std::vector; class MemoryControl { public: @@ -30,9 +23,16 @@ class MemoryControl { using MemoryBlockMap = std::unordered_map; public: - static edgeClusters findEdgeClusters(const std::vector& graphEdges); + static EdgeClusters formEdgeClusters(const std::vector& graphEdges); + static MemoryRegions formMemoryRegions(const EdgeClusters& clusters, size_t remaining, const GlobalExecutionIndex& globalExecIndex); + static OutputMemoryBlocks filterOutDynamicOutputEdges(MemoryRegions& memoryRegions, + const EdgeClusters& clusters, + const std::map& outputNodes); + + void insert(const MemoryRegions& regions, + const std::vector& syncInds); - MemoryBlockMap insert(const std::vector& regions); + MemoryBlockMap solve(); bool allocated() const { return m_allocated; @@ -42,13 +42,12 @@ class MemoryControl { void releaseMemory(); private: - explicit MemoryControl(std::vector syncInds); - void insert(const MemoryRegion& region); + explicit MemoryControl(); + void insert(const MemoryRegion& region, const std::vector& syncInds); friend class NetworkMemoryControl; private: - std::vector m_syncInds; std::vector m_handlers; bool m_allocated = false; }; @@ -56,7 +55,8 @@ class MemoryControl { class NetworkMemoryControl { public: NetworkMemoryControl() = default; - MemoryControl& createMemoryControlUnit(std::vector syncInds); + // @todo return std::reference_wrapper instead? + MemoryControl* createMemoryControlUnit(); void allocateMemory(); void releaseMemory(); @@ -69,4 +69,4 @@ class NetworkMemoryControl { }; } // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/plugins/intel_cpu/src/node.cpp b/src/plugins/intel_cpu/src/node.cpp index 7c23d55fc4147a..2c592a26bac7c9 100644 --- a/src/plugins/intel_cpu/src/node.cpp +++ b/src/plugins/intel_cpu/src/node.cpp @@ -1152,6 +1152,10 @@ bool Node::isConstant() { return getConstantType() == ConstantType::Const; } +bool Node::isConstantInput() { + return isConstant() && getType() == Type::Input; +} + void Node::updateConstantType() { if (constant == ConstantType::StrictNoConst) return; diff --git a/src/plugins/intel_cpu/src/node.h b/src/plugins/intel_cpu/src/node.h index 948bd6999ce27a..4b437d94b52f83 100644 --- a/src/plugins/intel_cpu/src/node.h +++ b/src/plugins/intel_cpu/src/node.h @@ -6,6 +6,7 @@ #include #include +#include "allocation_context.hpp" #include "cpu_memory.h" #include "cpu_shape.h" #include "cpu_types.h" @@ -43,6 +44,7 @@ using NodePtr = std::shared_ptr; using NodeConstPtr = std::shared_ptr; using NodeWeakPtr = std::weak_ptr; + class PortConfigurator { public: PortConfigurator(ov::intel_cpu::LayoutType blockedDescType, ov::element::Type prc, const Shape& shape, @@ -278,6 +280,7 @@ class Node { ConstantType getConstantType() const; void updateConstantType(); bool isConstant(); + bool isConstantInput(); // return type int supports return -1 in overloading when channel axis doesn't exist virtual int getFusingAxis() const { @@ -482,6 +485,25 @@ class Node { int getExecIndex() const { return execIndex; } + // for nodes with subgraphs equals to number of internal nodes (continius) - 1 + // equals to 0 for other nodes + virtual int getExecIndexOffset() const { + return 1; + } + + virtual void updateGlobalFlattenExecIndex(std::unordered_map& globalExecIndexStorage) { + return; + } + + virtual int registerToAllocationContext(int offset, AllocationContext& context) { + (void) context; + return offset + 1; + } + + // virtual void registerInAllocationContext(AllocationContext context) { + // (void) context; + // return; + // } const std::string & getTypeStr() const { return typeStr; diff --git a/src/plugins/intel_cpu/src/nodes/composite.cpp b/src/plugins/intel_cpu/src/nodes/composite.cpp index a1ceabd6942db1..d2d8ee66ad6323 100644 --- a/src/plugins/intel_cpu/src/nodes/composite.cpp +++ b/src/plugins/intel_cpu/src/nodes/composite.cpp @@ -4,6 +4,7 @@ #include "composite.h" +#include "compiled_model.h" #include "nodes/input.h" #include "cpu_memory.h" #include "transformations/cpu_opset/common/op/submodel.hpp" @@ -75,23 +76,46 @@ void Composite::selectOptimalPrimitiveDescriptor() { // @todo add ascii diagramm for memory mapping / reuse void Composite::createPrimitive() { - OPENVINO_ASSERT(getOriginalInputsNumber() == m_graph.GetInputNodesMap().size(), - "Number of node inputs must be equal the number of inner graph's inputs"); + // OPENVINO_ASSERT(getOriginalInputsNumber() == m_graph.GetInputNodesMap().size(), + // "Number of node inputs must be equal the number of inner graph's inputs"); - std::vector inputMemory; - for (size_t i = 0; i < getOriginalInputsNumber(); i++) { - inputMemory.emplace_back(getSrcMemoryAtPort(i)); - } + // std::vector inputMemory; + // for (size_t i = 0; i < getOriginalInputsNumber(); i++) { + // inputMemory.emplace_back(getSrcMemoryAtPort(i)); + // } - OPENVINO_ASSERT(getOriginalOutputsNumber() == m_graph.GetOutputNodesMap().size(), - "Number of node outputs must be equal the number of inner graph's outputs"); + // OPENVINO_ASSERT(getOriginalOutputsNumber() == m_graph.GetOutputNodesMap().size(), + // "Number of node outputs must be equal the number of inner graph's outputs"); - std::vector outputMemory; - for (size_t i = 0; i < getOriginalOutputsNumber(); i++) { - outputMemory.emplace_back(getDstMemoryAtPort(i)); + // std::vector outputMemory; + // for (size_t i = 0; i < getOriginalOutputsNumber(); i++) { + // outputMemory.emplace_back(getDstMemoryAtPort(i)); + // } + + // m_graph.Activate(inputMemory, outputMemory); + m_graph.Activate({}, {}, true); +} + +int Composite::registerToAllocationContext(int offset, AllocationContext& context) { + for (size_t i = 0; i < getParentEdges().size(); i++) { + auto parentEdge = getParentEdgeAt(i); + auto inputEdges = m_graph.GetInputNodesMap().at(i)->getChildEdgesAtPort(0); + for (const auto& inputEdge : inputEdges) { + OPENVINO_ASSERT(inputEdge->getStatus() == Edge::Status::Uninitialized, + "Expected Uninitialized state for edge: ", inputEdge->name()); + inputEdge->sharedMemFrom(parentEdge); + } + } + + for (size_t i = 0; i < getChildEdges().size(); i++) { + auto childEdge = getChildEdgeAt(i); + auto outputEdge = m_graph.GetOutputNodesMap().at(i)->getParentEdgeAt(0); + OPENVINO_ASSERT(outputEdge->getStatus() == Edge::Status::Uninitialized, + "Expected Uninitialized state for edge: ", outputEdge->name()); + outputEdge->sharedMemFrom(childEdge); } - m_graph.Activate(inputMemory, outputMemory); + return m_graph.RegisterToAllocationContext(offset, context); } void Composite::execute(dnnl::stream) { diff --git a/src/plugins/intel_cpu/src/nodes/composite.h b/src/plugins/intel_cpu/src/nodes/composite.h index 9f18a2ba68b769..a80157f624003b 100644 --- a/src/plugins/intel_cpu/src/nodes/composite.h +++ b/src/plugins/intel_cpu/src/nodes/composite.h @@ -4,7 +4,9 @@ #pragma once +#include #include +#include #include "graph.h" #include "node.h" @@ -41,6 +43,8 @@ class Composite : public Node { void execute(dnnl::stream) override; void executeDynamicImpl(dnnl::stream strm) override; + int registerToAllocationContext(int offset, AllocationContext& context) override; + const Graph& graph() const { return m_graph; } diff --git a/src/plugins/intel_cpu/src/nodes/input.cpp b/src/plugins/intel_cpu/src/nodes/input.cpp index 1f650bd8c5de17..5a2b56fac51e07 100644 --- a/src/plugins/intel_cpu/src/nodes/input.cpp +++ b/src/plugins/intel_cpu/src/nodes/input.cpp @@ -543,6 +543,36 @@ void Input::initSupportedPdFromMemDesc() { supportedPrimitiveDescriptors.emplace_back(std::move(config), impl_desc_type::unknown); } +void Input::resolveInPlaceEdges(Edge::LOOK look) { + if (look & Edge::LOOK_UP) { + auto edges = getChildEdgesAtPort(0); + for (const auto& edge : edges) { + EdgePtr sharedEdge = edge; + + while (sharedEdge->getSharedEdge(std::nothrow)) { + sharedEdge = sharedEdge->getSharedEdge(std::nothrow); + } + + // std::cout << edge->name() << " shared edge is: " << sharedEdge->name() << "\n"; + edge->allocate(sharedEdge->getMemoryPtr()->getMemoryBlock()); + } + } + + if (look & Edge::LOOK_DOWN) { + for (size_t i = 0; i < getParentEdges().size(); i++) { + auto edge = getParentEdgeAt(i); + EdgePtr sharedEdge = edge; + + while (sharedEdge->getSharedEdge(std::nothrow)) { + sharedEdge = sharedEdge->getSharedEdge(std::nothrow); + } + + // std::cout << edge->name() << " shared edge is: " << sharedEdge->name() << "\n"; + edge->allocate(sharedEdge->getMemoryPtr()->getMemoryBlock()); + } + } +} + } // namespace node } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/input.h b/src/plugins/intel_cpu/src/nodes/input.h index 4d7febb17ad4b7..a943419aed06d1 100644 --- a/src/plugins/intel_cpu/src/nodes/input.h +++ b/src/plugins/intel_cpu/src/nodes/input.h @@ -56,6 +56,7 @@ class Input : public Node { void selectOptimalPrimitiveDescriptor() override; void createPrimitive() override; bool created() const override; + void resolveInPlaceEdges(Edge::LOOK look) override; void withMeanImage(); MemoryCPtr getMemoryPtr() const; diff --git a/src/plugins/intel_cpu/src/nodes/lora.cpp b/src/plugins/intel_cpu/src/nodes/lora.cpp index 2c69bc347b6139..3c0ae5087bae3d 100644 --- a/src/plugins/intel_cpu/src/nodes/lora.cpp +++ b/src/plugins/intel_cpu/src/nodes/lora.cpp @@ -80,21 +80,46 @@ void LoRA::selectOptimalPrimitiveDescriptor() { selectPrimitiveDescriptorByIndex(0); } +int LoRA::registerToAllocationContext(int offset, AllocationContext& context) { + for (size_t i = 0; i < getParentEdges().size(); i++) { + auto parentEdge = getParentEdgeAt(i); + auto inputEdges = m_graph.GetInputNodesMap().at(i)->getChildEdgesAtPort(0); + for (const auto& inputEdge : inputEdges) { + OPENVINO_ASSERT(inputEdge->getStatus() == Edge::Status::Uninitialized, + "Expected Uninitialized Edge instead of: ", static_cast(inputEdge->getStatus())); + inputEdge->sharedMemFrom(parentEdge); + } + } + + for (size_t i = 0; i < getChildEdges().size(); i++) { + auto childEdge = getChildEdgeAt(i); + auto outputEdge = m_graph.GetOutputNodesMap().at(i)->getParentEdgeAt(0); + // std::cout << outputEdge->name() << " sharing memory from edge: " << childEdge->name() << "\n"; + outputEdge->sharedMemFrom(childEdge); + // for (const auto& inputEdge : inputEdges) { + // OPENVINO_ASSERT(inputEdge->getStatus() != Edge::Status::Uninitialized, "Expected Uninitialized Edge"); + // inputEdge->sharedMemFrom(parentEdge); + // } + } + return m_graph.RegisterToAllocationContext(offset, context); +} + // @todo add ascii diagram for memory mapping / reuse void LoRA::createPrimitive() { - CPU_NODE_ASSERT(getOriginalInputsNumber() == m_graph.GetInputNodesMap().size(), - "Number of node inputs must be equal the number of inner graph's inputs"); + // CPU_NODE_ASSERT(getOriginalInputsNumber() == m_graph.GetInputNodesMap().size(), + // "Number of node inputs must be equal the number of inner graph's inputs"); - std::vector inputMemory; - for (size_t i = 0; i < getOriginalInputsNumber(); i++) { - inputMemory.emplace_back(getSrcMemoryAtPort(i)); - } + // std::vector inputMemory; + // for (size_t i = 0; i < getOriginalInputsNumber(); i++) { + // inputMemory.emplace_back(getSrcMemoryAtPort(i)); + // } - CPU_NODE_ASSERT(getOriginalOutputsNumber() == m_graph.GetOutputNodesMap().size(), - "Number of node outputs must be equal the number of inner graph's outputs"); + // CPU_NODE_ASSERT(getOriginalOutputsNumber() == m_graph.GetOutputNodesMap().size(), + // "Number of node outputs must be equal the number of inner graph's outputs"); - std::vector outputMemory{getDstMemoryAtPort(0)}; - m_graph.Activate(inputMemory, outputMemory); + // std::vector outputMemory{getDstMemoryAtPort(0)}; + // m_graph.Activate(inputMemory, outputMemory); + m_graph.Activate({}, {}, true); } void LoRA::execute(dnnl::stream) { diff --git a/src/plugins/intel_cpu/src/nodes/lora.h b/src/plugins/intel_cpu/src/nodes/lora.h index 89a1bc15c2bf17..acba8d949d2e82 100644 --- a/src/plugins/intel_cpu/src/nodes/lora.h +++ b/src/plugins/intel_cpu/src/nodes/lora.h @@ -27,6 +27,7 @@ class LoRA : public Node { void getSupportedDescriptors() override{}; void selectOptimalPrimitiveDescriptor() override; + int registerToAllocationContext(int offset, AllocationContext& context) override; void createPrimitive() override; void execute(dnnl::stream) override; void executeDynamicImpl(dnnl::stream strm) override; diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp index 5c88772eeedabc..e330cad845837c 100644 --- a/src/plugins/intel_cpu/src/plugin.cpp +++ b/src/plugins/intel_cpu/src/plugin.cpp @@ -521,7 +521,7 @@ ov::SupportedOpsMap Plugin::query_model(const std::shared_ptr& Config::ModelType modelType = getModelType(model); conf.readProperties(config, modelType); - auto context = std::make_shared(conf, fake_w_cache, false); + auto context = std::make_shared(conf, fake_w_cache, false, nullptr, nullptr); auto supported = ov::get_supported_nodes( model, diff --git a/src/plugins/intel_cpu/tests/functional/cmake/target_per_test.cmake b/src/plugins/intel_cpu/tests/functional/cmake/target_per_test.cmake index 057869a864d87b..9d7fa9f9d9a365 100644 --- a/src/plugins/intel_cpu/tests/functional/cmake/target_per_test.cmake +++ b/src/plugins/intel_cpu/tests/functional/cmake/target_per_test.cmake @@ -96,7 +96,8 @@ endif() endfunction() if(ENABLE_CPU_SPECIFIC_TARGET_PER_TEST) - create_target_per_test_for_directory(${CMAKE_CURRENT_SOURCE_DIR}/custom/subgraph_tests/src ov_cpu_func_subgraph) + # create_target_per_test_for_directory(${CMAKE_CURRENT_SOURCE_DIR}/custom/subgraph_tests/src ov_cpu_func_subgraph) + create_target_per_test_for_directory(${CMAKE_CURRENT_SOURCE_DIR}/custom/subgraph_tests/src/common ov_cpu_func_subgraph) create_target_per_test_for_directory(${CMAKE_CURRENT_SOURCE_DIR}/custom/single_layer_tests ov_cpu_func_slt) endif() diff --git a/src/plugins/intel_cpu/tests/unit/graph/inplace_resolve_io.cpp b/src/plugins/intel_cpu/tests/unit/graph/inplace_resolve_io.cpp index a41cb4c4300d42..96733ec115319a 100644 --- a/src/plugins/intel_cpu/tests/unit/graph/inplace_resolve_io.cpp +++ b/src/plugins/intel_cpu/tests/unit/graph/inplace_resolve_io.cpp @@ -6,6 +6,7 @@ #include "dummy_node.hpp" #include "graph.h" +#include "memory_control.hpp" #include "nodes/input.h" #include "nodes/concat.h" #include "nodes/rnn.h" @@ -42,7 +43,11 @@ class InplaceResolveIOCPUTestBase : public ::testing::Test { std::shared_ptr create_graph(const std::vector& input_shapes, const size_t num_consumers = 1) { Config conf; conf.rtCacheCapacity = 100; - const auto context = std::make_shared(conf, nullptr, false); + const auto context = std::make_shared(conf, + nullptr, + false, + networkMemoryControl->createMemoryControlUnit(), + networkMemoryControl); std::shared_ptr graph = std::shared_ptr(new Graph()); @@ -88,6 +93,7 @@ class InplaceResolveIOCPUTestBase : public ::testing::Test { std::vector nodes; std::vector edges; std::unordered_set nodesSet; + std::shared_ptr networkMemoryControl = std::make_shared(); }; class RNNConcatCPUTest : public InplaceResolveIOCPUTestBase { diff --git a/src/plugins/intel_cpu/tests/unit/graph/memory_state.cpp b/src/plugins/intel_cpu/tests/unit/graph/memory_state.cpp index 5b9468ffc35e6f..02a5940965fb6e 100644 --- a/src/plugins/intel_cpu/tests/unit/graph/memory_state.cpp +++ b/src/plugins/intel_cpu/tests/unit/graph/memory_state.cpp @@ -6,6 +6,7 @@ #include "dummy_node.hpp" #include "graph.h" +#include "memory_control.hpp" #include "nodes/memory.hpp" #include "nodes/softmax.h" #include "nodes/shapeof.h" @@ -82,7 +83,8 @@ TEST(MemStateGraphTest, smoke_Check_Memory_Modification_Guard) { Config conf; conf.rtCacheCapacity = 0; - auto context = std::make_shared(conf, nullptr, false); + std::shared_ptr networkMemoryControl = std::make_shared(); + auto context = std::make_shared(conf, nullptr, false, networkMemoryControl->createMemoryControlUnit(), networkMemoryControl); auto input_node = std::make_shared(param, context); auto memory_input = std::make_shared(read, context); @@ -281,7 +283,12 @@ TEST(MemStateGraphTest, smoke_ShapeOf_no_Inplace_Conflicts) { Config conf; conf.rtCacheCapacity = 0; - auto context = std::make_shared(conf, nullptr, false); + std::shared_ptr networkMemoryControl = std::make_shared(); + auto context = std::make_shared(conf, + nullptr, + false, + networkMemoryControl->createMemoryControlUnit(), + networkMemoryControl); auto input_node = std::make_shared(param, context); auto memory_input = std::make_shared(read, context); diff --git a/src/plugins/intel_cpu/tests/unit/graph/merge_transpose_reorder_test.cpp b/src/plugins/intel_cpu/tests/unit/graph/merge_transpose_reorder_test.cpp index 003aca979398fb..71bf2dc340855e 100644 --- a/src/plugins/intel_cpu/tests/unit/graph/merge_transpose_reorder_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/graph/merge_transpose_reorder_test.cpp @@ -9,6 +9,7 @@ #include "common_test_utils/node_builders/constant.hpp" #include "dummy_node.hpp" #include "graph.h" +#include "memory_control.hpp" #include "nodes/input.h" #include "nodes/reorder.h" #include "nodes/reshape.h" @@ -76,7 +77,7 @@ class MergeTransposeReorderCPUTest : public testing::WithParamInterface(conf, nullptr, false); + m_context = std::make_shared(conf, nullptr, false, networkMemoryControl->createMemoryControlUnit(), networkMemoryControl); const auto replication_result = CreateModelAndReplicate(shape, params.firstNodeLayout, params.firstNodeInplaceDirection, @@ -173,6 +174,7 @@ class MergeTransposeReorderCPUTest : public testing::WithParamInterface m_context; std::unique_ptr m_graph; + std::shared_ptr networkMemoryControl = std::make_shared(); }; // class MergeTransposeReorderCPUTest /* @@ -335,7 +337,8 @@ TEST(MergeTransposeReorder, smoke_InplaceConflict) { */ Config conf; conf.rtCacheCapacity = 100; - auto context = std::make_shared(conf, nullptr, false); + std::shared_ptr networkMemoryControl = std::make_shared(); + auto context = std::make_shared(conf, nullptr, false, networkMemoryControl->createMemoryControlUnit(), networkMemoryControl); std::unique_ptr graph = std::unique_ptr(new Graph()); diff --git a/src/plugins/intel_cpu/tests/unit/graph/resolve_edge_conflicts_test.cpp b/src/plugins/intel_cpu/tests/unit/graph/resolve_edge_conflicts_test.cpp index b44194a3d5806c..8e510f31f8066c 100644 --- a/src/plugins/intel_cpu/tests/unit/graph/resolve_edge_conflicts_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/graph/resolve_edge_conflicts_test.cpp @@ -5,6 +5,7 @@ #include "dummy_node.hpp" #include "graph.h" +#include "memory_control.hpp" #include "nodes/input.h" #include "nodes/concat.h" #include "openvino/op/concat.hpp" @@ -43,7 +44,12 @@ TEST(ResolveEdgeConflictsCPUTest, smoke_Run_ResolveEdgeConflicts) { */ Config conf; conf.rtCacheCapacity = 100; - auto context = std::make_shared(conf, nullptr, false); + std::shared_ptr networkMemoryControl = std::make_shared(); + auto context = std::make_shared(conf, + nullptr, + false, + networkMemoryControl->createMemoryControlUnit(), + networkMemoryControl); const dnnl::engine cpuEngine = context->getEngine(); std::unique_ptr graph = std::unique_ptr(new Graph()); @@ -104,7 +110,8 @@ TEST(ResolveEdgeConflictsCPUTest2, smoke_Run_ResolveEdgeConflicts2) { */ Config conf; conf.rtCacheCapacity = 100; - auto context = std::make_shared(conf, nullptr, false); + std::shared_ptr networkMemoryControl = std::make_shared(); + auto context = std::make_shared(conf, nullptr, false, networkMemoryControl->createMemoryControlUnit(), networkMemoryControl); std::unique_ptr graph = std::unique_ptr(new Graph()); diff --git a/src/plugins/intel_cpu/tests/unit/nodes/reorder_node_test.cpp b/src/plugins/intel_cpu/tests/unit/nodes/reorder_node_test.cpp index ea2994759e7036..63a44f5bea7075 100644 --- a/src/plugins/intel_cpu/tests/unit/nodes/reorder_node_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/nodes/reorder_node_test.cpp @@ -14,6 +14,7 @@ #include #include "common_test_utils/common_utils.hpp" +#include "memory_control.hpp" #include "nodes/input.h" using namespace ov::intel_cpu; @@ -108,7 +109,9 @@ class ReorderCPUTestGraph { conf.rtCacheCapacity = 100; auto context = std::make_shared(conf, std::make_shared(), - false); + false, + networkMemoryControl->createMemoryControlUnit(), + networkMemoryControl); const dnnl::engine cpuEngine = context->getEngine(); inputNode = std::make_shared(inputDesc.clone(), @@ -152,6 +155,7 @@ class ReorderCPUTestGraph { std::shared_ptr parentEdge; std::shared_ptr childEdge; ov::element::Type prec; + std::shared_ptr networkMemoryControl = std::make_shared(); }; }// namespace ReorderCPUTest