[NVIDIA] Add Multi-graph Feature to NVIDIA Plugin (#710)

* [NVIDIA] Change CudaGraphTopologyRunner/SubGraph to aggregation from inheritance * [NVIDIA] Add new constructor to SubGraph * [NVIDIA] Update CudaGraphContext to use vectors * [NVIDIA] Extract inputs/outputs to TensorMappingContext * [NVIDIA] Update SubGraph to use direct exec_sequence_ and std::shared_ptr<MemoryManager> * [NVIDIA] Add CudaGraphInfo and update CudaGraphContext to use them * [NVIDIA] Fix single-graph tests * [NVIDIA] Add CudaMultiGraphTest test * [NVIDIA] Add SubGraph::IsCudaGraphCompatible() cache * [NVIDIA] Add execute_sequence/capture_sequence member functions to Profiler * [NVIDIA] Enable TensorIterator to use Profiler::execute_sequence() * [NVIDIA] Enable SubGraph to use Profiler::execute/capture_sequence() * [NVIDIA] Extract ITopologyRunner into the separate header * [NVIDIA] Update tests to include cuda_eager_topology_runner.hpp * [NVIDIA] Add IExecutionDelegator * [NVIDIA] Add cuda_perf_counts.hpp * [NVIDIA] Add SimpleExecutionDelegator class and use it when profiling is not needed * [NVIDIA] Update tests to use SimpleExecutionDelegator * [NVIDIA] Add updateExecSequence() to TensorIteratorOp * [NVIDIA] Update TensorIteratorOp::IsCudaGraphCompatible() to use SubGraph implementation * [NVIDIA] Add rebase fixes * [NVIDIA] Add comment fixes * [NVIDIA] Rename functions to correspond to OV coding style * [NVIDIA] Add number_of_cuda_graphs property * [NVIDIA] Fix and update SimpleExecutionDelegator * [NVIDIA] Fix build error on some configurations * [NVIDIA] Temporary disable CUDA graph compatibility for TensorIterator
openvinotoolkit · Sep 1, 2023 · 394a8cf · 394a8cf
1 parent ebaf9dd
commit 394a8cf
Show file tree

Hide file tree

Showing 48 changed files with 1,291 additions and 411 deletions.
diff --git a/modules/nvidia_plugin/README.md b/modules/nvidia_plugin/README.md
@@ -179,6 +179,9 @@ Please refer to OpenVINO documentation for details.
 
 All parameters must be set before calling `ov::Core::compile_model()` in order to take effect.
 
+### Plugin specific properties
+* `ov::nvidia_gpu::number_of_cuda_graphs` - Read-only property showing the number of CUDA Graphs, used for the current model
+
 ## Compile options
 
 During compilation of the openvino_nvidia_gpu_plugin, user could specify the following options:

diff --git a/modules/nvidia_plugin/include/nvidia/properties.hpp b/modules/nvidia_plugin/include/nvidia/properties.hpp
@@ -27,7 +27,12 @@ static constexpr Property<bool, PropertyMutability::RW> operation_benchmark{"NVI
 /**
  * @brief Specifies if NVIDIA plugin attempts to use CUDA Graph feature to speed up sequential network inferences
  */
-static constexpr ov::Property<bool, ov::PropertyMutability::RW> use_cuda_graph{"NVIDIA_USE_CUDA_GRAPH"};
+static constexpr Property<bool, PropertyMutability::RW> use_cuda_graph{"NVIDIA_USE_CUDA_GRAPH"};
+
+/**
+ * @brief Read-only property showing number of used CUDA Graphs
+ */
+static constexpr Property<size_t, PropertyMutability::RO> number_of_cuda_graphs{"NVIDIA_NUMBER_OF_CUDA_GRAPHS"};
 
 }  // namespace nvidia_gpu
 }  // namespace ov
diff --git a/modules/nvidia_plugin/src/cuda/graph.cpp b/modules/nvidia_plugin/src/cuda/graph.cpp
@@ -27,16 +27,22 @@ cudaGraph_t Graph::createNativeWithFlags(unsigned int flags) {
     return g;
 }
 
+bool operator==(const Graph &rhs, const Graph &lhs) { return rhs.get() == lhs.get(); }
+
 GraphExec::GraphExec(const Graph &g)
 #if !defined(NDEBUG) || defined(_DEBUG)
-try
+    try
 #endif
-:
-Handle(cudaGraphInstantiate, cudaGraphExecDestroy, g.get(), static_cast<cudaGraphNode_t*>(nullptr),
+    : Handle(cudaGraphInstantiate,
+             cudaGraphExecDestroy,
+             g.get(),
+             static_cast<cudaGraphNode_t *>(nullptr),
 #if !defined(NDEBUG) || defined(_DEBUG)
-       errorMsg_, kErrorStringLen)
+             errorMsg_,
+             kErrorStringLen)
 #else
-       static_cast<char*>(nullptr), static_cast<size_t>(0ul))
+             static_cast<char *>(nullptr),
+             static_cast<size_t>(0ul))
 #endif
 {
 }
@@ -56,8 +62,9 @@ void GraphExec::launch(const Stream &stream) const {
     throwIfError(cudaGraphLaunch(get(), stream.get()));
 }
 
-GraphCapture::GraphCaptureScope::GraphCaptureScope(GraphCapture &graphCapture) :
-        graphCapture_ { graphCapture } {
+bool operator==(const GraphExec &lhs, const GraphExec &rhs) { return rhs.get() == lhs.get(); }
+
+GraphCapture::GraphCaptureScope::GraphCaptureScope(GraphCapture &graphCapture) : graphCapture_{graphCapture} {
     throwIfError(cudaStreamBeginCapture(graphCapture_.stream_.get(), cudaStreamCaptureModeThreadLocal));
 }
 

diff --git a/modules/nvidia_plugin/src/cuda/graph.hpp b/modules/nvidia_plugin/src/cuda/graph.hpp
@@ -16,6 +16,8 @@ class Graph: public Handle<cudaGraph_t> {
 public:
     Graph(unsigned int flags);
 
+    friend bool operator==(const Graph& lhs, const Graph& rhs);
+
     friend GraphCapture;
 
 private:
@@ -26,6 +28,7 @@ class Graph: public Handle<cudaGraph_t> {
     static cudaGraph_t createNativeWithFlags(unsigned int flags);
 };
 
+bool operator==(const Graph& rhs, const Graph& lhs);
 
 class GraphExec: public Handle<cudaGraphExec_t> {
 public:
@@ -35,13 +38,16 @@ class GraphExec: public Handle<cudaGraphExec_t> {
 
     void launch(const Stream& stream) const;
 
+    friend bool operator==(const GraphExec& lhs, const GraphExec& rhs);
+
 #if !defined(NDEBUG) || defined(_DEBUG)
 private:
     static constexpr std::size_t kErrorStringLen = 1024;
     char errorMsg_[kErrorStringLen];
 #endif
 };
 
+bool operator==(const GraphExec& lhs, const GraphExec& rhs);
 
 class GraphCapture {
 public:

diff --git a/modules/nvidia_plugin/src/cuda_compiled_model.cpp b/modules/nvidia_plugin/src/cuda_compiled_model.cpp
@@ -15,9 +15,11 @@
 #include <utility>
 
 #include "cuda_compiled_model.hpp"
+#include "cuda_eager_topology_runner.hpp"
 #include "cuda_graph_topology_runner.hpp"
 #include "cuda_itt.hpp"
 #include "cuda_operation_registry.hpp"
+#include "cuda_perf_counts.hpp"
 #include "cuda_plugin.hpp"
 #include "memory_manager/cuda_immutable_memory_block_builder.hpp"
 #include "memory_manager/cuda_memory_manager.hpp"
@@ -55,7 +57,8 @@ CompiledModel::CompiledModel(const std::shared_ptr<const ov::Model>& model,
       cuda_stream_executor_(std::move(wait_executor)),
       loaded_from_cache_(loaded_from_cache),
       use_cuda_graph_{get_property(ov::nvidia_gpu::use_cuda_graph.name()).as<bool>() &&
-                      !get_property(ov::enable_profiling.name()).as<bool>()} {
+                      !get_property(ov::enable_profiling.name()).as<bool>()},
+      number_of_cuda_graphs_{0} {
     try {
         compile_model(model);
         init_executor();  // creates thread-based executor using for async requests
@@ -129,13 +132,9 @@ void CompiledModel::compile_model(const std::shared_ptr<const ov::Model>& model)
     const auto creationContext = CreationContext{device, opBenchOption};
 
     if (use_cuda_graph_) {
-        try {
-            topology_runner_ = std::make_unique<CudaGraphTopologyRunner>(creationContext, model_);
-            // TODO: Add CudaGraphTopologyRunner validation
-        } catch (const CudaGraphTopologyRunner::CudaGraphIncompatible&) {
-            topology_runner_ = std::make_unique<EagerTopologyRunner>(creationContext, model_);
-            use_cuda_graph_ = false;
-        }
+        auto cudaGraphTopologyRunner = std::make_unique<CudaGraphTopologyRunner>(creationContext, model_);
+        number_of_cuda_graphs_ = cudaGraphTopologyRunner->GetCudaGraphsCount();
+        topology_runner_ = std::move(cudaGraphTopologyRunner);
     } else {
         topology_runner_ = std::make_unique<EagerTopologyRunner>(creationContext, model_);
     }
@@ -256,7 +255,7 @@ size_t CompiledModel::get_optimal_number_of_streams(size_t const_blob_size,
 }
 
 std::shared_ptr<MemoryPool> CompiledModel::create_memory_pool() {
-    const auto& memory_manager = topology_runner_->GetSubGraph().memoryManager();
+    const auto& memory_manager = *(topology_runner_->GetSubGraph().memoryManager());
     const auto const_blob_size = memory_manager.immutableTensors().memoryModel()->deviceMemoryBlockSize();
     const auto immutable_work_buffers_size = memory_manager.immutableWorkbuffers().memoryModel()->deviceMemoryBlockSize();
     const auto& memory_model = memory_manager.mutableTensorsMemoryModel();
@@ -306,6 +305,8 @@ ov::Any CompiledModel::get_property(const std::string& name) const {
         supported_properties.push_back(
             ov::PropertyName(ov::optimal_number_of_infer_requests.name(), PropertyMutability::RO));
         supported_properties.push_back(ov::PropertyName(ov::loaded_from_cache.name(), PropertyMutability::RO));
+        supported_properties.push_back(ov::PropertyName(ov::nvidia_gpu::number_of_cuda_graphs.name(),
+                                       PropertyMutability::RO));
         auto rw_properties = config_.get_rw_properties();
         for (auto& rw_property : rw_properties)
             supported_properties.emplace_back(ov::PropertyName(rw_property, PropertyMutability::RO));
@@ -333,6 +334,8 @@ ov::Any CompiledModel::get_property(const std::string& name) const {
         return decltype(ov::execution_devices)::value_type{get_plugin()->get_device_name() + "." + std::to_string(config_.get_device_id())};
     } else if (ov::loaded_from_cache == name) {
         return decltype(ov::loaded_from_cache)::value_type{loaded_from_cache_};
+    } else if (ov::nvidia_gpu::number_of_cuda_graphs == name) {
+        return decltype(ov::nvidia_gpu::number_of_cuda_graphs)::value_type{number_of_cuda_graphs_};
     } else {
         return config_.get(name);
     }

diff --git a/modules/nvidia_plugin/src/cuda_compiled_model.hpp b/modules/nvidia_plugin/src/cuda_compiled_model.hpp
@@ -4,18 +4,17 @@
 
 #pragma once
 
-#include "openvino/runtime/icompiled_model.hpp"
-#include "openvino/runtime/threading/itask_executor.hpp"
-
 #include "cuda_async_infer_request.hpp"
 #include "cuda_config.hpp"
-#include "cuda_eager_topology_runner.hpp"
 #include "cuda_infer_request.hpp"
+#include "cuda_itopology_runner.hpp"
 #include "cuda_op_buffers_extractor.hpp"
 #include "memory_manager/cuda_device_mem_block.hpp"
 #include "memory_manager/cuda_memory_manager.hpp"
 #include "memory_manager/cuda_memory_pool.hpp"
 #include "memory_manager/model/cuda_memory_model.hpp"
+#include "openvino/runtime/icompiled_model.hpp"
+#include "openvino/runtime/threading/itask_executor.hpp"
 #include "ops/subgraph.hpp"
 
 namespace ov {
@@ -78,6 +77,7 @@ class CompiledModel : public ov::ICompiledModel {
     std::shared_ptr<MemoryPool> memory_pool_;
     const bool loaded_from_cache_;
     bool use_cuda_graph_;
+    size_t number_of_cuda_graphs_;
 };
 
 }  // namespace nvidia_gpu

diff --git a/modules/nvidia_plugin/src/cuda_eager_topology_runner.hpp b/modules/nvidia_plugin/src/cuda_eager_topology_runner.hpp
@@ -1,21 +1,15 @@
-// Copyright (C) 2018-2021 Intel Corporation
+// Copyright (C) 2018-2023 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #pragma once
 
 #include <ops/subgraph.hpp>
+#include "cuda_itopology_runner.hpp"
 
 namespace ov {
 namespace nvidia_gpu {
 
-struct ITopologyRunner {
-    virtual void Run(const InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const = 0;
-    virtual void UpdateContext(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const = 0;
-    virtual const SubGraph& GetSubGraph() const = 0;
-    virtual ~ITopologyRunner() = default;
-};
-
 class EagerTopologyRunner final : public SubGraph, public ITopologyRunner {
 public:
     EagerTopologyRunner(const CreationContext& context, const std::shared_ptr<const ov::Model>& model);

diff --git a/modules/nvidia_plugin/src/cuda_graph_context.cpp b/modules/nvidia_plugin/src/cuda_graph_context.cpp
@@ -0,0 +1,132 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "cuda_graph_context.hpp"
+
+namespace ov {
+namespace nvidia_gpu {
+
+void CudaGraphContext::reset() {
+    graphs_.clear();
+    currentGraphIndex_ = 0;
+}
+
+void CudaGraphContext::start_next_graph_addition() {
+    currentGraphIndex_ = graphs_.size();
+    graphs_.emplace_back();
+}
+
+void CudaGraphContext::add_parameter(const std::string& tensorName,
+                                     const CUDA::Stream& stream,
+                                     CUDA::DevicePointer<void*> dst,
+                                     const void* src,
+                                     std::size_t size) {
+    OPENVINO_ASSERT(currentGraphIndex_ < graphs_.size(), "Graph index/vector size incosistency");
+    graphs_[currentGraphIndex_].add_parameter(tensorName, stream, dst, src, size);
+}
+
+void CudaGraphContext::add_result(const std::string& tensorName,
+                                  const CUDA::Stream& stream,
+                                  void* dst,
+                                  CUDA::DevicePointer<const void*> src,
+                                  std::size_t size) {
+    OPENVINO_ASSERT(currentGraphIndex_ < graphs_.size(), "Graph index/vector size incosistency");
+    graphs_[currentGraphIndex_].add_result(tensorName, stream, dst, src, size);
+}
+
+void CudaGraphContext::add_graph(const CUDA::Graph& graph) {
+    OPENVINO_ASSERT(currentGraphIndex_ < graphs_.size(), "Graph index/vector size incosistency");
+    graphs_[currentGraphIndex_].set_graph(graph);
+}
+
+bool CudaGraphContext::is_initialized() const {
+    const auto size = graphs_.size();
+    return size != 0 && graphs_[size - 1].is_initialized();
+}
+
+void CudaGraphContext::update_capture(const TensorMappingContext& context) {
+    for (currentGraphIndex_ = 0; currentGraphIndex_ < graphs_.size(); ++currentGraphIndex_) {
+        graphs_[currentGraphIndex_].update_capture(context);
+    }
+}
+
+void CudaGraphContext::launch(std::size_t index, const CUDA::Stream& stream) const {
+    currentGraphIndex_ = index;
+    OPENVINO_ASSERT(currentGraphIndex_ < graphs_.size(), "Graph index/vector size incosistency");
+    graphs_[currentGraphIndex_].launch(stream);
+}
+
+std::size_t CudaGraphContext::get_params_count() const {
+    std::size_t res = 0;
+    for (const auto& graph : graphs_) {
+        res += graph.get_params_count();
+    }
+    return res;
+}
+
+std::size_t CudaGraphContext::get_results_count() const {
+    std::size_t res = 0;
+    for (const auto& graph : graphs_) {
+        res += graph.get_results_count();
+    }
+    return res;
+}
+
+std::size_t CudaGraphContext::get_graphs_count() const { return graphs_.size(); }
+
+void CudaGraphContext::CudaGraphInfo::add_parameter(const std::string& tensorName,
+                                                    const CUDA::Stream& stream,
+                                                    CUDA::DevicePointer<void*> dst,
+                                                    const void* src,
+                                                    std::size_t size) {
+    CUDA::CaptureInfo captureInfo{stream};
+    parameterNodes_.emplace(tensorName, captureInfo.addUploadNode(dst, src, size));
+}
+
+void CudaGraphContext::CudaGraphInfo::add_result(const std::string& tensorName,
+                                                 const CUDA::Stream& stream,
+                                                 void* dst,
+                                                 CUDA::DevicePointer<const void*> src,
+                                                 std::size_t size) {
+    CUDA::CaptureInfo captureInfo{stream};
+    resultNodes_.emplace(tensorName, captureInfo.addDownloadNode(dst, src, size));
+}
+
+void CudaGraphContext::CudaGraphInfo::set_graph(const CUDA::Graph& graph) {
+    graph_.emplace(graph);
+    graphExec_.emplace(graph);
+}
+
+bool CudaGraphContext::CudaGraphInfo::is_initialized() const { return graph_.has_value() && graphExec_.has_value(); }
+
+void CudaGraphContext::CudaGraphInfo::update_capture(const TensorMappingContext& context) {
+    for (auto&& [tensorName, node] : parameterNodes_) {
+        node.update_src(graphExec_.value(), (context.get_input_tensor(tensorName)->data()));
+    }
+    for (auto&& [tensorName, node] : resultNodes_) {
+        node.update_dst(graphExec_.value(), context.get_output_tensor(tensorName)->data());
+    }
+}
+
+void CudaGraphContext::CudaGraphInfo::launch(const CUDA::Stream& stream) const { graphExec_.value().launch(stream); }
+
+std::size_t CudaGraphContext::CudaGraphInfo::get_params_count() const { return parameterNodes_.size(); }
+
+std::size_t CudaGraphContext::CudaGraphInfo::get_results_count() const { return resultNodes_.size(); }
+
+bool operator==(const CudaGraphContext::CudaGraphInfo& lhs, const CudaGraphContext::CudaGraphInfo& rhs) {
+    return lhs.graph_ == rhs.graph_ && lhs.graphExec_ == rhs.graphExec_ && lhs.parameterNodes_ == rhs.parameterNodes_ &&
+           lhs.resultNodes_ == rhs.resultNodes_;
+}
+
+bool operator!=(const CudaGraphContext::CudaGraphInfo& lhs, const CudaGraphContext::CudaGraphInfo& rhs) {
+    return !(lhs == rhs);
+}
+
+bool operator==(const CudaGraphContext& lhs, const CudaGraphContext& rhs) { return lhs.graphs_ == rhs.graphs_; }
+
+bool operator!=(const CudaGraphContext& lhs, const CudaGraphContext& rhs) { return !(lhs == rhs); }
+
+}  // namespace nvidia_gpu
+}  // namespace ov