Skip to content

Commit

Permalink
[NVIDIA] Add Multi-graph Feature to NVIDIA Plugin (#710)
Browse files Browse the repository at this point in the history
* [NVIDIA] Change CudaGraphTopologyRunner/SubGraph to aggregation from inheritance

* [NVIDIA] Add new constructor to SubGraph

* [NVIDIA] Update CudaGraphContext to use vectors

* [NVIDIA] Extract inputs/outputs to TensorMappingContext

* [NVIDIA] Update SubGraph to use direct exec_sequence_ and std::shared_ptr<MemoryManager>

* [NVIDIA] Add CudaGraphInfo and update CudaGraphContext to use them

* [NVIDIA] Fix single-graph tests

* [NVIDIA] Add CudaMultiGraphTest test

* [NVIDIA] Add SubGraph::IsCudaGraphCompatible() cache

* [NVIDIA] Add execute_sequence/capture_sequence member functions to Profiler

* [NVIDIA] Enable TensorIterator to use Profiler::execute_sequence()

* [NVIDIA] Enable SubGraph to use Profiler::execute/capture_sequence()

* [NVIDIA] Extract ITopologyRunner into the separate header

* [NVIDIA] Update tests to include cuda_eager_topology_runner.hpp

* [NVIDIA] Add IExecutionDelegator

* [NVIDIA] Add cuda_perf_counts.hpp

* [NVIDIA] Add SimpleExecutionDelegator class and use it when profiling is not needed

* [NVIDIA] Update tests to use SimpleExecutionDelegator

* [NVIDIA] Add updateExecSequence() to TensorIteratorOp

* [NVIDIA] Update TensorIteratorOp::IsCudaGraphCompatible() to use SubGraph implementation

* [NVIDIA] Add rebase fixes

* [NVIDIA] Add comment fixes

* [NVIDIA] Rename functions to correspond to OV coding style

* [NVIDIA] Add number_of_cuda_graphs property

* [NVIDIA] Fix and update SimpleExecutionDelegator

* [NVIDIA] Fix build error on some configurations

* [NVIDIA] Temporary disable CUDA graph compatibility for TensorIterator
  • Loading branch information
apavliuk55 authored Sep 1, 2023
1 parent ebaf9dd commit 394a8cf
Show file tree
Hide file tree
Showing 48 changed files with 1,291 additions and 411 deletions.
3 changes: 3 additions & 0 deletions modules/nvidia_plugin/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,9 @@ Please refer to OpenVINO documentation for details.

All parameters must be set before calling `ov::Core::compile_model()` in order to take effect.

### Plugin specific properties
* `ov::nvidia_gpu::number_of_cuda_graphs` - Read-only property showing the number of CUDA Graphs, used for the current model

## Compile options

During compilation of the openvino_nvidia_gpu_plugin, user could specify the following options:
Expand Down
7 changes: 6 additions & 1 deletion modules/nvidia_plugin/include/nvidia/properties.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,12 @@ static constexpr Property<bool, PropertyMutability::RW> operation_benchmark{"NVI
/**
* @brief Specifies if NVIDIA plugin attempts to use CUDA Graph feature to speed up sequential network inferences
*/
static constexpr ov::Property<bool, ov::PropertyMutability::RW> use_cuda_graph{"NVIDIA_USE_CUDA_GRAPH"};
static constexpr Property<bool, PropertyMutability::RW> use_cuda_graph{"NVIDIA_USE_CUDA_GRAPH"};

/**
* @brief Read-only property showing number of used CUDA Graphs
*/
static constexpr Property<size_t, PropertyMutability::RO> number_of_cuda_graphs{"NVIDIA_NUMBER_OF_CUDA_GRAPHS"};

} // namespace nvidia_gpu
} // namespace ov
21 changes: 14 additions & 7 deletions modules/nvidia_plugin/src/cuda/graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,22 @@ cudaGraph_t Graph::createNativeWithFlags(unsigned int flags) {
return g;
}

bool operator==(const Graph &rhs, const Graph &lhs) { return rhs.get() == lhs.get(); }

GraphExec::GraphExec(const Graph &g)
#if !defined(NDEBUG) || defined(_DEBUG)
try
try
#endif
:
Handle(cudaGraphInstantiate, cudaGraphExecDestroy, g.get(), static_cast<cudaGraphNode_t*>(nullptr),
: Handle(cudaGraphInstantiate,
cudaGraphExecDestroy,
g.get(),
static_cast<cudaGraphNode_t *>(nullptr),
#if !defined(NDEBUG) || defined(_DEBUG)
errorMsg_, kErrorStringLen)
errorMsg_,
kErrorStringLen)
#else
static_cast<char*>(nullptr), static_cast<size_t>(0ul))
static_cast<char *>(nullptr),
static_cast<size_t>(0ul))
#endif
{
}
Expand All @@ -56,8 +62,9 @@ void GraphExec::launch(const Stream &stream) const {
throwIfError(cudaGraphLaunch(get(), stream.get()));
}

GraphCapture::GraphCaptureScope::GraphCaptureScope(GraphCapture &graphCapture) :
graphCapture_ { graphCapture } {
bool operator==(const GraphExec &lhs, const GraphExec &rhs) { return rhs.get() == lhs.get(); }

GraphCapture::GraphCaptureScope::GraphCaptureScope(GraphCapture &graphCapture) : graphCapture_{graphCapture} {
throwIfError(cudaStreamBeginCapture(graphCapture_.stream_.get(), cudaStreamCaptureModeThreadLocal));
}

Expand Down
6 changes: 6 additions & 0 deletions modules/nvidia_plugin/src/cuda/graph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ class Graph: public Handle<cudaGraph_t> {
public:
Graph(unsigned int flags);

friend bool operator==(const Graph& lhs, const Graph& rhs);

friend GraphCapture;

private:
Expand All @@ -26,6 +28,7 @@ class Graph: public Handle<cudaGraph_t> {
static cudaGraph_t createNativeWithFlags(unsigned int flags);
};

bool operator==(const Graph& rhs, const Graph& lhs);

class GraphExec: public Handle<cudaGraphExec_t> {
public:
Expand All @@ -35,13 +38,16 @@ class GraphExec: public Handle<cudaGraphExec_t> {

void launch(const Stream& stream) const;

friend bool operator==(const GraphExec& lhs, const GraphExec& rhs);

#if !defined(NDEBUG) || defined(_DEBUG)
private:
static constexpr std::size_t kErrorStringLen = 1024;
char errorMsg_[kErrorStringLen];
#endif
};

bool operator==(const GraphExec& lhs, const GraphExec& rhs);

class GraphCapture {
public:
Expand Down
21 changes: 12 additions & 9 deletions modules/nvidia_plugin/src/cuda_compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,11 @@
#include <utility>

#include "cuda_compiled_model.hpp"
#include "cuda_eager_topology_runner.hpp"
#include "cuda_graph_topology_runner.hpp"
#include "cuda_itt.hpp"
#include "cuda_operation_registry.hpp"
#include "cuda_perf_counts.hpp"
#include "cuda_plugin.hpp"
#include "memory_manager/cuda_immutable_memory_block_builder.hpp"
#include "memory_manager/cuda_memory_manager.hpp"
Expand Down Expand Up @@ -55,7 +57,8 @@ CompiledModel::CompiledModel(const std::shared_ptr<const ov::Model>& model,
cuda_stream_executor_(std::move(wait_executor)),
loaded_from_cache_(loaded_from_cache),
use_cuda_graph_{get_property(ov::nvidia_gpu::use_cuda_graph.name()).as<bool>() &&
!get_property(ov::enable_profiling.name()).as<bool>()} {
!get_property(ov::enable_profiling.name()).as<bool>()},
number_of_cuda_graphs_{0} {
try {
compile_model(model);
init_executor(); // creates thread-based executor using for async requests
Expand Down Expand Up @@ -129,13 +132,9 @@ void CompiledModel::compile_model(const std::shared_ptr<const ov::Model>& model)
const auto creationContext = CreationContext{device, opBenchOption};

if (use_cuda_graph_) {
try {
topology_runner_ = std::make_unique<CudaGraphTopologyRunner>(creationContext, model_);
// TODO: Add CudaGraphTopologyRunner validation
} catch (const CudaGraphTopologyRunner::CudaGraphIncompatible&) {
topology_runner_ = std::make_unique<EagerTopologyRunner>(creationContext, model_);
use_cuda_graph_ = false;
}
auto cudaGraphTopologyRunner = std::make_unique<CudaGraphTopologyRunner>(creationContext, model_);
number_of_cuda_graphs_ = cudaGraphTopologyRunner->GetCudaGraphsCount();
topology_runner_ = std::move(cudaGraphTopologyRunner);
} else {
topology_runner_ = std::make_unique<EagerTopologyRunner>(creationContext, model_);
}
Expand Down Expand Up @@ -256,7 +255,7 @@ size_t CompiledModel::get_optimal_number_of_streams(size_t const_blob_size,
}

std::shared_ptr<MemoryPool> CompiledModel::create_memory_pool() {
const auto& memory_manager = topology_runner_->GetSubGraph().memoryManager();
const auto& memory_manager = *(topology_runner_->GetSubGraph().memoryManager());
const auto const_blob_size = memory_manager.immutableTensors().memoryModel()->deviceMemoryBlockSize();
const auto immutable_work_buffers_size = memory_manager.immutableWorkbuffers().memoryModel()->deviceMemoryBlockSize();
const auto& memory_model = memory_manager.mutableTensorsMemoryModel();
Expand Down Expand Up @@ -306,6 +305,8 @@ ov::Any CompiledModel::get_property(const std::string& name) const {
supported_properties.push_back(
ov::PropertyName(ov::optimal_number_of_infer_requests.name(), PropertyMutability::RO));
supported_properties.push_back(ov::PropertyName(ov::loaded_from_cache.name(), PropertyMutability::RO));
supported_properties.push_back(ov::PropertyName(ov::nvidia_gpu::number_of_cuda_graphs.name(),
PropertyMutability::RO));
auto rw_properties = config_.get_rw_properties();
for (auto& rw_property : rw_properties)
supported_properties.emplace_back(ov::PropertyName(rw_property, PropertyMutability::RO));
Expand Down Expand Up @@ -333,6 +334,8 @@ ov::Any CompiledModel::get_property(const std::string& name) const {
return decltype(ov::execution_devices)::value_type{get_plugin()->get_device_name() + "." + std::to_string(config_.get_device_id())};
} else if (ov::loaded_from_cache == name) {
return decltype(ov::loaded_from_cache)::value_type{loaded_from_cache_};
} else if (ov::nvidia_gpu::number_of_cuda_graphs == name) {
return decltype(ov::nvidia_gpu::number_of_cuda_graphs)::value_type{number_of_cuda_graphs_};
} else {
return config_.get(name);
}
Expand Down
8 changes: 4 additions & 4 deletions modules/nvidia_plugin/src/cuda_compiled_model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,17 @@

#pragma once

#include "openvino/runtime/icompiled_model.hpp"
#include "openvino/runtime/threading/itask_executor.hpp"

#include "cuda_async_infer_request.hpp"
#include "cuda_config.hpp"
#include "cuda_eager_topology_runner.hpp"
#include "cuda_infer_request.hpp"
#include "cuda_itopology_runner.hpp"
#include "cuda_op_buffers_extractor.hpp"
#include "memory_manager/cuda_device_mem_block.hpp"
#include "memory_manager/cuda_memory_manager.hpp"
#include "memory_manager/cuda_memory_pool.hpp"
#include "memory_manager/model/cuda_memory_model.hpp"
#include "openvino/runtime/icompiled_model.hpp"
#include "openvino/runtime/threading/itask_executor.hpp"
#include "ops/subgraph.hpp"

namespace ov {
Expand Down Expand Up @@ -78,6 +77,7 @@ class CompiledModel : public ov::ICompiledModel {
std::shared_ptr<MemoryPool> memory_pool_;
const bool loaded_from_cache_;
bool use_cuda_graph_;
size_t number_of_cuda_graphs_;
};

} // namespace nvidia_gpu
Expand Down
10 changes: 2 additions & 8 deletions modules/nvidia_plugin/src/cuda_eager_topology_runner.hpp
Original file line number Diff line number Diff line change
@@ -1,21 +1,15 @@
// Copyright (C) 2018-2021 Intel Corporation
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include <ops/subgraph.hpp>
#include "cuda_itopology_runner.hpp"

namespace ov {
namespace nvidia_gpu {

struct ITopologyRunner {
virtual void Run(const InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const = 0;
virtual void UpdateContext(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const = 0;
virtual const SubGraph& GetSubGraph() const = 0;
virtual ~ITopologyRunner() = default;
};

class EagerTopologyRunner final : public SubGraph, public ITopologyRunner {
public:
EagerTopologyRunner(const CreationContext& context, const std::shared_ptr<const ov::Model>& model);
Expand Down
132 changes: 132 additions & 0 deletions modules/nvidia_plugin/src/cuda_graph_context.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "cuda_graph_context.hpp"

namespace ov {
namespace nvidia_gpu {

void CudaGraphContext::reset() {
graphs_.clear();
currentGraphIndex_ = 0;
}

void CudaGraphContext::start_next_graph_addition() {
currentGraphIndex_ = graphs_.size();
graphs_.emplace_back();
}

void CudaGraphContext::add_parameter(const std::string& tensorName,
const CUDA::Stream& stream,
CUDA::DevicePointer<void*> dst,
const void* src,
std::size_t size) {
OPENVINO_ASSERT(currentGraphIndex_ < graphs_.size(), "Graph index/vector size incosistency");
graphs_[currentGraphIndex_].add_parameter(tensorName, stream, dst, src, size);
}

void CudaGraphContext::add_result(const std::string& tensorName,
const CUDA::Stream& stream,
void* dst,
CUDA::DevicePointer<const void*> src,
std::size_t size) {
OPENVINO_ASSERT(currentGraphIndex_ < graphs_.size(), "Graph index/vector size incosistency");
graphs_[currentGraphIndex_].add_result(tensorName, stream, dst, src, size);
}

void CudaGraphContext::add_graph(const CUDA::Graph& graph) {
OPENVINO_ASSERT(currentGraphIndex_ < graphs_.size(), "Graph index/vector size incosistency");
graphs_[currentGraphIndex_].set_graph(graph);
}

bool CudaGraphContext::is_initialized() const {
const auto size = graphs_.size();
return size != 0 && graphs_[size - 1].is_initialized();
}

void CudaGraphContext::update_capture(const TensorMappingContext& context) {
for (currentGraphIndex_ = 0; currentGraphIndex_ < graphs_.size(); ++currentGraphIndex_) {
graphs_[currentGraphIndex_].update_capture(context);
}
}

void CudaGraphContext::launch(std::size_t index, const CUDA::Stream& stream) const {
currentGraphIndex_ = index;
OPENVINO_ASSERT(currentGraphIndex_ < graphs_.size(), "Graph index/vector size incosistency");
graphs_[currentGraphIndex_].launch(stream);
}

std::size_t CudaGraphContext::get_params_count() const {
std::size_t res = 0;
for (const auto& graph : graphs_) {
res += graph.get_params_count();
}
return res;
}

std::size_t CudaGraphContext::get_results_count() const {
std::size_t res = 0;
for (const auto& graph : graphs_) {
res += graph.get_results_count();
}
return res;
}

std::size_t CudaGraphContext::get_graphs_count() const { return graphs_.size(); }

void CudaGraphContext::CudaGraphInfo::add_parameter(const std::string& tensorName,
const CUDA::Stream& stream,
CUDA::DevicePointer<void*> dst,
const void* src,
std::size_t size) {
CUDA::CaptureInfo captureInfo{stream};
parameterNodes_.emplace(tensorName, captureInfo.addUploadNode(dst, src, size));
}

void CudaGraphContext::CudaGraphInfo::add_result(const std::string& tensorName,
const CUDA::Stream& stream,
void* dst,
CUDA::DevicePointer<const void*> src,
std::size_t size) {
CUDA::CaptureInfo captureInfo{stream};
resultNodes_.emplace(tensorName, captureInfo.addDownloadNode(dst, src, size));
}

void CudaGraphContext::CudaGraphInfo::set_graph(const CUDA::Graph& graph) {
graph_.emplace(graph);
graphExec_.emplace(graph);
}

bool CudaGraphContext::CudaGraphInfo::is_initialized() const { return graph_.has_value() && graphExec_.has_value(); }

void CudaGraphContext::CudaGraphInfo::update_capture(const TensorMappingContext& context) {
for (auto&& [tensorName, node] : parameterNodes_) {
node.update_src(graphExec_.value(), (context.get_input_tensor(tensorName)->data()));
}
for (auto&& [tensorName, node] : resultNodes_) {
node.update_dst(graphExec_.value(), context.get_output_tensor(tensorName)->data());
}
}

void CudaGraphContext::CudaGraphInfo::launch(const CUDA::Stream& stream) const { graphExec_.value().launch(stream); }

std::size_t CudaGraphContext::CudaGraphInfo::get_params_count() const { return parameterNodes_.size(); }

std::size_t CudaGraphContext::CudaGraphInfo::get_results_count() const { return resultNodes_.size(); }

bool operator==(const CudaGraphContext::CudaGraphInfo& lhs, const CudaGraphContext::CudaGraphInfo& rhs) {
return lhs.graph_ == rhs.graph_ && lhs.graphExec_ == rhs.graphExec_ && lhs.parameterNodes_ == rhs.parameterNodes_ &&
lhs.resultNodes_ == rhs.resultNodes_;
}

bool operator!=(const CudaGraphContext::CudaGraphInfo& lhs, const CudaGraphContext::CudaGraphInfo& rhs) {
return !(lhs == rhs);
}

bool operator==(const CudaGraphContext& lhs, const CudaGraphContext& rhs) { return lhs.graphs_ == rhs.graphs_; }

bool operator!=(const CudaGraphContext& lhs, const CudaGraphContext& rhs) { return !(lhs == rhs); }

} // namespace nvidia_gpu
} // namespace ov
Loading

0 comments on commit 394a8cf

Please sign in to comment.