diff --git a/modules/nvidia_plugin/README.md b/modules/nvidia_plugin/README.md index f390ce8c1..6d1cc9356 100644 --- a/modules/nvidia_plugin/README.md +++ b/modules/nvidia_plugin/README.md @@ -175,6 +175,7 @@ Please refer to OpenVINO documentation for details. ### Plugin specific parameters * `ov::nvidia_gpu::operation_benchmark` - specifies if operation level benchmark should be run for increasing performance of network (`false` by default) +* `ov::nvidia_gpu::use_cuda_graph` - specifies if NVIDIA plugin attempts to use CUDA Graph feature to speed up sequential network inferences (`true` by default) All parameters must be set before calling `ov::Core::compile_model()` in order to take effect. diff --git a/modules/nvidia_plugin/include/nvidia/properties.hpp b/modules/nvidia_plugin/include/nvidia/properties.hpp index d01d0729c..685843e82 100644 --- a/modules/nvidia_plugin/include/nvidia/properties.hpp +++ b/modules/nvidia_plugin/include/nvidia/properties.hpp @@ -24,5 +24,10 @@ namespace nvidia_gpu { */ static constexpr Property operation_benchmark{"NVIDIA_OPERATION_BENCHMARK"}; +/** + * @brief Specifies if NVIDIA plugin attempts to use CUDA Graph feature to speed up sequential network inferences + */ +static constexpr ov::Property use_cuda_graph{"NVIDIA_USE_CUDA_GRAPH"}; + } // namespace nvidia_gpu } // namespace ov diff --git a/modules/nvidia_plugin/src/cuda_compiled_model.cpp b/modules/nvidia_plugin/src/cuda_compiled_model.cpp index 612636716..250ec4e5e 100644 --- a/modules/nvidia_plugin/src/cuda_compiled_model.cpp +++ b/modules/nvidia_plugin/src/cuda_compiled_model.cpp @@ -53,8 +53,8 @@ CompiledModel::CompiledModel(const std::shared_ptr& model, config_(std::move(cfg)), cuda_stream_executor_(std::move(wait_executor)), loaded_from_cache_(loaded_from_cache), - use_cuda_graph_{get_property(ov::nvidia_gpu::internal::use_cuda_graph.name()).as() && - !get_property(ov::enable_profiling.name()).as()} { + use_cuda_graph_{get_property(ov::nvidia_gpu::use_cuda_graph.name()).as() && + !get_property(ov::enable_profiling.name()).as()} { try { compile_model(model); init_executor(); // creates thread-based executor using for async requests diff --git a/modules/nvidia_plugin/src/cuda_config.cpp b/modules/nvidia_plugin/src/cuda_config.cpp index 1cb9cebe4..6fb3d0169 100644 --- a/modules/nvidia_plugin/src/cuda_config.cpp +++ b/modules/nvidia_plugin/src/cuda_config.cpp @@ -42,6 +42,7 @@ std::vector Configuration::get_rw_properties() { ov::PropertyName{ov::hint::execution_mode.name(), ov::PropertyMutability::RW}, ov::PropertyName{ov::enable_profiling.name(), ov::PropertyMutability::RW}, ov::PropertyName{ov::nvidia_gpu::operation_benchmark.name(), ov::PropertyMutability::RW}, + ov::PropertyName{ov::nvidia_gpu::use_cuda_graph.name(), ov::PropertyMutability::RW}, }; return rw_properties; } @@ -166,7 +167,7 @@ Configuration::Configuration(const ov::AnyMap& config, const Configuration& defa streams_executor_config_.set_property(key, value); } else if (ov::nvidia_gpu::operation_benchmark == key) { operation_benchmark = value.as(); - } else if (internal::use_cuda_graph == key) { + } else if (ov::nvidia_gpu::use_cuda_graph == key) { use_cuda_graph = value.as(); } else if (ov::enable_profiling == key) { is_profiling_enabled = value.as(); @@ -200,7 +201,7 @@ ov::Any Configuration::get(const std::string& name) const { return is_profiling_enabled; } else if (name == ov::nvidia_gpu::operation_benchmark) { return operation_benchmark; - } else if (name == internal::use_cuda_graph) { + } else if (name == ov::nvidia_gpu::use_cuda_graph) { return use_cuda_graph; } else if (name == ov::num_streams) { return (num_streams == 0) ? diff --git a/modules/nvidia_plugin/src/cuda_config.hpp b/modules/nvidia_plugin/src/cuda_config.hpp index 4c6e36956..21b00893c 100644 --- a/modules/nvidia_plugin/src/cuda_config.hpp +++ b/modules/nvidia_plugin/src/cuda_config.hpp @@ -15,13 +15,6 @@ namespace ov { namespace nvidia_gpu { -namespace internal { -/** - * @brief Defines if NVIDIA Plugin should use CUDA graphs for performance acceleration - */ -static constexpr ov::Property use_cuda_graph{"NVIDIA_USE_CUDA_GRAPH"}; - -} // namespace internal struct Configuration { using Ptr = std::shared_ptr; @@ -58,7 +51,7 @@ struct Configuration { int device_id = 0; bool is_profiling_enabled = false; bool operation_benchmark = false; - bool use_cuda_graph = false; + bool use_cuda_graph = true; uint32_t hint_num_requests = 0; ov::streams::Num num_streams = 0; ov::hint::PerformanceMode performance_mode = ov::hint::PerformanceMode::LATENCY; diff --git a/modules/nvidia_plugin/tests/functional/shared_tests_instances/behavior/ov_executable_network/properties.cpp b/modules/nvidia_plugin/tests/functional/shared_tests_instances/behavior/ov_executable_network/properties.cpp index 56d75edab..c785d3f3f 100644 --- a/modules/nvidia_plugin/tests/functional/shared_tests_instances/behavior/ov_executable_network/properties.cpp +++ b/modules/nvidia_plugin/tests/functional/shared_tests_instances/behavior/ov_executable_network/properties.cpp @@ -72,7 +72,8 @@ const std::vector default_properties = { {ov::hint::execution_mode(ov::hint::ExecutionMode::PERFORMANCE)}, {ov::enable_profiling(false)}, {ov::device::id("0")}, - {ov::nvidia_gpu::operation_benchmark(false)} + {ov::nvidia_gpu::operation_benchmark(false)}, + {ov::nvidia_gpu::use_cuda_graph(true)} }; INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests,