diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp index 04d39ffafdcf94..febcabd57efba0 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp @@ -55,6 +55,7 @@ static constexpr Property enable_lp_transformation static constexpr Property max_dynamic_batch{"DYN_BATCH_LIMIT"}; static constexpr Property nv12_two_inputs{"GPU_NV12_TWO_INPUTS"}; static constexpr Property buffers_preallocation_ratio{"GPU_BUFFERS_PREALLOCATION_RATIO"}; +static constexpr Property max_kernels_per_batch{"GPU_MAX_KERNELS_PER_BATCH"}; } // namespace intel_gpu } // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/program_builder.cpp b/src/plugins/intel_gpu/src/plugin/program_builder.cpp index 92c9c4ae9291fb..d6652beb8afb15 100644 --- a/src/plugins/intel_gpu/src/plugin/program_builder.cpp +++ b/src/plugins/intel_gpu/src/plugin/program_builder.cpp @@ -103,6 +103,14 @@ ProgramBuilder::ProgramBuilder(std::shared_ptr model, cldnn::engine& CustomLayer::LoadFromFile(custom_layers_config, m_custom_layers, custom_layers_config.empty()); auto ops = model->get_ordered_ops(); + // In the case of dynamic models, because most of the layers are mapped to shape agnostic kernels, + // smaller # of kernels are built compared to static models. + // So having smaller batch size is even better for dynamic model as we can do more parallel build. + if (model->is_dynamic()) { + m_config.set_property(ov::intel_gpu::max_kernels_per_batch(4)); + } else { + m_config.set_property(ov::intel_gpu::max_kernels_per_batch(8)); + } m_program = build(ops, partial_build, is_inner_program); } diff --git a/src/plugins/intel_gpu/src/runtime/execution_config.cpp b/src/plugins/intel_gpu/src/runtime/execution_config.cpp index d81e603f9add5a..8a57759bff9413 100644 --- a/src/plugins/intel_gpu/src/runtime/execution_config.cpp +++ b/src/plugins/intel_gpu/src/runtime/execution_config.cpp @@ -73,7 +73,8 @@ void ExecutionConfig::set_default() { std::make_tuple(ov::intel_gpu::partial_build_program, false), std::make_tuple(ov::intel_gpu::allow_new_shape_infer, false), std::make_tuple(ov::intel_gpu::use_only_static_kernels_for_dynamic_shape, false), - std::make_tuple(ov::intel_gpu::buffers_preallocation_ratio, 1.1f)); + std::make_tuple(ov::intel_gpu::buffers_preallocation_ratio, 1.1f), + std::make_tuple(ov::intel_gpu::max_kernels_per_batch, 8)); } void ExecutionConfig::register_property_impl(const std::pair& property, PropertyVisibility visibility, BaseValidator::Ptr validator) { diff --git a/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp b/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp index e835c4678fa873..a5c9529b327feb 100644 --- a/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp +++ b/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp @@ -89,10 +89,9 @@ size_t kernels_cache::get_max_kernels_per_batch() const { GPU_DEBUG_IF(debug_config->max_kernels_per_batch >= 1) { return static_cast(debug_config->max_kernels_per_batch); } - return 8; + return _config.get_property(ov::intel_gpu::max_kernels_per_batch); } - void kernels_cache::get_program_source(const kernels_code& kernels_source_code, std::vector* all_batches) const { OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "KernelsCache::BuildAll::GetProgramSource"); std::map>> program_buckets;