[GPU] smaller kernel batch size for dynamic models (openvinotoolkit#2…

…4190) ### Details: - We found a performance issue when multiple `gemm_tiled_opt` kernels are built as a single program. - This PR adds a WA to lower the kernel batch size to four in the case of dynamic models.
mryzhov · Apr 30, 2024 · 6e9949d · 6e9949d
1 parent 77cab19
commit 6e9949d
Show file tree

Hide file tree

Showing 4 changed files with 12 additions and 3 deletions.
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp
@@ -55,6 +55,7 @@ static constexpr Property<bool, PropertyMutability::RW> enable_lp_transformation
 static constexpr Property<size_t, PropertyMutability::RW> max_dynamic_batch{"DYN_BATCH_LIMIT"};
 static constexpr Property<bool, PropertyMutability::RW> nv12_two_inputs{"GPU_NV12_TWO_INPUTS"};
 static constexpr Property<float, PropertyMutability::RW> buffers_preallocation_ratio{"GPU_BUFFERS_PREALLOCATION_RATIO"};
+static constexpr Property<size_t, PropertyMutability::RW> max_kernels_per_batch{"GPU_MAX_KERNELS_PER_BATCH"};
 
 }  // namespace intel_gpu
 }  // namespace ov

diff --git a/src/plugins/intel_gpu/src/plugin/program_builder.cpp b/src/plugins/intel_gpu/src/plugin/program_builder.cpp
@@ -103,6 +103,14 @@ ProgramBuilder::ProgramBuilder(std::shared_ptr<ov::Model> model, cldnn::engine&
     CustomLayer::LoadFromFile(custom_layers_config, m_custom_layers, custom_layers_config.empty());
 
     auto ops = model->get_ordered_ops();
+    // In the case of dynamic models, because most of the layers are mapped to shape agnostic kernels,
+    // smaller # of kernels are built compared to static models.
+    // So having smaller batch size is even better for dynamic model as we can do more parallel build.
+    if (model->is_dynamic()) {
+        m_config.set_property(ov::intel_gpu::max_kernels_per_batch(4));
+    } else {
+        m_config.set_property(ov::intel_gpu::max_kernels_per_batch(8));
+    }
 
     m_program = build(ops, partial_build, is_inner_program);
 }

diff --git a/src/plugins/intel_gpu/src/runtime/execution_config.cpp b/src/plugins/intel_gpu/src/runtime/execution_config.cpp
@@ -73,7 +73,8 @@ void ExecutionConfig::set_default() {
         std::make_tuple(ov::intel_gpu::partial_build_program, false),
         std::make_tuple(ov::intel_gpu::allow_new_shape_infer, false),
         std::make_tuple(ov::intel_gpu::use_only_static_kernels_for_dynamic_shape, false),
-        std::make_tuple(ov::intel_gpu::buffers_preallocation_ratio, 1.1f));
+        std::make_tuple(ov::intel_gpu::buffers_preallocation_ratio, 1.1f),
+        std::make_tuple(ov::intel_gpu::max_kernels_per_batch, 8));
 }
 
 void ExecutionConfig::register_property_impl(const std::pair<std::string, ov::Any>& property, PropertyVisibility visibility, BaseValidator::Ptr validator) {

diff --git a/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp b/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp
@@ -89,10 +89,9 @@ size_t kernels_cache::get_max_kernels_per_batch() const {
     GPU_DEBUG_IF(debug_config->max_kernels_per_batch >= 1) {
         return static_cast<size_t>(debug_config->max_kernels_per_batch);
     }
-    return 8;
+    return _config.get_property(ov::intel_gpu::max_kernels_per_batch);
 }
 
-
 void kernels_cache::get_program_source(const kernels_code& kernels_source_code, std::vector<kernels_cache::batch_program>* all_batches) const {
     OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "KernelsCache::BuildAll::GetProgramSource");
     std::map<std::string, std::tuple<int32_t, std::vector<batch_program>>> program_buckets;