Skip to content

Commit

Permalink
[GPU] smaller kernel batch size for dynamic models (openvinotoolkit#2…
Browse files Browse the repository at this point in the history
…4190)

### Details:
- We found a performance issue when multiple `gemm_tiled_opt` kernels
are built as a single program.
- This PR adds a WA to lower the kernel batch size to four in the case
of dynamic models.
  • Loading branch information
e-ddykim authored Apr 30, 2024
1 parent 77cab19 commit 6e9949d
Show file tree
Hide file tree
Showing 4 changed files with 12 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ static constexpr Property<bool, PropertyMutability::RW> enable_lp_transformation
static constexpr Property<size_t, PropertyMutability::RW> max_dynamic_batch{"DYN_BATCH_LIMIT"};
static constexpr Property<bool, PropertyMutability::RW> nv12_two_inputs{"GPU_NV12_TWO_INPUTS"};
static constexpr Property<float, PropertyMutability::RW> buffers_preallocation_ratio{"GPU_BUFFERS_PREALLOCATION_RATIO"};
static constexpr Property<size_t, PropertyMutability::RW> max_kernels_per_batch{"GPU_MAX_KERNELS_PER_BATCH"};

} // namespace intel_gpu
} // namespace ov
Expand Down
8 changes: 8 additions & 0 deletions src/plugins/intel_gpu/src/plugin/program_builder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,14 @@ ProgramBuilder::ProgramBuilder(std::shared_ptr<ov::Model> model, cldnn::engine&
CustomLayer::LoadFromFile(custom_layers_config, m_custom_layers, custom_layers_config.empty());

auto ops = model->get_ordered_ops();
// In the case of dynamic models, because most of the layers are mapped to shape agnostic kernels,
// smaller # of kernels are built compared to static models.
// So having smaller batch size is even better for dynamic model as we can do more parallel build.
if (model->is_dynamic()) {
m_config.set_property(ov::intel_gpu::max_kernels_per_batch(4));
} else {
m_config.set_property(ov::intel_gpu::max_kernels_per_batch(8));
}

m_program = build(ops, partial_build, is_inner_program);
}
Expand Down
3 changes: 2 additions & 1 deletion src/plugins/intel_gpu/src/runtime/execution_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ void ExecutionConfig::set_default() {
std::make_tuple(ov::intel_gpu::partial_build_program, false),
std::make_tuple(ov::intel_gpu::allow_new_shape_infer, false),
std::make_tuple(ov::intel_gpu::use_only_static_kernels_for_dynamic_shape, false),
std::make_tuple(ov::intel_gpu::buffers_preallocation_ratio, 1.1f));
std::make_tuple(ov::intel_gpu::buffers_preallocation_ratio, 1.1f),
std::make_tuple(ov::intel_gpu::max_kernels_per_batch, 8));
}

void ExecutionConfig::register_property_impl(const std::pair<std::string, ov::Any>& property, PropertyVisibility visibility, BaseValidator::Ptr validator) {
Expand Down
3 changes: 1 addition & 2 deletions src/plugins/intel_gpu/src/runtime/kernels_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,9 @@ size_t kernels_cache::get_max_kernels_per_batch() const {
GPU_DEBUG_IF(debug_config->max_kernels_per_batch >= 1) {
return static_cast<size_t>(debug_config->max_kernels_per_batch);
}
return 8;
return _config.get_property(ov::intel_gpu::max_kernels_per_batch);
}


void kernels_cache::get_program_source(const kernels_code& kernels_source_code, std::vector<kernels_cache::batch_program>* all_batches) const {
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "KernelsCache::BuildAll::GetProgramSource");
std::map<std::string, std::tuple<int32_t, std::vector<batch_program>>> program_buckets;
Expand Down

0 comments on commit 6e9949d

Please sign in to comment.