diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp index 5377387c8b497e..9c952993532fd0 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp @@ -534,6 +534,7 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para size_t tile_k_ofm_packed = tile_k_ofm; size_t quantize_grp_size = get_dynamic_quantize_group_size(params); + bool add_decompress_scale_post_op = false; WeightsType weights_dt = params.weights.GetDType(); if (weights_dt == WeightsType::UINT4 || weights_dt == WeightsType::INT4) { tile_k_ofm_packed /= 2; @@ -542,7 +543,7 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para const size_t scale_group_size = params.weights.IFM().v / params.decompression_scale.Feature().v; // Do not use SCALE_POST_OP for SLM kernel, since it demonstrates worse performance if (scale_group_size % simd == 0 && !dispatchData.use_slm) - jit.AddConstant(MakeJitConstant("DECOMPRESSION_SCALE_POST_OP", 1)); + add_decompress_scale_post_op = true; } if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv32_isv2) { jit.AddConstant(MakeJitConstant("W_IDX", "fi * TILE_K + kii")); @@ -619,6 +620,8 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para jit.AddConstant(MakeJitConstant("DQ_TYPE", "char")); jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", quantize_grp_size)); } else { + if (add_decompress_scale_post_op) + jit.AddConstant(MakeJitConstant("DECOMPRESSION_SCALE_POST_OP", 1)); jit.AddConstant(MakeJitConstant("DYNAMIC_QUANTIZE", 0)); jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", min_quantize_grp_size)); }