Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GPU] Support large N FC optimization for dynamic quantization case #26848

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -809,7 +809,20 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
uint input_offset = out_b * TILE_IN_B_PITCH + INPUT0_OFFSET;
#endif

#if FILTER_LAYOUT_OS_IS_YX_OSV64_ISV2
const int power_of_two_for_simd = 5;
const int power_of_two_for_osv = 6;
const uint osv64_weight_base = (( (int) (out_f >> power_of_two_for_osv) ) << power_of_two_for_osv);
const uint osv_weight_stride = (INPUT_ELEMENTS_COUNT >> 1);
const uint out_f_offset = (int)((out_f >> power_of_two_for_simd) & 0x1) << power_of_two_for_simd;
// out_f(32) : 0 * osv_weight_stride + 32;
// out_f(64) : 64 * osv_weight_stride + 0;
// out_f(128) : 64 * osv_weight_stride + 32;
// ...
uint weights_offset = osv64_weight_base * osv_weight_stride + out_f_offset;
#else
uint weights_offset = out_f * (INPUT_ELEMENTS_COUNT / 2);
#endif

ACCUMULATOR_VEC_TYPE acc[TILE_B] = { };

Expand Down Expand Up @@ -905,7 +918,11 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(

__local int* char_slm_weight = (__local int*)wei_local_mem;

#if FILTER_LAYOUT_OS_IS_YX_OSV64_ISV2
uint weights_idx = weights_offset + local_id * SIMD * FILTER_LOAD_ITERS * FILTER_LOAD_BLOCK_SIZE * 2;
#else
uint weights_idx = weights_offset + local_id * SIMD * FILTER_LOAD_ITERS * FILTER_ACTUAL_LOAD_BLOCK_SIZE;
#endif
uint wei_local_idx = local_id * SIMD * FILTER_LOAD_ITERS * (FILTER_LOAD_BLOCK_SIZE/2) + sglid * 2;

// DECOMPRESSION_SCALE_POST_OP SHOULD be enabled for dynamic quantize FC : scale is ACCUMULATOR_VAL_ONE
Expand All @@ -917,6 +934,17 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
// loaded weights 'wei_packed' of os_iyx_osv16 format have continuous values along TILE_K. So no need to transpose while unpacking
dq_wei_unpacked.s0123 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed0));
dq_wei_unpacked.s4567 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed1));
#elif FILTER_LAYOUT_OS_IS_YX_OSV64_ISV2
SLM_FILTER_PACKED_VEC wei_packed0 = BLOCK_READN(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE, weights, weights_idx);
SLM_FILTER_PACKED_VEC wei_packed1 = BLOCK_READN(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE, weights, (weights_idx + (FILTER_LOAD_BLOCK_SIZE * SIMD)));
DQ_SLM_FILTER_UNPACKED_VEC dq_wei_unpacked;
DQ_SLM_FILTER_UNPACKED_VEC dq_wei_unpacked_tmp;
dq_wei_unpacked_tmp.s0123 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed0));
dq_wei_unpacked_tmp.s4567 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed1));
dq_wei_unpacked.s01 = dq_wei_unpacked_tmp.s01;
dq_wei_unpacked.s23 = dq_wei_unpacked_tmp.s45;
dq_wei_unpacked.s45 = dq_wei_unpacked_tmp.s23;
dq_wei_unpacked.s67 = dq_wei_unpacked_tmp.s67;
#else
SLM_FILTER_PACKED_VEC wei_packed = BLOCK_READN(FILTER_TYPE, FILTER_LOAD_BLOCK_SIZE, weights, weights_idx);
DQ_SLM_FILTER_UNPACKED_VEC dq_wei_unpacked = UNPACK_TRANSPOSED_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD *)&wei_packed));
Expand Down Expand Up @@ -996,11 +1024,7 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
acc_tmp[1][bi] = imad_SW(acc_tmp[1][bi], input_val, second_weight);
}

#if FILTER_LAYOUT_OS_IYX_OSV16 && TILE_OFM == 2
weights_offset += (TILE_K_OFM_PACKED/2) * SIMD;
#else
weights_offset += TILE_K_OFM_PACKED * SIMD;
#endif
weights_offset += TILE_K_OFM_PACKED * TILE_OFM_PER_OSV_SIZE * SIMD;

#if DECOMPRESSION_SCALE_POST_OP && (TILE_IFM_ELEMENTS_SIZE > DECOMPRESSION_SCALE_GROUP_SIZE)
unroll_for (uint bi = 0; bi < TILE_B; ++bi) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -781,8 +781,7 @@ KernelsData FullyConnected_bf_tiled::GetTunedKernelsDataByIndex(const Params &pa
auto output_f = get_output_aligned_bf_size(fc_params, false).second;

WeightsLayout weights_layout = WeightsLayout::os_iyx_osv16;
// TODO: Update may also be required to fc_bf_tiled_kernel_dyn_quan kernel to support os_is_yx_osv64_isv2 format as needed
if (!should_dynamic_quantize(fc_params) && fc_params.compressed && fc_params.inputs[0].GetDType() == Datatype::F16
if (fc_params.compressed && fc_params.inputs[0].GetDType() == Datatype::F16
&& (fc_params.weights.GetLayout() == WeightsLayout::oiyx || fc_params.weights.GetLayout() == WeightsLayout::os_is_yx_osv64_isv2)
&& (fc_params.weights.GetDType() == WeightsType::INT4 || fc_params.weights.GetDType() == WeightsType::UINT4)
&& is_weight_horizontal(fc_params, output_f)) {
Expand Down
Loading