From e1c167a841cad0ff205b52c67f7330c29edd7075 Mon Sep 17 00:00:00 2001 From: "Min, Byungil" Date: Tue, 24 Sep 2024 19:03:53 +0900 Subject: [PATCH] [GPU] Modify fc_gpu_bf_tiled kernel to enable weight zp (#26367) ### Details: - *item1* - *...* ### Tickets: - CVS-150930 --------- Signed-off-by: Min, Byung-il Signed-off-by: Min, Byungil --- .../fully_connected_gpu_bf_tiled.cl | 32 ++++++----- .../fully_connected_kernel_bf_tiled.cpp | 15 ++--- .../test_cases/fully_connected_gpu_test.cpp | 55 +++++++++++++++---- 3 files changed, 72 insertions(+), 30 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl index f71b51dfe24423..90eeb50fd24994 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl @@ -886,38 +886,44 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( SLM_FILTER_PACKED_VEC wei_packed0 = BLOCK_READN(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE, weights, weights_idx); SLM_FILTER_PACKED_VEC wei_packed1 = BLOCK_READN(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE, weights, (weights_idx + ((IFM_SIZE / 2) * 16))); DQ_SLM_FILTER_UNPACKED_VEC dq_wei_unpacked; - dq_wei_unpacked.s0123 = UNPACK_TRANSPOSED_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed0)); - dq_wei_unpacked.s4567 = UNPACK_TRANSPOSED_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed1)); + // loaded weights 'wei_packed' of os_iyx_osv16 format have continuous values along TILE_K. So no need to transpose while unpacking + dq_wei_unpacked.s0123 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed0)); + dq_wei_unpacked.s4567 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed1)); #else SLM_FILTER_PACKED_VEC wei_packed = BLOCK_READN(FILTER_TYPE, FILTER_LOAD_BLOCK_SIZE, weights, weights_idx); DQ_SLM_FILTER_UNPACKED_VEC dq_wei_unpacked = UNPACK_TRANSPOSED_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD *)&wei_packed)); #endif // Calculate zero-point and scale only for DECOMPRESSION_SCALE_POST_OP enabled + // Calculate weight : w = (w - dzp) * ds #if DECOMPRESSION_ZP_TERM #if DECOMPRESSION_ZP_SCALAR DQ_SLM_FILTER_UNPACKED_VEC dzp = (DQ_SLM_FILTER_UNPACKED_VEC)(DECOMPRESSION_ZP_VALUE); + dq_wei_unpacked -= dzp; #elif DECOMPRESSION_ZP_GROUPS_NUM > 1 - DQ_SLM_FILTER_UNPACKED_VEC dzp; + DQ_TYPE* w = (DQ_TYPE*)(&dq_wei_unpacked); + const uint ni_offset = ni * TILE_IFM * SIMD + local_id * FILTER_LOAD_ITERS * FILTER_LOAD_BLOCK_SIZE; unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) { + const uint offset_ofm = out_f + fi*SIMD + sglid; unroll_for(uint kii = 0; kii < FILTER_LOAD_BLOCK_SIZE; ++kii) { - const uint offset_ofm = out_f + fi*SIMD + sglid; - const uint offset_ifm = ni * TILE_IFM * SIMD + local_id * FILTER_LOAD_ITERS * FILTER_LOAD_BLOCK_SIZE + load_iter * FILTER_LOAD_BLOCK_SIZE + kii; + const uint offset_ifm = ni_offset + load_iter * FILTER_LOAD_BLOCK_SIZE + kii; const uint zp_offset = (offset_ofm % DECOMPRESSION_ZP_BATCH_NUM) * DECOMPRESSION_ZP_BATCH_PITCH + (offset_ifm / DECOMPRESSION_ZP_GROUP_SIZE) * DECOMPRESSION_ZP_FEATURE_PITCH; - dzp[W_IDX] = decompression_zp[zp_offset]; + w[W_DYN_QUAN_IDX] = w[W_DYN_QUAN_IDX] - TO_DQ_TYPE(decompression_zp[zp_offset]); } } #else - DQ_SLM_FILTER_UNPACKED_VEC dzp = (DQ_SLM_FILTER_UNPACKED_VEC)(d_zps[0]); + DQ_TYPE* w = (DQ_TYPE*)(&dq_wei_unpacked); + unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) { + unroll_for(uint kii = 0; kii < FILTER_LOAD_BLOCK_SIZE; ++kii) { + w[W_DYN_QUAN_IDX] = w[W_DYN_QUAN_IDX] - d_zps[fi % DECOMPRESSION_ZP_LENGTH]; + } + } #endif #else DQ_SLM_FILTER_UNPACKED_VEC dzp = (DQ_SLM_FILTER_UNPACKED_VEC)(ACCUMULATOR_VAL_ZERO); #endif - // Calculate weight : w = (w - dzp) * ds - dq_wei_unpacked -= dzp; - #if FILTER_LOAD_BLOCK_SIZE == 2 DQ_SLM_FILTER_VEC wei_1 = {dq_wei_unpacked.s01, dq_wei_unpacked.s23}; char_slm_weight[wei_local_idx] = as_int(wei_1); @@ -1117,7 +1123,7 @@ KERNEL(fc)( #endif ) { #if USE_SLM - #if DYNAMIC_QUANTIZE + #if DYNAMIC_QUANTIZE && (TILE_OFM == 2) __local int dq_wei_local_mem[SIMD * TILE_OFM * SIMD]; #else __local ACCUMULATOR_TYPE wei_local_mem[TILE_IFM * SIMD * TILE_OFM * SIMD]; @@ -1259,7 +1265,7 @@ KERNEL(fc)( #endif ); } else { - #if USE_SLM && DYNAMIC_QUANTIZE + #if USE_SLM && DYNAMIC_QUANTIZE && (TILE_OFM == 2) FUNC_CALL(fc_bf_tiled_kernel_dyn_quan)( OPTIONAL_SHAPE_INFO_TENSOR input, @@ -1306,7 +1312,7 @@ KERNEL(fc)( #endif } #else - #if USE_SLM && DYNAMIC_QUANTIZE + #if USE_SLM && DYNAMIC_QUANTIZE && (TILE_OFM == 2) FUNC_CALL(fc_bf_tiled_kernel_dyn_quan)( OPTIONAL_SHAPE_INFO_TENSOR input, diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp index c64b7419725611..ef03a0777c8126 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp @@ -111,8 +111,7 @@ static bool should_dynamic_quantize(const fully_connected_params& params) { if ((scale_group_size % simd == 0) && (input_f % dynamic_quantization_group_size == 0) && (params.is_shape_agnostic || (params.inputs[0].Batch().v > 1 && input_b > min_slm_size)) && params.inputs[0].GetDType() == Datatype::F16 && - (params.weights.GetDType() == WeightsType::INT4 || params.weights.GetDType() == WeightsType::UINT4) && - (params.decompression_zero_point.Feature().v == 1)) { + (params.weights.GetDType() == WeightsType::INT4 || params.weights.GetDType() == WeightsType::UINT4)) { GPU_DEBUG_TRACE_DETAIL << " Dynamic quantizing for FC : scale_group_size " << scale_group_size << ", Input (" << kernel_selector::toString(params.inputs[0].GetDType()) << ", " << kernel_selector::toString(params.outputs[0].GetLayout()) << ") B: " << params.inputs[0].Batch().v << ", F: " << params.inputs[0].Feature().v << ", Y: " << params.inputs[0].Y().v << std ::endl; @@ -524,13 +523,15 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para if (scale_group_size % simd == 0 && !dispatchData.use_slm) jit.AddConstant(MakeJitConstant("DECOMPRESSION_SCALE_POST_OP", 1)); } - if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv32_isv2) + if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv32_isv2) { jit.AddConstant(MakeJitConstant("W_IDX", "fi * TILE_K + kii")); - else if (params.weights.GetLayout() == WeightsLayout::os_iyx_osv16) + } else if (params.weights.GetLayout() == WeightsLayout::os_iyx_osv16) { jit.AddConstant(MakeJitConstant("W_IDX", "fi * TILE_K + kii")); - else + } else { jit.AddConstant(MakeJitConstant("W_IDX", "kii * TILE_OFM + fi")); + } + jit.AddConstant(MakeJitConstant("W_DYN_QUAN_IDX", "fi * TILE_K + kii")); if (dispatchData.use_slm) { OPENVINO_ASSERT(dispatchData.tile_n == 2, "[GPU] Unsupported TILE_OFM size for SLM kernel configuration"); @@ -576,14 +577,14 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para } // Validated perf gain, Dynamic quantize force enable SCALE_POST_OP for char type multiplication - if (should_dynamic_quantize(params) && dispatchData.tile_m > 1 && dispatchData.tile_n == 2) { + if (should_dynamic_quantize(params)) { jit.AddConstant(MakeJitConstant("DYNAMIC_QUANTIZE", 1)); jit.AddConstant(MakeJitConstant("DECOMPRESSION_SCALE_POST_OP", 1)); jit.AddConstant(MakeJitConstant("DQ_TYPE", "char")); jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", quantize_grp_size)); } else { jit.AddConstant(MakeJitConstant("DYNAMIC_QUANTIZE", 0)); - jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", -1)); + jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", min_quantize_grp_size)); } jit.AddConstant(MakeJitConstant("IFM_SIZE", get_input_bf_size(params).second)); diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp index 30b15f0c25a08b..3c2475d3d069df 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp @@ -2540,7 +2540,8 @@ class fully_connected_gpu_tests: public ::testing::Test { } void test_compressed_int4_scale_dyn_quan_weight_i4(bool is_dynamic, int batch = 1, int ifm = 512, int ofm = 2048, - int quantize_group_size = 32, int scales_group_size = 128) { + int quantize_group_size = 32, int scales_group_size = 128, + bool is_wzp_test = false, bool is_wzp_scalar = false) { tests::random_generator rg(GET_SUITE_NAME); auto& engine = get_test_engine(); @@ -2550,12 +2551,15 @@ class fully_connected_gpu_tests: public ::testing::Test { long int batch_num = batch; long int ifm_num = ifm; long int ofm_num = ofm; + long int wzp_num = is_wzp_scalar ? 1 : ofm_num; auto input_ps = ov::PartialShape{ batch_num, 1, ifm_num }; auto input_mem = engine.allocate_memory({ input_ps, data_types::f16, format::bfyx }); auto weights_mem = engine.allocate_memory({ {ofm_num, ifm_num}, data_types::i4, format::bfyx }); auto scale_mem = engine.allocate_memory({ {ofm_num, ifm_num / scales_group_size}, data_types::f16, format::fbyx }); + auto dcomp_zp_mem = engine.allocate_memory({ {wzp_num, 1}, data_types::u8, format::bfyx }); + auto input_data = rg.generate_random_1d(batch_num * ifm_num, -2.f, 2.f); set_values(input_mem, input_data); @@ -2566,20 +2570,30 @@ class fully_connected_gpu_tests: public ::testing::Test { auto scale_data = rg.generate_random_1d(ofm_num * ifm_num / scales_group_size, -2.f, 2.f); set_values(scale_mem, scale_data); + if (is_wzp_test) { + auto zp_data = rg.generate_random_1d(wzp_num, 0, 2); + set_values(dcomp_zp_mem, zp_data); + } + auto in_layout = is_dynamic ? layout{ ov::PartialShape{ -1, -1, -1 }, data_types::f16, format::bfyx } : layout{ input_ps, data_types::f16, format::bfyx }; - auto fc_prim = fully_connected("fc_prim", input_info("input"), "weights", "", "scale", "", data_types::f16, 3, 2); - fc_prim.decompression_zero_point_scalar = 0; + auto dcomp_zp_name = is_wzp_test ? "wzp" : ""; + auto fc_prim = fully_connected("fc_prim", input_info("input"), "weights", "", "scale", dcomp_zp_name, data_types::f16, 3, 2); + + if (is_wzp_test) { + fc_prim.compressed_weights = true; + fc_prim.decompression_zero_point = is_wzp_test ? "wzp" : ""; + } // Implemented dynamic quantize kernel auto get_ref_results = [&]() { - topology topology( - input_layout("input", in_layout), - data("weights", weights_mem), - data("scale", scale_mem), - fc_prim - ); + topology topo; + topo.add(input_layout("input", in_layout)); + topo.add(data("weights", weights_mem)); + topo.add(data("scale", scale_mem)); + topo.add(data("wzp", dcomp_zp_mem)); + topo.add(fc_prim); auto config = get_test_default_config(engine); config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); @@ -2587,7 +2601,7 @@ class fully_connected_gpu_tests: public ::testing::Test { config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"fc_prim", fc_impl_desc} })); config.set_property(ov::hint::dynamic_quantization_group_size(0)); - network network(engine, topology, config); + network network(engine, topo, config); network.set_input_data("input", input_mem); auto outputs = network.execute(); @@ -2604,6 +2618,7 @@ class fully_connected_gpu_tests: public ::testing::Test { input_layout("input", in_layout), data("weights", weights_mem), data("scale", scale_mem), + data("wzp", dcomp_zp_mem), fc_prim ); @@ -3699,6 +3714,26 @@ TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_quantize_edge_ca this->test_compressed_int4_scale_dyn_quan_weight_i4(true, 359, 1536, 2560, 128, 64); } +TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_quantize_no_wzp) { + this->test_compressed_int4_scale_dyn_quan_weight_i4(true, 320, 1024, 1024, 32, 32, false); +} + +TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_quantize_wzp) { + this->test_compressed_int4_scale_dyn_quan_weight_i4(true, 320, 1024, 1024, 32, 32, true); +} + +TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_quantize_wzp_scalar) { + this->test_compressed_int4_scale_dyn_quan_weight_i4(true, 320, 1024, 1024, 32, 32, true); +} + +TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_quantize_wzp_128) { + this->test_compressed_int4_scale_dyn_quan_weight_i4(true, 320, 1024, 1024, 128, 128, true); +} + +TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_quantize_wzp_static) { + this->test_compressed_int4_scale_dyn_quan_weight_i4(false, 320, 1024, 1024, 32, 32, true); +} + TEST_F(fully_connected_gpu_tests, compressed_scale_bias) { this->test_compressed_scale_bias(false); }