From 4bbd2040c8e5069e41f9115f49cbb1f3a1dce9a8 Mon Sep 17 00:00:00 2001 From: liubo-intel Date: Mon, 14 Aug 2023 01:30:09 -0700 Subject: [PATCH 01/17] fix avx2 fp16 load_store_emitters issue --- .../plugin/x64/jit_load_store_emitters.cpp | 118 ++++++++++-------- 1 file changed, 64 insertions(+), 54 deletions(-) diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp index e7e668335e49ee..226c7f1dcd4779 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp @@ -497,53 +497,72 @@ void jit_load_emitter::load_words_to_dword_extension(const Vmm &vmm, const Xbyak auto ymm = Xbyak::Ymm(vmm.getIdx()); auto zmm = Xbyak::Zmm(vmm.getIdx()); + auto load_words_to_dword_base = [&]() { + load_bytes(xmm, reg, offset, load_size); + if (is_bf16) { + h->uni_vpmovzxwd(vmm, xmm); + h->uni_vpslld(vmm, vmm, 16); + } else if (is_f16) { + h->vcvtph2ps(ymm, xmm); + } else { + if (is_signed) + h->uni_vpmovsxwd(vmm, xmm); + else + h->uni_vpmovzxwd(vmm, xmm); + } + }; + // For load_size == 32/16/8, do load/extension in one go // including xmm/ymm tail block for ymm/zmm, so explicite xmm/ymm/zmm switch (load_size) { - case 32: { - if (is_bf16) { - h->uni_vpmovzxwd(zmm, ptr[reg + offset]); - h->uni_vpslld(zmm, zmm, 16); - } else if (is_f16) { - h->vcvtph2ps(zmm, ptr[reg + offset]); - } else { - if (is_signed) - h->uni_vpmovsxwd(zmm, ptr[reg + offset]); - else + case 32: { + if (mayiuse(cpu::x64::avx512_core)) { + if (is_bf16) { h->uni_vpmovzxwd(zmm, ptr[reg + offset]); - } - break; + h->uni_vpslld(zmm, zmm, 16); + } else if (is_f16) { + h->vcvtph2ps(zmm, ptr[reg + offset]); + } else { + if (is_signed) + h->uni_vpmovsxwd(zmm, ptr[reg + offset]); + else + h->uni_vpmovzxwd(zmm, ptr[reg + offset]); + } + break; + } else { + load_words_to_dword_base(); } - case 16: { - if (is_bf16) { + } + case 16: { + if (is_bf16) { h->uni_vpmovzxwd(ymm, ptr[reg + offset]); h->uni_vpslld(ymm, ymm, 16); - } else if (is_f16) { + } else if (is_f16) { h->vcvtph2ps(ymm, ptr[reg + offset]); - } else { + } else { if (is_signed) h->uni_vpmovsxwd(ymm, ptr[reg + offset]); else h->uni_vpmovzxwd(ymm, ptr[reg + offset]); - } - break; } - case 8: { - if (is_bf16) { + break; + } + case 8: { + if (is_bf16) { h->uni_vpmovzxwd(xmm, ptr[reg + offset]); h->uni_vpslld(xmm, xmm, 16); - } else if (is_f16) { + } else if (is_f16) { h->vcvtph2ps(xmm, ptr[reg + offset]); - } else { + } else { if (is_signed) h->uni_vpmovsxwd(xmm, ptr[reg + offset]); else h->uni_vpmovzxwd(xmm, ptr[reg + offset]); - } - break; } - default: { - if (is_zmm && load_size > threshold_for_mask_emu_load) { + break; + } + default: { + if (is_zmm && load_size > threshold_for_mask_emu_load) { unsigned int mask = 1; mask = (mask << (load_size / 2)) - mask; h->mov(Reg32(aux_gpr_idxs[0]), mask); @@ -559,23 +578,12 @@ void jit_load_emitter::load_words_to_dword_extension(const Vmm &vmm, const Xbyak else h->uni_vpmovzxwd(vmm | k_mask | T_z, ptr[reg + offset]); } - } else { + } else { // xmm or ymm version - load_bytes(xmm, reg, offset, load_size); - if (is_bf16) { - h->uni_vpmovzxwd(vmm, xmm); - h->uni_vpslld(vmm, vmm, 16); - } else if (is_f16) { - h->vcvtph2ps(ymm, xmm); - } else { - if (is_signed) - h->uni_vpmovsxwd(vmm, xmm); - else - h->uni_vpmovzxwd(vmm, xmm); - } - } - break; + load_words_to_dword_base(); } + break; + } } } @@ -1188,20 +1196,22 @@ void jit_store_emitter::store_dword_to_word_extension(const Xbyak::Reg64 ®, store_bytes(reg, offset, store_num * 2); } } else if (is_f16) { - if (!mayiuse(cpu::x64::avx512_core_fp16)) - OPENVINO_THROW("Store emitter in ", name_, " only support fp16 on platform with avx512_core_fp16."); - // to avoid src vmm pollution - if (src_prc_ == ov::element::f32) { - // since avx512, zmm(fp32) => ymm(fp16) - ymm = Ymm(aux_vec_idxs[0]); - } // in I32 case, zmm&ymm is already in aux reg - - h->vcvtps2ph(ymm, zmm, 0x4); - if (store_num == 16) { - h->vmovdqu16(ptr[reg + offset], ymm); + if (mayiuse(cpu::x64::avx512_core_fp16)) { + // to avoid src vmm pollution + if (src_prc_ == ov::element::f32) { + // since avx512, zmm(fp32) => ymm(fp16) + ymm = Ymm(aux_vec_idxs[0]); + } // in I32 case, zmm&ymm is already in aux reg + + h->vcvtps2ph(ymm, zmm, 0x4); + if (store_num == 16) { + h->vmovdqu16(ptr[reg + offset], ymm); + } else { + data_idx = static_cast(ymm.getIdx()); + store_bytes(reg, offset, store_num * 2); + } } else { - data_idx = static_cast(ymm.getIdx()); - store_bytes(reg, offset, store_num * 2); + store_dword_to_word_base(); } } else { switch (store_num) { From d375bd553b8fdab021ae5138e87395e6cf0ac4ad Mon Sep 17 00:00:00 2001 From: liubo-intel Date: Wed, 16 Aug 2023 01:37:02 -0700 Subject: [PATCH 02/17] add eltwise_node bf16 support, and brgconv_avx2 for common cpu node DefaultImplPriority --- src/plugins/intel_cpu/src/config.cpp | 10 ++++----- .../plugin/x64/jit_load_store_emitters.cpp | 21 ++++++++++++++++--- src/plugins/intel_cpu/src/node.cpp | 3 +++ src/plugins/intel_cpu/src/nodes/eltwise.cpp | 13 +++++++----- 4 files changed, 34 insertions(+), 13 deletions(-) diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp index 53b2779936b7b3..a3a061891d6634 100644 --- a/src/plugins/intel_cpu/src/config.cpp +++ b/src/plugins/intel_cpu/src/config.cpp @@ -219,7 +219,7 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) { ". Expected only true/false"); } if (enable) { - if (mayiuse(avx512_core)) { + if (mayiuse(avx512_core) || mayiuse(avx2_vnni_2)) { inferencePrecision = ov::element::bf16; } else { OPENVINO_THROW("Platform doesn't support BF16 format"); @@ -234,12 +234,12 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) { auto const prec = val.as(); inferencePrecisionSetExplicitly = true; if (prec == ov::element::bf16) { - if (mayiuse(avx512_core)) { + if (mayiuse(avx512_core) || mayiuse(avx2_vnni_2)) { inferencePrecision = ov::element::bf16; } } else if (prec == ov::element::f16) { #if defined(OPENVINO_ARCH_X86_64) - if (mayiuse(avx512_core_fp16) || mayiuse(avx512_core_amx_fp16)) { + if (mayiuse(avx512_core_fp16) || mayiuse(avx512_core_amx_fp16) || mayiuse(avx2_vnni_2)) { inferencePrecision = ov::element::f16; } #elif defined(OV_CPU_ARM_ENABLE_FP16) @@ -327,7 +327,7 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) { if (modelType != ModelType::CNN) inferencePrecision = ov::element::f16; #else - if (mayiuse(avx512_core_bf16)) + if (mayiuse(avx512_core_bf16) || mayiuse(avx2_vnni_2)) inferencePrecision = ov::element::bf16; #endif } else { @@ -398,4 +398,4 @@ void Config::updateProperties() { } } // namespace intel_cpu -} // namespace ov +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp index 226c7f1dcd4779..e63cef2e49b3f9 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp @@ -474,8 +474,10 @@ void jit_load_emitter::load_words_to_dword_extension(const Vmm &vmm, const Xbyak bool is_f16 = (prc == ov::element::f16); bool is_signed = prc.is_signed(); - if (is_f16 && !mayiuse(cpu::x64::avx512_core_fp16)) - OPENVINO_THROW("Load emitter in ", name_, " only support fp16 on platform with avx512_core_fp16."); + if (is_f16 && !mayiuse(cpu::x64::avx512_core_fp16) && !mayiuse(cpu::x64::avx2_vnni_2)) + OPENVINO_THROW("Load emitter in ", + name_, + " only support fp16 on platform with avx512_core_fp16 or avx2_vnni_2."); // Ensure extended double words fit inside Zmm (32/2(num) * 32 <= 512) // For Ymm register, load capacity is halved (16/2(num) * 32 <= 128) @@ -1210,8 +1212,21 @@ void jit_store_emitter::store_dword_to_word_extension(const Xbyak::Reg64 ®, data_idx = static_cast(ymm.getIdx()); store_bytes(reg, offset, store_num * 2); } + } else if (mayiuse(cpu::x64::avx2_vnni_2)) { + // to avoid src vmm pollution + if (src_prc_ == Precision::FP32) { + xmm = Xmm(aux_vec_idxs[0]); + } + h->vcvtps2ph(xmm, ymm, 0x4); + if (store_num == 16) { + h->uni_vmovdqu(ptr[reg + offset], xmm); + } else { + data_idx = static_cast(xmm.getIdx()); + store_bytes(reg, offset, store_num * 2); + } } else { - store_dword_to_word_base(); + IE_THROW() << "Store emitter in " << name_ + << " only support fp16 on platform with avx512_core_fp16 or avx2_vnni_2."; } } else { switch (store_num) { diff --git a/src/plugins/intel_cpu/src/node.cpp b/src/plugins/intel_cpu/src/node.cpp index d9b5bfc490db8e..2293c427bfdcb4 100644 --- a/src/plugins/intel_cpu/src/node.cpp +++ b/src/plugins/intel_cpu/src/node.cpp @@ -1019,6 +1019,9 @@ const std::vector& Node::getDefaultImplPriority() { impl_desc_type::jit_avx512_dw, impl_desc_type::jit_avx512_1x1, impl_desc_type::jit_avx512, + // [WA]default support after fully evaluate + // impl_desc_type::brgconv_avx2_1x1, + // impl_desc_type::brgconv_avx2, impl_desc_type::jit_avx2_dw, impl_desc_type::jit_avx2_1x1, impl_desc_type::jit_avx2, diff --git a/src/plugins/intel_cpu/src/nodes/eltwise.cpp b/src/plugins/intel_cpu/src/nodes/eltwise.cpp index e471e29fc87185..92b478fb8518ee 100644 --- a/src/plugins/intel_cpu/src/nodes/eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/eltwise.cpp @@ -283,7 +283,7 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener this, p->entry_[i], vmm_d_weights, vmm_d_bias, reg_d_weights, reg_d_bias)); } - if (mayiuse(avx512_core)) + if (mayiuse(avx512_core) || mayiuse(avx2_vnni_2)) uni_vcvtneps2bf16.reset(new jit_uni_vcvtneps2bf16(this, isa)); const auto &jep = jep_; @@ -839,8 +839,12 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener uni_vmovups(op, vmm_dst); break; case ov::element::bf16: - uni_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, {static_cast(ymm_dst.getIdx())}); - vmovdqu16(op, ymm_dst); + uni_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, + {static_cast(ymm_dst.getIdx())}); + if (isa == x64::avx512_core) + vmovdqu16(op, ymm_dst); + else + uni_vmovdqu(op, ymm_dst); break; case ov::element::f16: vcvtps2ph(op, vmm_dst, 0x4); @@ -2184,8 +2188,7 @@ void Eltwise::initSupportedPrimitiveDescriptors() { if (!fusedWith.empty()) { outputPrecision = fusedWith[fusedWith.size() - 1]->getOriginalOutputPrecisionAtPort(0); } - - if (!mayiuse(avx512_core)) { + if (!mayiuse(avx512_core) && !mayiuse(avx2_vnni_2)) { bool hasBF16 = false; for (auto &inPrc : inputPrecisions) if (inPrc == ov::element::bf16) From cfd846da7ac85cfec0b1647a36f30e38c7fa401b Mon Sep 17 00:00:00 2001 From: liubo-intel Date: Thu, 21 Sep 2023 23:44:37 -0700 Subject: [PATCH 03/17] MVN node support avx2_bf16 --- src/plugins/intel_cpu/src/nodes/mvn.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_cpu/src/nodes/mvn.cpp b/src/plugins/intel_cpu/src/nodes/mvn.cpp index 97495042f2c6a4..dbfd36296fb484 100644 --- a/src/plugins/intel_cpu/src/nodes/mvn.cpp +++ b/src/plugins/intel_cpu/src/nodes/mvn.cpp @@ -1829,7 +1829,7 @@ void MVN::initSupportedPrimitiveDescriptors() { ov::element::Type inputPrecision = getOriginalInputPrecisionAtPort(0); ov::element::Type outputPrecision = getOriginalOutputPrecisionAtPort(0); - if (!mayiuse(avx512_core)) { + if (!mayiuse(avx512_core) && !mayiuse(avx2_vnni_2)) { if (outputPrecision == ov::element::bf16) outputPrecision = ov::element::f32; } From 6ab20978aa34e206a03ffbdafb61a1a6db8421a3 Mon Sep 17 00:00:00 2001 From: liubo-intel Date: Mon, 9 Oct 2023 22:19:42 -0700 Subject: [PATCH 04/17] limit brgconv support of avx2 to avx2_vnni_2, and enable s8s8 conv on avx2_vnni_2 --- src/plugins/intel_cpu/src/nodes/conv.cpp | 10 ++++++---- .../src/transformations/transformation_pipeline.cpp | 5 +++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/conv.cpp b/src/plugins/intel_cpu/src/nodes/conv.cpp index 9b0f355a9e2f16..fdfee25e531f0c 100644 --- a/src/plugins/intel_cpu/src/nodes/conv.cpp +++ b/src/plugins/intel_cpu/src/nodes/conv.cpp @@ -373,7 +373,8 @@ const std::vector& Convolution::getDefaultImplPriority() { } const bool Convolution::isBrgConvAvailable() { - static const bool isBrgConvAvailable = dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core); + static const bool isBrgConvAvailable = dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core) || + dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2_vnni_2); return isBrgConvAvailable; } @@ -1634,12 +1635,13 @@ void Convolution::initializeInputZeroPoints(const uint8_t* inputZpData, const si if (inputZpData[j] != inputZpData[0]) inputZeroPointType = zpType::PerChannel; } - // Only enable per-tensor zero point on avx512-amx and avx512-core-vnni. + // Only enable per-tensor zero point on avx512-amx and avx512-core-vnni, avx2_vnni_2. // If zero point is pertensor, both legacy zp and stock zp // would be passed into conv node. The conv node would determine how to create // post-ops attribute and prioritize to choose final onednn kernel. - if (inputZeroPointType == zpType::PerTensor && - (impl::cpu::x64::mayiuse(impl::cpu::x64::avx512_core_amx) || impl::cpu::x64::mayiuse(impl::cpu::x64::avx512_core_vnni))) + if (inputZeroPointType == zpType::PerTensor && (impl::cpu::x64::mayiuse(impl::cpu::x64::avx512_core_amx) || + impl::cpu::x64::mayiuse(impl::cpu::x64::avx512_core_vnni) || + impl::cpu::x64::mayiuse(impl::cpu::x64::avx2_vnni_2))) inputZeroPoints.push_back(static_cast(inputZpData[0])); else inputZeroPointType = zpType::PerChannel; diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index 47a1efbf9494ed..abef96a1197938 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -518,9 +518,10 @@ void Transformations::Lpt(const bool hasINT16orINT32Levels, const std::vector input0LowPrecisionList; - if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx)) { + if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx) || + dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2_vnni_2)) { input0LowPrecisionList = {ov::element::u8, ov::element::i8}; } else { input0LowPrecisionList = {ov::element::u8}; From e44e5d3e17ab2c3d64d372a1cfdedbc3128eb433 Mon Sep 17 00:00:00 2001 From: liubo-intel Date: Thu, 12 Oct 2023 01:55:42 -0700 Subject: [PATCH 05/17] fix avx2 bf16 memory free issue of Eltwise node --- src/plugins/intel_cpu/src/nodes/eltwise.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/eltwise.cpp b/src/plugins/intel_cpu/src/nodes/eltwise.cpp index 92b478fb8518ee..e02b1b422b059a 100644 --- a/src/plugins/intel_cpu/src/nodes/eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/eltwise.cpp @@ -839,12 +839,15 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener uni_vmovups(op, vmm_dst); break; case ov::element::bf16: - uni_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, - {static_cast(ymm_dst.getIdx())}); - if (isa == x64::avx512_core) + if (isa == x64::avx512_core) { + uni_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, + {static_cast(ymm_dst.getIdx())}); vmovdqu16(op, ymm_dst); - else - uni_vmovdqu(op, ymm_dst); + } else { + uni_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, + {static_cast(xmm_dst.getIdx())}); + uni_vmovdqu(op, xmm_dst); + } break; case ov::element::f16: vcvtps2ph(op, vmm_dst, 0x4); From 6565690ac76b4d55513f04674d7594a831d89ad2 Mon Sep 17 00:00:00 2001 From: liubo-intel Date: Mon, 16 Oct 2023 02:25:08 -0700 Subject: [PATCH 06/17] re-add avx2_vnni_2 configure as temp commit been dropped --- src/inference/dev_api/ie_system_conf.h | 7 +++++++ src/inference/dev_api/openvino/runtime/system_conf.hpp | 7 +++++++ src/inference/src/system_conf.cpp | 5 +++++ src/plugins/intel_cpu/src/nodes/conv.cpp | 2 ++ 4 files changed, 21 insertions(+) diff --git a/src/inference/dev_api/ie_system_conf.h b/src/inference/dev_api/ie_system_conf.h index c0d2d81704f432..adc887c7284da6 100644 --- a/src/inference/dev_api/ie_system_conf.h +++ b/src/inference/dev_api/ie_system_conf.h @@ -109,6 +109,13 @@ using ov::with_cpu_x86_avx2; */ using ov::with_cpu_x86_avx2_vnni; +/** + * @brief Checks whether CPU supports AVX2_VNNI_2 capability + * @ingroup ie_dev_api_system_conf + * @return `True` is AVX2_VNNI_2 instructions are available, `false` otherwise + */ +using ov::with_cpu_x86_avx2_vnni_2; + /** * @brief Checks whether CPU supports AVX 512 capability * @ingroup ie_dev_api_system_conf diff --git a/src/inference/dev_api/openvino/runtime/system_conf.hpp b/src/inference/dev_api/openvino/runtime/system_conf.hpp index c3648a69118e6b..72ecf9a61694ef 100644 --- a/src/inference/dev_api/openvino/runtime/system_conf.hpp +++ b/src/inference/dev_api/openvino/runtime/system_conf.hpp @@ -97,6 +97,13 @@ OPENVINO_RUNTIME_API bool with_cpu_x86_avx2(); */ OPENVINO_RUNTIME_API bool with_cpu_x86_avx2_vnni(); +/** + * @brief Checks whether CPU supports AVX2_VNNI_2 capability + * @ingroup ov_dev_api_system_conf + * @return `True` is AVX2_VNNI_2 instructions are available, `false` otherwise + */ +OPENVINO_RUNTIME_API bool with_cpu_x86_avx2_vnni_2(); + /** * @brief Checks whether CPU supports AVX 512 capability * @ingroup ov_dev_api_system_conf diff --git a/src/inference/src/system_conf.cpp b/src/inference/src/system_conf.cpp index 68e6e36df4f051..07278e2dde5fdb 100644 --- a/src/inference/src/system_conf.cpp +++ b/src/inference/src/system_conf.cpp @@ -60,6 +60,11 @@ bool with_cpu_x86_avx2_vnni() { return get_cpu_info().has(Xbyak::util::Cpu::tAVX2 | Xbyak::util::Cpu::tAVX_VNNI); } +bool with_cpu_x86_avx2_vnni_2() { + return with_cpu_x86_avx2_vnni() && + get_cpu_info().has(Xbyak::util::Cpu::tAVX_VNNI_INT8 | Xbyak::util::Cpu::tAVX_NE_CONVERT); +} + bool with_cpu_x86_avx512f() { return get_cpu_info().has(Xbyak::util::Cpu::tAVX512F); } diff --git a/src/plugins/intel_cpu/src/nodes/conv.cpp b/src/plugins/intel_cpu/src/nodes/conv.cpp index fdfee25e531f0c..71506791ae0c3d 100644 --- a/src/plugins/intel_cpu/src/nodes/conv.cpp +++ b/src/plugins/intel_cpu/src/nodes/conv.cpp @@ -349,6 +349,8 @@ const std::vector& Convolution::getDefaultImplPriority() { impl_desc_type::jit_avx512_dw, impl_desc_type::jit_avx512_1x1, impl_desc_type::jit_avx512, + impl_desc_type::brgconv_avx2_1x1, + impl_desc_type::brgconv_avx2, impl_desc_type::jit_avx2_dw, impl_desc_type::jit_avx2_1x1, impl_desc_type::jit_avx2, From f4ace001c171213b4ab8c1b9c336890cbd25a397 Mon Sep 17 00:00:00 2001 From: liubo-intel Date: Wed, 18 Oct 2023 23:03:09 -0700 Subject: [PATCH 07/17] W.A bf16/fp16 of gather jit impl and bf16/fp16 weightsDecompression of fullyconnected not supported on avx2_vnni_2 --- src/plugins/intel_cpu/src/nodes/fullyconnected.cpp | 5 +++++ src/plugins/intel_cpu/src/nodes/gather.cpp | 6 ++++++ 2 files changed, 11 insertions(+) diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp index ba0df9d418ba84..3a5ee5a96ce7c1 100644 --- a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp +++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp @@ -206,6 +206,11 @@ void FullyConnected::getSupportedDescriptors() { if (one_of(outputDataType , memory::data_type::u8, memory::data_type::s8)) { outputDataType = memory::data_type::bf16; } + // W.A. WeightsDecompression not supported on avx2_vnni_2 + if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2_vnni_2) && + weightsDataType == memory::data_type::u8) { + inputDataType = outputDataType = memory::data_type::f32; + } } else if (inputDataType == memory::data_type::f16) { #if defined(OV_CPU_WITH_ACL) // acl fc does not support precisions conversion diff --git a/src/plugins/intel_cpu/src/nodes/gather.cpp b/src/plugins/intel_cpu/src/nodes/gather.cpp index 6a9949365ced87..82acd63aab84a3 100644 --- a/src/plugins/intel_cpu/src/nodes/gather.cpp +++ b/src/plugins/intel_cpu/src/nodes/gather.cpp @@ -179,6 +179,12 @@ void Gather::createPrimitive() { if (isInPlace()) { return; } + // W.A gather bf16/fp16 jit impl has ACC issue on avx2_vnni_2 + Precision dataPrecision = getOriginalInputPrecisionAtPort(GATHER_DATA); + if (one_of(dataPrecision, Precision::BF16, Precision::FP16) && + dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2_vnni_2)) + return; + #if defined(OPENVINO_ARCH_X86_64) uint64_t idxElPerVec = 1; if (!isDynamicNode()) { From d1494a201236d653cd9626aa967c0dd192724b6d Mon Sep 17 00:00:00 2001 From: liubo-intel Date: Thu, 26 Oct 2023 20:28:01 -0700 Subject: [PATCH 08/17] fix elementwise functional test 'primTypeCheck' fail error, extend 'hasHardwareSupport' function on avx2_vnni_2 --- src/plugins/intel_cpu/src/utils/precision_support.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_cpu/src/utils/precision_support.cpp b/src/plugins/intel_cpu/src/utils/precision_support.cpp index cc942777697c51..ebd71290624cfb 100644 --- a/src/plugins/intel_cpu/src/utils/precision_support.cpp +++ b/src/plugins/intel_cpu/src/utils/precision_support.cpp @@ -14,7 +14,8 @@ bool hasHardwareSupport(const ov::element::Type& precision) { switch (precision) { case ov::element::f16: { #if defined(OPENVINO_ARCH_X86_64) - if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_fp16)) + if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_fp16) || + dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2_vnni_2)) return true; return false; #elif defined(OV_CPU_ARM_ENABLE_FP16) @@ -25,7 +26,8 @@ bool hasHardwareSupport(const ov::element::Type& precision) { } case ov::element::bf16: { #if defined(OPENVINO_ARCH_X86_64) - if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core)) + if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core) || + dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2_vnni_2)) return true; return false; #else From 31bd8414baeb7b3cd950774a550b1f31d2b1d09d Mon Sep 17 00:00:00 2001 From: liubo-intel Date: Wed, 1 Nov 2023 00:23:55 -0700 Subject: [PATCH 09/17] use avx2_vnni_2 new convert instructions --- .../emitters/plugin/x64/jit_bf16_emitters.hpp | 3 + .../plugin/x64/jit_load_store_emitters.cpp | 115 +++++++++--------- src/plugins/intel_cpu/src/nodes/eltwise.cpp | 14 ++- 3 files changed, 70 insertions(+), 62 deletions(-) diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_bf16_emitters.hpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_bf16_emitters.hpp index ca958355154a56..d8332c218b3822 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_bf16_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_bf16_emitters.hpp @@ -55,6 +55,9 @@ class jit_uni_vcvtneps2bf16 : public jit_emitter { h->vfixupimmps(aux, in, table_val("selector"), 0); h->vpsrad(aux, aux, 16); h->vpmovdw(out, aux); + } else if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::cpu_isa_t::avx2_vnni_2)) { + Xmm out = Xmm(out_vec_idxs[0]); + h->vcvtneps2bf16(out, in, PreferredEncoding::VexEncoding); } else { // round_to_nearest_even emulation Vmm aux = Vmm(aux_vec_idxs[0]); Xmm out = Xmm(out_vec_idxs[0]); diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp index e63cef2e49b3f9..85a631ca0277cc 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp @@ -499,90 +499,87 @@ void jit_load_emitter::load_words_to_dword_extension(const Vmm &vmm, const Xbyak auto ymm = Xbyak::Ymm(vmm.getIdx()); auto zmm = Xbyak::Zmm(vmm.getIdx()); - auto load_words_to_dword_base = [&]() { - load_bytes(xmm, reg, offset, load_size); - if (is_bf16) { - h->uni_vpmovzxwd(vmm, xmm); - h->uni_vpslld(vmm, vmm, 16); - } else if (is_f16) { - h->vcvtph2ps(ymm, xmm); - } else { - if (is_signed) - h->uni_vpmovsxwd(vmm, xmm); - else - h->uni_vpmovzxwd(vmm, xmm); - } - }; - // For load_size == 32/16/8, do load/extension in one go // including xmm/ymm tail block for ymm/zmm, so explicite xmm/ymm/zmm switch (load_size) { case 32: { - if (mayiuse(cpu::x64::avx512_core)) { - if (is_bf16) { - h->uni_vpmovzxwd(zmm, ptr[reg + offset]); - h->uni_vpslld(zmm, zmm, 16); - } else if (is_f16) { - h->vcvtph2ps(zmm, ptr[reg + offset]); - } else { - if (is_signed) - h->uni_vpmovsxwd(zmm, ptr[reg + offset]); - else - h->uni_vpmovzxwd(zmm, ptr[reg + offset]); - } - break; + // needed here? + if (!is_zmm) + IE_THROW() << "Load emitter in " << name_ + << " has unexpected number of values(32) to load to non-zmm in load_words_to_dword_extension."; + if (is_bf16) { + h->uni_vpmovzxwd(zmm, ptr[reg + offset]); + h->uni_vpslld(zmm, zmm, 16); + } else if (is_f16) { + h->vcvtph2ps(zmm, ptr[reg + offset]); } else { - load_words_to_dword_base(); + if (is_signed) + h->uni_vpmovsxwd(zmm, ptr[reg + offset]); + else + h->uni_vpmovzxwd(zmm, ptr[reg + offset]); } + break; } case 16: { if (is_bf16) { - h->uni_vpmovzxwd(ymm, ptr[reg + offset]); - h->uni_vpslld(ymm, ymm, 16); + h->uni_vpmovzxwd(ymm, ptr[reg + offset]); + h->uni_vpslld(ymm, ymm, 16); + } else if (is_f16) { - h->vcvtph2ps(ymm, ptr[reg + offset]); + h->vcvtph2ps(ymm, ptr[reg + offset]); } else { - if (is_signed) - h->uni_vpmovsxwd(ymm, ptr[reg + offset]); - else - h->uni_vpmovzxwd(ymm, ptr[reg + offset]); + if (is_signed) + h->uni_vpmovsxwd(ymm, ptr[reg + offset]); + else + h->uni_vpmovzxwd(ymm, ptr[reg + offset]); } break; } case 8: { if (is_bf16) { - h->uni_vpmovzxwd(xmm, ptr[reg + offset]); - h->uni_vpslld(xmm, xmm, 16); + h->uni_vpmovzxwd(xmm, ptr[reg + offset]); + h->uni_vpslld(xmm, xmm, 16); } else if (is_f16) { - h->vcvtph2ps(xmm, ptr[reg + offset]); + h->vcvtph2ps(xmm, ptr[reg + offset]); } else { - if (is_signed) - h->uni_vpmovsxwd(xmm, ptr[reg + offset]); - else - h->uni_vpmovzxwd(xmm, ptr[reg + offset]); + if (is_signed) + h->uni_vpmovsxwd(xmm, ptr[reg + offset]); + else + h->uni_vpmovzxwd(xmm, ptr[reg + offset]); } break; } default: { if (is_zmm && load_size > threshold_for_mask_emu_load) { - unsigned int mask = 1; - mask = (mask << (load_size / 2)) - mask; - h->mov(Reg32(aux_gpr_idxs[0]), mask); - h->kmovw(k_mask, Reg32(aux_gpr_idxs[0])); - if (is_bf16) { + unsigned int mask = 1; + mask = (mask << (load_size / 2)) - mask; + h->mov(Reg32(aux_gpr_idxs[0]), mask); + h->kmovw(k_mask, Reg32(aux_gpr_idxs[0])); + if (is_bf16) { + h->uni_vpmovzxwd(vmm | k_mask | T_z, ptr[reg + offset]); + h->uni_vpslld(vmm, vmm, 16); + } else if (is_f16) { + h->vcvtph2ps(vmm | k_mask | T_z, ptr[reg + offset]); + } else { + if (is_signed) + h->uni_vpmovsxwd(vmm | k_mask | T_z, ptr[reg + offset]); + else h->uni_vpmovzxwd(vmm | k_mask | T_z, ptr[reg + offset]); - h->uni_vpslld(vmm, vmm, 16); - } else if (is_f16) { - h->vcvtph2ps(vmm | k_mask | T_z, ptr[reg + offset]); - } else { - if (is_signed) - h->uni_vpmovsxwd(vmm | k_mask | T_z, ptr[reg + offset]); - else - h->uni_vpmovzxwd(vmm | k_mask | T_z, ptr[reg + offset]); - } + } } else { - // xmm or ymm version - load_words_to_dword_base(); + // xmm or ymm version + load_bytes(xmm, reg, offset, load_size); + if (is_bf16) { + h->uni_vpmovzxwd(vmm, xmm); + h->uni_vpslld(vmm, vmm, 16); + } else if (is_f16) { + h->vcvtph2ps(ymm, xmm); + } else { + if (is_signed) + h->uni_vpmovsxwd(vmm, xmm); + else + h->uni_vpmovzxwd(vmm, xmm); + } } break; } diff --git a/src/plugins/intel_cpu/src/nodes/eltwise.cpp b/src/plugins/intel_cpu/src/nodes/eltwise.cpp index e02b1b422b059a..ebff119982cc5f 100644 --- a/src/plugins/intel_cpu/src/nodes/eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/eltwise.cpp @@ -771,11 +771,19 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener uni_vmovss(xmm_src, op); break; case ov::element::bf16: - uni_vpinsrw(xmm_src, xmm_src, op, 0); - uni_vpslld(xmm_src, xmm_src, 16); + if (isa == x64::avx2_vnni_2) { + vbcstnebf162ps(xmm_src, op); + } else { + uni_vpinsrw(xmm_src, xmm_src, op, 0); + uni_vpslld(xmm_src, xmm_src, 16); + } break; case ov::element::f16: - vcvtph2ps(xmm_src, op); + if (isa == x64::avx2_vnni_2) { + vbcstnesh2ps(xmm_src, op); + } else { + vcvtph2ps(xmm_src, op); + } break; case ov::element::i16: uni_vpinsrw(xmm_src, xmm_src, op, 0); From 0b753ba2c66ff21254589df41fba27d2a6289871 Mon Sep 17 00:00:00 2001 From: liubo-intel Date: Sat, 4 Nov 2023 20:10:05 -0700 Subject: [PATCH 10/17] fix simple_if testcase failed issue --- .../intel_cpu/src/transformations/transformation_pipeline.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index abef96a1197938..909453b8be3b72 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -283,7 +283,8 @@ void Transformations::PreLpt(const std::vector& defaultPrecis }; // @todo should we always convert to f32 regardless of hardware support, as it is done for f16? - if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core)) + if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core) && + !dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2_vnni_2)) map.insert({ov::element::bf16, ov::element::f32}); #if defined(OV_CPU_ARM_ENABLE_FP16) if (inferencePrecision != ov::element::f16) From eb9aa08152377ce4c272f6ace755816891ed4c2c Mon Sep 17 00:00:00 2001 From: liubo-intel Date: Sat, 4 Nov 2023 20:16:41 -0700 Subject: [PATCH 11/17] remove gather Node WA, as gpt-j-6b AC issue has been fixed --- src/plugins/intel_cpu/src/nodes/gather.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/gather.cpp b/src/plugins/intel_cpu/src/nodes/gather.cpp index 82acd63aab84a3..6a9949365ced87 100644 --- a/src/plugins/intel_cpu/src/nodes/gather.cpp +++ b/src/plugins/intel_cpu/src/nodes/gather.cpp @@ -179,12 +179,6 @@ void Gather::createPrimitive() { if (isInPlace()) { return; } - // W.A gather bf16/fp16 jit impl has ACC issue on avx2_vnni_2 - Precision dataPrecision = getOriginalInputPrecisionAtPort(GATHER_DATA); - if (one_of(dataPrecision, Precision::BF16, Precision::FP16) && - dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2_vnni_2)) - return; - #if defined(OPENVINO_ARCH_X86_64) uint64_t idxElPerVec = 1; if (!isDynamicNode()) { From 304d8e475f61c021e64869e965ddcb0eacb67d73 Mon Sep 17 00:00:00 2001 From: liubo-intel Date: Tue, 28 Nov 2023 19:03:36 -0800 Subject: [PATCH 12/17] fix rebase conflict with new api --- .../src/emitters/plugin/x64/jit_load_store_emitters.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp index 85a631ca0277cc..08506acecf26dc 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp @@ -1211,7 +1211,7 @@ void jit_store_emitter::store_dword_to_word_extension(const Xbyak::Reg64 ®, } } else if (mayiuse(cpu::x64::avx2_vnni_2)) { // to avoid src vmm pollution - if (src_prc_ == Precision::FP32) { + if (src_prc_ == ov::element::f32) { xmm = Xmm(aux_vec_idxs[0]); } h->vcvtps2ph(xmm, ymm, 0x4); From f864ce222cadf9a6fd551a682e58b2d735aad968 Mon Sep 17 00:00:00 2001 From: liubo-intel Date: Tue, 28 Nov 2023 21:23:20 -0800 Subject: [PATCH 13/17] change default inferencePrecision from bf16 to f32 on avx2_vnni_2 --- src/plugins/intel_cpu/src/config.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp index a3a061891d6634..4882744939163c 100644 --- a/src/plugins/intel_cpu/src/config.cpp +++ b/src/plugins/intel_cpu/src/config.cpp @@ -327,7 +327,7 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) { if (modelType != ModelType::CNN) inferencePrecision = ov::element::f16; #else - if (mayiuse(avx512_core_bf16) || mayiuse(avx2_vnni_2)) + if (mayiuse(avx512_core_bf16)) inferencePrecision = ov::element::bf16; #endif } else { From 39c3d87e370d40b820250492bf157d662bc17bc2 Mon Sep 17 00:00:00 2001 From: liubo-intel Date: Tue, 12 Dec 2023 22:05:26 -0800 Subject: [PATCH 14/17] skip prepare_table() in jit_uni_vcvtneps2bf16 for avx2_vnni_2 --- .../intel_cpu/src/emitters/plugin/x64/jit_bf16_emitters.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_bf16_emitters.hpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_bf16_emitters.hpp index d8332c218b3822..a803995b74df68 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_bf16_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_bf16_emitters.hpp @@ -13,7 +13,8 @@ class jit_uni_vcvtneps2bf16 : public jit_emitter { public: jit_uni_vcvtneps2bf16(dnnl::impl::cpu::x64::jit_generator* host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::bf16) : jit_emitter(host, host_isa, exec_prc) { - if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_bf16)) + if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_bf16) && + !dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2_vnni_2)) prepare_table(); } From d6400d160adabfe8c73dc21f101fcf74a2ab2f5b Mon Sep 17 00:00:00 2001 From: liubo-intel Date: Mon, 25 Dec 2023 00:09:56 -0800 Subject: [PATCH 15/17] loose fp16 support limitation of jit_load_store_emitters to avx512_core and avx2 --- .../plugin/x64/jit_load_store_emitters.cpp | 138 +++++++++--------- 1 file changed, 65 insertions(+), 73 deletions(-) diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp index 08506acecf26dc..55ddbe36547014 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp @@ -474,10 +474,8 @@ void jit_load_emitter::load_words_to_dword_extension(const Vmm &vmm, const Xbyak bool is_f16 = (prc == ov::element::f16); bool is_signed = prc.is_signed(); - if (is_f16 && !mayiuse(cpu::x64::avx512_core_fp16) && !mayiuse(cpu::x64::avx2_vnni_2)) - OPENVINO_THROW("Load emitter in ", - name_, - " only support fp16 on platform with avx512_core_fp16 or avx2_vnni_2."); + if (is_f16 && !mayiuse(cpu::x64::avx512_core) && !mayiuse(cpu::x64::avx2)) + OPENVINO_THROW("Load emitter in ", name_, " only support fp16 on platform with avx512_core or avx2."); // Ensure extended double words fit inside Zmm (32/2(num) * 32 <= 512) // For Ymm register, load capacity is halved (16/2(num) * 32 <= 128) @@ -502,87 +500,82 @@ void jit_load_emitter::load_words_to_dword_extension(const Vmm &vmm, const Xbyak // For load_size == 32/16/8, do load/extension in one go // including xmm/ymm tail block for ymm/zmm, so explicite xmm/ymm/zmm switch (load_size) { - case 32: { - // needed here? - if (!is_zmm) - IE_THROW() << "Load emitter in " << name_ - << " has unexpected number of values(32) to load to non-zmm in load_words_to_dword_extension."; - if (is_bf16) { - h->uni_vpmovzxwd(zmm, ptr[reg + offset]); - h->uni_vpslld(zmm, zmm, 16); - } else if (is_f16) { - h->vcvtph2ps(zmm, ptr[reg + offset]); - } else { - if (is_signed) - h->uni_vpmovsxwd(zmm, ptr[reg + offset]); - else + case 32: { + if (is_bf16) { h->uni_vpmovzxwd(zmm, ptr[reg + offset]); + h->uni_vpslld(zmm, zmm, 16); + } else if (is_f16) { + h->vcvtph2ps(zmm, ptr[reg + offset]); + } else { + if (is_signed) + h->uni_vpmovsxwd(zmm, ptr[reg + offset]); + else + h->uni_vpmovzxwd(zmm, ptr[reg + offset]); + } + break; } - break; - } - case 16: { - if (is_bf16) { - h->uni_vpmovzxwd(ymm, ptr[reg + offset]); - h->uni_vpslld(ymm, ymm, 16); - - } else if (is_f16) { - h->vcvtph2ps(ymm, ptr[reg + offset]); - } else { - if (is_signed) - h->uni_vpmovsxwd(ymm, ptr[reg + offset]); - else - h->uni_vpmovzxwd(ymm, ptr[reg + offset]); - } - break; - } - case 8: { - if (is_bf16) { - h->uni_vpmovzxwd(xmm, ptr[reg + offset]); - h->uni_vpslld(xmm, xmm, 16); - } else if (is_f16) { - h->vcvtph2ps(xmm, ptr[reg + offset]); - } else { - if (is_signed) - h->uni_vpmovsxwd(xmm, ptr[reg + offset]); - else - h->uni_vpmovzxwd(xmm, ptr[reg + offset]); - } - break; - } - default: { - if (is_zmm && load_size > threshold_for_mask_emu_load) { - unsigned int mask = 1; - mask = (mask << (load_size / 2)) - mask; - h->mov(Reg32(aux_gpr_idxs[0]), mask); - h->kmovw(k_mask, Reg32(aux_gpr_idxs[0])); + case 16: { if (is_bf16) { - h->uni_vpmovzxwd(vmm | k_mask | T_z, ptr[reg + offset]); - h->uni_vpslld(vmm, vmm, 16); + h->uni_vpmovzxwd(ymm, ptr[reg + offset]); + h->uni_vpslld(ymm, ymm, 16); } else if (is_f16) { - h->vcvtph2ps(vmm | k_mask | T_z, ptr[reg + offset]); + h->vcvtph2ps(ymm, ptr[reg + offset]); } else { if (is_signed) - h->uni_vpmovsxwd(vmm | k_mask | T_z, ptr[reg + offset]); + h->uni_vpmovsxwd(ymm, ptr[reg + offset]); else - h->uni_vpmovzxwd(vmm | k_mask | T_z, ptr[reg + offset]); + h->uni_vpmovzxwd(ymm, ptr[reg + offset]); } - } else { - // xmm or ymm version - load_bytes(xmm, reg, offset, load_size); + break; + } + case 8: { if (is_bf16) { - h->uni_vpmovzxwd(vmm, xmm); - h->uni_vpslld(vmm, vmm, 16); + h->uni_vpmovzxwd(xmm, ptr[reg + offset]); + h->uni_vpslld(xmm, xmm, 16); } else if (is_f16) { - h->vcvtph2ps(ymm, xmm); + h->vcvtph2ps(xmm, ptr[reg + offset]); } else { if (is_signed) - h->uni_vpmovsxwd(vmm, xmm); + h->uni_vpmovsxwd(xmm, ptr[reg + offset]); else + h->uni_vpmovzxwd(xmm, ptr[reg + offset]); + } + break; + } + default: { + if (is_zmm && load_size > threshold_for_mask_emu_load) { + unsigned int mask = 1; + mask = (mask << (load_size / 2)) - mask; + h->mov(Reg32(aux_gpr_idxs[0]), mask); + h->kmovw(k_mask, Reg32(aux_gpr_idxs[0])); + if (is_bf16) { + h->uni_vpmovzxwd(vmm | k_mask | T_z, ptr[reg + offset]); + h->uni_vpslld(vmm, vmm, 16); + } else if (is_f16) { + h->vcvtph2ps(vmm | k_mask | T_z, ptr[reg + offset]); + } else { + if (is_signed) + h->uni_vpmovsxwd(vmm | k_mask | T_z, ptr[reg + offset]); + else + h->uni_vpmovzxwd(vmm | k_mask | T_z, ptr[reg + offset]); + } + } else { + // xmm or ymm version + load_bytes(xmm, reg, offset, load_size); + if (is_bf16) { h->uni_vpmovzxwd(vmm, xmm); + h->uni_vpslld(vmm, vmm, 16); + } else if (is_f16) { + h->vcvtph2ps(ymm, xmm); + } else { + if (is_signed) + h->uni_vpmovsxwd(vmm, xmm); + else + h->uni_vpmovzxwd(vmm, xmm); + } } + break; } - break; - } } } @@ -1195,7 +1188,7 @@ void jit_store_emitter::store_dword_to_word_extension(const Xbyak::Reg64 ®, store_bytes(reg, offset, store_num * 2); } } else if (is_f16) { - if (mayiuse(cpu::x64::avx512_core_fp16)) { + if (mayiuse(cpu::x64::avx512_core)) { // to avoid src vmm pollution if (src_prc_ == ov::element::f32) { // since avx512, zmm(fp32) => ymm(fp16) @@ -1209,7 +1202,7 @@ void jit_store_emitter::store_dword_to_word_extension(const Xbyak::Reg64 ®, data_idx = static_cast(ymm.getIdx()); store_bytes(reg, offset, store_num * 2); } - } else if (mayiuse(cpu::x64::avx2_vnni_2)) { + } else if (mayiuse(cpu::x64::avx2)) { // to avoid src vmm pollution if (src_prc_ == ov::element::f32) { xmm = Xmm(aux_vec_idxs[0]); @@ -1222,8 +1215,7 @@ void jit_store_emitter::store_dword_to_word_extension(const Xbyak::Reg64 ®, store_bytes(reg, offset, store_num * 2); } } else { - IE_THROW() << "Store emitter in " << name_ - << " only support fp16 on platform with avx512_core_fp16 or avx2_vnni_2."; + IE_THROW() << "Store emitter in " << name_ << " only support fp16 on platform with avx512_core or avx2."; } } else { switch (store_num) { From 6a94fe26b743e4559929ed58029239ed7a4d70eb Mon Sep 17 00:00:00 2001 From: liubo-intel Date: Wed, 3 Jan 2024 01:18:12 -0800 Subject: [PATCH 16/17] Apply suggestions from code review: remove 'ov::with_cpu_x86_avx2_vnni_2' api, fix store_num and OPENVINO_THROW issue, keep brgconv related types only for new platform priorities list, update WeightsDecompressionImpl of fullyconnected Node, replace mayiuse() by hasHardwareSupport() for Precision check --- src/inference/dev_api/ie_system_conf.h | 7 ------- src/inference/dev_api/openvino/runtime/system_conf.hpp | 7 ------- src/inference/src/system_conf.cpp | 5 ----- .../emitters/plugin/x64/jit_load_store_emitters.cpp | 8 ++++---- src/plugins/intel_cpu/src/node.cpp | 3 --- src/plugins/intel_cpu/src/nodes/conv.cpp | 10 +++++++++- src/plugins/intel_cpu/src/nodes/fullyconnected.cpp | 5 ++--- src/plugins/intel_cpu/src/nodes/interpolate.cpp | 3 +-- src/plugins/intel_cpu/src/nodes/mvn.cpp | 6 ++---- 9 files changed, 18 insertions(+), 36 deletions(-) diff --git a/src/inference/dev_api/ie_system_conf.h b/src/inference/dev_api/ie_system_conf.h index adc887c7284da6..c0d2d81704f432 100644 --- a/src/inference/dev_api/ie_system_conf.h +++ b/src/inference/dev_api/ie_system_conf.h @@ -109,13 +109,6 @@ using ov::with_cpu_x86_avx2; */ using ov::with_cpu_x86_avx2_vnni; -/** - * @brief Checks whether CPU supports AVX2_VNNI_2 capability - * @ingroup ie_dev_api_system_conf - * @return `True` is AVX2_VNNI_2 instructions are available, `false` otherwise - */ -using ov::with_cpu_x86_avx2_vnni_2; - /** * @brief Checks whether CPU supports AVX 512 capability * @ingroup ie_dev_api_system_conf diff --git a/src/inference/dev_api/openvino/runtime/system_conf.hpp b/src/inference/dev_api/openvino/runtime/system_conf.hpp index 72ecf9a61694ef..c3648a69118e6b 100644 --- a/src/inference/dev_api/openvino/runtime/system_conf.hpp +++ b/src/inference/dev_api/openvino/runtime/system_conf.hpp @@ -97,13 +97,6 @@ OPENVINO_RUNTIME_API bool with_cpu_x86_avx2(); */ OPENVINO_RUNTIME_API bool with_cpu_x86_avx2_vnni(); -/** - * @brief Checks whether CPU supports AVX2_VNNI_2 capability - * @ingroup ov_dev_api_system_conf - * @return `True` is AVX2_VNNI_2 instructions are available, `false` otherwise - */ -OPENVINO_RUNTIME_API bool with_cpu_x86_avx2_vnni_2(); - /** * @brief Checks whether CPU supports AVX 512 capability * @ingroup ov_dev_api_system_conf diff --git a/src/inference/src/system_conf.cpp b/src/inference/src/system_conf.cpp index 07278e2dde5fdb..68e6e36df4f051 100644 --- a/src/inference/src/system_conf.cpp +++ b/src/inference/src/system_conf.cpp @@ -60,11 +60,6 @@ bool with_cpu_x86_avx2_vnni() { return get_cpu_info().has(Xbyak::util::Cpu::tAVX2 | Xbyak::util::Cpu::tAVX_VNNI); } -bool with_cpu_x86_avx2_vnni_2() { - return with_cpu_x86_avx2_vnni() && - get_cpu_info().has(Xbyak::util::Cpu::tAVX_VNNI_INT8 | Xbyak::util::Cpu::tAVX_NE_CONVERT); -} - bool with_cpu_x86_avx512f() { return get_cpu_info().has(Xbyak::util::Cpu::tAVX512F); } diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp index 55ddbe36547014..d6e6d8f46fbf14 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp @@ -474,8 +474,8 @@ void jit_load_emitter::load_words_to_dword_extension(const Vmm &vmm, const Xbyak bool is_f16 = (prc == ov::element::f16); bool is_signed = prc.is_signed(); - if (is_f16 && !mayiuse(cpu::x64::avx512_core) && !mayiuse(cpu::x64::avx2)) - OPENVINO_THROW("Load emitter in ", name_, " only support fp16 on platform with avx512_core or avx2."); + if (is_f16 && !mayiuse(cpu::x64::avx2)) + OPENVINO_THROW("Load emitter in ", name_, " only support fp16 on platform with avx2 or above."); // Ensure extended double words fit inside Zmm (32/2(num) * 32 <= 512) // For Ymm register, load capacity is halved (16/2(num) * 32 <= 128) @@ -1208,14 +1208,14 @@ void jit_store_emitter::store_dword_to_word_extension(const Xbyak::Reg64 ®, xmm = Xmm(aux_vec_idxs[0]); } h->vcvtps2ph(xmm, ymm, 0x4); - if (store_num == 16) { + if (store_num == 8) { h->uni_vmovdqu(ptr[reg + offset], xmm); } else { data_idx = static_cast(xmm.getIdx()); store_bytes(reg, offset, store_num * 2); } } else { - IE_THROW() << "Store emitter in " << name_ << " only support fp16 on platform with avx512_core or avx2."; + OPENVINO_THROW("Store emitter in ", name_, " only support fp16 on platform with avx512_core or avx2."); } } else { switch (store_num) { diff --git a/src/plugins/intel_cpu/src/node.cpp b/src/plugins/intel_cpu/src/node.cpp index 2293c427bfdcb4..d9b5bfc490db8e 100644 --- a/src/plugins/intel_cpu/src/node.cpp +++ b/src/plugins/intel_cpu/src/node.cpp @@ -1019,9 +1019,6 @@ const std::vector& Node::getDefaultImplPriority() { impl_desc_type::jit_avx512_dw, impl_desc_type::jit_avx512_1x1, impl_desc_type::jit_avx512, - // [WA]default support after fully evaluate - // impl_desc_type::brgconv_avx2_1x1, - // impl_desc_type::brgconv_avx2, impl_desc_type::jit_avx2_dw, impl_desc_type::jit_avx2_1x1, impl_desc_type::jit_avx2, diff --git a/src/plugins/intel_cpu/src/nodes/conv.cpp b/src/plugins/intel_cpu/src/nodes/conv.cpp index 71506791ae0c3d..6fe0b9175c27d8 100644 --- a/src/plugins/intel_cpu/src/nodes/conv.cpp +++ b/src/plugins/intel_cpu/src/nodes/conv.cpp @@ -330,7 +330,7 @@ ov::element::Type Convolution::fusedEltwisePrecision(const NodePtr& fusingNode) } const std::vector& Convolution::getDefaultImplPriority() { - static const std::vector priorities = { + static std::vector priorities = { impl_desc_type::unknown, impl_desc_type::dw_acl, impl_desc_type::winograd_acl, @@ -371,6 +371,14 @@ const std::vector& Convolution::getDefaultImplPriority() { impl_desc_type::ref, }; + priorities.erase(std::remove_if(priorities.begin(), + priorities.end(), + [](impl_desc_type type) { + return !isBrgConvAvailable() && (type == impl_desc_type::brgconv_avx2_1x1 || + type == impl_desc_type::brgconv_avx2); + }), + priorities.end()); + return priorities; } diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp index 3a5ee5a96ce7c1..0d6aa56b9d5907 100644 --- a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp +++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp @@ -206,9 +206,8 @@ void FullyConnected::getSupportedDescriptors() { if (one_of(outputDataType , memory::data_type::u8, memory::data_type::s8)) { outputDataType = memory::data_type::bf16; } - // W.A. WeightsDecompression not supported on avx2_vnni_2 - if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2_vnni_2) && - weightsDataType == memory::data_type::u8) { + // TODO: Ticket CVS-122347 - support WeightsDecompression with bf16 inputDataType on avx2_vnni_2 + if (useWeightsDecompressionImpl && !dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_bf16)) { inputDataType = outputDataType = memory::data_type::f32; } } else if (inputDataType == memory::data_type::f16) { diff --git a/src/plugins/intel_cpu/src/nodes/interpolate.cpp b/src/plugins/intel_cpu/src/nodes/interpolate.cpp index 935c2a4d04367e..a10784492c28f9 100644 --- a/src/plugins/intel_cpu/src/nodes/interpolate.cpp +++ b/src/plugins/intel_cpu/src/nodes/interpolate.cpp @@ -2024,9 +2024,8 @@ void Interpolate::initSupportedPrimitiveDescriptors() { inputPrecision = ov::element::f32; } - if ((inputPrecision == ov::element::bf16) && !mayiuse(avx512_core)) { + if (!hasHardwareSupport(inputPrecision)) inputPrecision = ov::element::f32; - } // support input with rank<=3 only with float precision and planar layout. // Jit for avx2(gather is available) and ref for no-avx2 machine. diff --git a/src/plugins/intel_cpu/src/nodes/mvn.cpp b/src/plugins/intel_cpu/src/nodes/mvn.cpp index dbfd36296fb484..6c0e34963618fc 100644 --- a/src/plugins/intel_cpu/src/nodes/mvn.cpp +++ b/src/plugins/intel_cpu/src/nodes/mvn.cpp @@ -1829,10 +1829,8 @@ void MVN::initSupportedPrimitiveDescriptors() { ov::element::Type inputPrecision = getOriginalInputPrecisionAtPort(0); ov::element::Type outputPrecision = getOriginalOutputPrecisionAtPort(0); - if (!mayiuse(avx512_core) && !mayiuse(avx2_vnni_2)) { - if (outputPrecision == ov::element::bf16) - outputPrecision = ov::element::f32; - } + if (!hasHardwareSupport(outputPrecision)) + outputPrecision = ov::element::f32; if (!fusedWith.empty()) { outputPrecision = fusedWith[fusedWith.size() - 1]->getOriginalOutputPrecisionAtPort(0); From 104b6e27738b0227e6aea27ef4b22fa8cf7bf9f6 Mon Sep 17 00:00:00 2001 From: liubo-intel Date: Wed, 3 Jan 2024 23:38:46 -0800 Subject: [PATCH 17/17] hasHardwareSupport replacements, Conv DefaultImplPriority erase condiation for all brgconv --- src/plugins/intel_cpu/src/config.cpp | 7 ++++--- src/plugins/intel_cpu/src/nodes/conv.cpp | 3 +-- src/plugins/intel_cpu/src/nodes/eltwise.cpp | 2 +- .../src/transformations/transformation_pipeline.cpp | 3 +-- 4 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp index 4882744939163c..ba5c54d6bfe219 100644 --- a/src/plugins/intel_cpu/src/config.cpp +++ b/src/plugins/intel_cpu/src/config.cpp @@ -12,6 +12,7 @@ #include "openvino/runtime/internal_properties.hpp" #include "openvino/runtime/properties.hpp" #include "utils/debug_capabilities.h" +#include "utils/precision_support.h" #include #include @@ -219,7 +220,7 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) { ". Expected only true/false"); } if (enable) { - if (mayiuse(avx512_core) || mayiuse(avx2_vnni_2)) { + if (hasHardwareSupport(ov::element::bf16)) { inferencePrecision = ov::element::bf16; } else { OPENVINO_THROW("Platform doesn't support BF16 format"); @@ -234,12 +235,12 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) { auto const prec = val.as(); inferencePrecisionSetExplicitly = true; if (prec == ov::element::bf16) { - if (mayiuse(avx512_core) || mayiuse(avx2_vnni_2)) { + if (hasHardwareSupport(ov::element::bf16)) { inferencePrecision = ov::element::bf16; } } else if (prec == ov::element::f16) { #if defined(OPENVINO_ARCH_X86_64) - if (mayiuse(avx512_core_fp16) || mayiuse(avx512_core_amx_fp16) || mayiuse(avx2_vnni_2)) { + if (hasHardwareSupport(ov::element::f16)) { inferencePrecision = ov::element::f16; } #elif defined(OV_CPU_ARM_ENABLE_FP16) diff --git a/src/plugins/intel_cpu/src/nodes/conv.cpp b/src/plugins/intel_cpu/src/nodes/conv.cpp index 6fe0b9175c27d8..a1eb6f49e9900f 100644 --- a/src/plugins/intel_cpu/src/nodes/conv.cpp +++ b/src/plugins/intel_cpu/src/nodes/conv.cpp @@ -374,8 +374,7 @@ const std::vector& Convolution::getDefaultImplPriority() { priorities.erase(std::remove_if(priorities.begin(), priorities.end(), [](impl_desc_type type) { - return !isBrgConvAvailable() && (type == impl_desc_type::brgconv_avx2_1x1 || - type == impl_desc_type::brgconv_avx2); + return !isBrgConvAvailable() && (type & impl_desc_type::brgconv); }), priorities.end()); diff --git a/src/plugins/intel_cpu/src/nodes/eltwise.cpp b/src/plugins/intel_cpu/src/nodes/eltwise.cpp index ebff119982cc5f..4ed4174b750aad 100644 --- a/src/plugins/intel_cpu/src/nodes/eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/eltwise.cpp @@ -2199,7 +2199,7 @@ void Eltwise::initSupportedPrimitiveDescriptors() { if (!fusedWith.empty()) { outputPrecision = fusedWith[fusedWith.size() - 1]->getOriginalOutputPrecisionAtPort(0); } - if (!mayiuse(avx512_core) && !mayiuse(avx2_vnni_2)) { + if (!hasHardwareSupport(ov::element::bf16)) { bool hasBF16 = false; for (auto &inPrc : inputPrecisions) if (inPrc == ov::element::bf16) diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index 909453b8be3b72..b98d7a8979701d 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -283,8 +283,7 @@ void Transformations::PreLpt(const std::vector& defaultPrecis }; // @todo should we always convert to f32 regardless of hardware support, as it is done for f16? - if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core) && - !dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2_vnni_2)) + if (!hasHardwareSupport(ov::element::bf16)) map.insert({ov::element::bf16, ov::element::f32}); #if defined(OV_CPU_ARM_ENABLE_FP16) if (inferencePrecision != ov::element::f16)