From 35f2a0c37c914960e6a4535585bb241d91816a64 Mon Sep 17 00:00:00 2001 From: Chen Xu Date: Wed, 25 Dec 2024 21:27:08 +0800 Subject: [PATCH] [CPU] Implement fp8 conversion (#27949) ### Details: - *Implement fp8 conversion with the following combinations:* _fp32<->f8e4m3, fp16<->f8e4m3, bf16<->f8e4m3_ _fp32<->f8e5m2, fp16<->f8e5m2, bf16<->f8e5m2_ ### Tickets: - *[CVS-156962](https://jira.devtools.intel.com/browse/CVS-156962)* --- src/frontends/onnx/tests/__init__.py | 2 +- .../intel_cpu/src/dnnl_extension_utils.cpp | 10 + .../src/nodes/common/cpu_convert.cpp | 312 +++++++++++++++++- src/plugins/intel_cpu/src/plugin.cpp | 2 + .../single_layer_tests/classes/conversion.cpp | 78 ++++- .../single_layer_tests/classes/conversion.hpp | 6 + .../instances/arm/conversion.cpp | 2 + .../instances/common/conversion.cpp | 62 ++++ .../instances/x64/conversion.cpp | 4 + .../single_layer_tests/conversion.cpp | 30 ++ .../skip_tests_config.cpp | 4 + 11 files changed, 492 insertions(+), 20 deletions(-) diff --git a/src/frontends/onnx/tests/__init__.py b/src/frontends/onnx/tests/__init__.py index ef8cebfa361e3f..fdf1295dfd1dbe 100644 --- a/src/frontends/onnx/tests/__init__.py +++ b/src/frontends/onnx/tests/__init__.py @@ -147,7 +147,7 @@ def xfail_test(reason="Mark the test as expected to fail", strict=True): skip_dynamic_model = pytest.mark.skip(reason="CPU plug-in can't load a model with dynamic output shapes via legacy API") # ONNX 1.14 -xfail_issue_119896 = xfail_test(reason="Unsupported element type: FLOAT8") +xfail_issue_119896 = xfail_test(reason="Unsupported element type: FLOAT8", strict=False) xfail_issue_119900 = xfail_test(reason="While validating ONNX node '': " "half_pixel_symmetric - this type of coordinate transformation mode " "is not supported. Choose one of the following modes: " diff --git a/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp b/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp index 457f8368f734dd..1c5598b6d55e26 100644 --- a/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp +++ b/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp @@ -36,6 +36,8 @@ uint8_t DnnlExtensionUtils::sizeOfDataType(dnnl::memory::data_type dataType) { case dnnl::memory::data_type::s4: case dnnl::memory::data_type::u4: case dnnl::memory::data_type::f8_e8m0: + case dnnl::memory::data_type::f8_e4m3: + case dnnl::memory::data_type::f8_e5m2: case dnnl::memory::data_type::f4_e2m1: return 1; case dnnl::memory::data_type::undef: @@ -70,6 +72,10 @@ dnnl::memory::data_type DnnlExtensionUtils::ElementTypeToDataType(const ov::elem return memory::data_type::u4; case ov::element::f8e8m0: return memory::data_type::f8_e8m0; + case ov::element::f8e4m3: + return memory::data_type::f8_e4m3; + case ov::element::f8e5m2: + return memory::data_type::f8_e5m2; case ov::element::f4e2m1: return memory::data_type::f4_e2m1; case ov::element::undefined: @@ -106,6 +112,10 @@ ov::element::Type DnnlExtensionUtils::DataTypeToElementType(const dnnl::memory:: return ov::element::u4; case memory::data_type::f8_e8m0: return ov::element::f8e8m0; + case memory::data_type::f8_e4m3: + return ov::element::f8e4m3; + case memory::data_type::f8_e5m2: + return ov::element::f8e5m2; case memory::data_type::f4_e2m1: return ov::element::f4e2m1; case memory::data_type::undef: diff --git a/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp b/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp index 0c8cddd905dc2e..f6aabe376d6eec 100644 --- a/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp +++ b/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp @@ -9,6 +9,7 @@ #include "utils/bfloat16.hpp" #if defined(OPENVINO_ARCH_X86_64) +# include "cpu/x64/jit_avx512_core_fp8cvt.hpp" # include "nodes/kernels/x64/jit_kernel.hpp" #else # include "cpu_memory.h" @@ -27,6 +28,18 @@ using namespace dnnl::impl::utils; using namespace dnnl::impl::cpu::x64; using namespace Xbyak; +enum f8_type { none, f8e4m3, f8e5m2 }; + +template +f8_type get_f8_type() { + if (std::is_same::value || std::is_same::value) { + return f8_type::f8e4m3; + } else if (std::is_same::value || std::is_same::value) { + return f8_type::f8e5m2; + } + return f8_type::none; +} + template void convert_vec(jit_generator& gen, const RegExp& src, const RegExp& dst); @@ -50,12 +63,14 @@ void convert_vec(jit_generator& gen, const RegExp& src, cons gen.movdqu(gen.xword[dst], f16vec); } +template class jit_convert_array : public jit_kernel { DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_convert_array) void generate() override { - constexpr size_t vlen = 8u; - constexpr size_t vlen_log2 = 3; + bool is_fp8 = f8_e4m3_emu_ || f8_e5m2_emu_; + size_t vlen = is_fp8 ? 16u : 8u; + size_t vlen_log2 = is_fp8 ? 4 : 3; preamble(); @@ -84,17 +99,24 @@ class jit_convert_array : public jit_kernel { auto tail_size = var(); tail_size = size; - tail_size <<= static_cast(std::logb(_src_size)) - 1; - copy(tmp.pointer(), src, tail_size); + tail_size <<= static_cast(std::logb(_src_size)); + copy(tmp.pointer(), src, tail_size); _convert_vec(*this, tmp.pointer(), tmp.pointer()); tail_size = size; - tail_size <<= static_cast(std::logb(_dst_size)) - 1; - copy(dst, tmp.pointer(), tail_size); + tail_size <<= static_cast(std::logb(_dst_size)); + copy(dst, tmp.pointer(), tail_size); }); postamble(); + + if (f8_e4m3_emu_) + f8_e4m3_emu_->prepare_table(); + if (f8_e5m2_emu_) + f8_e5m2_emu_->prepare_table(); + if (uni_vcvtneps2bf16_) + uni_vcvtneps2bf16_->emit_data(); } public: @@ -108,16 +130,37 @@ class jit_convert_array : public jit_kernel { typedef void (*convert_vec_t)(jit_generator&, const RegExp&, const RegExp&); - jit_convert_array(convert_vec_t convert_vec, size_t src_size, size_t dst_size) + jit_convert_array(convert_vec_t convert_vec) : jit_kernel(jit_name()), _convert_vec(convert_vec), - _src_size(src_size), - _dst_size(dst_size) {} + _src_size(sizeof(src_t)), + _dst_size(sizeof(dst_t)) { + const auto type = get_f8_type(); + if (type == f8_type::f8e4m3) { + f8_e4m3_emu_ = std::make_shared(this, + fp8_emu_reserv_1_, + fp8_emu_reserv_2_, + fp8_emu_reserv_3_, + fp8_emu_reserv_4_, + fp8_emu_reserv_5_, + fp8_emu_scratch_); + } else if (type == f8_type::f8e5m2) { + f8_e5m2_emu_ = std::make_shared(this, + fp8_emu_reserv_1_, + fp8_emu_reserv_2_, + fp8_emu_reserv_3_, + fp8_emu_kmask_aux_, + fp8_emu_scratch_); + } + const bool is_dst_bf16 = std::is_same::value; + if (is_dst_bf16 && mayiuse(cpu_isa_t::avx512_core)) { + uni_vcvtneps2bf16_ = std::make_shared(this, cpu_isa_t::avx512_core); + } + } - template static fn_t get() { if (mayiuse(cpu_isa_t::avx2) && dnnl::impl::cpu::x64::cpu().has(Xbyak::util::Cpu::tF16C)) { - static jit_convert_array converter(convert_vec, sizeof(src_t), sizeof(dst_t)); + static jit_convert_array converter(convert_vec); auto& generator = static_cast(converter); generator.create_kernel(); return (fn_t)generator.jit_ker(); @@ -125,16 +168,192 @@ class jit_convert_array : public jit_kernel { return nullptr; } + std::shared_ptr get_f8_e4m3_emu() const { + return f8_e4m3_emu_; + } + + std::shared_ptr get_f8_e5m2_emu() const { + return f8_e5m2_emu_; + } + + std::shared_ptr get_uni_vcvtneps2bf16() const { + return uni_vcvtneps2bf16_; + } + private: convert_vec_t _convert_vec; size_t _src_size; size_t _dst_size; + + std::shared_ptr f8_e4m3_emu_; + std::shared_ptr f8_e5m2_emu_; + std::shared_ptr uni_vcvtneps2bf16_; + + const Reg64 fp8_emu_scratch_ = rax; + const Zmm fp8_emu_reserv_1_ = Zmm(9); + const Zmm fp8_emu_reserv_2_ = Zmm(10); + const Zmm fp8_emu_reserv_3_ = Zmm(11); + const Zmm fp8_emu_reserv_4_ = Zmm(12); + const Zmm fp8_emu_reserv_5_ = Zmm(13); + const Opmask fp8_emu_kmask_aux_ = Opmask(1); }; +template <> +void convert_vec(jit_generator& gen, const RegExp& src, const RegExp& dst) { + auto const& f8vec = gen.xmm3; + auto const& f32vec = gen.zmm4; + + auto& cvt = dynamic_cast&>(gen); + + gen.vmovups(f32vec, gen.zword[src]); + cvt.get_f8_e4m3_emu()->vcvt_f32_to_f8(f8vec, f32vec); + gen.vmovdqu(gen.xword[dst], f8vec); +} + +template <> +void convert_vec(jit_generator& gen, const RegExp& src, const RegExp& dst) { + auto const& f8vec = gen.xmm3; + auto const& f32vec = gen.zmm4; + + auto& cvt = dynamic_cast&>(gen); + + gen.vmovdqu(f8vec, gen.xword[src]); + cvt.get_f8_e4m3_emu()->vcvt_f8_to_f32(f32vec, f8vec); + gen.vmovups(gen.zword[dst], f32vec); +} + +template <> +void convert_vec(jit_generator& gen, const RegExp& src, const RegExp& dst) { + auto const& f8vec = gen.xmm3; + auto const& f16vec = gen.ymm4; + + auto& cvt = dynamic_cast&>(gen); + + gen.vmovdqu(f16vec, gen.yword[src]); + cvt.get_f8_e4m3_emu()->vcvt_f16_to_f8(f8vec, f16vec); + gen.vmovdqu(gen.xword[dst], f8vec); +} + +template <> +void convert_vec(jit_generator& gen, const RegExp& src, const RegExp& dst) { + auto const& f8vec = gen.xmm3; + auto const& f16vec = gen.ymm4; + + auto& cvt = dynamic_cast&>(gen); + + gen.vmovdqu(f8vec, gen.xword[src]); + cvt.get_f8_e4m3_emu()->vcvt_f8_to_f16(f16vec, f8vec); + gen.vmovdqu(gen.yword[dst], f16vec); +} + +template <> +void convert_vec(jit_generator& gen, const RegExp& src, const RegExp& dst) { + auto const& f8vec = gen.xmm3; + auto const& f16vec = gen.zmm4; + + auto& cvt = dynamic_cast&>(gen); + + gen.vpmovzxwd(f16vec, gen.yword[src]); + gen.vpslld(f16vec, f16vec, 16); + cvt.get_f8_e4m3_emu()->vcvt_f32_to_f8(f8vec, f16vec); + gen.vmovdqu(gen.xword[dst], f8vec); +} + +template <> +void convert_vec(jit_generator& gen, const RegExp& src, const RegExp& dst) { + auto const& f8vec = gen.xmm3; + auto const& f16vec = gen.ymm4; + auto const& f32vec = gen.zmm4; + + auto& cvt = dynamic_cast&>(gen); + + gen.vmovdqu(f8vec, gen.xword[src]); + cvt.get_f8_e4m3_emu()->vcvt_f8_to_f32(f32vec, f8vec); + cvt.get_uni_vcvtneps2bf16()->emit_code({static_cast(f32vec.getIdx())}, + {static_cast(f16vec.getIdx())}); + gen.vmovdqu(gen.yword[dst], f16vec); +} + +template <> +void convert_vec(jit_generator& gen, const RegExp& src, const RegExp& dst) { + auto const& f8vec = gen.xmm3; + auto const& f32vec = gen.zmm4; + + auto& cvt = dynamic_cast&>(gen); + + gen.vmovups(f32vec, gen.zword[src]); + cvt.get_f8_e5m2_emu()->vcvt_f32_to_f8(f8vec, f32vec); + gen.vmovdqu(gen.xword[dst], f8vec); +} + +template <> +void convert_vec(jit_generator& gen, const RegExp& src, const RegExp& dst) { + auto const& f8vec = gen.xmm3; + auto const& f32vec = gen.zmm4; + + auto& cvt = dynamic_cast&>(gen); + + gen.vmovdqu(f8vec, gen.xword[src]); + cvt.get_f8_e5m2_emu()->vcvt_f8_to_f32(f32vec, f8vec); + gen.vmovups(gen.zword[dst], f32vec); +} + +template <> +void convert_vec(jit_generator& gen, const RegExp& src, const RegExp& dst) { + auto const& f8vec = gen.xmm3; + auto const& f16vec = gen.ymm4; + + auto& cvt = dynamic_cast&>(gen); + + gen.vmovdqu(f16vec, gen.yword[src]); + cvt.get_f8_e5m2_emu()->vcvt_f16_to_f8(f8vec, f16vec); + gen.vmovdqu(gen.xword[dst], f8vec); +} + +template <> +void convert_vec(jit_generator& gen, const RegExp& src, const RegExp& dst) { + auto const& f8vec = gen.xmm3; + auto const& f16vec = gen.ymm4; + + auto& cvt = dynamic_cast&>(gen); + + gen.vmovdqu(f8vec, gen.xword[src]); + cvt.get_f8_e5m2_emu()->vcvt_f8_to_f16(f16vec, f8vec); + gen.vmovdqu(gen.yword[dst], f16vec); +} + +template <> +void convert_vec(jit_generator& gen, const RegExp& src, const RegExp& dst) { + auto const& f8vec = gen.xmm3; + auto const& f16vec = gen.zmm4; + + auto& cvt = dynamic_cast&>(gen); + + gen.vpmovzxwd(f16vec, gen.yword[src]); + gen.vpslld(f16vec, f16vec, 16); + cvt.get_f8_e5m2_emu()->vcvt_f32_to_f8(f8vec, f16vec); + gen.vmovdqu(gen.xword[dst], f8vec); +} + +template <> +void convert_vec(jit_generator& gen, const RegExp& src, const RegExp& dst) { + auto const& f8vec = gen.xmm3; + auto const& f16vec = gen.ymm4; + auto const& f32vec = gen.zmm4; + + auto& cvt = dynamic_cast&>(gen); + + gen.vmovdqu(f8vec, gen.xword[src]); + cvt.get_f8_e5m2_emu()->vcvt_f8_to_f32(f32vec, f8vec); + cvt.get_uni_vcvtneps2bf16()->emit_code({static_cast(f32vec.getIdx())}, + {static_cast(f16vec.getIdx())}); + gen.vmovdqu(gen.yword[dst], f16vec); +} + template void jit_convert(const TI* arg, TO* out, size_t count) { - using jit_impl = jit_convert_array; - static auto converter = jit_impl::get(); + using jit_impl = jit_convert_array; + static auto converter = jit_impl::get(); if (converter) { typename jit_impl::args_t args = {arg, out, count}; @@ -185,6 +404,12 @@ const std::tuple& Range::fit(const ov::element::Type& prec) { if (prec.is_real()) { double lbound, ubound; switch (prec) { + case ov::element::f8e4m3: + lbound = static_cast(std::numeric_limits::lowest()); + ubound = static_cast(std::numeric_limits::max()); + case ov::element::f8e5m2: + lbound = static_cast(std::numeric_limits::lowest()); + ubound = static_cast(std::numeric_limits::max()); case ov::element::bf16: lbound = static_cast(std::numeric_limits::lowest()); ubound = static_cast(std::numeric_limits::max()); @@ -293,6 +518,18 @@ struct ConvertPrecision> { src_t lbound, ubound; std::tie(lbound, ubound) = ctx.range(); + // Align with the behavior of ngraph ref and jit implementation. Conversion from f8e4m3-inf + // to float should output float-inf instead of f8e4m3-max. Proper handling of special values + // (nan, inf, overflow) has already been assured by the conversion process. + if (std::is_same::value || std::is_same::value || + std::is_same::value || std::is_same::value) { + parallel_for(ctx.size, [&](size_t i) { + dst[i] = static_cast(src[i]); + }); + ctx.converted = true; + return; + } + if (std::is_integral::value || ctx.interimPrc.is_real() || std::is_integral::value) { parallel_for(ctx.size, [&](size_t i) { dst[i] = static_cast(std::max(std::min(src[i], ubound), lbound)); @@ -492,6 +729,12 @@ struct ConvertPrecision> { PrecisionInfo::value_type, \ PrecisionInfo::value_type) +#define INTEL_CPU_CVT_FP8_LIST \ + INTEL_CPU_CVT(f32, f8e4m3), INTEL_CPU_CVT(f16, f8e4m3), INTEL_CPU_CVT(bf16, f8e4m3), INTEL_CPU_CVT(f8e4m3, f32), \ + INTEL_CPU_CVT(f8e4m3, f16), INTEL_CPU_CVT(f8e4m3, bf16), INTEL_CPU_CVT(f32, f8e5m2), \ + INTEL_CPU_CVT(f16, f8e5m2), INTEL_CPU_CVT(bf16, f8e5m2), INTEL_CPU_CVT(f8e5m2, f32), \ + INTEL_CPU_CVT(f8e5m2, f16), INTEL_CPU_CVT(f8e5m2, bf16) + #define INTEL_CPU_CVT_LIST \ INTEL_CPU_CVT(u8, i8), INTEL_CPU_CVT(u8, u16), INTEL_CPU_CVT(u8, i16), INTEL_CPU_CVT(u8, u32), \ INTEL_CPU_CVT(u8, i32), INTEL_CPU_CVT(u8, u64), INTEL_CPU_CVT(u8, i64), INTEL_CPU_CVT(u8, f32), \ @@ -535,7 +778,8 @@ struct ConvertPrecision> { INTEL_CPU_CVT(boolean, f16), INTEL_CPU_CVT(boolean, bf16), INTEL_CPU_CVT(boolean, f64), INTEL_CPU_CVT(u8, u8), \ INTEL_CPU_CVT(i8, i8), INTEL_CPU_CVT(u16, u16), INTEL_CPU_CVT(i16, i16), INTEL_CPU_CVT(u32, u32), \ INTEL_CPU_CVT(i32, i32), INTEL_CPU_CVT(u64, u64), INTEL_CPU_CVT(i64, i64), INTEL_CPU_CVT(f32, f32), \ - INTEL_CPU_CVT(f16, f16), INTEL_CPU_CVT(bf16, bf16), INTEL_CPU_CVT(f64, f64), INTEL_CPU_CVT(boolean, boolean) + INTEL_CPU_CVT(f16, f16), INTEL_CPU_CVT(bf16, bf16), INTEL_CPU_CVT(f64, f64), INTEL_CPU_CVT(boolean, boolean), \ + INTEL_CPU_CVT_FP8_LIST #define INTEL_CPU_CVT_FROM_BIN_LIST \ INTEL_CPU_CVT(u1, f32), INTEL_CPU_CVT(u1, f16), INTEL_CPU_CVT(u1, bf16), INTEL_CPU_CVT(u1, f64), \ @@ -667,6 +911,35 @@ struct ConvertFromByteFPPrecision> { } }; +#if defined(OPENVINO_ARCH_X86_64) +struct ConvertFP8Context { + const void* srcPtr; + void* dstPtr; + size_t size; + bool converted; +}; + +template +struct ConvertFP8Precision; + +template +struct ConvertFP8Precision> { + void operator()(ConvertFP8Context& ctx) { + auto src = static_cast(ctx.srcPtr); + auto dst = static_cast(ctx.dstPtr); + constexpr size_t batch = 64; + const size_t iterations = ov::intel_cpu::div_up(ctx.size, batch); + parallel_for(iterations, [&](size_t i) { + const size_t offset = i * batch; + const size_t current_batch_size = std::min(ctx.size - offset, batch); + jit_convert(src + offset, dst + offset, current_batch_size); + }); + + ctx.converted = true; + } +}; +#endif + void cpu_convert(const void* srcPtr, void* dstPtr, ov::element::Type srcPrc, @@ -728,7 +1001,7 @@ void cpu_convert(const void* srcPtr, OV_SWITCH(intel_cpu, ConvertFrom4BitPrecision, ctx, std::tie(srcPrc, dstPrc), INTEL_CPU_CVT_FROM_4BIT_LIST); if (!ctx.converted) OPENVINO_THROW("cpu_convert can't convert from: ", srcPrc, " precision to: ", dstPrc); - } else if (srcPrc.bitwidth() == 8u && srcPrc.is_real()) { + } else if (srcPrc == ov::element::f8e8m0) { ConvertFromByteFPContext ctx{srcPrc, srcPtr, dstPtr, size, false}; OV_SWITCH(intel_cpu, ConvertFromByteFPPrecision, @@ -737,6 +1010,15 @@ void cpu_convert(const void* srcPtr, INTEL_CPU_CVT_FROM_BYTE_FP_LIST); if (!ctx.converted) OPENVINO_THROW("cpu_convert can't convert from: ", srcPrc, " precision to: ", dstPrc); +#if defined(OPENVINO_ARCH_X86_64) + } else if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_fp16) && + (one_of(srcPrc, ov::element::f8e4m3, ov::element::f8e5m2) || + one_of(dstPrc, ov::element::f8e4m3, ov::element::f8e5m2))) { + ConvertFP8Context ctx{srcPtr, dstPtr, size, false}; + OV_SWITCH(intel_cpu, ConvertFP8Precision, ctx, std::tie(srcPrc, dstPrc), INTEL_CPU_CVT_FP8_LIST); + if (!ctx.converted) + OPENVINO_THROW("cpu_convert can't convert from: ", srcPrc, " precision to: ", dstPrc); +#endif } else { ConvertContext ctx{srcPtr, dstPtr, size, interimPrc, dstPrc, false}; OV_SWITCH(intel_cpu, ConvertPrecision, ctx, std::tie(srcPrc, dstPrc), INTEL_CPU_CVT_LIST); diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp index db55c728df725e..b3c2aa0b298a5a 100644 --- a/src/plugins/intel_cpu/src/plugin.cpp +++ b/src/plugins/intel_cpu/src/plugin.cpp @@ -218,6 +218,8 @@ std::shared_ptr Plugin::compile_model(const std::shared_ptr< ov::element::Type_t::i4, ov::element::Type_t::u8, ov::element::Type_t::i8, + ov::element::Type_t::f8e4m3, + ov::element::Type_t::f8e5m2, ov::element::Type_t::u16, ov::element::Type_t::i16, ov::element::Type_t::u32, diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/conversion.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/conversion.cpp index 4989fb3a0f04b7..a3c1f9ef7d3544 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/conversion.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/conversion.cpp @@ -16,11 +16,45 @@ using namespace CPUTestUtils; namespace ov { namespace test { +static std::string special_value_to_string(const ov::test::SpecialValue& value) { + if (value == SpecialValue::none) { + return "none"; + } else if (value == SpecialValue::nan) { + return "nan"; + } else if (value == SpecialValue::inf) { + return "inf"; + } else if (value == SpecialValue::overflow) { + return "overflow"; + } + return "unknown"; +} + +template +static T set_special_value(T& value, const ov::test::SpecialValue& special_value) { + if (special_value == ov::test::SpecialValue::nan) { + value = NAN; + } else if (special_value == ov::test::SpecialValue::inf) { + value = INFINITY; + } else if (special_value == ov::test::SpecialValue::overflow) { + value = value + std::numeric_limits::max(); + } + return value; +} + +template +static void modify_value(ov::Tensor& tensor, const ov::test::SpecialValue& special_value) { + T* dataPtr = static_cast(tensor.data()); + for (size_t i = 0; i < tensor.get_size(); i++) { + set_special_value(dataPtr[i], special_value); + } +} + std::string ConvertCPULayerTest::getTestCaseName(testing::TestParamInfo obj) { InputShape inputShape; ov::element::Type inPrc, outPrc; + ov::test::SpecialValue special_value; CPUSpecificParams cpuParams; - std::tie(inputShape, inPrc, outPrc, cpuParams) = obj.param; + std::tie(inputShape, inPrc, outPrc, special_value, cpuParams) = obj.param; std::ostringstream result; @@ -30,6 +64,7 @@ std::string ConvertCPULayerTest::getTestCaseName(testing::TestParamInfo(inPrc, shape)); @@ -101,6 +146,31 @@ void ConvertCPULayerTest::SetUp() { function = makeNgraphFunction(inPrc, params, conversion, "ConversionCPU"); } +void ConvertCPULayerTest::generate_inputs(const std::vector& targetInputStaticShapes) { + inputs.clear(); + const auto& funcInputs = function->inputs(); + for (size_t i = 0; i < funcInputs.size(); ++i) { + const auto& funcInput = funcInputs[i]; + ov::Tensor tensor = + ov::test::utils::create_and_fill_tensor(funcInput.get_element_type(), targetInputStaticShapes[i]); + if (special_value != ov::test::SpecialValue::none) { + if (inPrc == ov::element::f32) { + modify_value(tensor, special_value); + } else if (inPrc == ov::element::f16) { + modify_value(tensor, special_value); + } else if (inPrc == ov::element::bf16) { + modify_value(tensor, special_value); + } else if (inPrc == ov::element::f8e4m3) { + modify_value(tensor, special_value); + } else if (inPrc == ov::element::f8e5m2) { + modify_value(tensor, special_value); + } + } + + inputs.insert({funcInput.get_node_shared_ptr(), tensor}); + } +} + void ConvertCPULayerTest::validate_out_prc() const { if (outPrc == ov::element::boolean) FAIL() << "ConvertCPULayerTest supports only non boolean output prc"; diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/conversion.hpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/conversion.hpp index a53f56f873151c..a4f4e0fc56c238 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/conversion.hpp +++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/conversion.hpp @@ -13,9 +13,12 @@ using namespace CPUTestUtils; namespace ov { namespace test { +enum SpecialValue { none, nan, inf, overflow }; + using convertLayerTestParamsSet = std::tuple; class ConvertCPULayerTest : public testing::WithParamInterface, @@ -25,9 +28,12 @@ class ConvertCPULayerTest : public testing::WithParamInterface& targetInputStaticShapes) override; virtual void validate_out_prc() const; ov::element::Type inPrc, outPrc; +private: + ov::test::SpecialValue special_value; }; class ConvertToBooleanCPULayerTest : public ConvertCPULayerTest { diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/arm/conversion.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/arm/conversion.cpp index 11e0440b2e3618..e5d87f5cb2f3dd 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/arm/conversion.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/arm/conversion.cpp @@ -16,6 +16,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_7D_Dynamic, ConvertCPULayerTe ::testing::ValuesIn(inShapes_7D_dynamic()), ::testing::ValuesIn(precisions()), ::testing::ValuesIn(precisions()), + ::testing::Values(ov::test::SpecialValue::none), ::testing::Values(CPUSpecificParams({}, {}, {}, {}))), ConvertCPULayerTest::getTestCaseName); @@ -24,6 +25,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_7D_Static, ConvertCPULayerTes ::testing::ValuesIn(inShapes_7D_static()), ::testing::ValuesIn(precisions()), ::testing::ValuesIn(precisions()), + ::testing::Values(ov::test::SpecialValue::none), ::testing::Values(CPUSpecificParams({}, {}, {}, {}))), ConvertCPULayerTest::getTestCaseName); diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/conversion.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/conversion.cpp index 59ca1065bf78d9..8181304bf95e7d 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/conversion.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/conversion.cpp @@ -31,6 +31,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_4D_Dynamic, ConvertCPULayerTe ::testing::ValuesIn(inShapes_4D_dynamic()), ::testing::ValuesIn(precisions()), ::testing::ValuesIn(precisions()), + ::testing::Values(ov::test::SpecialValue::none), ::testing::ValuesIn(memForm4D_dynamic)), ConvertCPULayerTest::getTestCaseName); @@ -39,6 +40,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_4bit_Dynamic, ConvertCPULayer ::testing::Combine(::testing::ValuesIn(inShapes_4D_dynamic()), ::testing::ValuesIn({ov::element::u4, ov::element::i4}), ::testing::ValuesIn({ov::element::f32, ov::element::bf16, ov::element::u8, ov::element::i8}), + ::testing::Values(ov::test::SpecialValue::none), ::testing::Values(CPUSpecificParams({nchw}, {nchw}, {}, {"ref"}))), ConvertCPULayerTest::getTestCaseName); @@ -52,9 +54,69 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_4D_Static, ConvertCPULayerTes ::testing::ValuesIn(inShapes_4D_static()), ::testing::ValuesIn(precisions()), ::testing::ValuesIn(precisions()), + ::testing::Values(ov::test::SpecialValue::none), ::testing::ValuesIn(memForm4D_static_common)), ConvertCPULayerTest::getTestCaseName); +const std::vector float_precisions = { + ov::element::f32, + ov::element::f16, + ov::element::bf16, +}; + +const std::vector f8_precisions = { + ov::element::f8e4m3, + ov::element::f8e5m2, +}; + +const std::vector specialValue = { + ov::test::SpecialValue::none, + ov::test::SpecialValue::nan, + ov::test::SpecialValue::inf, + ov::test::SpecialValue::overflow, +}; + +std::vector memForm4D_fp8 = { + CPUSpecificParams({nchw}, {nchw}, {}, expectedPrimitiveType()), + CPUSpecificParams({nhwc}, {nhwc}, {}, expectedPrimitiveType()), +}; + +INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_from_fp8_Static, ConvertCPULayerTest, + ::testing::Combine( + ::testing::ValuesIn(inShapes_4D_static()), + ::testing::ValuesIn(f8_precisions), + ::testing::ValuesIn(float_precisions), + ::testing::ValuesIn(specialValue), + ::testing::ValuesIn(memForm4D_fp8)), + ConvertCPULayerTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_to_fp8_Static, ConvertCPULayerTest, + ::testing::Combine( + ::testing::ValuesIn(inShapes_4D_static()), + ::testing::ValuesIn(float_precisions), + ::testing::ValuesIn(f8_precisions), + ::testing::ValuesIn(specialValue), + ::testing::ValuesIn(memForm4D_fp8)), + ConvertCPULayerTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_from_fp8_Dynamic, ConvertCPULayerTest, + ::testing::Combine( + ::testing::ValuesIn(inShapes_4D_dynamic()), + ::testing::ValuesIn(f8_precisions), + ::testing::ValuesIn(float_precisions), + ::testing::ValuesIn(specialValue), + ::testing::ValuesIn(memForm4D_fp8)), + ConvertCPULayerTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_to_fp8_Dynamic, ConvertCPULayerTest, + ::testing::Combine( + ::testing::ValuesIn(inShapes_4D_dynamic()), + ::testing::ValuesIn(float_precisions), + ::testing::ValuesIn(f8_precisions), + ::testing::ValuesIn(specialValue), + ::testing::ValuesIn(memForm4D_fp8)), + ConvertCPULayerTest::getTestCaseName); + } // namespace Conversion } // namespace test } // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/x64/conversion.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/x64/conversion.cpp index 9c34d6220d4b2d..ab1e06639c5a3e 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/x64/conversion.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/x64/conversion.cpp @@ -23,6 +23,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_blocked_Dynamic, ConvertCPULa ::testing::ValuesIn(inShapes_4D_dynamic()), ::testing::ValuesIn(precisions()), ::testing::ValuesIn(precisions()), + ::testing::Values(ov::test::SpecialValue::none), ::testing::ValuesIn(memForm4D_dynamic)), ConvertCPULayerTest::getTestCaseName); @@ -44,6 +45,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_Blocked, ConvertCPULayerTest, ::testing::ValuesIn(inShapes_4D_blocked), ::testing::ValuesIn(precisions()), ::testing::ValuesIn(precisions()), + ::testing::Values(ov::test::SpecialValue::none), ::testing::ValuesIn(filterCPUSpecificParams(memForm4D_static_blocked))), ConvertCPULayerTest::getTestCaseName); @@ -52,6 +54,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_BOOL_Static, ConvertToBoolean ::testing::ValuesIn(inShapes_4D_static()), ::testing::ValuesIn(precisions_floating_point), ::testing::Values(ov::element::boolean), + ::testing::Values(ov::test::SpecialValue::none), ::testing::Values(CPUSpecificParams({nchw}, {nchw}, {}, {}))), ConvertToBooleanCPULayerTest::getTestCaseName); @@ -60,6 +63,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_BOOL_Dynamic, ConvertToBoolea ::testing::ValuesIn(inShapes_4D_dynamic()), ::testing::ValuesIn(precisions_floating_point), ::testing::Values(ov::element::boolean), + ::testing::Values(ov::test::SpecialValue::none), ::testing::Values(CPUSpecificParams({nchw}, {nchw}, {}, {}))), ConvertToBooleanCPULayerTest::getTestCaseName); diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/conversion.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/conversion.cpp index 9ff4d0b989fefa..903b8c083b1a1f 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/conversion.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/conversion.cpp @@ -32,6 +32,17 @@ const std::vector types = { ov::element::f64, }; +const std::vector floatTypes = { + ov::element::f32, + ov::element::f16, + ov::element::bf16, +}; + +const std::vector f8Types = { + ov::element::f8e4m3, + ov::element::f8e5m2, +}; + INSTANTIATE_TEST_SUITE_P(smoke_ConversionLayerTest, ConversionLayerTest, ::testing::Combine(::testing::ValuesIn(conversionOpTypes), @@ -49,4 +60,23 @@ INSTANTIATE_TEST_SUITE_P(smoke_ConversionToBooleanLayerTest, ::testing::Values(ov::element::boolean), ::testing::Values(ov::test::utils::DEVICE_CPU)), ConversionLayerTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_ConversionToF8LayerTest, + ConversionLayerTest, + ::testing::Combine(::testing::Values(conversionOpTypes[0]), + ::testing::ValuesIn(ov::test::static_shapes_to_test_representation(shapes)), + ::testing::ValuesIn(floatTypes), + ::testing::ValuesIn(f8Types), + ::testing::Values(ov::test::utils::DEVICE_CPU)), + ConversionLayerTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_ConversionFromF8LayerTest, + ConversionLayerTest, + ::testing::Combine(::testing::Values(conversionOpTypes[0]), + ::testing::ValuesIn(ov::test::static_shapes_to_test_representation(shapes)), + ::testing::ValuesIn(f8Types), + ::testing::ValuesIn(floatTypes), + ::testing::Values(ov::test::utils::DEVICE_CPU)), + ConversionLayerTest::getTestCaseName); + } // namespace diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp index 7af707df602bfc..4c34b3fd2506ac 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp @@ -173,6 +173,8 @@ std::vector disabledTestPatterns() { R"(.*smoke_TopK/TopKLayerTest.Inference.*_k=21_.*_sort=value_modelType=f16_trgDev=CPU.*)", // Issue: 121812 R"(.*ConvertCPULayerTest.*outFmts=(nhwc|nChw8c|nChw16c).*)", + // Issue: MFDNN-12917. The oneDNN emitter of conversion from fp32 to fp8 has rounding issue. + R"(.*ConvertCPULayerTest.*(\[1.1.1080.1920\]|\(2.17.5.4\))_.*_inputPRC=f32_targetPRC=f8e4m3_.*)", // Need to generate sequence exactly in the i64 data type. Enable in scope of i64 enabling. R"(.*RandomUniformLayerTestCPU.*OutPrc=i64.*)", // Issue: 123815 (Tests are sensintive to available thread count on testing machines) @@ -529,6 +531,7 @@ std::vector disabledTestPatterns() { retVector.emplace_back(R"(.*INFERENCE_PRECISION_HINT=(F|f)16.*)"); retVector.emplace_back(R"(.*ConcatMultiQuerySDPTest.*f16.*)"); retVector.emplace_back(R"(.*ConcatSDPTest.*f16.*)"); + retVector.emplace_back(R"(.*ConvertCPULayerTest.*f16.*)"); } #elif defined(OPENVINO_ARCH_ARM64) || defined(OPENVINO_ARCH_ARM) if (!ov::intel_cpu::hasHardwareSupport(ov::element::f16)) { @@ -536,6 +539,7 @@ std::vector disabledTestPatterns() { retVector.emplace_back(R"(.*INFERENCE_PRECISION_HINT=(F|f)16.*)"); retVector.emplace_back(R"(.*Prc=f16.*)"); retVector.emplace_back(R"(.*ConcatMultiQuerySDPTest.*f16.*HasShapeOf=1.*)"); + retVector.emplace_back(R"(.*ConvertCPULayerTest.*f16.*)"); } else { // Issue 117407 retVector.emplace_back(