From 5f4a445c8d588b37f978cd812661b439da2d63e8 Mon Sep 17 00:00:00 2001 From: Nikolay Shchegolev Date: Mon, 21 Oct 2024 19:41:24 +0400 Subject: [PATCH] [CPU] CACHE_DIR hash optimization (#25624) ### Details: - *JIT implementation of the hash function in the ConstantWriter* ### Tickets: - *127331* --- src/core/CMakeLists.txt | 5 +- .../dev_api/openvino/runtime/compute_hash.hpp | 20 + src/core/reference/CMakeLists.txt | 3 - .../reference/utils}/jit_generator.hpp | 70 +- .../reference/utils/registers_pool.hpp | 247 +++++ src/core/reference/src/op/convert.cpp | 6 +- .../src/{op => utils}/jit_generator.cpp | 19 +- .../reference/src/utils/registers_pool.cpp | 106 ++ src/core/src/pass/serialize.cpp | 105 +- src/core/src/runtime/compute_hash.cpp | 918 ++++++++++++++++++ 10 files changed, 1410 insertions(+), 89 deletions(-) create mode 100644 src/core/dev_api/openvino/runtime/compute_hash.hpp rename src/core/reference/{src/op => include/openvino/reference/utils}/jit_generator.hpp (59%) create mode 100644 src/core/reference/include/openvino/reference/utils/registers_pool.hpp rename src/core/reference/src/{op => utils}/jit_generator.cpp (91%) create mode 100644 src/core/reference/src/utils/registers_pool.cpp create mode 100644 src/core/src/runtime/compute_hash.cpp diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index bc42ffca8a3cf6..5ea4a21b705489 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -49,6 +49,9 @@ target_include_directories(openvino_core_dev INTERFACE $ $) +target_include_directories(openvino_core_dev SYSTEM INTERFACE + $:$>>) + target_link_libraries(openvino_core_dev INTERFACE openvino::itt openvino::util) set_target_properties(openvino_core_dev PROPERTIES EXPORT_NAME core::dev) @@ -81,7 +84,7 @@ if(ENABLE_SYSTEM_PUGIXML) set_target_properties(openvino_core_obj PROPERTIES NO_SYSTEM_FROM_IMPORTED ON) endif() -target_compile_definitions(openvino_core_obj PRIVATE IMPLEMENT_OPENVINO_API) +target_compile_definitions(openvino_core_obj PRIVATE IMPLEMENT_OPENVINO_API XBYAK_NO_OP_NAMES XBYAK64) ov_build_target_faster(openvino_core_obj UNITY diff --git a/src/core/dev_api/openvino/runtime/compute_hash.hpp b/src/core/dev_api/openvino/runtime/compute_hash.hpp new file mode 100644 index 00000000000000..47a90d589be4ee --- /dev/null +++ b/src/core/dev_api/openvino/runtime/compute_hash.hpp @@ -0,0 +1,20 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace ov { +namespace runtime { + +/** + * @brief Computes the hash value for the input data + * @param src A pointer to the input data + * @param size The length of the input data in bytes + */ +size_t compute_hash(const void* src, size_t size); + +} // namespace runtime +} // namespace ov diff --git a/src/core/reference/CMakeLists.txt b/src/core/reference/CMakeLists.txt index f7874964233cf5..b62cf02f23f4f1 100644 --- a/src/core/reference/CMakeLists.txt +++ b/src/core/reference/CMakeLists.txt @@ -50,9 +50,6 @@ target_include_directories(${TARGET_NAME} PUBLIC $ $) -target_include_directories(${TARGET_NAME} SYSTEM PRIVATE - $:$>>) - find_package(Threads REQUIRED) target_link_libraries(${TARGET_NAME} PRIVATE Threads::Threads openvino::core::dev) diff --git a/src/core/reference/src/op/jit_generator.hpp b/src/core/reference/include/openvino/reference/utils/jit_generator.hpp similarity index 59% rename from src/core/reference/src/op/jit_generator.hpp rename to src/core/reference/include/openvino/reference/utils/jit_generator.hpp index b4b9cd7a60c23f..539f686020049c 100644 --- a/src/core/reference/src/op/jit_generator.hpp +++ b/src/core/reference/include/openvino/reference/utils/jit_generator.hpp @@ -15,7 +15,6 @@ namespace ov { namespace reference { namespace jit { -#ifdef XBYAK64 static const Xbyak::Operand::Code abi_save_gpr_regs[] = { Xbyak::Operand::RBX, Xbyak::Operand::RBP, @@ -23,28 +22,42 @@ static const Xbyak::Operand::Code abi_save_gpr_regs[] = { Xbyak::Operand::R13, Xbyak::Operand::R14, Xbyak::Operand::R15, -# ifdef _WIN32 +#ifdef _WIN32 Xbyak::Operand::RDI, Xbyak::Operand::RSI, -# endif +#endif }; -# ifdef _WIN32 -# define abi_param1 Xbyak::Reg64(Xbyak::Operand::RCX) // RCX -# else -# define abi_param1 Xbyak::Reg64(Xbyak::Operand::RDI) // RDI -# endif -#endif // XBYAK64 +#ifdef _WIN32 +# define abi_param1 Xbyak::Reg64(Xbyak::Operand::RCX) // RCX +#else +# define abi_param1 Xbyak::Reg64(Xbyak::Operand::RDI) // RDI +#endif -class Generator : public Xbyak::CodeGenerator { - static constexpr size_t xmm_len = 16; +typedef enum { + isa_any, + sse42, + avx, + avx2, + avx512_common, + avx512_core, + avx512_core_vnni, + avx512_mic, + avx512_mic_4ops, + avx512_core_bf16, + avx512_vpopcnt, + fp16, + pclmulqdq, + vpclmulqdq +} cpu_isa_t; +class Generator : public Xbyak::CodeGenerator { #ifdef _WIN32 - static constexpr size_t xmm_to_preserve_start = 6; - static constexpr size_t xmm_to_preserve = 10; + static constexpr size_t xmm_to_preserve_start = 6llu; + static constexpr size_t xmm_to_preserve = 10llu; #else - static constexpr size_t xmm_to_preserve_start = 0; - static constexpr size_t xmm_to_preserve = 0; + static constexpr size_t xmm_to_preserve_start = 0lu; + static constexpr size_t xmm_to_preserve = 0lu; #endif static const size_t num_abi_save_gpr_regs = sizeof(abi_save_gpr_regs) / sizeof(abi_save_gpr_regs[0]); @@ -52,29 +65,19 @@ class Generator : public Xbyak::CodeGenerator { const Xbyak::Reg64 reg_EVEX_max_8b_offt; static constexpr int EVEX_max_8b_offt = 0x200; + size_t m_vlen = ymm_len; public: - const Xbyak::Reg64 param = abi_param1; + static constexpr size_t xmm_len = 16lu; + static constexpr size_t ymm_len = 32lu; + static constexpr size_t zmm_len = 64lu; - typedef enum { - isa_any, - sse42, - avx, - avx2, - avx512_common, - avx512_core, - avx512_core_vnni, - avx512_mic, - avx512_mic_4ops, - avx512_core_bf16, - avx512_vpopcnt, - fp16 - } cpu_isa_t; + const Xbyak::Reg64 param = abi_param1; static bool mayiuse(const cpu_isa_t cpu_isa); static bool is_x64(); - Generator(void* code_ptr = nullptr, size_t code_size = 16 * 1024); + Generator(cpu_isa_t isa = avx2, void* code_ptr = nullptr, size_t code_size = 16lu * 1024lu); void preamble(); void postamble(); @@ -85,7 +88,12 @@ class Generator : public Xbyak::CodeGenerator { template void copy(const Xbyak::Reg64& dst, const Xbyak::Reg64& src, const Xbyak::Reg64& size); + + size_t get_vlen() { + return m_vlen; + } }; + } // namespace jit } // namespace reference } // namespace ov diff --git a/src/core/reference/include/openvino/reference/utils/registers_pool.hpp b/src/core/reference/include/openvino/reference/utils/registers_pool.hpp new file mode 100644 index 00000000000000..62dfe01ec4ef1d --- /dev/null +++ b/src/core/reference/include/openvino/reference/utils/registers_pool.hpp @@ -0,0 +1,247 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include + +#include "openvino/core/except.hpp" +#include "openvino/reference/utils/jit_generator.hpp" +namespace ov { +namespace reference { +namespace jit { + +class RegistersPool { +public: + using Ptr = std::shared_ptr; + using WeakPtr = std::weak_ptr; + static constexpr int any_idx = -1; + + template + class Reg { + friend class RegistersPool; + + public: + Reg() {} + Reg(const RegistersPool::Ptr& regPool) { + initialize(regPool); + } + Reg(const RegistersPool::Ptr& regPool, int requested_idx) { + initialize(regPool, requested_idx); + } + ~Reg() { + release(); + } + Reg& operator=(Reg&& other) noexcept { + release(); + reg = other.reg; + regPool = std::move(other.regPool); + return *this; + } + Reg(Reg&& other) noexcept : reg(other.reg), regPool(std::move(other.regPool)) {} + operator TReg&() { + ensure_valid(); + return reg; + } + operator const TReg&() const { + ensure_valid(); + return reg; + } + operator Xbyak::RegExp() const { + ensure_valid(); + return reg; + } + int getIdx() const { + ensure_valid(); + return reg.getIdx(); + } + friend Xbyak::RegExp operator+(const Reg& lhs, const Xbyak::RegExp& rhs) { + lhs.ensure_valid(); + return lhs.operator Xbyak::RegExp() + rhs; + } + void release() { + if (auto pool = regPool.lock()) { + pool->return_to_pool(reg); + regPool.reset(); + } + } + bool is_initialized() const { + return !regPool.expired(); + } + + private: + void ensure_valid() const { + if (!is_initialized()) { + OPENVINO_THROW("RegistersPool::Reg is either not initialized or released"); + } + } + + void initialize(const RegistersPool::Ptr& pool, int requested_idx = any_idx) { + release(); + reg = TReg(pool->template get_free(requested_idx)); + regPool = pool; + } + + private: + TReg reg; + RegistersPool::WeakPtr regPool; + }; + + virtual ~RegistersPool() { + check_unique_and_update(false); + } + + template + static Ptr create(std::initializer_list regsToExclude); + + static Ptr create(cpu_isa_t isa, std::initializer_list regsToExclude); + + template + size_t count_free() const { + if (std::is_base_of::value) { + return m_simd_set.count_unused(); + } else if (std::is_same::value || std::is_same::value || + std::is_same::value || std::is_same::value) { + return m_general_set.count_unused(); + } else if (std::is_same::value) { + return count_unused_opmask(); + } + } + +protected: + class PhysicalSet { + public: + PhysicalSet(int size) : m_is_free_index_vector(size, true) {} + + void set_as_used(size_t reg_idx); + + void set_as_unused(size_t reg_idx); + + size_t get_unused(size_t requested_idx); + + void exclude(Xbyak::Reg reg) { + m_is_free_index_vector.at(reg.getIdx()) = false; + } + + size_t count_unused() const; + + private: + size_t get_first_free_index(); + + private: + std::vector m_is_free_index_vector; + }; + + virtual int get_free_opmask(int requested_idx) { + OPENVINO_THROW("get_free_opmask: The Opmask is not supported in current instruction set"); + } + virtual void return_opmask_to_pool(int idx) { + OPENVINO_THROW("return_opmask_to_pool: The Opmask is not supported in current instruction set"); + } + virtual size_t count_unused_opmask() const { + OPENVINO_THROW("count_unused_opmask: The Opmask is not supported in current instruction set"); + } + + RegistersPool(int simd_registers_number); + + RegistersPool(std::initializer_list regsToExclude, int simd_registers_number); + +private: + template + int get_free(int requested_idx) { + if (std::is_base_of::value) { + auto idx = m_simd_set.get_unused(requested_idx); + m_simd_set.set_as_used(idx); + return static_cast(idx); + } else if (std::is_same::value || std::is_same::value || + std::is_same::value || std::is_same::value) { + auto idx = m_general_set.get_unused(requested_idx); + m_general_set.set_as_used(idx); + return static_cast(idx); + } else if (std::is_same::value) { + return get_free_opmask(requested_idx); + } + } + + template + void return_to_pool(const TReg& reg) { + if (std::is_base_of::value) { + m_simd_set.set_as_unused(reg.getIdx()); + } else if (std::is_same::value || std::is_same::value || + std::is_same::value || std::is_same::value) { + m_general_set.set_as_unused(reg.getIdx()); + } else if (std::is_same::value) { + return_opmask_to_pool(reg.getIdx()); + } + } + + void check_unique_and_update(bool isCtor = true); + + PhysicalSet m_general_set; + PhysicalSet m_simd_set; +}; + +template +class IsaRegistersPool : public RegistersPool { +public: + IsaRegistersPool(std::initializer_list regsToExclude) : RegistersPool(regsToExclude, 32) {} +}; + +template <> +class IsaRegistersPool : public RegistersPool { +public: + IsaRegistersPool() : RegistersPool(32) { + m_opmask_set.exclude( + Xbyak::Opmask(0)); // the Opmask(0) has special meaning for some instructions, like gather instruction + } + + IsaRegistersPool(std::initializer_list regsToExclude) : RegistersPool(regsToExclude, 32) { + for (auto& reg : regsToExclude) { + if (reg.isOPMASK()) { + m_opmask_set.exclude(reg); + } + } + } + + int get_free_opmask(int requested_idx) override { + auto idx = static_cast(m_opmask_set.get_unused(requested_idx)); + m_opmask_set.set_as_used(idx); + return idx; + } + + void return_opmask_to_pool(int idx) override { + m_opmask_set.set_as_unused(idx); + } + + size_t count_unused_opmask() const override { + return m_opmask_set.count_unused(); + } + +protected: + PhysicalSet m_opmask_set{8}; +}; + +template +RegistersPool::Ptr RegistersPool::create(std::initializer_list regsToExclude) { + return std::make_shared>(regsToExclude); +} + +inline RegistersPool::Ptr RegistersPool::create(cpu_isa_t isa, std::initializer_list regsToExclude) { +#define ISA_SWITCH_CASE(isa) \ + case isa: \ + return std::make_shared>(regsToExclude); + switch (isa) { + ISA_SWITCH_CASE(avx2) + ISA_SWITCH_CASE(avx512_core) + default: + OPENVINO_THROW("Invalid isa argument in RegistersPool::create(): ", isa); + } +#undef ISA_SWITCH_CASE +} + +} // namespace jit +} // namespace reference +} // namespace ov diff --git a/src/core/reference/src/op/convert.cpp b/src/core/reference/src/op/convert.cpp index 5054121b5615c0..034734afd8fd2a 100644 --- a/src/core/reference/src/op/convert.cpp +++ b/src/core/reference/src/op/convert.cpp @@ -7,7 +7,7 @@ #include "openvino/reference/utils/convert_util.hpp" #ifdef OV_CORE_USE_XBYAK_JIT -# include "jit_generator.hpp" +# include "openvino/reference/utils/jit_generator.hpp" #endif #ifdef OV_CORE_USE_INTRINSICS @@ -256,7 +256,7 @@ class jit_convert_array : public jit::Generator { template static fn_t get() { - if (is_x64() && mayiuse(avx) && mayiuse(avx2) && mayiuse(fp16)) { + if (is_x64() && mayiuse(jit::avx) && mayiuse(jit::avx2) && mayiuse(jit::fp16)) { static const jit_convert_array::context_t context{{sizeof(src_t), &jit::Generator::copy}, {sizeof(dst_t), &jit::Generator::copy}, jit_convert_vec, @@ -460,7 +460,7 @@ class jit_count_out_of_range : public jit::Generator { template static fn_t get() { - if (is_x64() && mayiuse(avx2)) { + if (is_x64() && mayiuse(jit::avx2)) { static const jit_count_out_of_range::context_t context{ {sizeof(data_t), &jit::Generator::copy}, jit_count_out_of_range_vec_prepare, diff --git a/src/core/reference/src/op/jit_generator.cpp b/src/core/reference/src/utils/jit_generator.cpp similarity index 91% rename from src/core/reference/src/op/jit_generator.cpp rename to src/core/reference/src/utils/jit_generator.cpp index 7d7da06d5da8d5..39dc31c0033f9f 100644 --- a/src/core/reference/src/op/jit_generator.cpp +++ b/src/core/reference/src/utils/jit_generator.cpp @@ -11,9 +11,10 @@ # endif # include -# include "jit_generator.hpp" +# include "openvino/core/except.hpp" # include "openvino/core/type/bfloat16.hpp" # include "openvino/core/type/float16.hpp" +# include "openvino/reference/utils/jit_generator.hpp" namespace ov { namespace reference { @@ -51,6 +52,10 @@ bool Generator::mayiuse(const cpu_isa_t cpu_isa) { return true && cpu.has(Cpu::tAVX512_VPOPCNTDQ); case fp16: return cpu.has(Cpu::tF16C); + case cpu_isa_t::pclmulqdq: + return cpu.has(Cpu::tPCLMULQDQ); + case cpu_isa_t::vpclmulqdq: + return cpu.has(Cpu::tVPCLMULQDQ); case isa_any: return true; } @@ -60,10 +65,18 @@ bool Generator::mayiuse(const cpu_isa_t cpu_isa) { bool Generator::is_x64() { return sizeof(void*) == 8; } -Generator::Generator(void* code_ptr, size_t code_size) +Generator::Generator(cpu_isa_t isa, void* code_ptr, size_t code_size) : Xbyak::CodeGenerator(code_size, code_ptr), size_of_abi_save_regs(num_abi_save_gpr_regs * rax.getBit() / 8 + xmm_to_preserve * xmm_len), - reg_EVEX_max_8b_offt(rbp) {} + reg_EVEX_max_8b_offt(rbp) { + if (isa == avx512_core) { + m_vlen = zmm_len; + } else if (isa == avx2) { + m_vlen = ymm_len; + } else { + OPENVINO_THROW("Unsupported isa: ", isa); + } +} void Generator::preamble() { if (xmm_to_preserve) { diff --git a/src/core/reference/src/utils/registers_pool.cpp b/src/core/reference/src/utils/registers_pool.cpp new file mode 100644 index 00000000000000..413fdcc3ed83cf --- /dev/null +++ b/src/core/reference/src/utils/registers_pool.cpp @@ -0,0 +1,106 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/core/visibility.hpp" + +#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64) +# include "openvino/reference/utils/registers_pool.hpp" + +namespace ov { +namespace reference { +namespace jit { + +RegistersPool::RegistersPool(int simd_registers_number) : m_general_set(16), m_simd_set(simd_registers_number) { + check_unique_and_update(); + m_general_set.exclude(Xbyak::Reg64(Xbyak::Operand::RSP)); + m_general_set.exclude(Xbyak::Reg64(Xbyak::Operand::RAX)); + m_general_set.exclude(Xbyak::Reg64(Xbyak::Operand::RCX)); + m_general_set.exclude(Xbyak::Reg64(Xbyak::Operand::RDI)); + m_general_set.exclude(Xbyak::Reg64(Xbyak::Operand::RBP)); +} + +RegistersPool::RegistersPool(std::initializer_list regsToExclude, int simd_registers_number) + : m_general_set(16), + m_simd_set(simd_registers_number) { + check_unique_and_update(); + for (auto& reg : regsToExclude) { + if (reg.isXMM() || reg.isYMM() || reg.isZMM()) { + m_simd_set.exclude(reg); + } else if (reg.isREG()) { + m_general_set.exclude(reg); + } + } + m_general_set.exclude(Xbyak::Reg64(Xbyak::Operand::RSP)); +} + +void RegistersPool::check_unique_and_update(bool is_ctor) { + static thread_local bool is_created = false; + if (is_ctor) { + if (is_created) { + OPENVINO_THROW("There should be only one instance of RegistersPool per thread"); + } + is_created = true; + } else { + is_created = false; + } +} + +void RegistersPool::PhysicalSet::set_as_used(size_t reg_idx) { + if (reg_idx >= m_is_free_index_vector.size()) { + OPENVINO_THROW("reg_idx is out of bounds in RegistersPool::PhysicalSet::set_as_used()"); + } + if (!m_is_free_index_vector[reg_idx]) { + OPENVINO_THROW("Inconsistency in RegistersPool::PhysicalSet::set_as_used()"); + } + m_is_free_index_vector[reg_idx] = false; +} + +void RegistersPool::PhysicalSet::set_as_unused(size_t reg_idx) { + if (reg_idx >= m_is_free_index_vector.size()) { + OPENVINO_THROW("reg_idx is out of bounds in RegistersPool::PhysicalSet::set_as_used()"); + } + if (m_is_free_index_vector[reg_idx]) { + OPENVINO_THROW("Inconsistency in RegistersPool::PhysicalSet::set_as_unused()"); + } + m_is_free_index_vector[reg_idx] = true; +} + +size_t RegistersPool::PhysicalSet::get_unused(size_t requested_idx) { + if (requested_idx == static_cast(any_idx)) { + return get_first_free_index(); + } else { + if (requested_idx >= m_is_free_index_vector.size()) { + OPENVINO_THROW("requested_idx is out of bounds in RegistersPool::PhysicalSet::get_unused()"); + } + if (!m_is_free_index_vector[requested_idx]) { + OPENVINO_THROW("The register with index #", requested_idx, " already used in the RegistersPool"); + } + return requested_idx; + } +} + +size_t RegistersPool::PhysicalSet::count_unused() const { + size_t count = 0; + for (const auto& isFree : m_is_free_index_vector) { + if (isFree) { + ++count; + } + } + return count; +} + +size_t RegistersPool::PhysicalSet::get_first_free_index() { + for (size_t c = 0; c < m_is_free_index_vector.size(); ++c) { + if (m_is_free_index_vector[c]) { + return c; + } + } + OPENVINO_THROW("Not enough registers in the RegistersPool"); +} + +} // namespace jit +} // namespace reference +} // namespace ov + +#endif // OPENVINO_ARCH_X86 || OPENVINO_ARCH_X86_64 diff --git a/src/core/src/pass/serialize.cpp b/src/core/src/pass/serialize.cpp index 409dcad066d7a6..3af6d2c4b5313f 100644 --- a/src/core/src/pass/serialize.cpp +++ b/src/core/src/pass/serialize.cpp @@ -23,6 +23,7 @@ #include "openvino/pass/constant_folding.hpp" #include "openvino/reference/convert.hpp" #include "openvino/runtime/aligned_buffer.hpp" +#include "openvino/runtime/compute_hash.hpp" #include "openvino/runtime/string_aligned_buffer.hpp" #include "openvino/util/file_util.hpp" #include "pugixml.hpp" @@ -30,6 +31,18 @@ #include "transformations/rt_info/disable_fp16_compression.hpp" #include "transformations/rt_info/primitives_priority_attribute.hpp" +namespace ov { +class OstreamHashWrapperBin final : public std::streambuf { + uint64_t m_res = 0lu; + +public: + uint64_t getResult() const { + return m_res; + } + std::streamsize xsputn(const char* s, std::streamsize n) override; +}; +} // namespace ov + namespace { // helpers template std::string join(const Container& c, const char* glue = ", ") { @@ -69,23 +82,6 @@ std::string translate_type_name(const std::string& name) { return name; } -size_t hash_combine(const void* v, int64_t size) { - constexpr auto cel_size = sizeof(size_t); - auto seed = static_cast(size); - const auto data = static_cast(v); - const auto d_end = std::next(data, size / cel_size); - // The constant value used as a magic number has been - // traditionally used e.g. in boost library's hash_combine. - // It happens to be derived from the golden ratio. - for (auto d = data; d != d_end; ++d) { - seed ^= *d + 0x9e3779b9 + (seed << 6) + (seed >> 2); - } - size_t last_bytes{0}; - std::memcpy(&last_bytes, d_end, size % cel_size); - seed ^= last_bytes + 0x9e3779b9 + (seed << 6) + (seed >> 2); - return seed; -} - class ConstantWriter { public: using FilePosition = int64_t; @@ -95,16 +91,18 @@ class ConstantWriter { ConstantWriter(std::ostream& bin_data, bool enable_compression = true) : m_binary_output(bin_data), m_enable_compression(enable_compression), - m_blob_offset(bin_data.tellp()) {} + m_blob_offset(bin_data.tellp()) { + m_write_hash_value = (dynamic_cast(bin_data.rdbuf())) ? true : false; + } FilePosition write(const char* ptr, size_t size, - size_t* new_size, + size_t& new_size, bool compress_to_fp16 = false, ov::element::Type src_type = ov::element::dynamic) { const FilePosition write_pos = m_binary_output.tellp(); const auto offset = write_pos - m_blob_offset; - *new_size = size; + new_size = size; if (!m_enable_compression) { if (!compress_to_fp16) { @@ -112,7 +110,7 @@ class ConstantWriter { } else { OPENVINO_ASSERT(size % src_type.size() == 0); auto fp16_buffer = compress_data_to_fp16(ptr, size, src_type, new_size); - m_binary_output.write(fp16_buffer.get(), *new_size); + m_binary_output.write(fp16_buffer.get(), new_size); } return offset; } else { @@ -132,18 +130,24 @@ class ConstantWriter { // the same hash for {2, 2} and {0, 128} arrays. // But even strong hashing algorithms sometimes give collisions. // Therefore we always have to compare values when finding a match in the hash multimap. - const HashValue hash = hash_combine(ptr_to_write, *new_size); + const HashValue hash = ov::runtime::compute_hash(ptr_to_write, new_size); + auto found = m_hash_to_file_positions.find(hash); // iterate over all matches of the key in the multimap while (found != m_hash_to_file_positions.end()) { - if (memcmp(ptr, found->second.second, size) == 0) + if (memcmp(ptr, found->second.second, size) == 0) { return found->second.first; + } found++; } // Since fp16_compressed data will be disposed at exit point and since we cannot reread it from the ostream, // we store pointer to the original uncompressed blob. m_hash_to_file_positions.insert({hash, {offset, static_cast(ptr)}}); - m_binary_output.write(ptr_to_write, *new_size); + if (m_write_hash_value) { + m_binary_output.write(reinterpret_cast(&hash), sizeof(uint64_t)); + } else { + m_binary_output.write(ptr_to_write, new_size); + } } return offset; } @@ -152,17 +156,17 @@ class ConstantWriter { static std::unique_ptr compress_data_to_fp16(const char* ptr, size_t size, ov::element::Type src_type, - size_t* compressed_size) { + size_t& compressed_size) { auto num_src_elements = size / src_type.size(); - *compressed_size = num_src_elements * ov::element::f16.size(); + compressed_size = num_src_elements * ov::element::f16.size(); if (src_type == ov::element::f32) { - auto new_ptr = std::unique_ptr(new char[*compressed_size]); + auto new_ptr = std::unique_ptr(new char[compressed_size]); auto dst_data = reinterpret_cast(new_ptr.get()); auto src_data = reinterpret_cast(ptr); ov::reference::convert_from_f32_to_f16_with_clamp(src_data, dst_data, num_src_elements); return new_ptr; } else if (src_type == ov::element::f64) { - auto new_ptr = std::unique_ptr(new char[*compressed_size]); + auto new_ptr = std::unique_ptr(new char[compressed_size]); auto dst_data = reinterpret_cast(new_ptr.get()); auto src_data = reinterpret_cast(ptr); @@ -188,6 +192,7 @@ class ConstantWriter { ConstWritePositions m_hash_to_file_positions; std::ostream& m_binary_output; bool m_enable_compression; + bool m_write_hash_value = false; FilePosition m_blob_offset; // blob offset inside output stream }; @@ -531,7 +536,7 @@ class XmlSerializer : public ov::AttributeVisitor { int64_t offset = m_constant_write_handler.write(reinterpret_cast(header_ptr.get()), header_size, - &inter_size, + inter_size, m_compress_to_fp16, m_output_element_type); new_size += inter_size; @@ -554,7 +559,7 @@ class XmlSerializer : public ov::AttributeVisitor { m_constant_write_handler.write(raw_string_ptr, raw_string_size, - &inter_size, + inter_size, m_compress_to_fp16, m_output_element_type); new_size += inter_size; @@ -568,7 +573,7 @@ class XmlSerializer : public ov::AttributeVisitor { size_t new_size; int64_t offset = m_constant_write_handler.write(static_cast(a->get()->get_ptr()), size, - &new_size, + new_size, m_compress_to_fp16, m_output_element_type); @@ -1393,10 +1398,19 @@ bool pass::StreamSerialize::run_on_model(const std::shared_ptr& model /// -------- Hash calculation pass ------------- namespace { -template -static uint64_t hash_combine(uint64_t seed, const T& a) { - // Hash combine formula from boost - return seed ^ (std::hash()(a) + 0x9e3779b9 + (seed << 6) + (seed >> 2)); +// Hash combine formula from boost for uint64_t. +inline uint64_t hash_combine(uint64_t h, uint64_t k) { + constexpr uint64_t m = 0xc6a4a7935bd1e995; + constexpr int r = 47; + + k *= m; + k ^= k >> r; + k *= m; + + h ^= k; + h *= m; + + return h + 0xe6546b64; } class OstreamHashWrapper final : public std::streambuf { @@ -1408,28 +1422,23 @@ class OstreamHashWrapper final : public std::streambuf { } std::streamsize xsputn(const char* s, std::streamsize n) override { - // Reinterpret data as uint32_t and accumulate in uint64_t to avoid overflow fluctuations in parallel_sum. - auto* int_sum = reinterpret_cast(s); - const uint64_t n32 = n / sizeof(uint32_t); - - m_res += parallel_sum(n32, uint64_t(0lu), [&](size_t k) -> uint32_t { - return int_sum[k]; - }); - - const uint64_t rest = n % sizeof(uint32_t); - for (uint64_t i = 0lu; i < rest; i++) { - m_res += s[n - rest + i]; - } + uint64_t h = ov::runtime::compute_hash(s, n); + m_res = hash_combine(m_res, h); return n; } }; } // namespace +std::streamsize OstreamHashWrapperBin::xsputn(const char* s, std::streamsize n) { + m_res = hash_combine(m_res, *reinterpret_cast(s)); + return n; +} + bool pass::Hash::run_on_model(const std::shared_ptr& model) { RUN_ON_MODEL_SCOPE(Hash); OstreamHashWrapper xmlHash; - OstreamHashWrapper binHash; + OstreamHashWrapperBin binHash; std::ostream xml(&xmlHash); std::ostream bin(&binHash); diff --git a/src/core/src/runtime/compute_hash.cpp b/src/core/src/runtime/compute_hash.cpp new file mode 100644 index 00000000000000..c1a5a40c8638de --- /dev/null +++ b/src/core/src/runtime/compute_hash.cpp @@ -0,0 +1,918 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +// The CRC computation is used for x86. +// The calculations were taken from the article +// "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction - Intel (December, 2009)". + +#include "openvino/runtime/compute_hash.hpp" + +#include +#include +#include + +#include "openvino/core/visibility.hpp" + +#if !defined(OS_CHROMEOS) && (defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)) +# define OV_CORE_USE_XBYAK_JIT +#endif + +#ifdef OV_CORE_USE_XBYAK_JIT +# include "openvino/core/parallel.hpp" +# include "openvino/reference/utils/registers_pool.hpp" +#endif // OV_CORE_USE_XBYAK_JIT + +namespace ov { +namespace runtime { + +#ifdef OV_CORE_USE_XBYAK_JIT + +using namespace ov::reference::jit; + +namespace jit { + +# define GET_OFF(field) offsetof(ComputeHashCallArgs, field) +# define getReg64() RegistersPool::Reg(m_registers_pool) +# define getVmm() RegistersPool::Reg(m_registers_pool) +# define getXmm() RegistersPool::Reg(m_registers_pool) + +enum KernelType { SINGLE_THREAD = 0, FIRST_THREAD, N_THREAD, FINAL_FOLD }; + +struct ComputeHashCompileParams { + KernelType type; +}; + +struct ComputeHashCallArgs { + const void* src_ptr = nullptr; + void* dst_ptr = nullptr; + const void* k_ptr = nullptr; + void* intermediate_ptr = nullptr; + uint64_t work_amount = 0lu; + uint64_t size = 0lu; + uint64_t threads_num = 1lu; +}; + +typedef void (*hash_kernel)(const ComputeHashCallArgs*); + +static const uint8_t SHUF_MASK[16] = {0b00001111, + 0b00001110, + 0b00001101, + 0b00001100, + 0b00001011, + 0b00001010, + 0b00001001, + 0b00001000, + 0b00000111, + 0b00000110, + 0b00000101, + 0b00000100, + 0b00000011, + 0b00000010, + 0b00000001, + 0b00000000}; + +constexpr uint64_t CRC_VAL = 0xffffffffffffffff; + +// POLYNOM(x) = 0x42F0E1EBA9EA3693 +constexpr uint64_t K_2 = 0x05f5c3c7eb52fab6; // x^(64*2) +constexpr uint64_t P_1 = 0x578d29d06cc4f872; // floor(x^128/P(x))-x^64 +constexpr uint64_t P_2 = 0x42f0e1eba9ea3693; // P(x)-x^64 +static const uint64_t K_PULL[] = { + K_2, // x^(64*2) + 0x4eb938a7d257740e, // x^(64*3) + 0x571bee0a227ef92b, // x^(64*4) + 0x44bef2a201b5200c, // x^(64*5) + 0x54819d8713758b2c, // x^(64*6) + 0x4a6b90073eb0af5a, // x^(64*7) + 0x5f6843ca540df020, // x^(64*8) + 0xddf4b6981205b83f, // x^(64*9) + 0x097c516e98bd2e73, // x^(64*10) + 0x0b76477b31e22e7b, // x^(64*11) + 0x9af04e1eff82d0dd, // x^(64*12) + 0x6e82e609297f8fe8, // x^(64*13) + 0xe464f4df5fb60ac1, // x^(64*14) + 0xb649c5b35a759cf2, // x^(64*15) + 0x05cf79dea9ac37d6, // x^(64*16) + 0x001067e571d7d5c2 // x^(64*17) +}; + +constexpr uint64_t K_2_3_OFF = 0lu * 2lu * sizeof(uint64_t); +constexpr uint64_t K_4_5_OFF = 1lu * 2lu * sizeof(uint64_t); +constexpr uint64_t K_6_7_OFF = 2lu * 2lu * sizeof(uint64_t); +constexpr uint64_t K_8_9_OFF = 3lu * 2lu * sizeof(uint64_t); +constexpr uint64_t K_10_11_OFF = 4lu * 2lu * sizeof(uint64_t); +constexpr uint64_t K_12_13_OFF = 5lu * 2lu * sizeof(uint64_t); +constexpr uint64_t K_14_15_OFF = 6lu * 2lu * sizeof(uint64_t); +constexpr uint64_t K_16_17_OFF = 7lu * 2lu * sizeof(uint64_t); + +class HashBase : public Generator { +protected: + void (*ker_fn)(const ComputeHashCallArgs*); + +public: + HashBase(cpu_isa_t isa) : Generator(isa) {} + + virtual void generate() = 0; + + void operator()(const ComputeHashCallArgs* args) { + ker_fn(args); + } + + virtual void create_kernel() { + generate(); + ker_fn = (decltype(ker_fn))getCode(); + OPENVINO_ASSERT(ker_fn, "[ CORE ] Could not generate kernel code."); + } +}; + +template +class ComputeHash : public HashBase { +public: + explicit ComputeHash(const ComputeHashCompileParams& jcp) : HashBase(isa), m_jcp(jcp) { + if (!mayiuse(cpu_isa_t::pclmulqdq)) { + OPENVINO_THROW( + "The current CPU does not support pclmulqdq instruction, which is required for the CRC algorithm."); + } + if (mayiuse(cpu_isa_t::vpclmulqdq)) { + is_vpclmulqdq = true; + } + } + + void generate() override { + m_registers_pool = RegistersPool::create(isa, {rax, rcx, rsp, rdi, k0}); + + r64_src_ptr = getReg64(); + r64_dst_ptr = getReg64(); + r64_work_amount = getReg64(); + r64_k_ptr = getReg64(); + r64_aux = getReg64(); + v_k_2_3 = getVmm(); + v_shuf_mask = getVmm(); + auto v_dst = getVmm(); + + this->preamble(); + + initialize(v_dst); + bulk_fold(v_dst); + join(v_dst); + fold_to_128(v_dst); + fold_to_64(v_dst); + + this->postamble(); + m_registers_pool.reset(); + } + + static std::shared_ptr create(const ComputeHashCompileParams& params) { + auto kernel = std::make_shared(params); + OPENVINO_ASSERT(kernel, "[ CORE ] Could not create ComputeHash kernel."); + kernel->create_kernel(); + + return kernel; + } + +private: + using Vmm = typename std::conditional::type; + bool is_vpclmulqdq = false; + + ComputeHashCompileParams m_jcp; + RegistersPool::Ptr m_registers_pool; + + const Xbyak::Reg64 r64_params = abi_param1; + + RegistersPool::Reg r64_src_ptr; + RegistersPool::Reg r64_dst_ptr; + RegistersPool::Reg r64_work_amount; + RegistersPool::Reg r64_k_ptr; + RegistersPool::Reg r64_aux; + + // Vector registers + RegistersPool::Reg v_k_2_3; + RegistersPool::Reg v_shuf_mask; + + void initialize(const Vmm& v_dst); + + void bulk_fold(const Vmm& v_dst); + + void join(const Vmm& v_dst); + + void fold_to_128(const Vmm& v_dst); + + void fold_to_64(const Vmm& v_dst); + + void uni_vpxorq(const Xbyak::Xmm& v_dst, const Xbyak::Xmm& v_src_0, const Xbyak::Xmm& v_src_1); + + void uni_vmovdqu64(const Xbyak::Xmm& v_dst, const Xbyak::Operand& v_src_0); + + void uni_vmovdqu64(const Xbyak::Address& v_dst, const Xbyak::Xmm& v_src_0); + + void uni_vbroadcasti64x2(const Xbyak::Ymm& v_dst, const Xbyak::Address& v_src_0); + + void partial_load(const Xbyak::Xmm& xmm_dst, const Xbyak::Address& src_addr, const Xbyak::Reg64& r64_load_num); + + void partial_load(const Xbyak::Ymm& ymm_dst, const Xbyak::Address& src_addr, const Xbyak::Reg64& r64_load_num); +}; + +template <> +void ComputeHash::uni_vpxorq(const Xbyak::Xmm& v_dst, + const Xbyak::Xmm& v_src_0, + const Xbyak::Xmm& v_src_1) { + vpxorq(v_dst, v_src_0, v_src_1); +} +template +void ComputeHash::uni_vpxorq(const Xbyak::Xmm& v_dst, const Xbyak::Xmm& v_src_0, const Xbyak::Xmm& v_src_1) { + vpxor(v_dst, v_src_0, v_src_1); +} +template <> +void ComputeHash::uni_vmovdqu64(const Xbyak::Xmm& v_dst, const Xbyak::Operand& v_src_0) { + vmovdqu64(v_dst, v_src_0); +} +template +void ComputeHash::uni_vmovdqu64(const Xbyak::Xmm& v_dst, const Xbyak::Operand& v_src_0) { + vmovdqu(v_dst, v_src_0); +} +template <> +void ComputeHash::uni_vmovdqu64(const Xbyak::Address& v_dst, const Xbyak::Xmm& v_src_0) { + vmovdqu64(v_dst, v_src_0); +} +template +void ComputeHash::uni_vmovdqu64(const Xbyak::Address& v_dst, const Xbyak::Xmm& v_src_0) { + vmovdqu(v_dst, v_src_0); +} +template <> +void ComputeHash::uni_vbroadcasti64x2(const Xbyak::Ymm& v_dst, const Xbyak::Address& v_src_0) { + vbroadcasti64x2(v_dst, v_src_0); +} +template +void ComputeHash::uni_vbroadcasti64x2(const Xbyak::Ymm& v_dst, const Xbyak::Address& v_src_0) { + vbroadcasti128(v_dst, v_src_0); +} +template <> +void ComputeHash::partial_load(const Xbyak::Xmm& xmm_dst, + const Xbyak::Address& src_addr, + const Xbyak::Reg64& r64_load_num) { + Xbyak::Label l_mv_mask; + auto rOnes = getReg64(); + auto k_load_mask = RegistersPool::Reg(m_registers_pool); + + mov(rOnes, 0xFFFFFFFFFFFFFFFF); + cmp(r64_load_num, 0x3f); + jg(l_mv_mask); + + shlx(rOnes, rOnes, r64_load_num); + not_(rOnes); + + L(l_mv_mask); + kmovq(k_load_mask, rOnes); + + vmovdqu8(Vmm(xmm_dst.getIdx()) | k_load_mask | T_z, ptr[r64_src_ptr]); +} +template +void ComputeHash::partial_load(const Xbyak::Xmm& xmm_dst, + const Xbyak::Address& src_addr, + const Xbyak::Reg64& r64_load_num) { + Xbyak::Label l_partial, l_end; + + cmp(r64_load_num, xmm_len); + jl(l_partial, T_NEAR); + uni_vmovdqu64(xmm_dst, ptr[src_addr.getRegExp()]); + jmp(l_end, T_NEAR); + + L(l_partial); + { + uni_vpxorq(xmm_dst, xmm_dst, xmm_dst); + for (size_t j = 0lu; j < xmm_len - 1; j++) { + cmp(r64_load_num, static_cast(j)); + jle(l_end, T_NEAR); + pinsrb(xmm_dst, ptr[src_addr.getRegExp() + j], static_cast(j)); + } + } + + L(l_end); +} +template <> +void ComputeHash::partial_load(const Xbyak::Ymm& xmm_dst, + const Xbyak::Address& src_addr, + const Xbyak::Reg64& r64_load_num) { + partial_load(Xbyak::Xmm(xmm_dst.getIdx()), src_addr, r64_load_num); +} +template +void ComputeHash::partial_load(const Xbyak::Ymm& ymm_dst, + const Xbyak::Address& src_addr, + const Xbyak::Reg64& r64_load_num) { + Xbyak::Label l_xmm, l_partial, l_end; + auto xmm_dst = Xbyak::Xmm(ymm_dst.getIdx()); + + cmp(r64_load_num, ymm_len); + jl(l_xmm, T_NEAR); + uni_vmovdqu64(ymm_dst, ptr[src_addr.getRegExp()]); + jmp(l_end, T_NEAR); + + L(l_xmm); + uni_vpxorq(ymm_dst, ymm_dst, ymm_dst); + cmp(r64_load_num, xmm_len); + jl(l_partial, T_NEAR); + uni_vmovdqu64(xmm_dst, ptr[src_addr.getRegExp()]); + je(l_end, T_NEAR); + + { + Xbyak::Label l_rest_loop, l_perm; + + vperm2i128(ymm_dst, ymm_dst, ymm_dst, 0x1); + for (size_t j = 0lu; j < xmm_len - 1lu; j++) { + cmp(r64_load_num, static_cast(xmm_len + j)); + jle(l_perm, T_NEAR); + pinsrb(xmm_dst, ptr[src_addr.getRegExp() + xmm_len + j], static_cast(j)); + } + L(l_perm); + vperm2i128(ymm_dst, ymm_dst, ymm_dst, 0x1); + } + jmp(l_end, T_NEAR); + + L(l_partial); + { + for (size_t j = 0lu; j < xmm_len - 1; j++) { + cmp(r64_load_num, static_cast(j)); + jle(l_end, T_NEAR); + pinsrb(xmm_dst, ptr[src_addr.getRegExp() + j], static_cast(j)); + } + } + + L(l_end); +} + +template +void ComputeHash::initialize(const Vmm& v_dst) { + mov(r64_src_ptr, ptr[r64_params + GET_OFF(src_ptr)]); + mov(r64_dst_ptr, ptr[r64_params + GET_OFF(dst_ptr)]); + mov(r64_k_ptr, ptr[r64_params + GET_OFF(k_ptr)]); + mov(r64_work_amount, ptr[r64_params + GET_OFF(work_amount)]); + + uni_vbroadcasti64x2(v_k_2_3, ptr[r64_k_ptr + K_2_3_OFF]); + + mov(r64_aux, reinterpret_cast(SHUF_MASK)); + uni_vbroadcasti64x2(v_shuf_mask, ptr[r64_aux]); + + if (m_jcp.type == SINGLE_THREAD || m_jcp.type == FIRST_THREAD) { + auto xmm_dst = Xbyak::Xmm(v_dst.getIdx()); + auto xmm_aux = getXmm(); + + // Initial CRC + mov(r64_aux, ptr[r64_params + GET_OFF(size)]); + vpinsrq(xmm_aux, xmm_aux, r64_aux, 0x0); + mov(r64_aux, CRC_VAL); + vpinsrq(xmm_aux, xmm_aux, r64_aux, 0x1); + + // First xor with source. + partial_load(v_dst, ptr[r64_src_ptr], r64_work_amount); + vpshufb(v_dst, v_dst, v_shuf_mask); + pxor(xmm_dst, xmm_aux); // The SSE version is used to avoid zeroing out the rest of the Vmm. + if (m_jcp.type == SINGLE_THREAD) { + add(r64_src_ptr, xmm_len); + } + } else if (m_jcp.type == N_THREAD) { + uni_vmovdqu64(v_dst, ptr[r64_src_ptr]); + vpshufb(v_dst, v_dst, v_shuf_mask); + } + if (m_jcp.type == SINGLE_THREAD || m_jcp.type == FIRST_THREAD || m_jcp.type == N_THREAD) { + sub(r64_work_amount, xmm_len); + } +} + +template <> +void ComputeHash::bulk_fold(const Vmm& v_dst) { + if (m_jcp.type != SINGLE_THREAD && m_jcp.type != FIRST_THREAD && m_jcp.type != N_THREAD) { + return; + } + Xbyak::Label l_fold_loop, l_end; + cmp(r64_work_amount, static_cast(get_vlen() * 2lu - xmm_len)); + jl(l_end, T_NEAR); + + auto v_src_0 = getVmm(); + auto v_dst_0 = getVmm(); + auto v_dst_1 = getVmm(); + auto v_dst_2 = getVmm(); + auto& v_dst_3 = v_dst; + auto v_k_loop = getVmm(); + auto v_aux_0 = getVmm(); + + auto xmm_src_0 = Xbyak::Xmm(v_src_0.getIdx()); + auto xmm_src_1 = getXmm(); + auto xmm_dst_0 = Xbyak::Xmm(v_dst_0.getIdx()); + auto xmm_dst_1 = Xbyak::Xmm(v_dst_1.getIdx()); + auto xmm_dst_2 = Xbyak::Xmm(v_dst_2.getIdx()); + auto xmm_dst_3 = Xbyak::Xmm(v_dst_3.getIdx()); + auto xmm_k_loop = Xbyak::Xmm(v_k_loop.getIdx()); + auto xmm_k_2_3 = Xbyak::Xmm(v_k_2_3.getIdx()); + auto xmm_aux_0 = Xbyak::Xmm(v_aux_0.getIdx()); + + RegistersPool::Reg r64_bulk_step; + if (m_jcp.type == FIRST_THREAD || m_jcp.type == N_THREAD) { + r64_bulk_step = getReg64(); + mov(r64_bulk_step, ptr[r64_params + GET_OFF(threads_num)]); + sal(r64_bulk_step, static_cast(std::log2(get_vlen()))); // * vlen + } + + if (m_jcp.type == SINGLE_THREAD) { + uni_vbroadcasti64x2(v_k_loop, ptr[r64_k_ptr + K_8_9_OFF]); + } else { + uni_vbroadcasti64x2(v_k_loop, ptr[r64_k_ptr + K_16_17_OFF]); + } + + uni_vmovdqu64(v_dst_0, v_dst); + + if (!is_vpclmulqdq) { + vextracti64x2(xmm_dst_1, v_dst_0, 0x1); + vextracti64x2(xmm_dst_2, v_dst_0, 0x2); + vextracti64x2(xmm_dst_3, v_dst_0, 0x3); + } + + if (m_jcp.type == FIRST_THREAD || m_jcp.type == N_THREAD) { + add(r64_src_ptr, r64_bulk_step); + prefetcht2(ptr[r64_src_ptr + 16384]); + } else { + add(r64_src_ptr, static_cast(get_vlen() - xmm_len)); + prefetcht2(ptr[r64_src_ptr + 4096]); + } + prefetcht1(ptr[r64_src_ptr + 1024]); + prefetcht0(ptr[r64_src_ptr + 64]); + + sub(r64_work_amount, static_cast(get_vlen() * 2lu - xmm_len)); + + L(l_fold_loop); + { + uni_vmovdqu64(v_src_0, ptr[r64_src_ptr]); + vpshufb(v_src_0, v_src_0, v_shuf_mask); + + if (m_jcp.type == FIRST_THREAD || m_jcp.type == N_THREAD) { + add(r64_src_ptr, r64_bulk_step); + prefetcht2(ptr[r64_src_ptr + 16384]); + } else { + add(r64_src_ptr, static_cast(get_vlen())); + prefetcht2(ptr[r64_src_ptr + 4096]); + } + prefetcht1(ptr[r64_src_ptr + 1024]); + prefetcht0(ptr[r64_src_ptr + 64]); + + if (is_vpclmulqdq) { + vpclmulqdq(v_aux_0, v_dst_0, v_k_loop, 0b00000000); + vpclmulqdq(v_dst_0, v_dst_0, v_k_loop, 0b00010001); + uni_vpxorq(v_aux_0, v_aux_0, v_src_0); + uni_vpxorq(v_dst_0, v_dst_0, v_aux_0); + } else { + // 0 + vpclmulqdq(xmm_aux_0, xmm_dst_0, xmm_k_loop, 0b00000000); + vpclmulqdq(xmm_dst_0, xmm_dst_0, xmm_k_loop, 0b00010001); + uni_vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_0); + uni_vpxorq(xmm_dst_0, xmm_dst_0, xmm_aux_0); + + // 1 + vextracti64x2(xmm_src_1, v_src_0, 0x1); + vpclmulqdq(xmm_aux_0, xmm_dst_1, xmm_k_loop, 0b00000000); + vpclmulqdq(xmm_dst_1, xmm_dst_1, xmm_k_loop, 0b00010001); + uni_vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_1); + uni_vpxorq(xmm_dst_1, xmm_dst_1, xmm_aux_0); + + // 2 + vextracti64x2(xmm_src_1, v_src_0, 0x2); + vpclmulqdq(xmm_aux_0, xmm_dst_2, xmm_k_loop, 0b00000000); + vpclmulqdq(xmm_dst_2, xmm_dst_2, xmm_k_loop, 0b00010001); + uni_vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_1); + uni_vpxorq(xmm_dst_2, xmm_dst_2, xmm_aux_0); + + // 3 + vextracti64x2(xmm_src_1, v_src_0, 0x3); + vpclmulqdq(xmm_aux_0, xmm_dst_3, xmm_k_loop, 0b00000000); + vpclmulqdq(xmm_dst_3, xmm_dst_3, xmm_k_loop, 0b00010001); + uni_vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_1); + uni_vpxorq(xmm_dst_3, xmm_dst_3, xmm_aux_0); + } + + sub(r64_work_amount, static_cast(get_vlen())); + jge(l_fold_loop, T_NEAR); + } + add(r64_work_amount, static_cast(get_vlen())); + + if (m_jcp.type == SINGLE_THREAD) { + if (is_vpclmulqdq) { + vextracti64x2(xmm_dst_1, v_dst_0, 0x1); + vextracti64x2(xmm_dst_2, v_dst_0, 0x2); + vextracti64x2(xmm_dst_3, v_dst_0, 0x3); + } + + vpclmulqdq(xmm_aux_0, xmm_dst_0, ptr[r64_k_ptr + K_6_7_OFF], 0b00000000); + vpclmulqdq(xmm_dst_0, xmm_dst_0, ptr[r64_k_ptr + K_6_7_OFF], 0b00010001); + uni_vpxorq(xmm_dst_3, xmm_dst_3, xmm_aux_0); + uni_vpxorq(xmm_dst_3, xmm_dst_3, xmm_dst_0); + + vpclmulqdq(xmm_aux_0, xmm_dst_1, ptr[r64_k_ptr + K_4_5_OFF], 0b00000000); + vpclmulqdq(xmm_dst_1, xmm_dst_1, ptr[r64_k_ptr + K_4_5_OFF], 0b00010001); + uni_vpxorq(xmm_dst_3, xmm_dst_3, xmm_aux_0); + uni_vpxorq(xmm_dst_3, xmm_dst_3, xmm_dst_1); + + vpclmulqdq(xmm_aux_0, xmm_dst_2, xmm_k_2_3, 0b00000000); + vpclmulqdq(xmm_dst_2, xmm_dst_2, xmm_k_2_3, 0b00010001); + uni_vpxorq(xmm_dst_3, xmm_dst_3, xmm_aux_0); + uni_vpxorq(xmm_dst_3, xmm_dst_3, xmm_dst_2); + } else { + if (is_vpclmulqdq) { + uni_vmovdqu64(ptr[r64_dst_ptr], v_dst_0); + } else { + uni_vmovdqu64(ptr[r64_dst_ptr + xmm_len * 0lu], xmm_dst_0); + uni_vmovdqu64(ptr[r64_dst_ptr + xmm_len * 1lu], xmm_dst_1); + uni_vmovdqu64(ptr[r64_dst_ptr + xmm_len * 2lu], xmm_dst_2); + uni_vmovdqu64(ptr[r64_dst_ptr + xmm_len * 3lu], xmm_dst_3); + } + } + + L(l_end); +} + +template +void ComputeHash::bulk_fold(const Vmm& v_dst) { + if (m_jcp.type != SINGLE_THREAD && m_jcp.type != FIRST_THREAD && m_jcp.type != N_THREAD) { + return; + } + Xbyak::Label l_fold_loop, l_end; + cmp(r64_work_amount, static_cast(get_vlen() * 2lu - xmm_len)); + jl(l_end, T_NEAR); + + auto v_src_0 = getVmm(); + auto v_dst_0 = getVmm(); + auto& v_dst_1 = v_dst; + auto v_aux_0 = getVmm(); + auto v_k_loop = getVmm(); + + auto xmm_src_0 = Xbyak::Xmm(v_src_0.getIdx()); + auto xmm_src_1 = getXmm(); + auto xmm_dst_0 = Xbyak::Xmm(v_dst_0.getIdx()); + auto xmm_dst_1 = Xbyak::Xmm(v_dst_1.getIdx()); + auto xmm_k_loop = Xbyak::Xmm(v_k_loop.getIdx()); + auto xmm_k_2_3 = Xbyak::Xmm(v_k_2_3.getIdx()); + auto xmm_aux_0 = Xbyak::Xmm(v_aux_0.getIdx()); + + RegistersPool::Reg r64_bulk_step; + if (m_jcp.type == FIRST_THREAD || m_jcp.type == N_THREAD) { + r64_bulk_step = getReg64(); + mov(r64_bulk_step, ptr[r64_params + GET_OFF(threads_num)]); + sal(r64_bulk_step, static_cast(std::log2(get_vlen()))); // * vlen + } + + if (m_jcp.type == SINGLE_THREAD) { + uni_vbroadcasti64x2(v_k_loop, ptr[r64_k_ptr + K_4_5_OFF]); + } else { + uni_vbroadcasti64x2(v_k_loop, ptr[r64_k_ptr + K_8_9_OFF]); + } + + uni_vmovdqu64(v_dst_0, v_dst); + + if (!is_vpclmulqdq) { + vextracti128(xmm_dst_1, v_dst_0, 0x1); + } + + if (m_jcp.type == SINGLE_THREAD) { + add(r64_src_ptr, static_cast(get_vlen() - xmm_len)); + } else { + add(r64_src_ptr, r64_bulk_step); + } + prefetcht2(ptr[r64_src_ptr + 4096]); + prefetcht1(ptr[r64_src_ptr + 1024]); + prefetcht0(ptr[r64_src_ptr + 64]); + + sub(r64_work_amount, static_cast(get_vlen() * 2lu - xmm_len)); + + L(l_fold_loop); + { + uni_vmovdqu64(v_src_0, ptr[r64_src_ptr]); + vpshufb(v_src_0, v_src_0, v_shuf_mask); + + if (m_jcp.type == SINGLE_THREAD) { + add(r64_src_ptr, static_cast(get_vlen())); + } else { + add(r64_src_ptr, r64_bulk_step); + } + prefetcht2(ptr[r64_src_ptr + 4096]); + prefetcht1(ptr[r64_src_ptr + 1024]); + prefetcht0(ptr[r64_src_ptr + 64]); + + if (is_vpclmulqdq) { + vpclmulqdq(v_aux_0, v_dst_0, v_k_loop, 0b00000000); + vpclmulqdq(v_dst_0, v_dst_0, v_k_loop, 0b00010001); + uni_vpxorq(v_aux_0, v_aux_0, v_src_0); + uni_vpxorq(v_dst_0, v_dst_0, v_aux_0); + } else { + // 0 + vpclmulqdq(xmm_aux_0, xmm_dst_0, xmm_k_loop, 0b00000000); + vpclmulqdq(xmm_dst_0, xmm_dst_0, xmm_k_loop, 0b00010001); + uni_vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_0); + uni_vpxorq(xmm_dst_0, xmm_dst_0, xmm_aux_0); + // 1 + vextracti128(xmm_src_1, v_src_0, 0x1); + vpclmulqdq(xmm_aux_0, xmm_dst_1, xmm_k_loop, 0b00000000); + vpclmulqdq(xmm_dst_1, xmm_dst_1, xmm_k_loop, 0b00010001); + uni_vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_1); + uni_vpxorq(xmm_dst_1, xmm_dst_1, xmm_aux_0); + } + + sub(r64_work_amount, static_cast(get_vlen())); + jge(l_fold_loop, T_NEAR); + } + add(r64_work_amount, static_cast(get_vlen())); + + if (m_jcp.type == SINGLE_THREAD) { + if (is_vpclmulqdq) { + vextracti128(xmm_dst_1, v_dst_0, 0x1); + } + vpclmulqdq(xmm_aux_0, xmm_dst_0, xmm_k_2_3, 0b00000000); + vpclmulqdq(xmm_dst_0, xmm_dst_0, xmm_k_2_3, 0b00010001); + uni_vpxorq(xmm_dst_1, xmm_dst_1, xmm_aux_0); + uni_vpxorq(xmm_dst_1, xmm_dst_1, xmm_dst_0); + } else { + if (is_vpclmulqdq) { + uni_vmovdqu64(ptr[r64_dst_ptr], v_dst_0); + } else { + uni_vmovdqu64(ptr[r64_dst_ptr + xmm_len * 0lu], xmm_dst_0); + uni_vmovdqu64(ptr[r64_dst_ptr + xmm_len * 1lu], xmm_dst_1); + } + } + + L(l_end); +} + +template <> +void ComputeHash::join(const Vmm& v_dst) { + if (m_jcp.type != FINAL_FOLD) { + return; + } + + mov(r64_aux, ptr[r64_params + GET_OFF(intermediate_ptr)]); + prefetcht0(ptr[r64_aux + 1024]); + + auto xmm_src_0 = getXmm(); + auto xmm_src_last = Xbyak::Xmm(v_dst.getIdx()); + auto xmm_aux_0 = getXmm(); + auto xmm_k_2_3 = Xbyak::Xmm(v_k_2_3.getIdx()); + + uni_vmovdqu64(xmm_src_last, ptr[r64_aux + xmm_len * 7]); + + uni_vmovdqu64(xmm_src_0, ptr[r64_aux]); + vpclmulqdq(xmm_aux_0, xmm_src_0, ptr[r64_k_ptr + K_14_15_OFF], 0b00000000); + vpclmulqdq(xmm_src_0, xmm_src_0, ptr[r64_k_ptr + K_14_15_OFF], 0b00010001); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0); + + uni_vmovdqu64(xmm_src_0, ptr[r64_aux + xmm_len]); + vpclmulqdq(xmm_aux_0, xmm_src_0, ptr[r64_k_ptr + K_12_13_OFF], 0b00000000); + vpclmulqdq(xmm_src_0, xmm_src_0, ptr[r64_k_ptr + K_12_13_OFF], 0b00010001); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0); + + uni_vmovdqu64(xmm_src_0, ptr[r64_aux + xmm_len * 2lu]); + vpclmulqdq(xmm_aux_0, xmm_src_0, ptr[r64_k_ptr + K_10_11_OFF], 0b00000000); + vpclmulqdq(xmm_src_0, xmm_src_0, ptr[r64_k_ptr + K_10_11_OFF], 0b00010001); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0); + + uni_vmovdqu64(xmm_src_0, ptr[r64_aux + xmm_len * 3lu]); + vpclmulqdq(xmm_aux_0, xmm_src_0, ptr[r64_k_ptr + K_8_9_OFF], 0b00000000); + vpclmulqdq(xmm_src_0, xmm_src_0, ptr[r64_k_ptr + K_8_9_OFF], 0b00010001); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0); + + uni_vmovdqu64(xmm_src_0, ptr[r64_aux + xmm_len * 4lu]); + vpclmulqdq(xmm_aux_0, xmm_src_0, ptr[r64_k_ptr + K_6_7_OFF], 0b00000000); + vpclmulqdq(xmm_src_0, xmm_src_0, ptr[r64_k_ptr + K_6_7_OFF], 0b00010001); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0); + + uni_vmovdqu64(xmm_src_0, ptr[r64_aux + xmm_len * 5lu]); + vpclmulqdq(xmm_aux_0, xmm_src_0, ptr[r64_k_ptr + K_4_5_OFF], 0b00000000); + vpclmulqdq(xmm_src_0, xmm_src_0, ptr[r64_k_ptr + K_4_5_OFF], 0b00010001); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0); + + uni_vmovdqu64(xmm_src_0, ptr[r64_aux + xmm_len * 6lu]); + vpclmulqdq(xmm_aux_0, xmm_src_0, xmm_k_2_3, 0b00000000); + vpclmulqdq(xmm_src_0, xmm_src_0, xmm_k_2_3, 0b00010001); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0); +} + +template +void ComputeHash::join(const Vmm& v_dst) { + if (m_jcp.type != FINAL_FOLD) { + return; + } + + mov(r64_aux, ptr[r64_params + GET_OFF(intermediate_ptr)]); + prefetcht0(ptr[r64_aux + 1024]); + + auto xmm_src_0 = getXmm(); + auto xmm_src_last = Xbyak::Xmm(v_dst.getIdx()); + auto xmm_aux_0 = getXmm(); + auto xmm_k_2_3 = Xbyak::Xmm(v_k_2_3.getIdx()); + + uni_vmovdqu64(xmm_src_last, ptr[r64_aux + xmm_len * 3]); + + uni_vmovdqu64(xmm_src_0, ptr[r64_aux + xmm_len * 0lu]); + vpclmulqdq(xmm_aux_0, xmm_src_0, ptr[r64_k_ptr + K_6_7_OFF], 0b00000000); + vpclmulqdq(xmm_src_0, xmm_src_0, ptr[r64_k_ptr + K_6_7_OFF], 0b00010001); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0); + + uni_vmovdqu64(xmm_src_0, ptr[r64_aux + xmm_len * 1lu]); + vpclmulqdq(xmm_aux_0, xmm_src_0, ptr[r64_k_ptr + K_4_5_OFF], 0b00000000); + vpclmulqdq(xmm_src_0, xmm_src_0, ptr[r64_k_ptr + K_4_5_OFF], 0b00010001); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0); + + uni_vmovdqu64(xmm_src_0, ptr[r64_aux + xmm_len * 2lu]); + vpclmulqdq(xmm_aux_0, xmm_src_0, xmm_k_2_3, 0b00000000); + vpclmulqdq(xmm_src_0, xmm_src_0, xmm_k_2_3, 0b00010001); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0); +} + +template +void ComputeHash::fold_to_128(const Vmm& v_dst) { + if (m_jcp.type != SINGLE_THREAD && m_jcp.type != FINAL_FOLD) { + return; + } + Xbyak::Label l_fold_loop, l_end; + cmp(r64_work_amount, xmm_len); + jl(l_end, T_NEAR); + + auto xmm_src = getXmm(); + auto xmm_dst = Xbyak::Xmm(v_dst.getIdx()); + auto xmm_k_2_3 = Xbyak::Xmm(v_k_2_3.getIdx()); + auto xmm_shuf_mask = Xbyak::Xmm(v_shuf_mask.getIdx()); + auto xmm_aux = getXmm(); + + L(l_fold_loop); + { + uni_vmovdqu64(xmm_src, ptr[r64_src_ptr]); + vpshufb(xmm_src, xmm_src, xmm_shuf_mask); + + vpclmulqdq(xmm_aux, xmm_dst, xmm_k_2_3, 0b00000000); + vpclmulqdq(xmm_dst, xmm_dst, xmm_k_2_3, 0b00010001); + uni_vpxorq(xmm_dst, xmm_dst, xmm_aux); + uni_vpxorq(xmm_dst, xmm_dst, xmm_src); + + add(r64_src_ptr, xmm_len); + sub(r64_work_amount, xmm_len); + cmp(r64_work_amount, xmm_len); + jge(l_fold_loop, T_NEAR); + } + + L(l_end); +} + +template +void ComputeHash::fold_to_64(const Vmm& v_dst) { + if (m_jcp.type != SINGLE_THREAD && m_jcp.type != FINAL_FOLD) { + return; + } + Xbyak::Label l_fold_to_64; + cmp(r64_work_amount, 0); + jle(l_fold_to_64, T_NEAR); + + auto xmm_src = getXmm(); + auto xmm_dst = Xbyak::Xmm(v_dst.getIdx()); + auto xmm_k_2_3 = Xbyak::Xmm(v_k_2_3.getIdx()); + auto xmm_shuf_mask = Xbyak::Xmm(v_shuf_mask.getIdx()); + auto xmm_aux = getXmm(); + auto xmm_aux_1 = getXmm(); + auto xmm_aux_2 = getXmm(); + + partial_load(xmm_src, ptr[r64_src_ptr], r64_work_amount); + vpshufb(xmm_src, xmm_src, xmm_shuf_mask); + + vpclmulqdq(xmm_aux, xmm_dst, xmm_k_2_3, 0b00000000); + vpclmulqdq(xmm_dst, xmm_dst, xmm_k_2_3, 0b00010001); + uni_vpxorq(xmm_aux, xmm_aux, xmm_src); + uni_vpxorq(xmm_dst, xmm_dst, xmm_aux); + + L(l_fold_to_64); + + mov(r64_aux, K_2); + vpinsrq(xmm_aux, xmm_aux, r64_aux, 0x0); + vpclmulqdq(xmm_aux, xmm_dst, xmm_aux, 0b00000001); + vpslldq(xmm_dst, xmm_dst, 0x8); + uni_vpxorq(xmm_dst, xmm_dst, xmm_aux); + + mov(r64_aux, P_1); + vpinsrq(xmm_aux_2, xmm_aux_2, r64_aux, 0x0); + vpclmulqdq(xmm_aux, xmm_dst, xmm_aux_2, 0b00000001); + mov(r64_aux, 0x0); + vpinsrq(xmm_aux_1, xmm_dst, r64_aux, 0x0); + uni_vpxorq(xmm_aux, xmm_aux, xmm_aux_1); + vpinsrq(xmm_aux_1, xmm_aux, r64_aux, 0x0); + + mov(r64_aux, P_2); + vpinsrq(xmm_aux_2, xmm_aux_2, r64_aux, 0x1); + vpclmulqdq(xmm_aux, xmm_aux, xmm_aux_2, 0b00010001); + uni_vpxorq(xmm_aux, xmm_aux, xmm_aux_1); + uni_vpxorq(xmm_dst, xmm_dst, xmm_aux); + + vpextrq(ptr[r64_dst_ptr], xmm_dst, 0x0); +} + +} // namespace jit +#endif // OV_CORE_USE_XBYAK_JIT + +size_t compute_hash(const void* src, size_t size) { +#ifdef OV_CORE_USE_XBYAK_JIT + if (Generator::mayiuse(avx2)) { + uint64_t result = 0lu; + + // Parallel section + constexpr uint64_t min_wa_per_thread = 131072lu; // 2^17 + const uint64_t size_u64 = static_cast(size); + if (size_u64 >= min_wa_per_thread * 2lu) { + static auto first_thr_kernel = Generator::mayiuse(avx512_core) + ? jit::ComputeHash::create({jit::FIRST_THREAD}) + : jit::ComputeHash::create({jit::FIRST_THREAD}); + static auto n_thr_kernel = Generator::mayiuse(avx512_core) + ? jit::ComputeHash::create({jit::N_THREAD}) + : jit::ComputeHash::create({jit::N_THREAD}); + static auto final_fold_kernel = Generator::mayiuse(avx512_core) + ? jit::ComputeHash::create({jit::FINAL_FOLD}) + : jit::ComputeHash::create({jit::FINAL_FOLD}); + + static const uint64_t max_thr_num = 2lu; + uint64_t thr_num = std::min(size_u64 / min_wa_per_thread, max_thr_num); + const uint64_t el_per_thread = + first_thr_kernel->get_vlen() * ((size_u64 / thr_num) / first_thr_kernel->get_vlen()); + std::vector intermediate(thr_num * first_thr_kernel->get_vlen()); + + parallel_nt_static(static_cast(thr_num), [&](const int ithr, const int nthr) { + uint64_t start = el_per_thread * ithr; + if (start >= size_u64) { + return; + } + uint64_t work_amount = (el_per_thread + start > size_u64) ? size_u64 - start : el_per_thread; + + jit::ComputeHashCallArgs args; + + args.src_ptr = reinterpret_cast(src) + first_thr_kernel->get_vlen() * ithr; + args.dst_ptr = &(intermediate[first_thr_kernel->get_vlen() * ithr]); + args.k_ptr = jit::K_PULL; + args.work_amount = work_amount; + args.size = size_u64; + args.threads_num = thr_num; + + if (ithr == 0) { + (*first_thr_kernel)(&args); + } else { + (*n_thr_kernel)(&args); + } + }); + + jit::ComputeHashCallArgs args; + args.work_amount = size_u64 - el_per_thread * thr_num; + args.src_ptr = reinterpret_cast(src) + size_u64 - args.work_amount; + args.dst_ptr = &result; + args.k_ptr = jit::K_PULL; + args.size = size_u64; + args.intermediate_ptr = intermediate.data(); + + (*final_fold_kernel)(&args); + } else { + static auto single_thr_kernel = Generator::mayiuse(avx512_core) + ? jit::ComputeHash::create({jit::SINGLE_THREAD}) + : jit::ComputeHash::create({jit::SINGLE_THREAD}); + + jit::ComputeHashCallArgs args; + args.src_ptr = src; + args.dst_ptr = &result; + args.k_ptr = jit::K_PULL; + args.work_amount = size_u64; + args.size = size_u64; + + (*single_thr_kernel)(&args); + } + + return result; + } + +#endif // OV_CORE_USE_XBYAK_JIT + + constexpr auto cel_size = sizeof(size_t); + size_t seed = size; + const auto data = static_cast(src); + const auto d_end = std::next(data, size / cel_size); + // The constant value used as a magic number has been + // traditionally used e.g. in boost library's hash_combine. + // It happens to be derived from the golden ratio. + for (auto d = data; d != d_end; ++d) { + seed ^= *d + 0x9e3779b9 + (seed << 6) + (seed >> 2); + } + size_t last_bytes{0}; + std::memcpy(&last_bytes, d_end, size % cel_size); + seed ^= last_bytes + 0x9e3779b9 + (seed << 6) + (seed >> 2); + + return seed; +} + +} // namespace runtime +} // namespace ov