From a29a70dc00e6b5479c79b7c6c28e8d396352cc68 Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Sun, 14 May 2023 15:42:38 -0400 Subject: [PATCH] buffer: add SIMD Neon optimization for `byteLength` Co-authored-by: Keyhan Vakil Co-authored-by: Daniel Lemire --- node.gyp | 1 + src/node_buffer.cc | 11 +++------ src/node_simd.cc | 58 ++++++++++++++++++++++++++++++++++++++++++++++ src/node_simd.h | 22 ++++++++++++++++++ 4 files changed, 84 insertions(+), 8 deletions(-) create mode 100644 src/node_simd.cc create mode 100644 src/node_simd.h diff --git a/node.gyp b/node.gyp index f9621fc1e15470..9d4ad4a496c494 100644 --- a/node.gyp +++ b/node.gyp @@ -121,6 +121,7 @@ 'src/node_report_utils.cc', 'src/node_sea.cc', 'src/node_serdes.cc', + 'src/node_simd.cc', 'src/node_shadow_realm.cc', 'src/node_snapshotable.cc', 'src/node_sockaddr.cc', diff --git a/src/node_buffer.cc b/src/node_buffer.cc index ff041274f90d24..a2692479ad1416 100644 --- a/src/node_buffer.cc +++ b/src/node_buffer.cc @@ -26,6 +26,7 @@ #include "node_external_reference.h" #include "node_i18n.h" #include "node_internals.h" +#include "node_simd.h" #include "env-inl.h" #include "simdutf.h" @@ -743,14 +744,8 @@ void SlowByteLengthUtf8(const FunctionCallbackInfo& args) { uint32_t FastByteLengthUtf8(Local receiver, const v8::FastOneByteString& source) { - uint32_t result = 0; - uint32_t length = source.length; - const uint8_t* data = reinterpret_cast(source.data); - for (uint32_t i = 0; i < length; ++i) { - result += (data[i] >> 7); - } - result += length; - return result; + return node::simd::utf8_byte_length( + reinterpret_cast(source.data), source.length); } static v8::CFunction fast_byte_length_utf8( diff --git a/src/node_simd.cc b/src/node_simd.cc new file mode 100644 index 00000000000000..e8f7acc4644db1 --- /dev/null +++ b/src/node_simd.cc @@ -0,0 +1,58 @@ +#include "node_simd.h" + +#if NODE_HAS_SIMD_NEON +#include +#endif + +namespace node { +namespace simd { + +#if NODE_HAS_SIMD_NEON +uint32_t utf8_byte_length(const uint8_t* data, size_t length) { + uint64_t result{0}; + + const int lanes = sizeof(uint8x16_t); + const int max_sra_count = 256 / lanes; // Avoid overflowing vaddvq_u8. + const int unrolls = max_sra_count; + const int unrolled_lanes = lanes * unrolls; + + const uint8_t *unroll_end = data + (length / unrolled_lanes) * unrolled_lanes; + uint32_t length_after_unroll = length % unrolled_lanes; + for (; data < unroll_end;) { + uint8x16_t acc = {}; + for (int i = 0; i < unrolls; ++i, data += lanes) { + uint8x16_t chunk = vld1q_u8(data); + acc = vsraq_n_u8(acc, chunk, 7); + } + result += vaddvq_u8(acc); + } + + const uint8_t *simd_end = data + (length_after_unroll / lanes) * lanes; + uint32_t length_after_simd = length % lanes; + uint8x16_t acc = {}; + for (; data < simd_end; data += lanes) { + uint8x16_t chunk = vld1q_u8(data); + acc = vsraq_n_u8(acc, chunk, 7); + } + result += vaddvq_u8(acc); + + const uint8_t *scalar_end = data + length_after_simd; + for (; data < scalar_end; data += 1) { + result += *data >> 7; + } + + return result + length; +} +#else +uint32_t utf8_byte_length(const uint8_t* data, size_t length) { + uint32_t result = 0; + for (uint32_t i = 0; i < length; ++i) { + result += (data[i] >> 7); + } + result += length; + return result; +} +#endif + +} // namespace simd +} // namespace node diff --git a/src/node_simd.h b/src/node_simd.h new file mode 100644 index 00000000000000..24398683c69126 --- /dev/null +++ b/src/node_simd.h @@ -0,0 +1,22 @@ +#ifndef SRC_NODE_SIMD_H_ +#define SRC_NODE_SIMD_H_ + +#if defined(__aarch64__) || defined(_M_ARM64) +#define NODE_HAS_SIMD_NEON 1 +#endif + +#if defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS + +#include + +namespace node { +namespace simd { + +uint32_t utf8_byte_length(const uint8_t* input, size_t length); + +} // namespace simd +} // namespace node + +#endif // defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS + +#endif // SRC_NODE_SIMD_H_