From 5566c99801a903ef924a9c8ce735a4c47db738cf Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Wed, 30 Nov 2022 11:05:17 -0500 Subject: [PATCH] deps: add simdutf dependency --- Makefile | 1 + deps/simdutf/README.md | 14 + deps/simdutf/simdutf.cpp | 27967 +++++++++++++++++++++++++ deps/simdutf/simdutf.gyp | 123 + deps/simdutf/simdutf.h | 2435 +++ node.gyp | 5 +- tools/dep_updaters/README.md | 25 + tools/dep_updaters/update-simdutf.sh | 49 + tools/license-builder.sh | 2 + 9 files changed, 30620 insertions(+), 1 deletion(-) create mode 100644 deps/simdutf/README.md create mode 100644 deps/simdutf/simdutf.cpp create mode 100644 deps/simdutf/simdutf.gyp create mode 100644 deps/simdutf/simdutf.h create mode 100755 tools/dep_updaters/update-simdutf.sh diff --git a/Makefile b/Makefile index 05fdc1509844ad6..ab696b45b645ec2 100644 --- a/Makefile +++ b/Makefile @@ -170,6 +170,7 @@ with-code-cache test-code-cache: out/Makefile: config.gypi common.gypi node.gyp \ deps/uv/uv.gyp deps/llhttp/llhttp.gyp deps/zlib/zlib.gyp \ + deps/simdutf/simdutf.gyp \ tools/v8_gypfiles/toolchain.gypi tools/v8_gypfiles/features.gypi \ tools/v8_gypfiles/inspector.gypi tools/v8_gypfiles/v8.gyp $(PYTHON) tools/gyp_node.py -f make diff --git a/deps/simdutf/README.md b/deps/simdutf/README.md new file mode 100644 index 000000000000000..1852968642c8747 --- /dev/null +++ b/deps/simdutf/README.md @@ -0,0 +1,14 @@ +# simdutf + +This project boosts unicode validation and transcoding performance by +utilizing SIMD operations where possible. + +The source is pulled from: https://github.com/simdutf/simdutf + +Active development occurs in the default branch (currently named `master`). + +## Updating + +```sh +$ git clone https://github.com/simdutf/simdutf +``` diff --git a/deps/simdutf/simdutf.cpp b/deps/simdutf/simdutf.cpp new file mode 100644 index 000000000000000..cce3e86028a23e9 --- /dev/null +++ b/deps/simdutf/simdutf.cpp @@ -0,0 +1,27967 @@ +/* auto-generated on 2022-12-08 17:36:59 +0000. Do not edit! */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf.cpp +/* begin file src/simdutf.cpp */ +#include "simdutf.h" +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=implementation.cpp +/* begin file src/implementation.cpp */ +#include +#include + +// Useful for debugging purposes +namespace simdutf { +namespace { + +template +std::string toBinaryString(T b) { + std::string binary = ""; + T mask = T(1) << (sizeof(T) * CHAR_BIT - 1); + while (mask > 0) { + binary += ((b & mask) == 0) ? '0' : '1'; + mask >>= 1; + } + return binary; +} +} +} + +// Implementations +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64.h +/* begin file src/simdutf/arm64.h */ +#ifndef SIMDUTF_ARM64_H +#define SIMDUTF_ARM64_H + +#ifdef SIMDUTF_FALLBACK_H +#error "arm64.h must be included before fallback.h" +#endif + + +#ifndef SIMDUTF_IMPLEMENTATION_ARM64 +#define SIMDUTF_IMPLEMENTATION_ARM64 (SIMDUTF_IS_ARM64) +#endif +#define SIMDUTF_CAN_ALWAYS_RUN_ARM64 SIMDUTF_IMPLEMENTATION_ARM64 && SIMDUTF_IS_ARM64 + + + +#if SIMDUTF_IMPLEMENTATION_ARM64 + +namespace simdutf { +/** + * Implementation for NEON (ARMv8). + */ +namespace arm64 { +} // namespace arm64 +} // namespace simdutf + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/implementation.h +/* begin file src/simdutf/arm64/implementation.h */ +#ifndef SIMDUTF_ARM64_IMPLEMENTATION_H +#define SIMDUTF_ARM64_IMPLEMENTATION_H + + +namespace simdutf { +namespace arm64 { + +namespace { +using namespace simdutf; +} + +class implementation final : public simdutf::implementation { +public: + simdutf_really_inline implementation() : simdutf::implementation("arm64", "ARM NEON", internal::instruction_set::NEON) {} + simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final; + simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final; + simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final; + simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final; + simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final; + simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final; + simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; + simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final; + simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept; + simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept; + simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept; + simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept; +}; + +} // namespace arm64 +} // namespace simdutf + +#endif // SIMDUTF_ARM64_IMPLEMENTATION_H +/* end file src/simdutf/arm64/implementation.h */ + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/begin.h +/* begin file src/simdutf/arm64/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "arm64" +// #define SIMDUTF_IMPLEMENTATION arm64 +/* end file src/simdutf/arm64/begin.h */ + +// Declarations +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/intrinsics.h +/* begin file src/simdutf/arm64/intrinsics.h */ +#ifndef SIMDUTF_ARM64_INTRINSICS_H +#define SIMDUTF_ARM64_INTRINSICS_H + + +// This should be the correct header whether +// you use visual studio or other compilers. +#include + +#endif // SIMDUTF_ARM64_INTRINSICS_H +/* end file src/simdutf/arm64/intrinsics.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/bitmanipulation.h +/* begin file src/simdutf/arm64/bitmanipulation.h */ +#ifndef SIMDUTF_ARM64_BITMANIPULATION_H +#define SIMDUTF_ARM64_BITMANIPULATION_H + +namespace simdutf { +namespace arm64 { +namespace { + +/* result might be undefined when input_num is zero */ +simdutf_really_inline int count_ones(uint64_t input_num) { + return vaddv_u8(vcnt_u8(vcreate_u8(input_num))); +} + +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf + +#endif // SIMDUTF_ARM64_BITMANIPULATION_H +/* end file src/simdutf/arm64/bitmanipulation.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/simd.h +/* begin file src/simdutf/arm64/simd.h */ +#ifndef SIMDUTF_ARM64_SIMD_H +#define SIMDUTF_ARM64_SIMD_H + +#include + + +namespace simdutf { +namespace arm64 { +namespace { +namespace simd { + +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO +namespace { +// Start of private section with Visual Studio workaround + + +/** + * make_uint8x16_t initializes a SIMD register (uint8x16_t). + * This is needed because, incredibly, the syntax uint8x16_t x = {1,2,3...} + * is not recognized under Visual Studio! This is a workaround. + * Using a std::initializer_list as a parameter resulted in + * inefficient code. With the current approach, if the parameters are + * compile-time constants, + * GNU GCC compiles it to ldr, the same as uint8x16_t x = {1,2,3...}. + * You should not use this function except for compile-time constants: + * it is not efficient. + */ +simdutf_really_inline uint8x16_t make_uint8x16_t(uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, + uint8_t x5, uint8_t x6, uint8_t x7, uint8_t x8, + uint8_t x9, uint8_t x10, uint8_t x11, uint8_t x12, + uint8_t x13, uint8_t x14, uint8_t x15, uint8_t x16) { + // Doing a load like so end ups generating worse code. + // uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, + // x9, x10,x11,x12,x13,x14,x15,x16}; + // return vld1q_u8(array); + uint8x16_t x{}; + // incredibly, Visual Studio does not allow x[0] = x1 + x = vsetq_lane_u8(x1, x, 0); + x = vsetq_lane_u8(x2, x, 1); + x = vsetq_lane_u8(x3, x, 2); + x = vsetq_lane_u8(x4, x, 3); + x = vsetq_lane_u8(x5, x, 4); + x = vsetq_lane_u8(x6, x, 5); + x = vsetq_lane_u8(x7, x, 6); + x = vsetq_lane_u8(x8, x, 7); + x = vsetq_lane_u8(x9, x, 8); + x = vsetq_lane_u8(x10, x, 9); + x = vsetq_lane_u8(x11, x, 10); + x = vsetq_lane_u8(x12, x, 11); + x = vsetq_lane_u8(x13, x, 12); + x = vsetq_lane_u8(x14, x, 13); + x = vsetq_lane_u8(x15, x, 14); + x = vsetq_lane_u8(x16, x, 15); + return x; +} + +// We have to do the same work for make_int8x16_t +simdutf_really_inline int8x16_t make_int8x16_t(int8_t x1, int8_t x2, int8_t x3, int8_t x4, + int8_t x5, int8_t x6, int8_t x7, int8_t x8, + int8_t x9, int8_t x10, int8_t x11, int8_t x12, + int8_t x13, int8_t x14, int8_t x15, int8_t x16) { + // Doing a load like so end ups generating worse code. + // int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, + // x9, x10,x11,x12,x13,x14,x15,x16}; + // return vld1q_s8(array); + int8x16_t x{}; + // incredibly, Visual Studio does not allow x[0] = x1 + x = vsetq_lane_s8(x1, x, 0); + x = vsetq_lane_s8(x2, x, 1); + x = vsetq_lane_s8(x3, x, 2); + x = vsetq_lane_s8(x4, x, 3); + x = vsetq_lane_s8(x5, x, 4); + x = vsetq_lane_s8(x6, x, 5); + x = vsetq_lane_s8(x7, x, 6); + x = vsetq_lane_s8(x8, x, 7); + x = vsetq_lane_s8(x9, x, 8); + x = vsetq_lane_s8(x10, x, 9); + x = vsetq_lane_s8(x11, x, 10); + x = vsetq_lane_s8(x12, x, 11); + x = vsetq_lane_s8(x13, x, 12); + x = vsetq_lane_s8(x14, x, 13); + x = vsetq_lane_s8(x15, x, 14); + x = vsetq_lane_s8(x16, x, 15); + return x; +} + +simdutf_really_inline uint8x8_t make_uint8x8_t(uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, + uint8_t x5, uint8_t x6, uint8_t x7, uint8_t x8) { + uint8x8_t x{}; + x = vset_lane_u8(x1, x, 0); + x = vset_lane_u8(x2, x, 1); + x = vset_lane_u8(x3, x, 2); + x = vset_lane_u8(x4, x, 3); + x = vset_lane_u8(x5, x, 4); + x = vset_lane_u8(x6, x, 5); + x = vset_lane_u8(x7, x, 6); + x = vset_lane_u8(x8, x, 7); + return x; +} + +simdutf_really_inline uint16x8_t make_uint16x8_t(uint16_t x1, uint16_t x2, uint16_t x3, uint16_t x4, + uint16_t x5, uint16_t x6, uint16_t x7, uint16_t x8) { + uint16x8_t x{}; + x = vsetq_lane_u16(x1, x, 0); + x = vsetq_lane_u16(x2, x, 1); + x = vsetq_lane_u16(x3, x, 2); + x = vsetq_lane_u16(x4, x, 3); + x = vsetq_lane_u16(x5, x, 4); + x = vsetq_lane_u16(x6, x, 5); + x = vsetq_lane_u16(x7, x, 6); + x = vsetq_lane_u16(x8, x, 7);; + return x; +} + +simdutf_really_inline int16x8_t make_int16x8_t(int16_t x1, int16_t x2, int16_t x3, int16_t x4, + int16_t x5, int16_t x6, int16_t x7, int16_t x8) { + uint16x8_t x{}; + x = vsetq_lane_s16(x1, x, 0); + x = vsetq_lane_s16(x2, x, 1); + x = vsetq_lane_s16(x3, x, 2); + x = vsetq_lane_s16(x4, x, 3); + x = vsetq_lane_s16(x5, x, 4); + x = vsetq_lane_s16(x6, x, 5); + x = vsetq_lane_s16(x7, x, 6); + x = vsetq_lane_s16(x8, x, 7);; + return x; +} + + +// End of private section with Visual Studio workaround +} // namespace +#endif // SIMDUTF_REGULAR_VISUAL_STUDIO + + + template + struct simd8; + + // + // Base class of simd8 and simd8, both of which use uint8x16_t internally. + // + template> + struct base_u8 { + uint8x16_t value; + static const int SIZE = sizeof(value); + + // Conversion from/to SIMD register + simdutf_really_inline base_u8(const uint8x16_t _value) : value(_value) {} + simdutf_really_inline operator const uint8x16_t&() const { return this->value; } + simdutf_really_inline operator uint8x16_t&() { return this->value; } + simdutf_really_inline T first() const { return vgetq_lane_u8(*this,0); } + simdutf_really_inline T last() const { return vgetq_lane_u8(*this,15); } + + // Bit operations + simdutf_really_inline simd8 operator|(const simd8 other) const { return vorrq_u8(*this, other); } + simdutf_really_inline simd8 operator&(const simd8 other) const { return vandq_u8(*this, other); } + simdutf_really_inline simd8 operator^(const simd8 other) const { return veorq_u8(*this, other); } + simdutf_really_inline simd8 bit_andnot(const simd8 other) const { return vbicq_u8(*this, other); } + simdutf_really_inline simd8 operator~() const { return *this ^ 0xFFu; } + simdutf_really_inline simd8& operator|=(const simd8 other) { auto this_cast = static_cast*>(this); *this_cast = *this_cast | other; return *this_cast; } + simdutf_really_inline simd8& operator&=(const simd8 other) { auto this_cast = static_cast*>(this); *this_cast = *this_cast & other; return *this_cast; } + simdutf_really_inline simd8& operator^=(const simd8 other) { auto this_cast = static_cast*>(this); *this_cast = *this_cast ^ other; return *this_cast; } + + simdutf_really_inline Mask operator==(const simd8 other) const { return vceqq_u8(*this, other); } + + template + simdutf_really_inline simd8 prev(const simd8 prev_chunk) const { + return vextq_u8(prev_chunk, *this, 16 - N); + } + }; + + // SIMD byte mask type (returned by things like eq and gt) + template<> + struct simd8: base_u8 { + typedef uint16_t bitmask_t; + typedef uint32_t bitmask2_t; + + static simdutf_really_inline simd8 splat(bool _value) { return vmovq_n_u8(uint8_t(-(!!_value))); } + + simdutf_really_inline simd8(const uint8x16_t _value) : base_u8(_value) {} + // False constructor + simdutf_really_inline simd8() : simd8(vdupq_n_u8(0)) {} + // Splat constructor + simdutf_really_inline simd8(bool _value) : simd8(splat(_value)) {} + simdutf_really_inline void store(uint8_t dst[16]) const { return vst1q_u8(dst, *this); } + + // We return uint32_t instead of uint16_t because that seems to be more efficient for most + // purposes (cutting it down to uint16_t costs performance in some compilers). + simdutf_really_inline uint32_t to_bitmask() const { +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint8x16_t bit_mask = make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80); +#else + const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; +#endif + auto minput = *this & bit_mask; + uint8x16_t tmp = vpaddq_u8(minput, minput); + tmp = vpaddq_u8(tmp, tmp); + tmp = vpaddq_u8(tmp, tmp); + return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0); + } + + // Returns 4-bit out of each byte, alternating between the high 4 bits and low bits + // result it is 64 bit. + // This method is expected to be faster than none() and is equivalent + // when the vector register is the result of a comparison, with byte + // values 0xff and 0x00. + simdutf_really_inline uint64_t to_bitmask64() const { + return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(*this), 4)), 0); + } + + simdutf_really_inline bool any() const { return vmaxvq_u8(*this) != 0; } + simdutf_really_inline bool none() const { return vmaxvq_u8(*this) == 0; } + simdutf_really_inline bool all() const { return vminvq_u8(*this) == 0xFF; } + + + }; + + // Unsigned bytes + template<> + struct simd8: base_u8 { + static simdutf_really_inline simd8 splat(uint8_t _value) { return vmovq_n_u8(_value); } + static simdutf_really_inline simd8 zero() { return vdupq_n_u8(0); } + static simdutf_really_inline simd8 load(const uint8_t* values) { return vld1q_u8(values); } + simdutf_really_inline simd8(const uint8x16_t _value) : base_u8(_value) {} + // Zero constructor + simdutf_really_inline simd8() : simd8(zero()) {} + // Array constructor + simdutf_really_inline simd8(const uint8_t values[16]) : simd8(load(values)) {} + // Splat constructor + simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {} + // Member-by-member initialization +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + simdutf_really_inline simd8( + uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, + uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 + ) : simd8(make_uint8x16_t( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + )) {} +#else + simdutf_really_inline simd8( + uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, + uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 + ) : simd8(uint8x16_t{ + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + }) {} +#endif + + // Repeat 16 values as many times as necessary (usually for lookup tables) + simdutf_really_inline static simd8 repeat_16( + uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, + uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } + + // Store to array + simdutf_really_inline void store(uint8_t dst[16]) const { return vst1q_u8(dst, *this); } + + // Saturated math + simdutf_really_inline simd8 saturating_add(const simd8 other) const { return vqaddq_u8(*this, other); } + simdutf_really_inline simd8 saturating_sub(const simd8 other) const { return vqsubq_u8(*this, other); } + + // Addition/subtraction are the same for signed and unsigned + simdutf_really_inline simd8 operator+(const simd8 other) const { return vaddq_u8(*this, other); } + simdutf_really_inline simd8 operator-(const simd8 other) const { return vsubq_u8(*this, other); } + simdutf_really_inline simd8& operator+=(const simd8 other) { *this = *this + other; return *this; } + simdutf_really_inline simd8& operator-=(const simd8 other) { *this = *this - other; return *this; } + + // Order-specific operations + simdutf_really_inline uint8_t max_val() const { return vmaxvq_u8(*this); } + simdutf_really_inline uint8_t min_val() const { return vminvq_u8(*this); } + simdutf_really_inline simd8 max_val(const simd8 other) const { return vmaxq_u8(*this, other); } + simdutf_really_inline simd8 min_val(const simd8 other) const { return vminq_u8(*this, other); } + simdutf_really_inline simd8 operator<=(const simd8 other) const { return vcleq_u8(*this, other); } + simdutf_really_inline simd8 operator>=(const simd8 other) const { return vcgeq_u8(*this, other); } + simdutf_really_inline simd8 operator<(const simd8 other) const { return vcltq_u8(*this, other); } + simdutf_really_inline simd8 operator>(const simd8 other) const { return vcgtq_u8(*this, other); } + // Same as >, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's. + simdutf_really_inline simd8 gt_bits(const simd8 other) const { return simd8(*this > other); } + // Same as <, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's. + simdutf_really_inline simd8 lt_bits(const simd8 other) const { return simd8(*this < other); } + + // Bit-specific operations + simdutf_really_inline simd8 any_bits_set(simd8 bits) const { return vtstq_u8(*this, bits); } + simdutf_really_inline bool is_ascii() const { return this->max_val() < 0b10000000u; } + + simdutf_really_inline bool any_bits_set_anywhere() const { return this->max_val() != 0; } + simdutf_really_inline bool any_bits_set_anywhere(simd8 bits) const { return (*this & bits).any_bits_set_anywhere(); } + template + simdutf_really_inline simd8 shr() const { return vshrq_n_u8(*this, N); } + template + simdutf_really_inline simd8 shl() const { return vshlq_n_u8(*this, N); } + + // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values) + template + simdutf_really_inline simd8 lookup_16(simd8 lookup_table) const { + return lookup_table.apply_lookup_16_to(*this); + } + + + template + simdutf_really_inline simd8 lookup_16( + L replace0, L replace1, L replace2, L replace3, + L replace4, L replace5, L replace6, L replace7, + L replace8, L replace9, L replace10, L replace11, + L replace12, L replace13, L replace14, L replace15) const { + return lookup_16(simd8::repeat_16( + replace0, replace1, replace2, replace3, + replace4, replace5, replace6, replace7, + replace8, replace9, replace10, replace11, + replace12, replace13, replace14, replace15 + )); + } + + template + simdutf_really_inline simd8 apply_lookup_16_to(const simd8 original) const { + return vqtbl1q_u8(*this, simd8(original)); + } + }; + + // Signed bytes + template<> + struct simd8 { + int8x16_t value; + + static simdutf_really_inline simd8 splat(int8_t _value) { return vmovq_n_s8(_value); } + static simdutf_really_inline simd8 zero() { return vdupq_n_s8(0); } + static simdutf_really_inline simd8 load(const int8_t values[16]) { return vld1q_s8(values); } + template + simdutf_really_inline void store_ascii_as_utf16(char16_t * p) const { + uint16x8_t first = vmovl_u8(vget_low_u8 (vreinterpretq_u8_s8(this->value))); + uint16x8_t second = vmovl_high_u8(vreinterpretq_u8_s8(this->value)); + if (big_endian) { + #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + #else + const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + #endif + first = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(first), swap)); + second = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(second), swap)); + } + vst1q_u16(reinterpret_cast(p), first); + vst1q_u16(reinterpret_cast(p + 8), second); + } + simdutf_really_inline void store_ascii_as_utf32(char32_t * p) const { + vst1q_u32(reinterpret_cast(p), vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8 (vreinterpretq_u8_s8(this->value)))))); + vst1q_u32(reinterpret_cast(p + 4), vmovl_high_u16(vmovl_u8(vget_low_u8 (vreinterpretq_u8_s8(this->value))))); + vst1q_u32(reinterpret_cast(p + 8), vmovl_u16(vget_low_u16(vmovl_high_u8(vreinterpretq_u8_s8(this->value))))); + vst1q_u32(reinterpret_cast(p + 12), vmovl_high_u16(vmovl_high_u8(vreinterpretq_u8_s8(this->value)))); + } + // Conversion from/to SIMD register + simdutf_really_inline simd8(const int8x16_t _value) : value{_value} {} + simdutf_really_inline operator const int8x16_t&() const { return this->value; } + simdutf_really_inline operator const uint8x16_t() const { return vreinterpretq_u8_s8(this->value); } + simdutf_really_inline operator int8x16_t&() { return this->value; } + + // Zero constructor + simdutf_really_inline simd8() : simd8(zero()) {} + // Splat constructor + simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {} + // Array constructor + simdutf_really_inline simd8(const int8_t* values) : simd8(load(values)) {} + // Member-by-member initialization +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + simdutf_really_inline simd8( + int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 + ) : simd8(make_int8x16_t( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + )) {} +#else + simdutf_really_inline simd8( + int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 + ) : simd8(int8x16_t{ + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + }) {} +#endif + // Repeat 16 values as many times as necessary (usually for lookup tables) + simdutf_really_inline static simd8 repeat_16( + int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } + + // Store to array + simdutf_really_inline void store(int8_t dst[16]) const { return vst1q_s8(dst, value); } + // Explicit conversion to/from unsigned + // + // Under Visual Studio/ARM64 uint8x16_t and int8x16_t are apparently the same type. + // In theory, we could check this occurrence with std::same_as and std::enabled_if but it is C++14 + // and relatively ugly and hard to read. +#ifndef SIMDUTF_REGULAR_VISUAL_STUDIO + simdutf_really_inline explicit simd8(const uint8x16_t other): simd8(vreinterpretq_s8_u8(other)) {} +#endif + simdutf_really_inline operator simd8() const { return vreinterpretq_u8_s8(this->value); } + + simdutf_really_inline simd8 operator|(const simd8 other) const { return vorrq_s8(value, other.value); } + simdutf_really_inline simd8 operator&(const simd8 other) const { return vandq_s8(value, other.value); } + simdutf_really_inline simd8 operator^(const simd8 other) const { return veorq_s8(value, other.value); } + simdutf_really_inline simd8 bit_andnot(const simd8 other) const { return vbicq_s8(value, other.value); } + + // Math + simdutf_really_inline simd8 operator+(const simd8 other) const { return vaddq_s8(value, other.value); } + simdutf_really_inline simd8 operator-(const simd8 other) const { return vsubq_s8(value, other.value); } + simdutf_really_inline simd8& operator+=(const simd8 other) { *this = *this + other; return *this; } + simdutf_really_inline simd8& operator-=(const simd8 other) { *this = *this - other; return *this; } + + simdutf_really_inline int8_t max_val() const { return vmaxvq_s8(value); } + simdutf_really_inline int8_t min_val() const { return vminvq_s8(value); } + simdutf_really_inline bool is_ascii() const { return this->min_val() >= 0; } + + // Order-sensitive comparisons + simdutf_really_inline simd8 max_val(const simd8 other) const { return vmaxq_s8(value, other.value); } + simdutf_really_inline simd8 min_val(const simd8 other) const { return vminq_s8(value, other.value); } + simdutf_really_inline simd8 operator>(const simd8 other) const { return vcgtq_s8(value, other.value); } + simdutf_really_inline simd8 operator<(const simd8 other) const { return vcltq_s8(value, other.value); } + simdutf_really_inline simd8 operator==(const simd8 other) const { return vceqq_s8(value, other.value); } + + template + simdutf_really_inline simd8 prev(const simd8 prev_chunk) const { + return vextq_s8(prev_chunk, *this, 16 - N); + } + + // Perform a lookup assuming no value is larger than 16 + template + simdutf_really_inline simd8 lookup_16(simd8 lookup_table) const { + return lookup_table.apply_lookup_16_to(*this); + } + template + simdutf_really_inline simd8 lookup_16( + L replace0, L replace1, L replace2, L replace3, + L replace4, L replace5, L replace6, L replace7, + L replace8, L replace9, L replace10, L replace11, + L replace12, L replace13, L replace14, L replace15) const { + return lookup_16(simd8::repeat_16( + replace0, replace1, replace2, replace3, + replace4, replace5, replace6, replace7, + replace8, replace9, replace10, replace11, + replace12, replace13, replace14, replace15 + )); + } + + template + simdutf_really_inline simd8 apply_lookup_16_to(const simd8 original) { + return vqtbl1q_s8(*this, simd8(original)); + } + }; + + template + struct simd8x64 { + static constexpr int NUM_CHUNKS = 64 / sizeof(simd8); + static_assert(NUM_CHUNKS == 4, "ARM kernel should use four registers per 64-byte block."); + simd8 chunks[NUM_CHUNKS]; + + simd8x64(const simd8x64& o) = delete; // no copy allowed + simd8x64& operator=(const simd8 other) = delete; // no assignment allowed + simd8x64() = delete; // no default constructor allowed + + simdutf_really_inline simd8x64(const simd8 chunk0, const simd8 chunk1, const simd8 chunk2, const simd8 chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {} + simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8::load(ptr), simd8::load(ptr+sizeof(simd8)/sizeof(T)), simd8::load(ptr+2*sizeof(simd8)/sizeof(T)), simd8::load(ptr+3*sizeof(simd8)/sizeof(T))} {} + + simdutf_really_inline void store(T* ptr) const { + this->chunks[0].store(ptr+sizeof(simd8)*0/sizeof(T)); + this->chunks[1].store(ptr+sizeof(simd8)*1/sizeof(T)); + this->chunks[2].store(ptr+sizeof(simd8)*2/sizeof(T)); + this->chunks[3].store(ptr+sizeof(simd8)*3/sizeof(T)); + } + + + simdutf_really_inline simd8x64& operator |=(const simd8x64 &other) { + this->chunks[0] |= other.chunks[0]; + this->chunks[1] |= other.chunks[1]; + this->chunks[2] |= other.chunks[2]; + this->chunks[3] |= other.chunks[3]; + return *this; + } + + simdutf_really_inline simd8 reduce_or() const { + return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]); + } + + simdutf_really_inline bool is_ascii() const { + return reduce_or().is_ascii(); + } + + template + simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const { + this->chunks[0].template store_ascii_as_utf16(ptr+sizeof(simd8)*0); + this->chunks[1].template store_ascii_as_utf16(ptr+sizeof(simd8)*1); + this->chunks[2].template store_ascii_as_utf16(ptr+sizeof(simd8)*2); + this->chunks[3].template store_ascii_as_utf16(ptr+sizeof(simd8)*3); + } + + simdutf_really_inline void store_ascii_as_utf32(char32_t * ptr) const { + this->chunks[0].store_ascii_as_utf32(ptr+sizeof(simd8)*0); + this->chunks[1].store_ascii_as_utf32(ptr+sizeof(simd8)*1); + this->chunks[2].store_ascii_as_utf32(ptr+sizeof(simd8)*2); + this->chunks[3].store_ascii_as_utf32(ptr+sizeof(simd8)*3); + } + + simdutf_really_inline uint64_t to_bitmask() const { +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint8x16_t bit_mask = make_uint8x16_t( + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 + ); +#else + const uint8x16_t bit_mask = { + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 + }; +#endif + // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one. + uint8x16_t sum0 = vpaddq_u8(vandq_u8(uint8x16_t(this->chunks[0]), bit_mask), vandq_u8(uint8x16_t(this->chunks[1]), bit_mask)); + uint8x16_t sum1 = vpaddq_u8(vandq_u8(uint8x16_t(this->chunks[2]), bit_mask), vandq_u8(uint8x16_t(this->chunks[3]), bit_mask)); + sum0 = vpaddq_u8(sum0, sum1); + sum0 = vpaddq_u8(sum0, sum0); + return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0); + } + + simdutf_really_inline uint64_t eq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] == mask, + this->chunks[1] == mask, + this->chunks[2] == mask, + this->chunks[3] == mask + ).to_bitmask(); + } + + simdutf_really_inline uint64_t lteq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] <= mask, + this->chunks[1] <= mask, + this->chunks[2] <= mask, + this->chunks[3] <= mask + ).to_bitmask(); + } + + simdutf_really_inline uint64_t in_range(const T low, const T high) const { + const simd8 mask_low = simd8::splat(low); + const simd8 mask_high = simd8::splat(high); + + return simd8x64( + (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low), + (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low), + (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low), + (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low) + ).to_bitmask(); + } + simdutf_really_inline uint64_t not_in_range(const T low, const T high) const { + const simd8 mask_low = simd8::splat(low); + const simd8 mask_high = simd8::splat(high); + return simd8x64( + (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low), + (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low), + (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low), + (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low) + ).to_bitmask(); + } + simdutf_really_inline uint64_t lt(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] < mask, + this->chunks[1] < mask, + this->chunks[2] < mask, + this->chunks[3] < mask + ).to_bitmask(); + } + simdutf_really_inline uint64_t gt(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] > mask, + this->chunks[1] > mask, + this->chunks[2] > mask, + this->chunks[3] > mask + ).to_bitmask(); + } + simdutf_really_inline uint64_t gteq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] >= mask, + this->chunks[1] >= mask, + this->chunks[2] >= mask, + this->chunks[3] >= mask + ).to_bitmask(); + } + simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + simd8(uint8x16_t(this->chunks[0])) >= mask, + simd8(uint8x16_t(this->chunks[1])) >= mask, + simd8(uint8x16_t(this->chunks[2])) >= mask, + simd8(uint8x16_t(this->chunks[3])) >= mask + ).to_bitmask(); + } + }; // struct simd8x64 +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/simd16-inl.h +/* begin file src/simdutf/arm64/simd16-inl.h */ +template +struct simd16; + + template> + struct base_u16 { + uint16x8_t value; + static const int SIZE = sizeof(value); + + // Conversion from/to SIMD register + simdutf_really_inline base_u16() = default; + simdutf_really_inline base_u16(const uint16x8_t _value) : value(_value) {} + simdutf_really_inline operator const uint16x8_t&() const { return this->value; } + simdutf_really_inline operator uint16x8_t&() { return this->value; } + // Bit operations + simdutf_really_inline simd16 operator|(const simd16 other) const { return vorrq_u16(*this, other); } + simdutf_really_inline simd16 operator&(const simd16 other) const { return vandq_u16(*this, other); } + simdutf_really_inline simd16 operator^(const simd16 other) const { return veorq_u16(*this, other); } + simdutf_really_inline simd16 bit_andnot(const simd16 other) const { return vbicq_u16(*this, other); } + simdutf_really_inline simd16 operator~() const { return *this ^ 0xFFu; } + simdutf_really_inline simd16& operator|=(const simd16 other) { auto this_cast = static_cast*>(this); *this_cast = *this_cast | other; return *this_cast; } + simdutf_really_inline simd16& operator&=(const simd16 other) { auto this_cast = static_cast*>(this); *this_cast = *this_cast & other; return *this_cast; } + simdutf_really_inline simd16& operator^=(const simd16 other) { auto this_cast = static_cast*>(this); *this_cast = *this_cast ^ other; return *this_cast; } + + simdutf_really_inline Mask operator==(const simd16 other) const { return vceqq_u16(*this, other); } + + template + simdutf_really_inline simd16 prev(const simd16 prev_chunk) const { + return vextq_u18(prev_chunk, *this, 8 - N); + } + }; + +template> +struct base16: base_u16 { + typedef uint16_t bitmask_t; + typedef uint32_t bitmask2_t; + + simdutf_really_inline base16() : base_u16() {} + simdutf_really_inline base16(const uint16x8_t _value) : base_u16(_value) {} + template + simdutf_really_inline base16(const Pointer* ptr) : base16(vld1q_u16(ptr)) {} + + simdutf_really_inline Mask operator==(const simd16 other) const { return vceqq_u16(*this, other); } + + static const int SIZE = sizeof(base_u16::value); + + template + simdutf_really_inline simd16 prev(const simd16 prev_chunk) const { + return vextq_u18(prev_chunk, *this, 8 - N); + } +}; + +// SIMD byte mask type (returned by things like eq and gt) +template<> +struct simd16: base16 { + static simdutf_really_inline simd16 splat(bool _value) { return vmovq_n_u16(uint16_t(-(!!_value))); } + + simdutf_really_inline simd16() : base16() {} + simdutf_really_inline simd16(const uint16x8_t _value) : base16(_value) {} + // Splat constructor + simdutf_really_inline simd16(bool _value) : base16(splat(_value)) {} + +}; + +template +struct base16_numeric: base16 { + static simdutf_really_inline simd16 splat(T _value) { return vmovq_n_u16(_value); } + static simdutf_really_inline simd16 zero() { return vdupq_n_u16(0); } + static simdutf_really_inline simd16 load(const T values[8]) { + return vld1q_u16(reinterpret_cast(values)); + } + + simdutf_really_inline base16_numeric() : base16() {} + simdutf_really_inline base16_numeric(const uint16x8_t _value) : base16(_value) {} + + // Store to array + simdutf_really_inline void store(T dst[8]) const { return vst1q_u16(dst, *this); } + + // Override to distinguish from bool version + simdutf_really_inline simd16 operator~() const { return *this ^ 0xFFu; } + + // Addition/subtraction are the same for signed and unsigned + simdutf_really_inline simd16 operator+(const simd16 other) const { return vaddq_u8(*this, other); } + simdutf_really_inline simd16 operator-(const simd16 other) const { return vsubq_u8(*this, other); } + simdutf_really_inline simd16& operator+=(const simd16 other) { *this = *this + other; return *static_cast*>(this); } + simdutf_really_inline simd16& operator-=(const simd16 other) { *this = *this - other; return *static_cast*>(this); } +}; + +// Signed words +template<> +struct simd16 : base16_numeric { + simdutf_really_inline simd16() : base16_numeric() {} +#ifndef SIMDUTF_REGULAR_VISUAL_STUDIO + simdutf_really_inline simd16(const uint16x8_t _value) : base16_numeric(_value) {} +#endif + simdutf_really_inline simd16(const int16x8_t _value) : base16_numeric(vreinterpretq_u16_s16(_value)) {} + + // Splat constructor + simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {} + // Array constructor + simdutf_really_inline simd16(const int16_t* values) : simd16(load(values)) {} + simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast(values))) {} + simdutf_really_inline operator simd16() const; + simdutf_really_inline operator const uint16x8_t&() const { return this->value; } + simdutf_really_inline operator const int16x8_t() const { return vreinterpretq_s16_u16(this->value); } + + simdutf_really_inline int16_t max_val() const { return vmaxvq_s16(vreinterpretq_s16_u16(this->value)); } + simdutf_really_inline int16_t min_val() const { return vminvq_s16(vreinterpretq_s16_u16(this->value)); } + // Order-sensitive comparisons + simdutf_really_inline simd16 max_val(const simd16 other) const { return vmaxq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); } + simdutf_really_inline simd16 min_val(const simd16 other) const { return vmaxq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); } + simdutf_really_inline simd16 operator>(const simd16 other) const { return vcgtq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); } + simdutf_really_inline simd16 operator<(const simd16 other) const { return vcltq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); } +}; + + + + +// Unsigned words +template<> +struct simd16: base16_numeric { + simdutf_really_inline simd16() : base16_numeric() {} + simdutf_really_inline simd16(const uint16x8_t _value) : base16_numeric(_value) {} + + // Splat constructor + simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {} + // Array constructor + simdutf_really_inline simd16(const uint16_t* values) : simd16(load(values)) {} + simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast(values))) {} + + + simdutf_really_inline int16_t max_val() const { return vmaxvq_u16(*this); } + simdutf_really_inline int16_t min_val() const { return vminvq_u16(*this); } + // Saturated math + simdutf_really_inline simd16 saturating_add(const simd16 other) const { return vqaddq_u16(*this, other); } + simdutf_really_inline simd16 saturating_sub(const simd16 other) const { return vqsubq_u16(*this, other); } + + // Order-specific operations + simdutf_really_inline simd16 max_val(const simd16 other) const { return vmaxq_u16(*this, other); } + simdutf_really_inline simd16 min_val(const simd16 other) const { return vminq_u16(*this, other); } + // Same as >, but only guarantees true is nonzero (< guarantees true = -1) + simdutf_really_inline simd16 gt_bits(const simd16 other) const { return this->saturating_sub(other); } + // Same as <, but only guarantees true is nonzero (< guarantees true = -1) + simdutf_really_inline simd16 lt_bits(const simd16 other) const { return other.saturating_sub(*this); } + simdutf_really_inline simd16 operator<=(const simd16 other) const { return vcleq_u16(*this, other); } + simdutf_really_inline simd16 operator>=(const simd16 other) const { return vcgeq_u16(*this, other); } + simdutf_really_inline simd16 operator>(const simd16 other) const { return vcgtq_u16(*this, other); } + simdutf_really_inline simd16 operator<(const simd16 other) const { return vcltq_u16(*this, other); } + + // Bit-specific operations + simdutf_really_inline simd16 bits_not_set() const { return *this == uint16_t(0); } + template + simdutf_really_inline simd16 shr() const { return simd16(vshrq_n_u16(*this, N)); } + template + simdutf_really_inline simd16 shl() const { return simd16(vshlq_n_u16(*this, N)); } + + // logical operations + simdutf_really_inline simd16 operator|(const simd16 other) const { return vorrq_u16(*this, other); } + simdutf_really_inline simd16 operator&(const simd16 other) const { return vandq_u16(*this, other); } + simdutf_really_inline simd16 operator^(const simd16 other) const { return veorq_u16(*this, other); } + + // Pack with the unsigned saturation two uint16_t words into single uint8_t vector + static simdutf_really_inline simd8 pack(const simd16& v0, const simd16& v1) { + return vqmovn_high_u16(vqmovn_u16(v0), v1); + } + + // Change the endianness + simdutf_really_inline simd16 swap_bytes() const { + #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + #else + const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + #endif + return vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(*this), swap)); + } +}; +simdutf_really_inline simd16::operator simd16() const { return this->value; } + + + template + struct simd16x32 { + static constexpr int NUM_CHUNKS = 64 / sizeof(simd16); + static_assert(NUM_CHUNKS == 4, "ARM kernel should use four registers per 64-byte block."); + simd16 chunks[NUM_CHUNKS]; + + simd16x32(const simd16x32& o) = delete; // no copy allowed + simd16x32& operator=(const simd16 other) = delete; // no assignment allowed + simd16x32() = delete; // no default constructor allowed + + simdutf_really_inline simd16x32(const simd16 chunk0, const simd16 chunk1, const simd16 chunk2, const simd16 chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {} + simdutf_really_inline simd16x32(const T* ptr) : chunks{simd16::load(ptr), simd16::load(ptr+sizeof(simd16)/sizeof(T)), simd16::load(ptr+2*sizeof(simd16)/sizeof(T)), simd16::load(ptr+3*sizeof(simd16)/sizeof(T))} {} + + simdutf_really_inline void store(T* ptr) const { + this->chunks[0].store(ptr+sizeof(simd16)*0/sizeof(T)); + this->chunks[1].store(ptr+sizeof(simd16)*1/sizeof(T)); + this->chunks[2].store(ptr+sizeof(simd16)*2/sizeof(T)); + this->chunks[3].store(ptr+sizeof(simd16)*3/sizeof(T)); + } + + simdutf_really_inline simd16 reduce_or() const { + return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]); + } + + simdutf_really_inline bool is_ascii() const { + return reduce_or().is_ascii(); + } + + simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const { + this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd16)*0); + this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd16)*1); + this->chunks[2].store_ascii_as_utf16(ptr+sizeof(simd16)*2); + this->chunks[3].store_ascii_as_utf16(ptr+sizeof(simd16)*3); + } + + simdutf_really_inline uint64_t to_bitmask() const { +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint8x16_t bit_mask = make_uint8x16_t( + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 + ); +#else + const uint8x16_t bit_mask = { + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 + }; +#endif + // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one. + uint8x16_t sum0 = vpaddq_u8(vreinterpretq_u8_u16(this->chunks[0] & vreinterpretq_u16_u8(bit_mask)), vreinterpretq_u8_u16(this->chunks[1] & vreinterpretq_u16_u8(bit_mask))); + uint8x16_t sum1 = vpaddq_u8(vreinterpretq_u8_u16(this->chunks[2] & vreinterpretq_u16_u8(bit_mask)), vreinterpretq_u8_u16(this->chunks[3] & vreinterpretq_u16_u8(bit_mask))); + sum0 = vpaddq_u8(sum0, sum1); + sum0 = vpaddq_u8(sum0, sum0); + return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0); + } + + simdutf_really_inline void swap_bytes() { + this->chunks[0] = this->chunks[0].swap_bytes(); + this->chunks[1] = this->chunks[1].swap_bytes(); + this->chunks[2] = this->chunks[2].swap_bytes(); + this->chunks[3] = this->chunks[3].swap_bytes(); + } + + simdutf_really_inline uint64_t eq(const T m) const { + const simd16 mask = simd16::splat(m); + return simd16x32( + this->chunks[0] == mask, + this->chunks[1] == mask, + this->chunks[2] == mask, + this->chunks[3] == mask + ).to_bitmask(); + } + + simdutf_really_inline uint64_t lteq(const T m) const { + const simd16 mask = simd16::splat(m); + return simd16x32( + this->chunks[0] <= mask, + this->chunks[1] <= mask, + this->chunks[2] <= mask, + this->chunks[3] <= mask + ).to_bitmask(); + } + + simdutf_really_inline uint64_t in_range(const T low, const T high) const { + const simd16 mask_low = simd16::splat(low); + const simd16 mask_high = simd16::splat(high); + + return simd16x32( + (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low), + (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low), + (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low), + (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low) + ).to_bitmask(); + } + simdutf_really_inline uint64_t not_in_range(const T low, const T high) const { + const simd16 mask_low = simd16::splat(low); + const simd16 mask_high = simd16::splat(high); + return simd16x32( + (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low), + (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low), + (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low), + (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low) + ).to_bitmask(); + } + simdutf_really_inline uint64_t lt(const T m) const { + const simd16 mask = simd16::splat(m); + return simd16x32( + this->chunks[0] < mask, + this->chunks[1] < mask, + this->chunks[2] < mask, + this->chunks[3] < mask + ).to_bitmask(); + } + + }; // struct simd16x32 + template<> + simdutf_really_inline uint64_t simd16x32::not_in_range(const uint16_t low, const uint16_t high) const { + const simd16 mask_low = simd16::splat(low); + const simd16 mask_high = simd16::splat(high); + simd16x32 x( + simd16((this->chunks[0] > mask_high) | (this->chunks[0] < mask_low)), + simd16((this->chunks[1] > mask_high) | (this->chunks[1] < mask_low)), + simd16((this->chunks[2] > mask_high) | (this->chunks[2] < mask_low)), + simd16((this->chunks[3] > mask_high) | (this->chunks[3] < mask_low)) + ); + return x.to_bitmask(); + } +/* end file src/simdutf/arm64/simd16-inl.h */ +} // namespace simd +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf + +#endif // SIMDUTF_ARM64_SIMD_H +/* end file src/simdutf/arm64/simd.h */ + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/end.h +/* begin file src/simdutf/arm64/end.h */ +/* end file src/simdutf/arm64/end.h */ + +#endif // SIMDUTF_IMPLEMENTATION_ARM64 + +#endif // SIMDUTF_ARM64_H +/* end file src/simdutf/arm64.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell.h +/* begin file src/simdutf/haswell.h */ +#ifndef SIMDUTF_HASWELL_H +#define SIMDUTF_HASWELL_H + +#ifdef SIMDUTF_WESTMERE_H +#error "haswell.h must be included before westmere.h" +#endif +#ifdef SIMDUTF_FALLBACK_H +#error "haswell.h must be included before fallback.h" +#endif + + +// Default Haswell to on if this is x86-64. Even if we're not compiled for it, it could be selected +// at runtime. +#ifndef SIMDUTF_IMPLEMENTATION_HASWELL +// +// You do not want to restrict it like so: SIMDUTF_IS_X86_64 && __AVX2__ +// because we want to rely on *runtime dispatch*. +// +#define SIMDUTF_IMPLEMENTATION_HASWELL (SIMDUTF_IS_X86_64) +#endif +// To see why (__BMI__) && (__PCLMUL__) && (__LZCNT__) are not part of this next line, see +// https://github.com/simdutf/simdutf/issues/1247 +#define SIMDUTF_CAN_ALWAYS_RUN_HASWELL ((SIMDUTF_IMPLEMENTATION_HASWELL) && (SIMDUTF_IS_X86_64) && (__AVX2__)) + +#if SIMDUTF_IMPLEMENTATION_HASWELL + +#define SIMDUTF_TARGET_HASWELL SIMDUTF_TARGET_REGION("avx2,bmi,pclmul,lzcnt") + +namespace simdutf { +/** + * Implementation for Haswell (Intel AVX2). + */ +namespace haswell { +} // namespace haswell +} // namespace simdutf + +// +// These two need to be included outside SIMDUTF_TARGET_REGION +// +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/implementation.h +/* begin file src/simdutf/haswell/implementation.h */ +#ifndef SIMDUTF_HASWELL_IMPLEMENTATION_H +#define SIMDUTF_HASWELL_IMPLEMENTATION_H + + +// The constructor may be executed on any host, so we take care not to use SIMDUTF_TARGET_REGION +namespace simdutf { +namespace haswell { + +using namespace simdutf; + +class implementation final : public simdutf::implementation { +public: + simdutf_really_inline implementation() : simdutf::implementation( + "haswell", + "Intel/AMD AVX2", + internal::instruction_set::AVX2 | internal::instruction_set::PCLMULQDQ | internal::instruction_set::BMI1 | internal::instruction_set::BMI2 + ) {} + simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final; + simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final; + simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final; + simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final; + simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final; + simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final; + simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; + simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final; + simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept; + simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept; + simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept; + simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept; +}; + +} // namespace haswell +} // namespace simdutf + +#endif // SIMDUTF_HASWELL_IMPLEMENTATION_H +/* end file src/simdutf/haswell/implementation.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/intrinsics.h +/* begin file src/simdutf/haswell/intrinsics.h */ +#ifndef SIMDUTF_HASWELL_INTRINSICS_H +#define SIMDUTF_HASWELL_INTRINSICS_H + + +#ifdef SIMDUTF_VISUAL_STUDIO +// under clang within visual studio, this will include +#include // visual studio or clang +#else +#include // elsewhere +#endif // SIMDUTF_VISUAL_STUDIO + +#ifdef SIMDUTF_CLANG_VISUAL_STUDIO +/** + * You are not supposed, normally, to include these + * headers directly. Instead you should either include intrin.h + * or x86intrin.h. However, when compiling with clang + * under Windows (i.e., when _MSC_VER is set), these headers + * only get included *if* the corresponding features are detected + * from macros: + * e.g., if __AVX2__ is set... in turn, we normally set these + * macros by compiling against the corresponding architecture + * (e.g., arch:AVX2, -mavx2, etc.) which compiles the whole + * software with these advanced instructions. In simdutf, we + * want to compile the whole program for a generic target, + * and only target our specific kernels. As a workaround, + * we directly include the needed headers. These headers would + * normally guard against such usage, but we carefully included + * (or ) before, so the headers + * are fooled. + */ +#include // for _blsr_u64 +#include // for __lzcnt64 +#include // for most things (AVX2, AVX512, _popcnt64) +#include +#include +#include +#include +#include // for _mm_clmulepi64_si128 +// unfortunately, we may not get _blsr_u64, but, thankfully, clang +// has it as a macro. +#ifndef _blsr_u64 +// we roll our own +#define _blsr_u64(n) ((n - 1) & n) +#endif // _blsr_u64 +#endif // SIMDUTF_CLANG_VISUAL_STUDIO + +#endif // SIMDUTF_HASWELL_INTRINSICS_H +/* end file src/simdutf/haswell/intrinsics.h */ + +// +// The rest need to be inside the region +// +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/begin.h +/* begin file src/simdutf/haswell/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "haswell" +// #define SIMDUTF_IMPLEMENTATION haswell +SIMDUTF_TARGET_HASWELL +/* end file src/simdutf/haswell/begin.h */ +// Declarations +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/bitmanipulation.h +/* begin file src/simdutf/haswell/bitmanipulation.h */ +#ifndef SIMDUTF_HASWELL_BITMANIPULATION_H +#define SIMDUTF_HASWELL_BITMANIPULATION_H + +namespace simdutf { +namespace haswell { +namespace { + +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO +simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) { + // note: we do not support legacy 32-bit Windows + return __popcnt64(input_num);// Visual Studio wants two underscores +} +#else +simdutf_really_inline long long int count_ones(uint64_t input_num) { + return _popcnt64(input_num); +} +#endif + +} // unnamed namespace +} // namespace haswell +} // namespace simdutf + +#endif // SIMDUTF_HASWELL_BITMANIPULATION_H +/* end file src/simdutf/haswell/bitmanipulation.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/simd.h +/* begin file src/simdutf/haswell/simd.h */ +#ifndef SIMDUTF_HASWELL_SIMD_H +#define SIMDUTF_HASWELL_SIMD_H + + +namespace simdutf { +namespace haswell { +namespace { +namespace simd { + + // Forward-declared so they can be used by splat and friends. + template + struct base { + __m256i value; + + // Zero constructor + simdutf_really_inline base() : value{__m256i()} {} + + // Conversion from SIMD register + simdutf_really_inline base(const __m256i _value) : value(_value) {} + // Conversion to SIMD register + simdutf_really_inline operator const __m256i&() const { return this->value; } + simdutf_really_inline operator __m256i&() { return this->value; } + template + simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const { + __m256i first = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(*this)); + __m256i second = _mm256_cvtepu8_epi16(_mm256_extractf128_si256(*this,1)); + if (big_endian) { + const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, + 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30); + first = _mm256_shuffle_epi8(first, swap); + second = _mm256_shuffle_epi8(second, swap); + } + _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), first); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 16), second); + } + simdutf_really_inline void store_ascii_as_utf32(char32_t * ptr) const { + _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), _mm256_cvtepu8_epi32(_mm256_castsi256_si128(*this))); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr+8), _mm256_cvtepu8_epi32(_mm256_castsi256_si128(_mm256_srli_si256(*this,8)))); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 16), _mm256_cvtepu8_epi32(_mm256_extractf128_si256(*this,1))); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 24), _mm256_cvtepu8_epi32(_mm_srli_si128(_mm256_extractf128_si256(*this,1),8))); + } + // Bit operations + simdutf_really_inline Child operator|(const Child other) const { return _mm256_or_si256(*this, other); } + simdutf_really_inline Child operator&(const Child other) const { return _mm256_and_si256(*this, other); } + simdutf_really_inline Child operator^(const Child other) const { return _mm256_xor_si256(*this, other); } + simdutf_really_inline Child bit_andnot(const Child other) const { return _mm256_andnot_si256(other, *this); } + simdutf_really_inline Child& operator|=(const Child other) { auto this_cast = static_cast(this); *this_cast = *this_cast | other; return *this_cast; } + simdutf_really_inline Child& operator&=(const Child other) { auto this_cast = static_cast(this); *this_cast = *this_cast & other; return *this_cast; } + simdutf_really_inline Child& operator^=(const Child other) { auto this_cast = static_cast(this); *this_cast = *this_cast ^ other; return *this_cast; } + }; + + // Forward-declared so they can be used by splat and friends. + template + struct simd8; + + template> + struct base8: base> { + typedef uint32_t bitmask_t; + typedef uint64_t bitmask2_t; + + simdutf_really_inline base8() : base>() {} + simdutf_really_inline base8(const __m256i _value) : base>(_value) {} + simdutf_really_inline T first() const { return _mm256_extract_epi8(*this,0); } + simdutf_really_inline T last() const { return _mm256_extract_epi8(*this,31); } + simdutf_really_inline Mask operator==(const simd8 other) const { return _mm256_cmpeq_epi8(*this, other); } + + static const int SIZE = sizeof(base::value); + + template + simdutf_really_inline simd8 prev(const simd8 prev_chunk) const { + return _mm256_alignr_epi8(*this, _mm256_permute2x128_si256(prev_chunk, *this, 0x21), 16 - N); + } + }; + + // SIMD byte mask type (returned by things like eq and gt) + template<> + struct simd8: base8 { + static simdutf_really_inline simd8 splat(bool _value) { return _mm256_set1_epi8(uint8_t(-(!!_value))); } + + simdutf_really_inline simd8() : base8() {} + simdutf_really_inline simd8(const __m256i _value) : base8(_value) {} + // Splat constructor + simdutf_really_inline simd8(bool _value) : base8(splat(_value)) {} + + simdutf_really_inline uint32_t to_bitmask() const { return uint32_t(_mm256_movemask_epi8(*this)); } + simdutf_really_inline bool any() const { return !_mm256_testz_si256(*this, *this); } + simdutf_really_inline bool none() const { return _mm256_testz_si256(*this, *this); } + simdutf_really_inline bool all() const { return static_cast(_mm256_movemask_epi8(*this)) == 0xFFFFFFFF; } + simdutf_really_inline simd8 operator~() const { return *this ^ true; } + }; + + template + struct base8_numeric: base8 { + static simdutf_really_inline simd8 splat(T _value) { return _mm256_set1_epi8(_value); } + static simdutf_really_inline simd8 zero() { return _mm256_setzero_si256(); } + static simdutf_really_inline simd8 load(const T values[32]) { + return _mm256_loadu_si256(reinterpret_cast(values)); + } + // Repeat 16 values as many times as necessary (usually for lookup tables) + static simdutf_really_inline simd8 repeat_16( + T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, + T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15, + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } + + simdutf_really_inline base8_numeric() : base8() {} + simdutf_really_inline base8_numeric(const __m256i _value) : base8(_value) {} + + // Store to array + simdutf_really_inline void store(T dst[32]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); } + + // Addition/subtraction are the same for signed and unsigned + simdutf_really_inline simd8 operator+(const simd8 other) const { return _mm256_add_epi8(*this, other); } + simdutf_really_inline simd8 operator-(const simd8 other) const { return _mm256_sub_epi8(*this, other); } + simdutf_really_inline simd8& operator+=(const simd8 other) { *this = *this + other; return *static_cast*>(this); } + simdutf_really_inline simd8& operator-=(const simd8 other) { *this = *this - other; return *static_cast*>(this); } + + // Override to distinguish from bool version + simdutf_really_inline simd8 operator~() const { return *this ^ 0xFFu; } + + // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values) + template + simdutf_really_inline simd8 lookup_16(simd8 lookup_table) const { + return _mm256_shuffle_epi8(lookup_table, *this); + } + + template + simdutf_really_inline simd8 lookup_16( + L replace0, L replace1, L replace2, L replace3, + L replace4, L replace5, L replace6, L replace7, + L replace8, L replace9, L replace10, L replace11, + L replace12, L replace13, L replace14, L replace15) const { + return lookup_16(simd8::repeat_16( + replace0, replace1, replace2, replace3, + replace4, replace5, replace6, replace7, + replace8, replace9, replace10, replace11, + replace12, replace13, replace14, replace15 + )); + } + }; + + + // Signed bytes + template<> + struct simd8 : base8_numeric { + simdutf_really_inline simd8() : base8_numeric() {} + simdutf_really_inline simd8(const __m256i _value) : base8_numeric(_value) {} + + // Splat constructor + simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {} + // Array constructor + simdutf_really_inline simd8(const int8_t values[32]) : simd8(load(values)) {} + simdutf_really_inline operator simd8() const; + // Member-by-member initialization + simdutf_really_inline simd8( + int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15, + int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23, + int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31 + ) : simd8(_mm256_setr_epi8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15, + v16,v17,v18,v19,v20,v21,v22,v23, + v24,v25,v26,v27,v28,v29,v30,v31 + )) {} + // Repeat 16 values as many times as necessary (usually for lookup tables) + simdutf_really_inline static simd8 repeat_16( + int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15, + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } + simdutf_really_inline bool is_ascii() const { return _mm256_movemask_epi8(*this) == 0; } + // Order-sensitive comparisons + simdutf_really_inline simd8 max_val(const simd8 other) const { return _mm256_max_epi8(*this, other); } + simdutf_really_inline simd8 min_val(const simd8 other) const { return _mm256_min_epi8(*this, other); } + simdutf_really_inline simd8 operator>(const simd8 other) const { return _mm256_cmpgt_epi8(*this, other); } + simdutf_really_inline simd8 operator<(const simd8 other) const { return _mm256_cmpgt_epi8(other, *this); } + }; + + // Unsigned bytes + template<> + struct simd8: base8_numeric { + simdutf_really_inline simd8() : base8_numeric() {} + simdutf_really_inline simd8(const __m256i _value) : base8_numeric(_value) {} + // Splat constructor + simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {} + // Array constructor + simdutf_really_inline simd8(const uint8_t values[32]) : simd8(load(values)) {} + // Member-by-member initialization + simdutf_really_inline simd8( + uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, + uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15, + uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, uint8_t v21, uint8_t v22, uint8_t v23, + uint8_t v24, uint8_t v25, uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, uint8_t v31 + ) : simd8(_mm256_setr_epi8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15, + v16,v17,v18,v19,v20,v21,v22,v23, + v24,v25,v26,v27,v28,v29,v30,v31 + )) {} + // Repeat 16 values as many times as necessary (usually for lookup tables) + simdutf_really_inline static simd8 repeat_16( + uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, + uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15, + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } + + + // Saturated math + simdutf_really_inline simd8 saturating_add(const simd8 other) const { return _mm256_adds_epu8(*this, other); } + simdutf_really_inline simd8 saturating_sub(const simd8 other) const { return _mm256_subs_epu8(*this, other); } + + // Order-specific operations + simdutf_really_inline simd8 max_val(const simd8 other) const { return _mm256_max_epu8(*this, other); } + simdutf_really_inline simd8 min_val(const simd8 other) const { return _mm256_min_epu8(other, *this); } + // Same as >, but only guarantees true is nonzero (< guarantees true = -1) + simdutf_really_inline simd8 gt_bits(const simd8 other) const { return this->saturating_sub(other); } + // Same as <, but only guarantees true is nonzero (< guarantees true = -1) + simdutf_really_inline simd8 lt_bits(const simd8 other) const { return other.saturating_sub(*this); } + simdutf_really_inline simd8 operator<=(const simd8 other) const { return other.max_val(*this) == other; } + simdutf_really_inline simd8 operator>=(const simd8 other) const { return other.min_val(*this) == other; } + simdutf_really_inline simd8 operator>(const simd8 other) const { return this->gt_bits(other).any_bits_set(); } + simdutf_really_inline simd8 operator<(const simd8 other) const { return this->lt_bits(other).any_bits_set(); } + + // Bit-specific operations + simdutf_really_inline simd8 bits_not_set() const { return *this == uint8_t(0); } + simdutf_really_inline simd8 bits_not_set(simd8 bits) const { return (*this & bits).bits_not_set(); } + simdutf_really_inline simd8 any_bits_set() const { return ~this->bits_not_set(); } + simdutf_really_inline simd8 any_bits_set(simd8 bits) const { return ~this->bits_not_set(bits); } + simdutf_really_inline bool is_ascii() const { return _mm256_movemask_epi8(*this) == 0; } + simdutf_really_inline bool bits_not_set_anywhere() const { return _mm256_testz_si256(*this, *this); } + simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); } + simdutf_really_inline bool bits_not_set_anywhere(simd8 bits) const { return _mm256_testz_si256(*this, bits); } + simdutf_really_inline bool any_bits_set_anywhere(simd8 bits) const { return !bits_not_set_anywhere(bits); } + template + simdutf_really_inline simd8 shr() const { return simd8(_mm256_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); } + template + simdutf_really_inline simd8 shl() const { return simd8(_mm256_slli_epi16(*this, N)) & uint8_t(0xFFu << N); } + // Get one of the bits and make a bitmask out of it. + // e.g. value.get_bit<7>() gets the high bit + template + simdutf_really_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 7-N)); } + }; + simdutf_really_inline simd8::operator simd8() const { return this->value; } + + + template + struct simd8x64 { + static constexpr int NUM_CHUNKS = 64 / sizeof(simd8); + static_assert(NUM_CHUNKS == 2, "Haswell kernel should use two registers per 64-byte block."); + simd8 chunks[NUM_CHUNKS]; + + simd8x64(const simd8x64& o) = delete; // no copy allowed + simd8x64& operator=(const simd8 other) = delete; // no assignment allowed + simd8x64() = delete; // no default constructor allowed + + simdutf_really_inline simd8x64(const simd8 chunk0, const simd8 chunk1) : chunks{chunk0, chunk1} {} + simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8::load(ptr), simd8::load(ptr+sizeof(simd8)/sizeof(T))} {} + + simdutf_really_inline void store(T* ptr) const { + this->chunks[0].store(ptr+sizeof(simd8)*0/sizeof(T)); + this->chunks[1].store(ptr+sizeof(simd8)*1/sizeof(T)); + } + + simdutf_really_inline uint64_t to_bitmask() const { + uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask()); + uint64_t r_hi = this->chunks[1].to_bitmask(); + return r_lo | (r_hi << 32); + } + + simdutf_really_inline simd8x64& operator|=(const simd8x64 &other) { + this->chunks[0] |= other.chunks[0]; + this->chunks[1] |= other.chunks[1]; + return *this; + } + + simdutf_really_inline simd8 reduce_or() const { + return this->chunks[0] | this->chunks[1]; + } + + simdutf_really_inline bool is_ascii() const { + return this->reduce_or().is_ascii(); + } + + template + simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const { + this->chunks[0].template store_ascii_as_utf16(ptr+sizeof(simd8)*0); + this->chunks[1].template store_ascii_as_utf16(ptr+sizeof(simd8)*1); + } + + simdutf_really_inline void store_ascii_as_utf32(char32_t * ptr) const { + this->chunks[0].store_ascii_as_utf32(ptr+sizeof(simd8)*0); + this->chunks[1].store_ascii_as_utf32(ptr+sizeof(simd8)*1); + } + + simdutf_really_inline simd8x64 bit_or(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] | mask, + this->chunks[1] | mask + ); + } + + simdutf_really_inline uint64_t eq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] == mask, + this->chunks[1] == mask + ).to_bitmask(); + } + + simdutf_really_inline uint64_t eq(const simd8x64 &other) const { + return simd8x64( + this->chunks[0] == other.chunks[0], + this->chunks[1] == other.chunks[1] + ).to_bitmask(); + } + + simdutf_really_inline uint64_t lteq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] <= mask, + this->chunks[1] <= mask + ).to_bitmask(); + } + + simdutf_really_inline uint64_t in_range(const T low, const T high) const { + const simd8 mask_low = simd8::splat(low); + const simd8 mask_high = simd8::splat(high); + + return simd8x64( + (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low), + (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low), + (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low), + (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low) + ).to_bitmask(); + } + simdutf_really_inline uint64_t not_in_range(const T low, const T high) const { + const simd8 mask_low = simd8::splat(low); + const simd8 mask_high = simd8::splat(high); + return simd8x64( + (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low), + (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low) + ).to_bitmask(); + } + simdutf_really_inline uint64_t lt(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] < mask, + this->chunks[1] < mask + ).to_bitmask(); + } + + simdutf_really_inline uint64_t gt(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] > mask, + this->chunks[1] > mask + ).to_bitmask(); + } + simdutf_really_inline uint64_t gteq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] >= mask, + this->chunks[1] >= mask + ).to_bitmask(); + } + simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + (simd8(__m256i(this->chunks[0])) >= mask), + (simd8(__m256i(this->chunks[1])) >= mask) + ).to_bitmask(); + } + }; // struct simd8x64 + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/simd16-inl.h +/* begin file src/simdutf/haswell/simd16-inl.h */ +#ifdef __GNUC__ +#if __GNUC__ < 8 +#define _mm256_set_m128i(xmm1, xmm2) _mm256_permute2f128_si256(_mm256_castsi128_si256(xmm1), _mm256_castsi128_si256(xmm2), 2) +#define _mm256_setr_m128i(xmm2, xmm1) _mm256_permute2f128_si256(_mm256_castsi128_si256(xmm1), _mm256_castsi128_si256(xmm2), 2) +#endif +#endif + +template +struct simd16; + +template> +struct base16: base> { + using bitmask_type = uint32_t; + + simdutf_really_inline base16() : base>() {} + simdutf_really_inline base16(const __m256i _value) : base>(_value) {} + template + simdutf_really_inline base16(const Pointer* ptr) : base16(_mm256_loadu_si256(reinterpret_cast(ptr))) {} + + simdutf_really_inline Mask operator==(const simd16 other) const { return _mm256_cmpeq_epi16(*this, other); } + + /// the size of vector in bytes + static const int SIZE = sizeof(base>::value); + + /// the number of elements of type T a vector can hold + static const int ELEMENTS = SIZE / sizeof(T); + + template + simdutf_really_inline simd16 prev(const simd16 prev_chunk) const { + return _mm256_alignr_epi8(*this, prev_chunk, 16 - N); + } +}; + +// SIMD byte mask type (returned by things like eq and gt) +template<> +struct simd16: base16 { + static simdutf_really_inline simd16 splat(bool _value) { return _mm256_set1_epi16(uint16_t(-(!!_value))); } + + simdutf_really_inline simd16() : base16() {} + simdutf_really_inline simd16(const __m256i _value) : base16(_value) {} + // Splat constructor + simdutf_really_inline simd16(bool _value) : base16(splat(_value)) {} + + simdutf_really_inline bitmask_type to_bitmask() const { return _mm256_movemask_epi8(*this); } + simdutf_really_inline bool any() const { return !_mm256_testz_si256(*this, *this); } + simdutf_really_inline simd16 operator~() const { return *this ^ true; } +}; + +template +struct base16_numeric: base16 { + static simdutf_really_inline simd16 splat(T _value) { return _mm256_set1_epi16(_value); } + static simdutf_really_inline simd16 zero() { return _mm256_setzero_si256(); } + static simdutf_really_inline simd16 load(const T values[8]) { + return _mm256_loadu_si256(reinterpret_cast(values)); + } + + simdutf_really_inline base16_numeric() : base16() {} + simdutf_really_inline base16_numeric(const __m256i _value) : base16(_value) {} + + // Store to array + simdutf_really_inline void store(T dst[8]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); } + + // Override to distinguish from bool version + simdutf_really_inline simd16 operator~() const { return *this ^ 0xFFFFu; } + + // Addition/subtraction are the same for signed and unsigned + simdutf_really_inline simd16 operator+(const simd16 other) const { return _mm256_add_epi16(*this, other); } + simdutf_really_inline simd16 operator-(const simd16 other) const { return _mm256_sub_epi16(*this, other); } + simdutf_really_inline simd16& operator+=(const simd16 other) { *this = *this + other; return *static_cast*>(this); } + simdutf_really_inline simd16& operator-=(const simd16 other) { *this = *this - other; return *static_cast*>(this); } +}; + +// Signed words +template<> +struct simd16 : base16_numeric { + simdutf_really_inline simd16() : base16_numeric() {} + simdutf_really_inline simd16(const __m256i _value) : base16_numeric(_value) {} + // Splat constructor + simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {} + // Array constructor + simdutf_really_inline simd16(const int16_t* values) : simd16(load(values)) {} + simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast(values))) {} + // Order-sensitive comparisons + simdutf_really_inline simd16 max_val(const simd16 other) const { return _mm256_max_epi16(*this, other); } + simdutf_really_inline simd16 min_val(const simd16 other) const { return _mm256_min_epi16(*this, other); } + simdutf_really_inline simd16 operator>(const simd16 other) const { return _mm256_cmpgt_epi16(*this, other); } + simdutf_really_inline simd16 operator<(const simd16 other) const { return _mm256_cmpgt_epi16(other, *this); } +}; + +// Unsigned words +template<> +struct simd16: base16_numeric { + simdutf_really_inline simd16() : base16_numeric() {} + simdutf_really_inline simd16(const __m256i _value) : base16_numeric(_value) {} + + // Splat constructor + simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {} + // Array constructor + simdutf_really_inline simd16(const uint16_t* values) : simd16(load(values)) {} + simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast(values))) {} + + // Saturated math + simdutf_really_inline simd16 saturating_add(const simd16 other) const { return _mm256_adds_epu16(*this, other); } + simdutf_really_inline simd16 saturating_sub(const simd16 other) const { return _mm256_subs_epu16(*this, other); } + + // Order-specific operations + simdutf_really_inline simd16 max_val(const simd16 other) const { return _mm256_max_epu16(*this, other); } + simdutf_really_inline simd16 min_val(const simd16 other) const { return _mm256_min_epu16(*this, other); } + // Same as >, but only guarantees true is nonzero (< guarantees true = -1) + simdutf_really_inline simd16 gt_bits(const simd16 other) const { return this->saturating_sub(other); } + // Same as <, but only guarantees true is nonzero (< guarantees true = -1) + simdutf_really_inline simd16 lt_bits(const simd16 other) const { return other.saturating_sub(*this); } + simdutf_really_inline simd16 operator<=(const simd16 other) const { return other.max_val(*this) == other; } + simdutf_really_inline simd16 operator>=(const simd16 other) const { return other.min_val(*this) == other; } + simdutf_really_inline simd16 operator>(const simd16 other) const { return this->gt_bits(other).any_bits_set(); } + simdutf_really_inline simd16 operator<(const simd16 other) const { return this->gt_bits(other).any_bits_set(); } + + // Bit-specific operations + simdutf_really_inline simd16 bits_not_set() const { return *this == uint16_t(0); } + simdutf_really_inline simd16 bits_not_set(simd16 bits) const { return (*this & bits).bits_not_set(); } + simdutf_really_inline simd16 any_bits_set() const { return ~this->bits_not_set(); } + simdutf_really_inline simd16 any_bits_set(simd16 bits) const { return ~this->bits_not_set(bits); } + + simdutf_really_inline bool bits_not_set_anywhere() const { return _mm256_testz_si256(*this, *this); } + simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); } + simdutf_really_inline bool bits_not_set_anywhere(simd16 bits) const { return _mm256_testz_si256(*this, bits); } + simdutf_really_inline bool any_bits_set_anywhere(simd16 bits) const { return !bits_not_set_anywhere(bits); } + template + simdutf_really_inline simd16 shr() const { return simd16(_mm256_srli_epi16(*this, N)); } + template + simdutf_really_inline simd16 shl() const { return simd16(_mm256_slli_epi16(*this, N)); } + // Get one of the bits and make a bitmask out of it. + // e.g. value.get_bit<7>() gets the high bit + template + simdutf_really_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 15-N)); } + + // Change the endianness + simdutf_really_inline simd16 swap_bytes() const { + const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, + 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30); + return _mm256_shuffle_epi8(*this, swap); + } + + // Pack with the unsigned saturation two uint16_t words into single uint8_t vector + static simdutf_really_inline simd8 pack(const simd16& v0, const simd16& v1) { + // Note: the AVX2 variant of pack operates on 128-bit lanes, thus + // we have to shuffle lanes in order to produce bytes in the + // correct order. + + // get the 0th lanes + const __m128i lo_0 = _mm256_extracti128_si256(v0, 0); + const __m128i lo_1 = _mm256_extracti128_si256(v1, 0); + + // get the 1st lanes + const __m128i hi_0 = _mm256_extracti128_si256(v0, 1); + const __m128i hi_1 = _mm256_extracti128_si256(v1, 1); + + // build new vectors (shuffle lanes) + const __m256i t0 = _mm256_set_m128i(lo_1, lo_0); + const __m256i t1 = _mm256_set_m128i(hi_1, hi_0); + + // pack words in linear order from v0 and v1 + return _mm256_packus_epi16(t0, t1); + } +}; + + + template + struct simd16x32 { + static constexpr int NUM_CHUNKS = 64 / sizeof(simd16); + static_assert(NUM_CHUNKS == 2, "Haswell kernel should use two registers per 64-byte block."); + simd16 chunks[NUM_CHUNKS]; + + simd16x32(const simd16x32& o) = delete; // no copy allowed + simd16x32& operator=(const simd16 other) = delete; // no assignment allowed + simd16x32() = delete; // no default constructor allowed + + simdutf_really_inline simd16x32(const simd16 chunk0, const simd16 chunk1) : chunks{chunk0, chunk1} {} + simdutf_really_inline simd16x32(const T* ptr) : chunks{simd16::load(ptr), simd16::load(ptr+sizeof(simd16)/sizeof(T))} {} + + simdutf_really_inline void store(T* ptr) const { + this->chunks[0].store(ptr+sizeof(simd16)*0/sizeof(T)); + this->chunks[1].store(ptr+sizeof(simd16)*1/sizeof(T)); + } + + simdutf_really_inline uint64_t to_bitmask() const { + uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask()); + uint64_t r_hi = this->chunks[1].to_bitmask(); + return r_lo | (r_hi << 32); + } + + simdutf_really_inline simd16 reduce_or() const { + return this->chunks[0] | this->chunks[1]; + } + + simdutf_really_inline bool is_ascii() const { + return this->reduce_or().is_ascii(); + } + + simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const { + this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd16)*0); + this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd16)); + } + + simdutf_really_inline simd16x32 bit_or(const T m) const { + const simd16 mask = simd16::splat(m); + return simd16x32( + this->chunks[0] | mask, + this->chunks[1] | mask + ); + } + + simdutf_really_inline void swap_bytes() { + this->chunks[0] = this->chunks[0].swap_bytes(); + this->chunks[1] = this->chunks[1].swap_bytes(); + } + + simdutf_really_inline uint64_t eq(const T m) const { + const simd16 mask = simd16::splat(m); + return simd16x32( + this->chunks[0] == mask, + this->chunks[1] == mask + ).to_bitmask(); + } + + simdutf_really_inline uint64_t eq(const simd16x32 &other) const { + return simd16x32( + this->chunks[0] == other.chunks[0], + this->chunks[1] == other.chunks[1] + ).to_bitmask(); + } + + simdutf_really_inline uint64_t lteq(const T m) const { + const simd16 mask = simd16::splat(m); + return simd16x32( + this->chunks[0] <= mask, + this->chunks[1] <= mask + ).to_bitmask(); + } + + simdutf_really_inline uint64_t in_range(const T low, const T high) const { + const simd16 mask_low = simd16::splat(low); + const simd16 mask_high = simd16::splat(high); + + return simd16x32( + (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low), + (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low), + (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low), + (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low) + ).to_bitmask(); + } + simdutf_really_inline uint64_t not_in_range(const T low, const T high) const { + const simd16 mask_low = simd16::splat(static_cast(low-1)); + const simd16 mask_high = simd16::splat(static_cast(high+1)); + return simd16x32( + (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low), + (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low) + ).to_bitmask(); + } + simdutf_really_inline uint64_t lt(const T m) const { + const simd16 mask = simd16::splat(m); + return simd16x32( + this->chunks[0] < mask, + this->chunks[1] < mask + ).to_bitmask(); + } + }; // struct simd16x32 +/* end file src/simdutf/haswell/simd16-inl.h */ + +} // namespace simd + +} // unnamed namespace +} // namespace haswell +} // namespace simdutf + +#endif // SIMDUTF_HASWELL_SIMD_H +/* end file src/simdutf/haswell/simd.h */ + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/end.h +/* begin file src/simdutf/haswell/end.h */ +SIMDUTF_UNTARGET_REGION +/* end file src/simdutf/haswell/end.h */ + +#endif // SIMDUTF_IMPLEMENTATION_HASWELL +#endif // SIMDUTF_HASWELL_COMMON_H +/* end file src/simdutf/haswell.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere.h +/* begin file src/simdutf/westmere.h */ +#ifndef SIMDUTF_WESTMERE_H +#define SIMDUTF_WESTMERE_H + +#ifdef SIMDUTF_FALLBACK_H +#error "westmere.h must be included before fallback.h" +#endif + + +// Default Westmere to on if this is x86-64, unless we'll always select Haswell. +#ifndef SIMDUTF_IMPLEMENTATION_WESTMERE +// +// You do not want to set it to (SIMDUTF_IS_X86_64 && !SIMDUTF_REQUIRES_HASWELL) +// because you want to rely on runtime dispatch! +// +#define SIMDUTF_IMPLEMENTATION_WESTMERE (SIMDUTF_IS_X86_64) +#endif +#define SIMDUTF_CAN_ALWAYS_RUN_WESTMERE (SIMDUTF_IMPLEMENTATION_WESTMERE && SIMDUTF_IS_X86_64 && __SSE4_2__ && __PCLMUL__) + +#if SIMDUTF_IMPLEMENTATION_WESTMERE + +#define SIMDUTF_TARGET_WESTMERE SIMDUTF_TARGET_REGION("sse4.2,pclmul") + +namespace simdutf { +/** + * Implementation for Westmere (Intel SSE4.2). + */ +namespace westmere { +} // namespace westmere +} // namespace simdutf + +// +// These two need to be included outside SIMDUTF_TARGET_REGION +// +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/implementation.h +/* begin file src/simdutf/westmere/implementation.h */ +#ifndef SIMDUTF_WESTMERE_IMPLEMENTATION_H +#define SIMDUTF_WESTMERE_IMPLEMENTATION_H + + +// The constructor may be executed on any host, so we take care not to use SIMDUTF_TARGET_REGION +namespace simdutf { +namespace westmere { + +namespace { +using namespace simdutf; +} + +class implementation final : public simdutf::implementation { +public: + simdutf_really_inline implementation() : simdutf::implementation("westmere", "Intel/AMD SSE4.2", internal::instruction_set::SSE42 | internal::instruction_set::PCLMULQDQ) {} + simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final; + simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final; + simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final; + simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final; + simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final; + simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final; + simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; + simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final; + simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept; + simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept; + simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept; + simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept; +}; + +} // namespace westmere +} // namespace simdutf + +#endif // SIMDUTF_WESTMERE_IMPLEMENTATION_H +/* end file src/simdutf/westmere/implementation.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/intrinsics.h +/* begin file src/simdutf/westmere/intrinsics.h */ +#ifndef SIMDUTF_WESTMERE_INTRINSICS_H +#define SIMDUTF_WESTMERE_INTRINSICS_H + +#ifdef SIMDUTF_VISUAL_STUDIO +// under clang within visual studio, this will include +#include // visual studio or clang +#else +#include // elsewhere +#endif // SIMDUTF_VISUAL_STUDIO + + +#ifdef SIMDUTF_CLANG_VISUAL_STUDIO +/** + * You are not supposed, normally, to include these + * headers directly. Instead you should either include intrin.h + * or x86intrin.h. However, when compiling with clang + * under Windows (i.e., when _MSC_VER is set), these headers + * only get included *if* the corresponding features are detected + * from macros: + */ +#include // for _mm_alignr_epi8 +#include // for _mm_clmulepi64_si128 +#endif + + + +#endif // SIMDUTF_WESTMERE_INTRINSICS_H +/* end file src/simdutf/westmere/intrinsics.h */ + +// +// The rest need to be inside the region +// +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/begin.h +/* begin file src/simdutf/westmere/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "westmere" +// #define SIMDUTF_IMPLEMENTATION westmere +SIMDUTF_TARGET_WESTMERE +/* end file src/simdutf/westmere/begin.h */ + +// Declarations +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/bitmanipulation.h +/* begin file src/simdutf/westmere/bitmanipulation.h */ +#ifndef SIMDUTF_WESTMERE_BITMANIPULATION_H +#define SIMDUTF_WESTMERE_BITMANIPULATION_H + +namespace simdutf { +namespace westmere { +namespace { + +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO +simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) { + // note: we do not support legacy 32-bit Windows + return __popcnt64(input_num);// Visual Studio wants two underscores +} +#else +simdutf_really_inline long long int count_ones(uint64_t input_num) { + return _popcnt64(input_num); +} +#endif + +} // unnamed namespace +} // namespace westmere +} // namespace simdutf + +#endif // SIMDUTF_WESTMERE_BITMANIPULATION_H +/* end file src/simdutf/westmere/bitmanipulation.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/simd.h +/* begin file src/simdutf/westmere/simd.h */ +#ifndef SIMDUTF_WESTMERE_SIMD_H +#define SIMDUTF_WESTMERE_SIMD_H + +namespace simdutf { +namespace westmere { +namespace { +namespace simd { + + template + struct base { + __m128i value; + + // Zero constructor + simdutf_really_inline base() : value{__m128i()} {} + + // Conversion from SIMD register + simdutf_really_inline base(const __m128i _value) : value(_value) {} + // Conversion to SIMD register + simdutf_really_inline operator const __m128i&() const { return this->value; } + simdutf_really_inline operator __m128i&() { return this->value; } + template + simdutf_really_inline void store_ascii_as_utf16(char16_t * p) const { + __m128i first = _mm_cvtepu8_epi16(*this); + __m128i second = _mm_cvtepu8_epi16(_mm_srli_si128(*this,8)); + if (big_endian) { + const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + first = _mm_shuffle_epi8(first, swap); + second = _mm_shuffle_epi8(second, swap); + } + _mm_storeu_si128(reinterpret_cast<__m128i *>(p), first); + _mm_storeu_si128(reinterpret_cast<__m128i *>(p+8), second); + } + simdutf_really_inline void store_ascii_as_utf32(char32_t * p) const { + _mm_storeu_si128(reinterpret_cast<__m128i *>(p), _mm_cvtepu8_epi32(*this)); + _mm_storeu_si128(reinterpret_cast<__m128i *>(p+4), _mm_cvtepu8_epi32(_mm_srli_si128(*this,4))); + _mm_storeu_si128(reinterpret_cast<__m128i *>(p+8), _mm_cvtepu8_epi32(_mm_srli_si128(*this,8))); + _mm_storeu_si128(reinterpret_cast<__m128i *>(p+12), _mm_cvtepu8_epi32(_mm_srli_si128(*this,12))); + } + // Bit operations + simdutf_really_inline Child operator|(const Child other) const { return _mm_or_si128(*this, other); } + simdutf_really_inline Child operator&(const Child other) const { return _mm_and_si128(*this, other); } + simdutf_really_inline Child operator^(const Child other) const { return _mm_xor_si128(*this, other); } + simdutf_really_inline Child bit_andnot(const Child other) const { return _mm_andnot_si128(other, *this); } + simdutf_really_inline Child& operator|=(const Child other) { auto this_cast = static_cast(this); *this_cast = *this_cast | other; return *this_cast; } + simdutf_really_inline Child& operator&=(const Child other) { auto this_cast = static_cast(this); *this_cast = *this_cast & other; return *this_cast; } + simdutf_really_inline Child& operator^=(const Child other) { auto this_cast = static_cast(this); *this_cast = *this_cast ^ other; return *this_cast; } + }; + + // Forward-declared so they can be used by splat and friends. + template + struct simd8; + + template> + struct base8: base> { + typedef uint16_t bitmask_t; + typedef uint32_t bitmask2_t; + + simdutf_really_inline T first() const { return _mm_extract_epi8(*this,0); } + simdutf_really_inline T last() const { return _mm_extract_epi8(*this,15); } + simdutf_really_inline base8() : base>() {} + simdutf_really_inline base8(const __m128i _value) : base>(_value) {} + + simdutf_really_inline Mask operator==(const simd8 other) const { return _mm_cmpeq_epi8(*this, other); } + + static const int SIZE = sizeof(base>::value); + + template + simdutf_really_inline simd8 prev(const simd8 prev_chunk) const { + return _mm_alignr_epi8(*this, prev_chunk, 16 - N); + } + }; + + // SIMD byte mask type (returned by things like eq and gt) + template<> + struct simd8: base8 { + static simdutf_really_inline simd8 splat(bool _value) { return _mm_set1_epi8(uint8_t(-(!!_value))); } + + simdutf_really_inline simd8() : base8() {} + simdutf_really_inline simd8(const __m128i _value) : base8(_value) {} + // Splat constructor + simdutf_really_inline simd8(bool _value) : base8(splat(_value)) {} + + simdutf_really_inline int to_bitmask() const { return _mm_movemask_epi8(*this); } + simdutf_really_inline bool any() const { return !_mm_testz_si128(*this, *this); } + simdutf_really_inline bool none() const { return _mm_testz_si128(*this, *this); } + simdutf_really_inline bool all() const { return _mm_movemask_epi8(*this) == 0xFFFF; } + simdutf_really_inline simd8 operator~() const { return *this ^ true; } + }; + + template + struct base8_numeric: base8 { + static simdutf_really_inline simd8 splat(T _value) { return _mm_set1_epi8(_value); } + static simdutf_really_inline simd8 zero() { return _mm_setzero_si128(); } + static simdutf_really_inline simd8 load(const T values[16]) { + return _mm_loadu_si128(reinterpret_cast(values)); + } + // Repeat 16 values as many times as necessary (usually for lookup tables) + static simdutf_really_inline simd8 repeat_16( + T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, + T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } + + simdutf_really_inline base8_numeric() : base8() {} + simdutf_really_inline base8_numeric(const __m128i _value) : base8(_value) {} + + // Store to array + simdutf_really_inline void store(T dst[16]) const { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); } + + // Override to distinguish from bool version + simdutf_really_inline simd8 operator~() const { return *this ^ 0xFFu; } + + // Addition/subtraction are the same for signed and unsigned + simdutf_really_inline simd8 operator+(const simd8 other) const { return _mm_add_epi8(*this, other); } + simdutf_really_inline simd8 operator-(const simd8 other) const { return _mm_sub_epi8(*this, other); } + simdutf_really_inline simd8& operator+=(const simd8 other) { *this = *this + other; return *static_cast*>(this); } + simdutf_really_inline simd8& operator-=(const simd8 other) { *this = *this - other; return *static_cast*>(this); } + + // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values) + template + simdutf_really_inline simd8 lookup_16(simd8 lookup_table) const { + return _mm_shuffle_epi8(lookup_table, *this); + } + + template + simdutf_really_inline simd8 lookup_16( + L replace0, L replace1, L replace2, L replace3, + L replace4, L replace5, L replace6, L replace7, + L replace8, L replace9, L replace10, L replace11, + L replace12, L replace13, L replace14, L replace15) const { + return lookup_16(simd8::repeat_16( + replace0, replace1, replace2, replace3, + replace4, replace5, replace6, replace7, + replace8, replace9, replace10, replace11, + replace12, replace13, replace14, replace15 + )); + } + }; + + // Signed bytes + template<> + struct simd8 : base8_numeric { + simdutf_really_inline simd8() : base8_numeric() {} + simdutf_really_inline simd8(const __m128i _value) : base8_numeric(_value) {} + // Splat constructor + simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {} + // Array constructor + simdutf_really_inline simd8(const int8_t* values) : simd8(load(values)) {} + // Member-by-member initialization + simdutf_really_inline simd8( + int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 + ) : simd8(_mm_setr_epi8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + )) {} + // Repeat 16 values as many times as necessary (usually for lookup tables) + simdutf_really_inline static simd8 repeat_16( + int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } + simdutf_really_inline operator simd8() const; + simdutf_really_inline bool is_ascii() const { return _mm_movemask_epi8(*this) == 0; } + + // Order-sensitive comparisons + simdutf_really_inline simd8 max_val(const simd8 other) const { return _mm_max_epi8(*this, other); } + simdutf_really_inline simd8 min_val(const simd8 other) const { return _mm_min_epi8(*this, other); } + simdutf_really_inline simd8 operator>(const simd8 other) const { return _mm_cmpgt_epi8(*this, other); } + simdutf_really_inline simd8 operator<(const simd8 other) const { return _mm_cmpgt_epi8(other, *this); } + }; + + // Unsigned bytes + template<> + struct simd8: base8_numeric { + simdutf_really_inline simd8() : base8_numeric() {} + simdutf_really_inline simd8(const __m128i _value) : base8_numeric(_value) {} + + // Splat constructor + simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {} + // Array constructor + simdutf_really_inline simd8(const uint8_t* values) : simd8(load(values)) {} + // Member-by-member initialization + simdutf_really_inline simd8( + uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, + uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 + ) : simd8(_mm_setr_epi8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + )) {} + // Repeat 16 values as many times as necessary (usually for lookup tables) + simdutf_really_inline static simd8 repeat_16( + uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, + uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } + + // Saturated math + simdutf_really_inline simd8 saturating_add(const simd8 other) const { return _mm_adds_epu8(*this, other); } + simdutf_really_inline simd8 saturating_sub(const simd8 other) const { return _mm_subs_epu8(*this, other); } + + // Order-specific operations + simdutf_really_inline simd8 max_val(const simd8 other) const { return _mm_max_epu8(*this, other); } + simdutf_really_inline simd8 min_val(const simd8 other) const { return _mm_min_epu8(*this, other); } + // Same as >, but only guarantees true is nonzero (< guarantees true = -1) + simdutf_really_inline simd8 gt_bits(const simd8 other) const { return this->saturating_sub(other); } + // Same as <, but only guarantees true is nonzero (< guarantees true = -1) + simdutf_really_inline simd8 lt_bits(const simd8 other) const { return other.saturating_sub(*this); } + simdutf_really_inline simd8 operator<=(const simd8 other) const { return other.max_val(*this) == other; } + simdutf_really_inline simd8 operator>=(const simd8 other) const { return other.min_val(*this) == other; } + simdutf_really_inline simd8 operator>(const simd8 other) const { return this->gt_bits(other).any_bits_set(); } + simdutf_really_inline simd8 operator<(const simd8 other) const { return this->gt_bits(other).any_bits_set(); } + + // Bit-specific operations + simdutf_really_inline simd8 bits_not_set() const { return *this == uint8_t(0); } + simdutf_really_inline simd8 bits_not_set(simd8 bits) const { return (*this & bits).bits_not_set(); } + simdutf_really_inline simd8 any_bits_set() const { return ~this->bits_not_set(); } + simdutf_really_inline simd8 any_bits_set(simd8 bits) const { return ~this->bits_not_set(bits); } + simdutf_really_inline bool is_ascii() const { return _mm_movemask_epi8(*this) == 0; } + + simdutf_really_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); } + simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); } + simdutf_really_inline bool bits_not_set_anywhere(simd8 bits) const { return _mm_testz_si128(*this, bits); } + simdutf_really_inline bool any_bits_set_anywhere(simd8 bits) const { return !bits_not_set_anywhere(bits); } + template + simdutf_really_inline simd8 shr() const { return simd8(_mm_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); } + template + simdutf_really_inline simd8 shl() const { return simd8(_mm_slli_epi16(*this, N)) & uint8_t(0xFFu << N); } + // Get one of the bits and make a bitmask out of it. + // e.g. value.get_bit<7>() gets the high bit + template + simdutf_really_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7-N)); } + }; + simdutf_really_inline simd8::operator simd8() const { return this->value; } + + // Unsigned bytes + template<> + struct simd8: base { + static simdutf_really_inline simd8 splat(uint16_t _value) { return _mm_set1_epi16(_value); } + static simdutf_really_inline simd8 load(const uint16_t values[8]) { + return _mm_loadu_si128(reinterpret_cast(values)); + } + + simdutf_really_inline simd8() : base() {} + simdutf_really_inline simd8(const __m128i _value) : base(_value) {} + // Splat constructor + simdutf_really_inline simd8(uint16_t _value) : simd8(splat(_value)) {} + // Array constructor + simdutf_really_inline simd8(const uint16_t* values) : simd8(load(values)) {} + // Member-by-member initialization + simdutf_really_inline simd8( + uint16_t v0, uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, uint16_t v5, uint16_t v6, uint16_t v7 + ) : simd8(_mm_setr_epi16( + v0, v1, v2, v3, v4, v5, v6, v7 + )) {} + + // Saturated math + simdutf_really_inline simd8 saturating_add(const simd8 other) const { return _mm_adds_epu16(*this, other); } + simdutf_really_inline simd8 saturating_sub(const simd8 other) const { return _mm_subs_epu16(*this, other); } + + // Order-specific operations + simdutf_really_inline simd8 max_val(const simd8 other) const { return _mm_max_epu16(*this, other); } + simdutf_really_inline simd8 min_val(const simd8 other) const { return _mm_min_epu16(*this, other); } + // Same as >, but only guarantees true is nonzero (< guarantees true = -1) + simdutf_really_inline simd8 gt_bits(const simd8 other) const { return this->saturating_sub(other); } + // Same as <, but only guarantees true is nonzero (< guarantees true = -1) + simdutf_really_inline simd8 lt_bits(const simd8 other) const { return other.saturating_sub(*this); } + simdutf_really_inline simd8 operator<=(const simd8 other) const { return other.max_val(*this) == other; } + simdutf_really_inline simd8 operator>=(const simd8 other) const { return other.min_val(*this) == other; } + simdutf_really_inline simd8 operator==(const simd8 other) const { return _mm_cmpeq_epi16(*this, other); } + simdutf_really_inline simd8 operator&(const simd8 other) const { return _mm_and_si128(*this, other); } + simdutf_really_inline simd8 operator|(const simd8 other) const { return _mm_or_si128(*this, other); } + + // Bit-specific operations + simdutf_really_inline simd8 bits_not_set() const { return *this == uint16_t(0); } + simdutf_really_inline simd8 any_bits_set() const { return ~this->bits_not_set(); } + + simdutf_really_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); } + simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); } + simdutf_really_inline bool bits_not_set_anywhere(simd8 bits) const { return _mm_testz_si128(*this, bits); } + simdutf_really_inline bool any_bits_set_anywhere(simd8 bits) const { return !bits_not_set_anywhere(bits); } + }; + template + struct simd8x64 { + static constexpr int NUM_CHUNKS = 64 / sizeof(simd8); + static_assert(NUM_CHUNKS == 4, "Westmere kernel should use four registers per 64-byte block."); + simd8 chunks[NUM_CHUNKS]; + + simd8x64(const simd8x64& o) = delete; // no copy allowed + simd8x64& operator=(const simd8 other) = delete; // no assignment allowed + simd8x64() = delete; // no default constructor allowed + + simdutf_really_inline simd8x64(const simd8 chunk0, const simd8 chunk1, const simd8 chunk2, const simd8 chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {} + simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8::load(ptr), simd8::load(ptr+sizeof(simd8)/sizeof(T)), simd8::load(ptr+2*sizeof(simd8)/sizeof(T)), simd8::load(ptr+3*sizeof(simd8)/sizeof(T))} {} + + simdutf_really_inline void store(T* ptr) const { + this->chunks[0].store(ptr+sizeof(simd8)*0/sizeof(T)); + this->chunks[1].store(ptr+sizeof(simd8)*1/sizeof(T)); + this->chunks[2].store(ptr+sizeof(simd8)*2/sizeof(T)); + this->chunks[3].store(ptr+sizeof(simd8)*3/sizeof(T)); + } + + simdutf_really_inline simd8x64& operator |=(const simd8x64 &other) { + this->chunks[0] |= other.chunks[0]; + this->chunks[1] |= other.chunks[1]; + this->chunks[2] |= other.chunks[2]; + this->chunks[3] |= other.chunks[3]; + return *this; + } + + simdutf_really_inline simd8 reduce_or() const { + return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]); + } + + simdutf_really_inline bool is_ascii() const { + return this->reduce_or().is_ascii(); + } + + template + simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const { + this->chunks[0].template store_ascii_as_utf16(ptr+sizeof(simd8)*0); + this->chunks[1].template store_ascii_as_utf16(ptr+sizeof(simd8)*1); + this->chunks[2].template store_ascii_as_utf16(ptr+sizeof(simd8)*2); + this->chunks[3].template store_ascii_as_utf16(ptr+sizeof(simd8)*3); + } + + simdutf_really_inline void store_ascii_as_utf32(char32_t * ptr) const { + this->chunks[0].store_ascii_as_utf32(ptr+sizeof(simd8)*0); + this->chunks[1].store_ascii_as_utf32(ptr+sizeof(simd8)*1); + this->chunks[2].store_ascii_as_utf32(ptr+sizeof(simd8)*2); + this->chunks[3].store_ascii_as_utf32(ptr+sizeof(simd8)*3); + } + + simdutf_really_inline uint64_t to_bitmask() const { + uint64_t r0 = uint32_t(this->chunks[0].to_bitmask() ); + uint64_t r1 = this->chunks[1].to_bitmask() ; + uint64_t r2 = this->chunks[2].to_bitmask() ; + uint64_t r3 = this->chunks[3].to_bitmask() ; + return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48); + } + + simdutf_really_inline uint64_t eq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] == mask, + this->chunks[1] == mask, + this->chunks[2] == mask, + this->chunks[3] == mask + ).to_bitmask(); + } + + simdutf_really_inline uint64_t eq(const simd8x64 &other) const { + return simd8x64( + this->chunks[0] == other.chunks[0], + this->chunks[1] == other.chunks[1], + this->chunks[2] == other.chunks[2], + this->chunks[3] == other.chunks[3] + ).to_bitmask(); + } + + simdutf_really_inline uint64_t lteq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] <= mask, + this->chunks[1] <= mask, + this->chunks[2] <= mask, + this->chunks[3] <= mask + ).to_bitmask(); + } + + simdutf_really_inline uint64_t in_range(const T low, const T high) const { + const simd8 mask_low = simd8::splat(low); + const simd8 mask_high = simd8::splat(high); + + return simd8x64( + (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low), + (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low), + (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low), + (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low) + ).to_bitmask(); + } + simdutf_really_inline uint64_t not_in_range(const T low, const T high) const { + const simd8 mask_low = simd8::splat(low-1); + const simd8 mask_high = simd8::splat(high+1); + return simd8x64( + (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low), + (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low), + (this->chunks[2] >= mask_high) | (this->chunks[2] <= mask_low), + (this->chunks[3] >= mask_high) | (this->chunks[3] <= mask_low) + ).to_bitmask(); + } + simdutf_really_inline uint64_t lt(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] < mask, + this->chunks[1] < mask, + this->chunks[2] < mask, + this->chunks[3] < mask + ).to_bitmask(); + } + + simdutf_really_inline uint64_t gt(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] > mask, + this->chunks[1] > mask, + this->chunks[2] > mask, + this->chunks[3] > mask + ).to_bitmask(); + } + simdutf_really_inline uint64_t gteq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] >= mask, + this->chunks[1] >= mask, + this->chunks[2] >= mask, + this->chunks[3] >= mask + ).to_bitmask(); + } + simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + simd8(__m128i(this->chunks[0])) >= mask, + simd8(__m128i(this->chunks[1])) >= mask, + simd8(__m128i(this->chunks[2])) >= mask, + simd8(__m128i(this->chunks[3])) >= mask + ).to_bitmask(); + } + }; // struct simd8x64 + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/simd16-inl.h +/* begin file src/simdutf/westmere/simd16-inl.h */ +template +struct simd16; + +template> +struct base16: base> { + typedef uint16_t bitmask_t; + typedef uint32_t bitmask2_t; + + simdutf_really_inline base16() : base>() {} + simdutf_really_inline base16(const __m128i _value) : base>(_value) {} + template + simdutf_really_inline base16(const Pointer* ptr) : base16(_mm_loadu_si128(reinterpret_cast(ptr))) {} + + simdutf_really_inline Mask operator==(const simd16 other) const { return _mm_cmpeq_epi16(*this, other); } + + static const int SIZE = sizeof(base>::value); + + template + simdutf_really_inline simd16 prev(const simd16 prev_chunk) const { + return _mm_alignr_epi8(*this, prev_chunk, 16 - N); + } +}; + +// SIMD byte mask type (returned by things like eq and gt) +template<> +struct simd16: base16 { + static simdutf_really_inline simd16 splat(bool _value) { return _mm_set1_epi16(uint16_t(-(!!_value))); } + + simdutf_really_inline simd16() : base16() {} + simdutf_really_inline simd16(const __m128i _value) : base16(_value) {} + // Splat constructor + simdutf_really_inline simd16(bool _value) : base16(splat(_value)) {} + + simdutf_really_inline int to_bitmask() const { return _mm_movemask_epi8(*this); } + simdutf_really_inline bool any() const { return !_mm_testz_si128(*this, *this); } + simdutf_really_inline simd16 operator~() const { return *this ^ true; } +}; + +template +struct base16_numeric: base16 { + static simdutf_really_inline simd16 splat(T _value) { return _mm_set1_epi16(_value); } + static simdutf_really_inline simd16 zero() { return _mm_setzero_si128(); } + static simdutf_really_inline simd16 load(const T values[8]) { + return _mm_loadu_si128(reinterpret_cast(values)); + } + + simdutf_really_inline base16_numeric() : base16() {} + simdutf_really_inline base16_numeric(const __m128i _value) : base16(_value) {} + + // Store to array + simdutf_really_inline void store(T dst[8]) const { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); } + + // Override to distinguish from bool version + simdutf_really_inline simd16 operator~() const { return *this ^ 0xFFu; } + + // Addition/subtraction are the same for signed and unsigned + simdutf_really_inline simd16 operator+(const simd16 other) const { return _mm_add_epi16(*this, other); } + simdutf_really_inline simd16 operator-(const simd16 other) const { return _mm_sub_epi16(*this, other); } + simdutf_really_inline simd16& operator+=(const simd16 other) { *this = *this + other; return *static_cast*>(this); } + simdutf_really_inline simd16& operator-=(const simd16 other) { *this = *this - other; return *static_cast*>(this); } +}; + +// Signed words +template<> +struct simd16 : base16_numeric { + simdutf_really_inline simd16() : base16_numeric() {} + simdutf_really_inline simd16(const __m128i _value) : base16_numeric(_value) {} + // Splat constructor + simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {} + // Array constructor + simdutf_really_inline simd16(const int16_t* values) : simd16(load(values)) {} + simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast(values))) {} + // Member-by-member initialization + simdutf_really_inline simd16( + int16_t v0, int16_t v1, int16_t v2, int16_t v3, int16_t v4, int16_t v5, int16_t v6, int16_t v7) + : simd16(_mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7)) {} + simdutf_really_inline operator simd16() const; + + // Order-sensitive comparisons + simdutf_really_inline simd16 max_val(const simd16 other) const { return _mm_max_epi16(*this, other); } + simdutf_really_inline simd16 min_val(const simd16 other) const { return _mm_min_epi16(*this, other); } + simdutf_really_inline simd16 operator>(const simd16 other) const { return _mm_cmpgt_epi16(*this, other); } + simdutf_really_inline simd16 operator<(const simd16 other) const { return _mm_cmpgt_epi16(other, *this); } +}; + +// Unsigned words +template<> +struct simd16: base16_numeric { + simdutf_really_inline simd16() : base16_numeric() {} + simdutf_really_inline simd16(const __m128i _value) : base16_numeric(_value) {} + + // Splat constructor + simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {} + // Array constructor + simdutf_really_inline simd16(const uint16_t* values) : simd16(load(values)) {} + simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast(values))) {} + // Member-by-member initialization + simdutf_really_inline simd16( + uint16_t v0, uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, uint16_t v5, uint16_t v6, uint16_t v7) + : simd16(_mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7)) {} + // Repeat 16 values as many times as necessary (usually for lookup tables) + simdutf_really_inline static simd16 repeat_16( + uint16_t v0, uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, uint16_t v5, uint16_t v6, uint16_t v7 + ) { + return simd16(v0, v1, v2, v3, v4, v5, v6, v7); + } + + // Saturated math + simdutf_really_inline simd16 saturating_add(const simd16 other) const { return _mm_adds_epu16(*this, other); } + simdutf_really_inline simd16 saturating_sub(const simd16 other) const { return _mm_subs_epu16(*this, other); } + + // Order-specific operations + simdutf_really_inline simd16 max_val(const simd16 other) const { return _mm_max_epu16(*this, other); } + simdutf_really_inline simd16 min_val(const simd16 other) const { return _mm_min_epu16(*this, other); } + // Same as >, but only guarantees true is nonzero (< guarantees true = -1) + simdutf_really_inline simd16 gt_bits(const simd16 other) const { return this->saturating_sub(other); } + // Same as <, but only guarantees true is nonzero (< guarantees true = -1) + simdutf_really_inline simd16 lt_bits(const simd16 other) const { return other.saturating_sub(*this); } + simdutf_really_inline simd16 operator<=(const simd16 other) const { return other.max_val(*this) == other; } + simdutf_really_inline simd16 operator>=(const simd16 other) const { return other.min_val(*this) == other; } + simdutf_really_inline simd16 operator>(const simd16 other) const { return this->gt_bits(other).any_bits_set(); } + simdutf_really_inline simd16 operator<(const simd16 other) const { return this->gt_bits(other).any_bits_set(); } + + // Bit-specific operations + simdutf_really_inline simd16 bits_not_set() const { return *this == uint16_t(0); } + simdutf_really_inline simd16 bits_not_set(simd16 bits) const { return (*this & bits).bits_not_set(); } + simdutf_really_inline simd16 any_bits_set() const { return ~this->bits_not_set(); } + simdutf_really_inline simd16 any_bits_set(simd16 bits) const { return ~this->bits_not_set(bits); } + + simdutf_really_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); } + simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); } + simdutf_really_inline bool bits_not_set_anywhere(simd16 bits) const { return _mm_testz_si128(*this, bits); } + simdutf_really_inline bool any_bits_set_anywhere(simd16 bits) const { return !bits_not_set_anywhere(bits); } + template + simdutf_really_inline simd16 shr() const { return simd16(_mm_srli_epi16(*this, N)); } + template + simdutf_really_inline simd16 shl() const { return simd16(_mm_slli_epi16(*this, N)); } + // Get one of the bits and make a bitmask out of it. + // e.g. value.get_bit<7>() gets the high bit + template + simdutf_really_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7-N)); } + + // Change the endianness + simdutf_really_inline simd16 swap_bytes() const { + const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + return _mm_shuffle_epi8(*this, swap); + } + + // Pack with the unsigned saturation two uint16_t words into single uint8_t vector + static simdutf_really_inline simd8 pack(const simd16& v0, const simd16& v1) { + return _mm_packus_epi16(v0, v1); + } +}; +simdutf_really_inline simd16::operator simd16() const { return this->value; } + +template + struct simd16x32 { + static constexpr int NUM_CHUNKS = 64 / sizeof(simd16); + static_assert(NUM_CHUNKS == 4, "Westmere kernel should use four registers per 64-byte block."); + simd16 chunks[NUM_CHUNKS]; + + simd16x32(const simd16x32& o) = delete; // no copy allowed + simd16x32& operator=(const simd16 other) = delete; // no assignment allowed + simd16x32() = delete; // no default constructor allowed + + simdutf_really_inline simd16x32(const simd16 chunk0, const simd16 chunk1, const simd16 chunk2, const simd16 chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {} + simdutf_really_inline simd16x32(const T* ptr) : chunks{simd16::load(ptr), simd16::load(ptr+sizeof(simd16)/sizeof(T)), simd16::load(ptr+2*sizeof(simd16)/sizeof(T)), simd16::load(ptr+3*sizeof(simd16)/sizeof(T))} {} + + simdutf_really_inline void store(T* ptr) const { + this->chunks[0].store(ptr+sizeof(simd16)*0/sizeof(T)); + this->chunks[1].store(ptr+sizeof(simd16)*1/sizeof(T)); + this->chunks[2].store(ptr+sizeof(simd16)*2/sizeof(T)); + this->chunks[3].store(ptr+sizeof(simd16)*3/sizeof(T)); + } + + simdutf_really_inline simd16 reduce_or() const { + return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]); + } + + simdutf_really_inline bool is_ascii() const { + return this->reduce_or().is_ascii(); + } + + simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const { + this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd16)*0); + this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd16)*1); + this->chunks[2].store_ascii_as_utf16(ptr+sizeof(simd16)*2); + this->chunks[3].store_ascii_as_utf16(ptr+sizeof(simd16)*3); + } + + simdutf_really_inline uint64_t to_bitmask() const { + uint64_t r0 = uint32_t(this->chunks[0].to_bitmask() ); + uint64_t r1 = this->chunks[1].to_bitmask() ; + uint64_t r2 = this->chunks[2].to_bitmask() ; + uint64_t r3 = this->chunks[3].to_bitmask() ; + return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48); + } + + simdutf_really_inline void swap_bytes() { + this->chunks[0] = this->chunks[0].swap_bytes(); + this->chunks[1] = this->chunks[1].swap_bytes(); + this->chunks[2] = this->chunks[2].swap_bytes(); + this->chunks[3] = this->chunks[3].swap_bytes(); + } + + simdutf_really_inline uint64_t eq(const T m) const { + const simd16 mask = simd16::splat(m); + return simd16x32( + this->chunks[0] == mask, + this->chunks[1] == mask, + this->chunks[2] == mask, + this->chunks[3] == mask + ).to_bitmask(); + } + + simdutf_really_inline uint64_t eq(const simd16x32 &other) const { + return simd16x32( + this->chunks[0] == other.chunks[0], + this->chunks[1] == other.chunks[1], + this->chunks[2] == other.chunks[2], + this->chunks[3] == other.chunks[3] + ).to_bitmask(); + } + + simdutf_really_inline uint64_t lteq(const T m) const { + const simd16 mask = simd16::splat(m); + return simd16x32( + this->chunks[0] <= mask, + this->chunks[1] <= mask, + this->chunks[2] <= mask, + this->chunks[3] <= mask + ).to_bitmask(); + } + + simdutf_really_inline uint64_t in_range(const T low, const T high) const { + const simd16 mask_low = simd16::splat(low); + const simd16 mask_high = simd16::splat(high); + + return simd16x32( + (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low), + (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low), + (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low), + (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low) + ).to_bitmask(); + } + simdutf_really_inline uint64_t not_in_range(const T low, const T high) const { + const simd16 mask_low = simd16::splat(static_cast(low-1)); + const simd16 mask_high = simd16::splat(static_cast(high+1)); + return simd16x32( + (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low), + (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low), + (this->chunks[2] >= mask_high) | (this->chunks[2] <= mask_low), + (this->chunks[3] >= mask_high) | (this->chunks[3] <= mask_low) + ).to_bitmask(); + } + simdutf_really_inline uint64_t lt(const T m) const { + const simd16 mask = simd16::splat(m); + return simd16x32( + this->chunks[0] < mask, + this->chunks[1] < mask, + this->chunks[2] < mask, + this->chunks[3] < mask + ).to_bitmask(); + } + }; // struct simd16x32 +/* end file src/simdutf/westmere/simd16-inl.h */ + +} // namespace simd +} // unnamed namespace +} // namespace westmere +} // namespace simdutf + +#endif // SIMDUTF_WESTMERE_SIMD_INPUT_H +/* end file src/simdutf/westmere/simd.h */ + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/end.h +/* begin file src/simdutf/westmere/end.h */ +SIMDUTF_UNTARGET_REGION +/* end file src/simdutf/westmere/end.h */ + +#endif // SIMDUTF_IMPLEMENTATION_WESTMERE +#endif // SIMDUTF_WESTMERE_COMMON_H +/* end file src/simdutf/westmere.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64.h +/* begin file src/simdutf/ppc64.h */ +#ifndef SIMDUTF_PPC64_H +#define SIMDUTF_PPC64_H + +#ifdef SIMDUTF_FALLBACK_H +#error "ppc64.h must be included before fallback.h" +#endif + + +#ifndef SIMDUTF_IMPLEMENTATION_PPC64 +#define SIMDUTF_IMPLEMENTATION_PPC64 (SIMDUTF_IS_PPC64) +#endif +#define SIMDUTF_CAN_ALWAYS_RUN_PPC64 SIMDUTF_IMPLEMENTATION_PPC64 && SIMDUTF_IS_PPC64 + + + +#if SIMDUTF_IMPLEMENTATION_PPC64 + +namespace simdutf { +/** + * Implementation for ALTIVEC (PPC64). + */ +namespace ppc64 { +} // namespace ppc64 +} // namespace simdutf + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/implementation.h +/* begin file src/simdutf/ppc64/implementation.h */ +#ifndef SIMDUTF_PPC64_IMPLEMENTATION_H +#define SIMDUTF_PPC64_IMPLEMENTATION_H + + +namespace simdutf { +namespace ppc64 { + +namespace { +using namespace simdutf; +} // namespace + +class implementation final : public simdutf::implementation { +public: + simdutf_really_inline implementation() + : simdutf::implementation("ppc64", "PPC64 ALTIVEC", + internal::instruction_set::ALTIVEC) {} + simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final; + simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final; + simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final; + simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final; + simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final; + simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final; + simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; + simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final; + simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept; + simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept; + simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept; + simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept; +}; + +} // namespace ppc64 +} // namespace simdutf + +#endif // SIMDUTF_PPC64_IMPLEMENTATION_H +/* end file src/simdutf/ppc64/implementation.h */ + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/begin.h +/* begin file src/simdutf/ppc64/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "ppc64" +// #define SIMDUTF_IMPLEMENTATION ppc64 +/* end file src/simdutf/ppc64/begin.h */ + +// Declarations +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/intrinsics.h +/* begin file src/simdutf/ppc64/intrinsics.h */ +#ifndef SIMDUTF_PPC64_INTRINSICS_H +#define SIMDUTF_PPC64_INTRINSICS_H + + +// This should be the correct header whether +// you use visual studio or other compilers. +#include + +// These are defined by altivec.h in GCC toolchain, it is safe to undef them. +#ifdef bool +#undef bool +#endif + +#ifdef vector +#undef vector +#endif + +#endif // SIMDUTF_PPC64_INTRINSICS_H +/* end file src/simdutf/ppc64/intrinsics.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/bitmanipulation.h +/* begin file src/simdutf/ppc64/bitmanipulation.h */ +#ifndef SIMDUTF_PPC64_BITMANIPULATION_H +#define SIMDUTF_PPC64_BITMANIPULATION_H + +namespace simdutf { +namespace ppc64 { +namespace { + +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO +simdutf_really_inline int count_ones(uint64_t input_num) { + // note: we do not support legacy 32-bit Windows + return __popcnt64(input_num); // Visual Studio wants two underscores +} +#else +simdutf_really_inline int count_ones(uint64_t input_num) { + return __builtin_popcountll(input_num); +} +#endif + +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf + +#endif // SIMDUTF_PPC64_BITMANIPULATION_H +/* end file src/simdutf/ppc64/bitmanipulation.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/simd.h +/* begin file src/simdutf/ppc64/simd.h */ +#ifndef SIMDUTF_PPC64_SIMD_H +#define SIMDUTF_PPC64_SIMD_H + +#include + +namespace simdutf { +namespace ppc64 { +namespace { +namespace simd { + +using __m128i = __vector unsigned char; + +template struct base { + __m128i value; + + // Zero constructor + simdutf_really_inline base() : value{__m128i()} {} + + // Conversion from SIMD register + simdutf_really_inline base(const __m128i _value) : value(_value) {} + + // Conversion to SIMD register + simdutf_really_inline operator const __m128i &() const { + return this->value; + } + simdutf_really_inline operator __m128i &() { return this->value; } + + // Bit operations + simdutf_really_inline Child operator|(const Child other) const { + return vec_or(this->value, (__m128i)other); + } + simdutf_really_inline Child operator&(const Child other) const { + return vec_and(this->value, (__m128i)other); + } + simdutf_really_inline Child operator^(const Child other) const { + return vec_xor(this->value, (__m128i)other); + } + simdutf_really_inline Child bit_andnot(const Child other) const { + return vec_andc(this->value, (__m128i)other); + } + simdutf_really_inline Child &operator|=(const Child other) { + auto this_cast = static_cast(this); + *this_cast = *this_cast | other; + return *this_cast; + } + simdutf_really_inline Child &operator&=(const Child other) { + auto this_cast = static_cast(this); + *this_cast = *this_cast & other; + return *this_cast; + } + simdutf_really_inline Child &operator^=(const Child other) { + auto this_cast = static_cast(this); + *this_cast = *this_cast ^ other; + return *this_cast; + } +}; + +// Forward-declared so they can be used by splat and friends. +template struct simd8; + +template > +struct base8 : base> { + typedef uint16_t bitmask_t; + typedef uint32_t bitmask2_t; + + simdutf_really_inline base8() : base>() {} + simdutf_really_inline base8(const __m128i _value) : base>(_value) {} + + simdutf_really_inline Mask operator==(const simd8 other) const { + return (__m128i)vec_cmpeq(this->value, (__m128i)other); + } + + static const int SIZE = sizeof(base>::value); + + template + simdutf_really_inline simd8 prev(simd8 prev_chunk) const { + __m128i chunk = this->value; +#ifdef __LITTLE_ENDIAN__ + chunk = (__m128i)vec_reve(this->value); + prev_chunk = (__m128i)vec_reve((__m128i)prev_chunk); +#endif + chunk = (__m128i)vec_sld((__m128i)prev_chunk, (__m128i)chunk, 16 - N); +#ifdef __LITTLE_ENDIAN__ + chunk = (__m128i)vec_reve((__m128i)chunk); +#endif + return chunk; + } +}; + +// SIMD byte mask type (returned by things like eq and gt) +template <> struct simd8 : base8 { + static simdutf_really_inline simd8 splat(bool _value) { + return (__m128i)vec_splats((unsigned char)(-(!!_value))); + } + + simdutf_really_inline simd8() : base8() {} + simdutf_really_inline simd8(const __m128i _value) + : base8(_value) {} + // Splat constructor + simdutf_really_inline simd8(bool _value) + : base8(splat(_value)) {} + + simdutf_really_inline int to_bitmask() const { + __vector unsigned long long result; + const __m128i perm_mask = {0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40, + 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00}; + + result = ((__vector unsigned long long)vec_vbpermq((__m128i)this->value, + (__m128i)perm_mask)); +#ifdef __LITTLE_ENDIAN__ + return static_cast(result[1]); +#else + return static_cast(result[0]); +#endif + } + simdutf_really_inline bool any() const { + return !vec_all_eq(this->value, (__m128i)vec_splats(0)); + } + simdutf_really_inline simd8 operator~() const { + return this->value ^ (__m128i)splat(true); + } +}; + +template struct base8_numeric : base8 { + static simdutf_really_inline simd8 splat(T value) { + (void)value; + return (__m128i)vec_splats(value); + } + static simdutf_really_inline simd8 zero() { return splat(0); } + static simdutf_really_inline simd8 load(const T values[16]) { + return (__m128i)(vec_vsx_ld(0, reinterpret_cast(values))); + } + // Repeat 16 values as many times as necessary (usually for lookup tables) + static simdutf_really_inline simd8 repeat_16(T v0, T v1, T v2, T v3, T v4, + T v5, T v6, T v7, T v8, T v9, + T v10, T v11, T v12, T v13, + T v14, T v15) { + return simd8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, + v14, v15); + } + + simdutf_really_inline base8_numeric() : base8() {} + simdutf_really_inline base8_numeric(const __m128i _value) + : base8(_value) {} + + // Store to array + simdutf_really_inline void store(T dst[16]) const { + vec_vsx_st(this->value, 0, reinterpret_cast<__m128i *>(dst)); + } + + // Override to distinguish from bool version + simdutf_really_inline simd8 operator~() const { return *this ^ 0xFFu; } + + // Addition/subtraction are the same for signed and unsigned + simdutf_really_inline simd8 operator+(const simd8 other) const { + return (__m128i)((__m128i)this->value + (__m128i)other); + } + simdutf_really_inline simd8 operator-(const simd8 other) const { + return (__m128i)((__m128i)this->value - (__m128i)other); + } + simdutf_really_inline simd8 &operator+=(const simd8 other) { + *this = *this + other; + return *static_cast *>(this); + } + simdutf_really_inline simd8 &operator-=(const simd8 other) { + *this = *this - other; + return *static_cast *>(this); + } + + // Perform a lookup assuming the value is between 0 and 16 (undefined behavior + // for out of range values) + template + simdutf_really_inline simd8 lookup_16(simd8 lookup_table) const { + return (__m128i)vec_perm((__m128i)lookup_table, (__m128i)lookup_table, this->value); + } + + template + simdutf_really_inline simd8 + lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4, + L replace5, L replace6, L replace7, L replace8, L replace9, + L replace10, L replace11, L replace12, L replace13, L replace14, + L replace15) const { + return lookup_16(simd8::repeat_16( + replace0, replace1, replace2, replace3, replace4, replace5, replace6, + replace7, replace8, replace9, replace10, replace11, replace12, + replace13, replace14, replace15)); + } +}; + +// Signed bytes +template <> struct simd8 : base8_numeric { + simdutf_really_inline simd8() : base8_numeric() {} + simdutf_really_inline simd8(const __m128i _value) + : base8_numeric(_value) {} + + // Splat constructor + simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {} + // Array constructor + simdutf_really_inline simd8(const int8_t *values) : simd8(load(values)) {} + // Member-by-member initialization + simdutf_really_inline simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3, + int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, + int8_t v12, int8_t v13, int8_t v14, int8_t v15) + : simd8((__m128i)(__vector signed char){v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10, v11, v12, v13, v14, + v15}) {} + // Repeat 16 values as many times as necessary (usually for lookup tables) + simdutf_really_inline static simd8 + repeat_16(int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, + int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11, + int8_t v12, int8_t v13, int8_t v14, int8_t v15) { + return simd8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, + v13, v14, v15); + } + + // Order-sensitive comparisons + simdutf_really_inline simd8 + max_val(const simd8 other) const { + return (__m128i)vec_max((__vector signed char)this->value, + (__vector signed char)(__m128i)other); + } + simdutf_really_inline simd8 + min_val(const simd8 other) const { + return (__m128i)vec_min((__vector signed char)this->value, + (__vector signed char)(__m128i)other); + } + simdutf_really_inline simd8 + operator>(const simd8 other) const { + return (__m128i)vec_cmpgt((__vector signed char)this->value, + (__vector signed char)(__m128i)other); + } + simdutf_really_inline simd8 + operator<(const simd8 other) const { + return (__m128i)vec_cmplt((__vector signed char)this->value, + (__vector signed char)(__m128i)other); + } +}; + +// Unsigned bytes +template <> struct simd8 : base8_numeric { + simdutf_really_inline simd8() : base8_numeric() {} + simdutf_really_inline simd8(const __m128i _value) + : base8_numeric(_value) {} + // Splat constructor + simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {} + // Array constructor + simdutf_really_inline simd8(const uint8_t *values) : simd8(load(values)) {} + // Member-by-member initialization + simdutf_really_inline + simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, + uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10, + uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15) + : simd8((__m128i){v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, + v13, v14, v15}) {} + // Repeat 16 values as many times as necessary (usually for lookup tables) + simdutf_really_inline static simd8 + repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, + uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, + uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, + uint8_t v15) { + return simd8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, + v13, v14, v15); + } + + // Saturated math + simdutf_really_inline simd8 + saturating_add(const simd8 other) const { + return (__m128i)vec_adds(this->value, (__m128i)other); + } + simdutf_really_inline simd8 + saturating_sub(const simd8 other) const { + return (__m128i)vec_subs(this->value, (__m128i)other); + } + + // Order-specific operations + simdutf_really_inline simd8 + max_val(const simd8 other) const { + return (__m128i)vec_max(this->value, (__m128i)other); + } + simdutf_really_inline simd8 + min_val(const simd8 other) const { + return (__m128i)vec_min(this->value, (__m128i)other); + } + // Same as >, but only guarantees true is nonzero (< guarantees true = -1) + simdutf_really_inline simd8 + gt_bits(const simd8 other) const { + return this->saturating_sub(other); + } + // Same as <, but only guarantees true is nonzero (< guarantees true = -1) + simdutf_really_inline simd8 + lt_bits(const simd8 other) const { + return other.saturating_sub(*this); + } + simdutf_really_inline simd8 + operator<=(const simd8 other) const { + return other.max_val(*this) == other; + } + simdutf_really_inline simd8 + operator>=(const simd8 other) const { + return other.min_val(*this) == other; + } + simdutf_really_inline simd8 + operator>(const simd8 other) const { + return this->gt_bits(other).any_bits_set(); + } + simdutf_really_inline simd8 + operator<(const simd8 other) const { + return this->gt_bits(other).any_bits_set(); + } + + // Bit-specific operations + simdutf_really_inline simd8 bits_not_set() const { + return (__m128i)vec_cmpeq(this->value, (__m128i)vec_splats(uint8_t(0))); + } + simdutf_really_inline simd8 bits_not_set(simd8 bits) const { + return (*this & bits).bits_not_set(); + } + simdutf_really_inline simd8 any_bits_set() const { + return ~this->bits_not_set(); + } + simdutf_really_inline simd8 any_bits_set(simd8 bits) const { + return ~this->bits_not_set(bits); + } + + simdutf_really_inline bool is_ascii() const { + return this->saturating_sub(0b01111111u).bits_not_set_anywhere(); + } + + simdutf_really_inline bool bits_not_set_anywhere() const { + return vec_all_eq(this->value, (__m128i)vec_splats(0)); + } + simdutf_really_inline bool any_bits_set_anywhere() const { + return !bits_not_set_anywhere(); + } + simdutf_really_inline bool bits_not_set_anywhere(simd8 bits) const { + return vec_all_eq(vec_and(this->value, (__m128i)bits), + (__m128i)vec_splats(0)); + } + simdutf_really_inline bool any_bits_set_anywhere(simd8 bits) const { + return !bits_not_set_anywhere(bits); + } + template simdutf_really_inline simd8 shr() const { + return simd8( + (__m128i)vec_sr(this->value, (__m128i)vec_splat_u8(N))); + } + template simdutf_really_inline simd8 shl() const { + return simd8( + (__m128i)vec_sl(this->value, (__m128i)vec_splat_u8(N))); + } +}; + +template struct simd8x64 { + static constexpr int NUM_CHUNKS = 64 / sizeof(simd8); + static_assert(NUM_CHUNKS == 4, + "PPC64 kernel should use four registers per 64-byte block."); + simd8 chunks[NUM_CHUNKS]; + + simd8x64(const simd8x64 &o) = delete; // no copy allowed + simd8x64 & + operator=(const simd8 other) = delete; // no assignment allowed + simd8x64() = delete; // no default constructor allowed + + simdutf_really_inline simd8x64(const simd8 chunk0, const simd8 chunk1, + const simd8 chunk2, const simd8 chunk3) + : chunks{chunk0, chunk1, chunk2, chunk3} {} + + simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8::load(ptr), simd8::load(ptr+sizeof(simd8)/sizeof(T)), simd8::load(ptr+2*sizeof(simd8)/sizeof(T)), simd8::load(ptr+3*sizeof(simd8)/sizeof(T))} {} + + simdutf_really_inline void store(T* ptr) const { + this->chunks[0].store(ptr + sizeof(simd8) * 0/sizeof(T)); + this->chunks[1].store(ptr + sizeof(simd8) * 1/sizeof(T)); + this->chunks[2].store(ptr + sizeof(simd8) * 2/sizeof(T)); + this->chunks[3].store(ptr + sizeof(simd8) * 3/sizeof(T)); + } + + + simdutf_really_inline simd8x64& operator |=(const simd8x64 &other) { + this->chunks[0] |= other.chunks[0]; + this->chunks[1] |= other.chunks[1]; + this->chunks[2] |= other.chunks[2]; + this->chunks[3] |= other.chunks[3]; + return *this; + } + + simdutf_really_inline simd8 reduce_or() const { + return (this->chunks[0] | this->chunks[1]) | + (this->chunks[2] | this->chunks[3]); + } + + + simdutf_really_inline bool is_ascii() const { + return input.reduce_or().is_ascii(); + } + + simdutf_really_inline uint64_t to_bitmask() const { + uint64_t r0 = uint32_t(this->chunks[0].to_bitmask()); + uint64_t r1 = this->chunks[1].to_bitmask(); + uint64_t r2 = this->chunks[2].to_bitmask(); + uint64_t r3 = this->chunks[3].to_bitmask(); + return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48); + } + + simdutf_really_inline uint64_t eq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64(this->chunks[0] == mask, this->chunks[1] == mask, + this->chunks[2] == mask, this->chunks[3] == mask) + .to_bitmask(); + } + + simdutf_really_inline uint64_t eq(const simd8x64 &other) const { + return simd8x64(this->chunks[0] == other.chunks[0], + this->chunks[1] == other.chunks[1], + this->chunks[2] == other.chunks[2], + this->chunks[3] == other.chunks[3]) + .to_bitmask(); + } + + simdutf_really_inline uint64_t lteq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64(this->chunks[0] <= mask, this->chunks[1] <= mask, + this->chunks[2] <= mask, this->chunks[3] <= mask) + .to_bitmask(); + } + + simdutf_really_inline uint64_t in_range(const T low, const T high) const { + const simd8 mask_low = simd8::splat(low); + const simd8 mask_high = simd8::splat(high); + + return simd8x64( + (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low), + (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low), + (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low), + (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low) + ).to_bitmask(); + } + simdutf_really_inline uint64_t not_in_range(const T low, const T high) const { + const simd8 mask_low = simd8::splat(low); + const simd8 mask_high = simd8::splat(high); + return simd8x64( + (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low), + (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low), + (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low), + (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low) + ).to_bitmask(); + } + simdutf_really_inline uint64_t lt(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64(this->chunks[0] < mask, this->chunks[1] < mask, + this->chunks[2] < mask, this->chunks[3] < mask) + .to_bitmask(); + } + + simdutf_really_inline uint64_t gt(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] > mask, + this->chunks[1] > mask, + this->chunks[2] > mask, + this->chunks[3] > mask + ).to_bitmask(); + } + simdutf_really_inline uint64_t gteq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] >= mask, + this->chunks[1] >= mask, + this->chunks[2] >= mask, + this->chunks[3] >= mask + ).to_bitmask(); + } + simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + simd8(this->chunks[0]) >= mask, + simd8(this->chunks[1]) >= mask, + simd8(this->chunks[2]) >= mask, + simd8(this->chunks[3]) >= mask + ).to_bitmask(); + } +}; // struct simd8x64 + +} // namespace simd +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf + +#endif // SIMDUTF_PPC64_SIMD_INPUT_H +/* end file src/simdutf/ppc64/simd.h */ + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/end.h +/* begin file src/simdutf/ppc64/end.h */ +/* end file src/simdutf/ppc64/end.h */ + +#endif // SIMDUTF_IMPLEMENTATION_PPC64 + +#endif // SIMDUTF_PPC64_H +/* end file src/simdutf/ppc64.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/fallback.h +/* begin file src/simdutf/fallback.h */ +#ifndef SIMDUTF_FALLBACK_H +#define SIMDUTF_FALLBACK_H + + +// Default Fallback to on unless a builtin implementation has already been selected. +#ifndef SIMDUTF_IMPLEMENTATION_FALLBACK +#define SIMDUTF_IMPLEMENTATION_FALLBACK 1 // (!SIMDUTF_CAN_ALWAYS_RUN_ARM64 && !SIMDUTF_CAN_ALWAYS_RUN_HASWELL && !SIMDUTF_CAN_ALWAYS_RUN_WESTMERE && !SIMDUTF_CAN_ALWAYS_RUN_PPC64) +#endif +#define SIMDUTF_CAN_ALWAYS_RUN_FALLBACK SIMDUTF_IMPLEMENTATION_FALLBACK + +#if SIMDUTF_IMPLEMENTATION_FALLBACK + +namespace simdutf { +/** + * Fallback implementation (runs on any machine). + */ +namespace fallback { +} // namespace fallback +} // namespace simdutf + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/fallback/implementation.h +/* begin file src/simdutf/fallback/implementation.h */ +#ifndef SIMDUTF_FALLBACK_IMPLEMENTATION_H +#define SIMDUTF_FALLBACK_IMPLEMENTATION_H + + +namespace simdutf { +namespace fallback { + +namespace { +using namespace simdutf; +} + +class implementation final : public simdutf::implementation { +public: + simdutf_really_inline implementation() : simdutf::implementation( + "fallback", + "Generic fallback implementation", + 0 + ) {} + simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final; + simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final; + simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final; + simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final; + simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final; + simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final; + simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; + simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final; + simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept; + simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept; + simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept; + simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept; +}; + +} // namespace fallback +} // namespace simdutf + +#endif // SIMDUTF_FALLBACK_IMPLEMENTATION_H +/* end file src/simdutf/fallback/implementation.h */ + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/fallback/begin.h +/* begin file src/simdutf/fallback/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "fallback" +// #define SIMDUTF_IMPLEMENTATION fallback +/* end file src/simdutf/fallback/begin.h */ + +// Declarations +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/fallback/bitmanipulation.h +/* begin file src/simdutf/fallback/bitmanipulation.h */ +#ifndef SIMDUTF_FALLBACK_BITMANIPULATION_H +#define SIMDUTF_FALLBACK_BITMANIPULATION_H + +#include + +namespace simdutf { +namespace fallback { +namespace { + +#if defined(_MSC_VER) && !defined(_M_ARM64) && !defined(_M_X64) +static inline unsigned char _BitScanForward64(unsigned long* ret, uint64_t x) { + unsigned long x0 = (unsigned long)x, top, bottom; + _BitScanForward(&top, (unsigned long)(x >> 32)); + _BitScanForward(&bottom, x0); + *ret = x0 ? bottom : 32 + top; + return x != 0; +} +static unsigned char _BitScanReverse64(unsigned long* ret, uint64_t x) { + unsigned long x1 = (unsigned long)(x >> 32), top, bottom; + _BitScanReverse(&top, x1); + _BitScanReverse(&bottom, (unsigned long)x); + *ret = x1 ? top + 32 : bottom; + return x != 0; +} +#endif + +/* result might be undefined when input_num is zero */ +simdutf_really_inline int leading_zeroes(uint64_t input_num) { +#ifdef _MSC_VER + unsigned long leading_zero = 0; + // Search the mask data from most significant bit (MSB) + // to least significant bit (LSB) for a set bit (1). + if (_BitScanReverse64(&leading_zero, input_num)) + return (int)(63 - leading_zero); + else + return 64; +#else + return __builtin_clzll(input_num); +#endif// _MSC_VER +} + +} // unnamed namespace +} // namespace fallback +} // namespace simdutf + +#endif // SIMDUTF_FALLBACK_BITMANIPULATION_H +/* end file src/simdutf/fallback/bitmanipulation.h */ + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/fallback/end.h +/* begin file src/simdutf/fallback/end.h */ +/* end file src/simdutf/fallback/end.h */ + +#endif // SIMDUTF_IMPLEMENTATION_FALLBACK +#endif // SIMDUTF_FALLBACK_H +/* end file src/simdutf/fallback.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake.h +/* begin file src/simdutf/icelake.h */ +#ifndef SIMDUTF_ICELAKE_H +#define SIMDUTF_ICELAKE_H + + + +#ifdef __has_include +// How do we detect that a compiler supports vbmi2? +// For sure if the following header is found, we are ok? +#if __has_include() +#define SIMDUTF_COMPILER_SUPPORTS_VBMI2 1 +#endif +#endif + +#ifdef _MSC_VER +#if _MSC_VER >= 1920 +// Visual Studio 2019 and up support VBMI2 under x64 even if the header +// avx512vbmi2intrin.h is not found. +#define SIMDUTF_COMPILER_SUPPORTS_VBMI2 1 +#endif +#endif + +// We allow icelake on x64 as long as the compiler is known to support VBMI2. +#ifndef SIMDUTF_IMPLEMENTATION_ICELAKE +#define SIMDUTF_IMPLEMENTATION_ICELAKE ((SIMDUTF_IS_X86_64) && (SIMDUTF_COMPILER_SUPPORTS_VBMI2)) +#endif + +// To see why (__BMI__) && (__PCLMUL__) && (__LZCNT__) are not part of this next line, see +// https://github.com/simdutf/simdutf/issues/1247 +#define SIMDUTF_CAN_ALWAYS_RUN_ICELAKE ((SIMDUTF_IMPLEMENTATION_ICELAKE) && (SIMDUTF_IS_X86_64) && (__AVX2__) && (SIMDUTF_HAS_AVX512F && \ + SIMDUTF_HAS_AVX512DQ && \ + SIMDUTF_HAS_AVX512VL && \ + SIMDUTF_HAS_AVX512VBMI2)) + +#if SIMDUTF_IMPLEMENTATION_ICELAKE +#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE +#define SIMDUTF_TARGET_ICELAKE +#define SIMDJSON_UNTARGET_ICELAKE +#else +#define SIMDUTF_TARGET_ICELAKE SIMDUTF_TARGET_REGION("avx512f,avx512dq,avx512cd,avx512bw,avx512vbmi,avx512vbmi2,avx512vl,avx2,bmi,bmi2,pclmul,lzcnt") +#define SIMDUTF_UNTARGET_ICELAKE SIMDUTF_UNTARGET_REGION +#endif + +namespace simdutf { +namespace icelake { +} // namespace icelake +} // namespace simdutf + + + +// +// These two need to be included outside SIMDUTF_TARGET_REGION +// +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake/intrinsics.h +/* begin file src/simdutf/icelake/intrinsics.h */ +#ifndef SIMDUTF_ICELAKE_INTRINSICS_H +#define SIMDUTF_ICELAKE_INTRINSICS_H + + +#ifdef SIMDUTF_VISUAL_STUDIO +// under clang within visual studio, this will include +#include // visual studio or clang +#include +#else +#include // elsewhere +#ifndef _tzcnt_u64 +#define _tzcnt_u64(x) __tzcnt_u64(x) +#endif // _tzcnt_u64 +#endif // SIMDUTF_VISUAL_STUDIO + +#ifdef SIMDUTF_CLANG_VISUAL_STUDIO +/** + * You are not supposed, normally, to include these + * headers directly. Instead you should either include intrin.h + * or x86intrin.h. However, when compiling with clang + * under Windows (i.e., when _MSC_VER is set), these headers + * only get included *if* the corresponding features are detected + * from macros: + * e.g., if __AVX2__ is set... in turn, we normally set these + * macros by compiling against the corresponding architecture + * (e.g., arch:AVX2, -mavx2, etc.) which compiles the whole + * software with these advanced instructions. In simdutf, we + * want to compile the whole program for a generic target, + * and only target our specific kernels. As a workaround, + * we directly include the needed headers. These headers would + * normally guard against such usage, but we carefully included + * (or ) before, so the headers + * are fooled. + */ +#include // for _blsr_u64 +#include // for _pext_u64, _pdep_u64 +#include // for __lzcnt64 +#include // for most things (AVX2, AVX512, _popcnt64) +#include +#include +#include +#include +#include // for _mm_clmulepi64_si128 +// Important: we need the AVX-512 headers: +#include +#include +#include +#include +#include +#include +#include +#include +// unfortunately, we may not get _blsr_u64, but, thankfully, clang +// has it as a macro. +#ifndef _blsr_u64 +// we roll our own +#define _blsr_u64(n) ((n - 1) & n) +#endif // _blsr_u64 +#endif // SIMDUTF_CLANG_VISUAL_STUDIO + + + +#if defined(__GNUC__) && !defined(__clang__) +#if __GNUC__ == 8 +#define SIMDUTF_GCC8 1 +#endif // __GNUC__ == 8 +#endif // defined(__GNUC__) && !defined(__clang__) + +#if SIMDUTF_GCC8 +/** + * GCC 8 fails to provide _mm512_set_epi8. We roll our own. + */ +inline __m512i _mm512_set_epi8(uint8_t a0, uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8, uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16, uint8_t a17, uint8_t a18, uint8_t a19, uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24, uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, uint8_t a30, uint8_t a31, uint8_t a32, uint8_t a33, uint8_t a34, uint8_t a35, uint8_t a36, uint8_t a37, uint8_t a38, uint8_t a39, uint8_t a40, uint8_t a41, uint8_t a42, uint8_t a43, uint8_t a44, uint8_t a45, uint8_t a46, uint8_t a47, uint8_t a48, uint8_t a49, uint8_t a50, uint8_t a51, uint8_t a52, uint8_t a53, uint8_t a54, uint8_t a55, uint8_t a56, uint8_t a57, uint8_t a58, uint8_t a59, uint8_t a60, uint8_t a61, uint8_t a62, uint8_t a63) { + return _mm512_set_epi64(uint64_t(a7) + (uint64_t(a6) << 8) + (uint64_t(a5) << 16) + (uint64_t(a4) << 24) + (uint64_t(a3) << 32) + (uint64_t(a2) << 40) + (uint64_t(a1) << 48) + (uint64_t(a0) << 56), + uint64_t(a15) + (uint64_t(a14) << 8) + (uint64_t(a13) << 16) + (uint64_t(a12) << 24) + (uint64_t(a11) << 32) + (uint64_t(a10) << 40) + (uint64_t(a9) << 48) + (uint64_t(a8) << 56), + uint64_t(a23) + (uint64_t(a22) << 8) + (uint64_t(a21) << 16) + (uint64_t(a20) << 24) + (uint64_t(a19) << 32) + (uint64_t(a18) << 40) + (uint64_t(a17) << 48) + (uint64_t(a16) << 56), + uint64_t(a31) + (uint64_t(a30) << 8) + (uint64_t(a29) << 16) + (uint64_t(a28) << 24) + (uint64_t(a27) << 32) + (uint64_t(a26) << 40) + (uint64_t(a25) << 48) + (uint64_t(a24) << 56), + uint64_t(a39) + (uint64_t(a38) << 8) + (uint64_t(a37) << 16) + (uint64_t(a36) << 24) + (uint64_t(a35) << 32) + (uint64_t(a34) << 40) + (uint64_t(a33) << 48) + (uint64_t(a32) << 56), + uint64_t(a47) + (uint64_t(a46) << 8) + (uint64_t(a45) << 16) + (uint64_t(a44) << 24) + (uint64_t(a43) << 32) + (uint64_t(a42) << 40) + (uint64_t(a41) << 48) + (uint64_t(a40) << 56), + uint64_t(a55) + (uint64_t(a54) << 8) + (uint64_t(a53) << 16) + (uint64_t(a52) << 24) + (uint64_t(a51) << 32) + (uint64_t(a50) << 40) + (uint64_t(a49) << 48) + (uint64_t(a48) << 56), + uint64_t(a63) + (uint64_t(a62) << 8) + (uint64_t(a61) << 16) + (uint64_t(a60) << 24) + (uint64_t(a59) << 32) + (uint64_t(a58) << 40) + (uint64_t(a57) << 48) + (uint64_t(a56) << 56)); +} +#endif // SIMDUTF_GCC8 + +#endif // SIMDUTF_HASWELL_INTRINSICS_H +/* end file src/simdutf/icelake/intrinsics.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake/implementation.h +/* begin file src/simdutf/icelake/implementation.h */ +#ifndef SIMDUTF_ICELAKE_IMPLEMENTATION_H +#define SIMDUTF_ICELAKE_IMPLEMENTATION_H + + +namespace simdutf { +namespace icelake { + +namespace { +using namespace simdutf; +} + +class implementation final : public simdutf::implementation { +public: + simdutf_really_inline implementation() : simdutf::implementation( + "icelake", + "Intel AVX512 (AVX-512BW, AVX-512CD, AVX-512VL, AVX-512VBMI2 extensions)", + internal::instruction_set::AVX2 | internal::instruction_set::PCLMULQDQ | internal::instruction_set::BMI1 | internal::instruction_set::BMI2 | internal::instruction_set::AVX512BW | internal::instruction_set::AVX512CD | internal::instruction_set::AVX512VL | internal::instruction_set::AVX512VBMI2 ) {} + simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final; + simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final; + simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final; + simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final; + simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final; + simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final; + simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final; + simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; + simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; + simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final; + void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final; + simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept; + simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept; + simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept; + simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept; +}; + +} // namespace icelake +} // namespace simdutf + +#endif // SIMDUTF_ICELAKE_IMPLEMENTATION_H +/* end file src/simdutf/icelake/implementation.h */ + +// +// The rest need to be inside the region +// +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake/begin.h +/* begin file src/simdutf/icelake/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "icelake" +// #define SIMDUTF_IMPLEMENTATION icelake +SIMDUTF_TARGET_ICELAKE + +/* end file src/simdutf/icelake/begin.h */ +// Declarations +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake/bitmanipulation.h +/* begin file src/simdutf/icelake/bitmanipulation.h */ +#ifndef SIMDUTF_ICELAKE_BITMANIPULATION_H +#define SIMDUTF_ICELAKE_BITMANIPULATION_H + +namespace simdutf { +namespace icelake { +namespace { + +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO +simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) { + // note: we do not support legacy 32-bit Windows + return __popcnt64(input_num);// Visual Studio wants two underscores +} +#else +simdutf_really_inline long long int count_ones(uint64_t input_num) { + return _popcnt64(input_num); +} +#endif + +} // unnamed namespace +} // namespace icelake +} // namespace simdutf + +#endif // SIMDUTF_ICELAKE_BITMANIPULATION_H +/* end file src/simdutf/icelake/bitmanipulation.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake/end.h +/* begin file src/simdutf/icelake/end.h */ +SIMDUTF_UNTARGET_REGION +/* end file src/simdutf/icelake/end.h */ + + + +#endif // SIMDUTF_IMPLEMENTATION_ICELAKE +#endif // SIMDUTF_ICELAKE_H +/* end file src/simdutf/icelake.h */ + +namespace simdutf { +bool implementation::supported_by_runtime_system() const { + uint32_t required_instruction_sets = this->required_instruction_sets(); + uint32_t supported_instruction_sets = internal::detect_supported_architectures(); + return ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets); +} + +simdutf_warn_unused encoding_type implementation::autodetect_encoding(const char * input, size_t length) const noexcept { + // If there is a BOM, then we trust it. + auto bom_encoding = simdutf::BOM::check_bom(input, length); + if(bom_encoding != encoding_type::unspecified) { return bom_encoding; } + // UTF8 is common, it includes ASCII, and is commonly represented + // without a BOM, so if it fits, go with that. Note that it is still + // possible to get it wrong, we are only 'guessing'. If some has UTF-16 + // data without a BOM, it could pass as UTF-8. + // + // An interesting twist might be to check for UTF-16 ASCII first (every + // other byte is zero). + if(validate_utf8(input, length)) { return encoding_type::UTF8; } + // The next most common encoding that might appear without BOM is probably + // UTF-16LE, so try that next. + if((length % 2) == 0) { + // important: we need to divide by two + if(validate_utf16le(reinterpret_cast(input), length/2)) { return encoding_type::UTF16_LE; } + } + if((length % 4) == 0) { + if(validate_utf32(reinterpret_cast(input), length/4)) { return encoding_type::UTF32_LE; } + } + return encoding_type::unspecified; +} + +namespace internal { + +// Static array of known implementations. We're hoping these get baked into the executable +// without requiring a static initializer. + + +#if SIMDUTF_IMPLEMENTATION_ICELAKE +const icelake::implementation icelake_singleton{}; +#endif +#if SIMDUTF_IMPLEMENTATION_HASWELL +const haswell::implementation haswell_singleton{}; +#endif +#if SIMDUTF_IMPLEMENTATION_WESTMERE +const westmere::implementation westmere_singleton{}; +#endif +#if SIMDUTF_IMPLEMENTATION_ARM64 +const arm64::implementation arm64_singleton{}; +#endif +#if SIMDUTF_IMPLEMENTATION_PPC64 +const ppc64::implementation ppc64_singleton{}; +#endif +#if SIMDUTF_IMPLEMENTATION_FALLBACK +const fallback::implementation fallback_singleton{}; +#endif + +/** + * @private Detects best supported implementation on first use, and sets it + */ +class detect_best_supported_implementation_on_first_use final : public implementation { +public: + const std::string &name() const noexcept final { return set_best()->name(); } + const std::string &description() const noexcept final { return set_best()->description(); } + uint32_t required_instruction_sets() const noexcept final { return set_best()->required_instruction_sets(); } + + simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept override { + return set_best()->detect_encodings(input, length); + } + + simdutf_warn_unused bool validate_utf8(const char * buf, size_t len) const noexcept final override { + return set_best()->validate_utf8(buf, len); + } + + simdutf_warn_unused result validate_utf8_with_errors(const char * buf, size_t len) const noexcept final override { + return set_best()->validate_utf8_with_errors(buf, len); + } + + simdutf_warn_unused bool validate_ascii(const char * buf, size_t len) const noexcept final override { + return set_best()->validate_ascii(buf, len); + } + + simdutf_warn_unused result validate_ascii_with_errors(const char * buf, size_t len) const noexcept final override { + return set_best()->validate_ascii_with_errors(buf, len); + } + + simdutf_warn_unused bool validate_utf16le(const char16_t * buf, size_t len) const noexcept final override { + return set_best()->validate_utf16le(buf, len); + } + + simdutf_warn_unused bool validate_utf16be(const char16_t * buf, size_t len) const noexcept final override { + return set_best()->validate_utf16be(buf, len); + } + + simdutf_warn_unused result validate_utf16le_with_errors(const char16_t * buf, size_t len) const noexcept final override { + return set_best()->validate_utf16le_with_errors(buf, len); + } + + simdutf_warn_unused result validate_utf16be_with_errors(const char16_t * buf, size_t len) const noexcept final override { + return set_best()->validate_utf16be_with_errors(buf, len); + } + + simdutf_warn_unused bool validate_utf32(const char32_t * buf, size_t len) const noexcept final override { + return set_best()->validate_utf32(buf, len); + } + + simdutf_warn_unused result validate_utf32_with_errors(const char32_t * buf, size_t len) const noexcept final override { + return set_best()->validate_utf32_with_errors(buf, len); + } + + simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override { + return set_best()->convert_utf8_to_utf16le(buf, len, utf16_output); + } + + simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override { + return set_best()->convert_utf8_to_utf16be(buf, len, utf16_output); + } + + simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override { + return set_best()->convert_utf8_to_utf16le_with_errors(buf, len, utf16_output); + } + + simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override { + return set_best()->convert_utf8_to_utf16be_with_errors(buf, len, utf16_output); + } + + simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override { + return set_best()->convert_valid_utf8_to_utf16le(buf, len, utf16_output); + } + + simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override { + return set_best()->convert_valid_utf8_to_utf16be(buf, len, utf16_output); + } + + simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final override { + return set_best()->convert_utf8_to_utf32(buf, len, utf32_output); + } + + simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final override { + return set_best()->convert_utf8_to_utf32_with_errors(buf, len, utf32_output); + } + + simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final override { + return set_best()->convert_valid_utf8_to_utf32(buf, len, utf32_output); + } + + simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override { + return set_best()->convert_utf16le_to_utf8(buf, len, utf8_output); + } + + simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override { + return set_best()->convert_utf16be_to_utf8(buf, len, utf8_output); + } + + simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override { + return set_best()->convert_utf16le_to_utf8_with_errors(buf, len, utf8_output); + } + + simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override { + return set_best()->convert_utf16be_to_utf8_with_errors(buf, len, utf8_output); + } + + simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override { + return set_best()->convert_valid_utf16le_to_utf8(buf, len, utf8_output); + } + + simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override { + return set_best()->convert_valid_utf16be_to_utf8(buf, len, utf8_output); + } + + simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_output) const noexcept final override { + return set_best()->convert_utf32_to_utf8(buf, len, utf8_output); + } + + simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_output) const noexcept final override { + return set_best()->convert_utf32_to_utf8_with_errors(buf, len, utf8_output); + } + + simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_output) const noexcept final override { + return set_best()->convert_valid_utf32_to_utf8(buf, len, utf8_output); + } + + simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override { + return set_best()->convert_utf32_to_utf16le(buf, len, utf16_output); + } + + simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override { + return set_best()->convert_utf32_to_utf16be(buf, len, utf16_output); + } + + simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override { + return set_best()->convert_utf32_to_utf16le_with_errors(buf, len, utf16_output); + } + + simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override { + return set_best()->convert_utf32_to_utf16be_with_errors(buf, len, utf16_output); + } + + simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override { + return set_best()->convert_valid_utf32_to_utf16le(buf, len, utf16_output); + } + + simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override { + return set_best()->convert_valid_utf32_to_utf16be(buf, len, utf16_output); + } + + simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override { + return set_best()->convert_utf16le_to_utf32(buf, len, utf32_output); + } + + simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override { + return set_best()->convert_utf16be_to_utf32(buf, len, utf32_output); + } + + simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override { + return set_best()->convert_utf16le_to_utf32_with_errors(buf, len, utf32_output); + } + + simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override { + return set_best()->convert_utf16be_to_utf32_with_errors(buf, len, utf32_output); + } + + simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override { + return set_best()->convert_valid_utf16le_to_utf32(buf, len, utf32_output); + } + + simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override { + return set_best()->convert_valid_utf16be_to_utf32(buf, len, utf32_output); + } + + void change_endianness_utf16(const char16_t * buf, size_t len, char16_t * output) const noexcept final override { + set_best()->change_endianness_utf16(buf, len, output); + } + + simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t len) const noexcept final override { + return set_best()->count_utf16le(buf, len); + } + + simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t len) const noexcept final override { + return set_best()->count_utf16be(buf, len); + } + + simdutf_warn_unused size_t count_utf8(const char * buf, size_t len) const noexcept final override { + return set_best()->count_utf8(buf, len); + } + + simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * buf, size_t len) const noexcept override { + return set_best()->utf8_length_from_utf16le(buf, len); + } + + simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * buf, size_t len) const noexcept override { + return set_best()->utf8_length_from_utf16be(buf, len); + } + + simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * buf, size_t len) const noexcept override { + return set_best()->utf32_length_from_utf16le(buf, len); + } + + simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * buf, size_t len) const noexcept override { + return set_best()->utf32_length_from_utf16be(buf, len); + } + + simdutf_warn_unused size_t utf16_length_from_utf8(const char * buf, size_t len) const noexcept override { + return set_best()->utf16_length_from_utf8(buf, len); + } + + simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * buf, size_t len) const noexcept override { + return set_best()->utf8_length_from_utf32(buf, len); + } + + simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * buf, size_t len) const noexcept override { + return set_best()->utf16_length_from_utf32(buf, len); + } + + simdutf_warn_unused size_t utf32_length_from_utf8(const char * buf, size_t len) const noexcept override { + return set_best()->utf32_length_from_utf8(buf, len); + } + + simdutf_really_inline detect_best_supported_implementation_on_first_use() noexcept : implementation("best_supported_detector", "Detects the best supported implementation and sets it", 0) {} + +private: + const implementation *set_best() const noexcept; +}; + +const detect_best_supported_implementation_on_first_use detect_best_supported_implementation_on_first_use_singleton; + +const std::initializer_list available_implementation_pointers { +#if SIMDUTF_IMPLEMENTATION_ICELAKE + &icelake_singleton, +#endif +#if SIMDUTF_IMPLEMENTATION_HASWELL + &haswell_singleton, +#endif +#if SIMDUTF_IMPLEMENTATION_WESTMERE + &westmere_singleton, +#endif +#if SIMDUTF_IMPLEMENTATION_ARM64 + &arm64_singleton, +#endif +#if SIMDUTF_IMPLEMENTATION_PPC64 + &ppc64_singleton, +#endif +#if SIMDUTF_IMPLEMENTATION_FALLBACK + &fallback_singleton, +#endif +}; // available_implementation_pointers + +// So we can return UNSUPPORTED_ARCHITECTURE from the parser when there is no support +class unsupported_implementation final : public implementation { +public: + simdutf_warn_unused int detect_encodings(const char *, size_t) const noexcept override { + return encoding_type::unspecified; + } + + simdutf_warn_unused bool validate_utf8(const char *, size_t) const noexcept final override { + return false; // Just refuse to validate. Given that we have a fallback implementation + // it seems unlikely that unsupported_implementation will ever be used. If it is used, + // then it will flag all strings as invalid. The alternative is to return an error_code + // from which the user has to figure out whether the string is valid UTF-8... which seems + // like a lot of work just to handle the very unlikely case that we have an unsupported + // implementation. And, when it does happen (that we have an unsupported implementation), + // what are the chances that the programmer has a fallback? Given that *we* provide the + // fallback, it implies that the programmer would need a fallback for our fallback. + } + + simdutf_warn_unused result validate_utf8_with_errors(const char *, size_t) const noexcept final override { + return result(error_code::OTHER, 0); + } + + simdutf_warn_unused bool validate_ascii(const char *, size_t) const noexcept final override { + return false; + } + + simdutf_warn_unused result validate_ascii_with_errors(const char *, size_t) const noexcept final override { + return result(error_code::OTHER, 0); + } + + simdutf_warn_unused bool validate_utf16le(const char16_t*, size_t) const noexcept final override { + return false; + } + + simdutf_warn_unused bool validate_utf16be(const char16_t*, size_t) const noexcept final override { + return false; + } + + simdutf_warn_unused result validate_utf16le_with_errors(const char16_t*, size_t) const noexcept final override { + return result(error_code::OTHER, 0); + } + + simdutf_warn_unused result validate_utf16be_with_errors(const char16_t*, size_t) const noexcept final override { + return result(error_code::OTHER, 0); + } + + simdutf_warn_unused bool validate_utf32(const char32_t*, size_t) const noexcept final override { + return false; + } + + simdutf_warn_unused result validate_utf32_with_errors(const char32_t*, size_t) const noexcept final override { + return result(error_code::OTHER, 0); + } + + simdutf_warn_unused size_t convert_utf8_to_utf16le(const char*, size_t, char16_t*) const noexcept final override { + return 0; + } + + simdutf_warn_unused size_t convert_utf8_to_utf16be(const char*, size_t, char16_t*) const noexcept final override { + return 0; + } + + simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char*, size_t, char16_t*) const noexcept final override { + return result(error_code::OTHER, 0); + } + + simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char*, size_t, char16_t*) const noexcept final override { + return result(error_code::OTHER, 0); + } + + simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char*, size_t, char16_t*) const noexcept final override { + return 0; + } + + simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char*, size_t, char16_t*) const noexcept final override { + return 0; + } + + simdutf_warn_unused size_t convert_utf8_to_utf32(const char*, size_t, char32_t*) const noexcept final override { + return 0; + } + + simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char*, size_t, char32_t*) const noexcept final override { + return result(error_code::OTHER, 0); + } + + simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char*, size_t, char32_t*) const noexcept final override { + return 0; + } + + simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t*, size_t, char*) const noexcept final override { + return 0; + } + + simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t*, size_t, char*) const noexcept final override { + return 0; + } + + simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t*, size_t, char*) const noexcept final override { + return result(error_code::OTHER, 0); + } + + simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t*, size_t, char*) const noexcept final override { + return result(error_code::OTHER, 0); + } + + simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t*, size_t, char*) const noexcept final override { + return 0; + } + + simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t*, size_t, char*) const noexcept final override { + return 0; + } + + simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t*, size_t, char*) const noexcept final override { + return 0; + } + + simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t*, size_t, char*) const noexcept final override { + return result(error_code::OTHER, 0); + } + + simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t*, size_t, char*) const noexcept final override { + return 0; + } + + simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t*, size_t, char16_t*) const noexcept final override { + return 0; + } + + simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t*, size_t, char16_t*) const noexcept final override { + return 0; + } + + simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t*, size_t, char16_t*) const noexcept final override { + return result(error_code::OTHER, 0); + } + + simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t*, size_t, char16_t*) const noexcept final override { + return result(error_code::OTHER, 0); + } + + simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t*, size_t, char16_t*) const noexcept final override { + return 0; + } + + simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t*, size_t, char16_t*) const noexcept final override { + return 0; + } + + simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override { + return 0; + } + + simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override { + return 0; + } + + simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t*, size_t, char32_t*) const noexcept final override { + return result(error_code::OTHER, 0); + } + + simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t*, size_t, char32_t*) const noexcept final override { + return result(error_code::OTHER, 0); + } + + simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override { + return 0; + } + + simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override { + return 0; + } + + void change_endianness_utf16(const char16_t *, size_t, char16_t *) const noexcept final override { + + } + + simdutf_warn_unused size_t count_utf16le(const char16_t *, size_t) const noexcept final override { + return 0; + } + + simdutf_warn_unused size_t count_utf16be(const char16_t *, size_t) const noexcept final override { + return 0; + } + + simdutf_warn_unused size_t count_utf8(const char *, size_t) const noexcept final override { + return 0; + } + + simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t *, size_t) const noexcept override { + return 0; + } + + simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t *, size_t) const noexcept override { + return 0; + } + + simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t *, size_t) const noexcept override { + return 0; + } + + simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t *, size_t) const noexcept override { + return 0; + } + + simdutf_warn_unused size_t utf16_length_from_utf8(const char *, size_t) const noexcept override { + return 0; + } + + simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t *, size_t) const noexcept override { + return 0; + } + + simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t *, size_t) const noexcept override { + return 0; + } + + simdutf_warn_unused size_t utf32_length_from_utf8(const char *, size_t) const noexcept override { + return 0; + } + + unsupported_implementation() : implementation("unsupported", "Unsupported CPU (no detected SIMD instructions)", 0) {} +}; + +const unsupported_implementation unsupported_singleton{}; + +size_t available_implementation_list::size() const noexcept { + return internal::available_implementation_pointers.size(); +} +const implementation * const *available_implementation_list::begin() const noexcept { + return internal::available_implementation_pointers.begin(); +} +const implementation * const *available_implementation_list::end() const noexcept { + return internal::available_implementation_pointers.end(); +} +const implementation *available_implementation_list::detect_best_supported() const noexcept { + // They are prelisted in priority order, so we just go down the list + uint32_t supported_instruction_sets = internal::detect_supported_architectures(); + for (const implementation *impl : internal::available_implementation_pointers) { + uint32_t required_instruction_sets = impl->required_instruction_sets(); + if ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets) { return impl; } + } + return &unsupported_singleton; // this should never happen? +} + +const implementation *detect_best_supported_implementation_on_first_use::set_best() const noexcept { + SIMDUTF_PUSH_DISABLE_WARNINGS + SIMDUTF_DISABLE_DEPRECATED_WARNING // Disable CRT_SECURE warning on MSVC: manually verified this is safe + char *force_implementation_name = getenv("SIMDUTF_FORCE_IMPLEMENTATION"); + SIMDUTF_POP_DISABLE_WARNINGS + + if (force_implementation_name) { + auto force_implementation = available_implementations[force_implementation_name]; + if (force_implementation) { + return active_implementation = force_implementation; + } else { + // Note: abort() and stderr usage within the library is forbidden. + return active_implementation = &unsupported_singleton; + } + } + return active_implementation = available_implementations.detect_best_supported(); +} + +} // namespace internal + +SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list available_implementations{}; +SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr active_implementation{&internal::detect_best_supported_implementation_on_first_use_singleton}; + +simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept { + return active_implementation->validate_utf8(buf, len); +} +simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) noexcept { + return active_implementation->validate_utf8_with_errors(buf, len); +} +simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept { + return active_implementation->validate_ascii(buf, len); +} +simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) noexcept { + return active_implementation->validate_ascii_with_errors(buf, len); +} +simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_output) noexcept { + return active_implementation->convert_utf8_to_utf16le(input, length, utf16_output); +} +simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_output) noexcept { + return active_implementation->convert_utf8_to_utf16be(input, length, utf16_output); +} +simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept { + return active_implementation->convert_utf8_to_utf16le_with_errors(input, length, utf16_output); +} +simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept { + return active_implementation->convert_utf8_to_utf16be_with_errors(input, length, utf16_output); +} +simdutf_warn_unused size_t convert_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_output) noexcept { + return active_implementation->convert_utf8_to_utf32(input, length, utf32_output); +} +simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * input, size_t length, char32_t* utf32_output) noexcept { + return active_implementation->convert_utf8_to_utf32_with_errors(input, length, utf32_output); +} +simdutf_warn_unused bool validate_utf16le(const char16_t * buf, size_t len) noexcept { + return active_implementation->validate_utf16le(buf, len); +} +simdutf_warn_unused bool validate_utf16be(const char16_t * buf, size_t len) noexcept { + return active_implementation->validate_utf16be(buf, len); +} +simdutf_warn_unused result validate_utf16le_with_errors(const char16_t * buf, size_t len) noexcept { + return active_implementation->validate_utf16le_with_errors(buf, len); +} +simdutf_warn_unused result validate_utf16be_with_errors(const char16_t * buf, size_t len) noexcept { + return active_implementation->validate_utf16be_with_errors(buf, len); +} +simdutf_warn_unused bool validate_utf32(const char32_t * buf, size_t len) noexcept { + return active_implementation->validate_utf32(buf, len); +} +simdutf_warn_unused result validate_utf32_with_errors(const char32_t * buf, size_t len) noexcept { + return active_implementation->validate_utf32_with_errors(buf, len); +} +simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_buffer) noexcept { + return active_implementation->convert_valid_utf8_to_utf16le(input, length, utf16_buffer); +} +simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_buffer) noexcept { + return active_implementation->convert_valid_utf8_to_utf16be(input, length, utf16_buffer); +} +simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) noexcept { + return active_implementation->convert_valid_utf8_to_utf32(input, length, utf32_buffer); +} +simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept { + return active_implementation->convert_utf16le_to_utf8(buf, len, utf8_buffer); +} +simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept { + return active_implementation->convert_utf16be_to_utf8(buf, len, utf8_buffer); +} +simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) noexcept { + return active_implementation->convert_utf16le_to_utf8_with_errors(buf, len, utf8_buffer); +} +simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) noexcept { + return active_implementation->convert_utf16be_to_utf8_with_errors(buf, len, utf8_buffer); +} +simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept { + return active_implementation->convert_valid_utf16le_to_utf8(buf, len, utf8_buffer); +} +simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept { + return active_implementation->convert_valid_utf16be_to_utf8(buf, len, utf8_buffer); +} +simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) noexcept { + return active_implementation->convert_utf32_to_utf8(buf, len, utf8_buffer); +} +simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) noexcept { + return active_implementation->convert_utf32_to_utf8_with_errors(buf, len, utf8_buffer); +} +simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) noexcept { + return active_implementation->convert_valid_utf32_to_utf8(buf, len, utf8_buffer); +} +simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept { + return active_implementation->convert_utf32_to_utf16le(buf, len, utf16_buffer); +} +simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept { + return active_implementation->convert_utf32_to_utf16be(buf, len, utf16_buffer); +} +simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept { + return active_implementation->convert_utf32_to_utf16le_with_errors(buf, len, utf16_buffer); +} +simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept { + return active_implementation->convert_utf32_to_utf16be_with_errors(buf, len, utf16_buffer); +} +simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept { + return active_implementation->convert_valid_utf32_to_utf16le(buf, len, utf16_buffer); +} +simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept { + return active_implementation->convert_valid_utf32_to_utf16be(buf, len, utf16_buffer); +} +simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept { + return active_implementation->convert_utf16le_to_utf32(buf, len, utf32_buffer); +} +simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept { + return active_implementation->convert_utf16be_to_utf32(buf, len, utf32_buffer); +} +simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept { + return active_implementation->convert_utf16le_to_utf32_with_errors(buf, len, utf32_buffer); +} +simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept { + return active_implementation->convert_utf16be_to_utf32_with_errors(buf, len, utf32_buffer); +} +simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept { + return active_implementation->convert_valid_utf16le_to_utf32(buf, len, utf32_buffer); +} +simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept { + return active_implementation->convert_valid_utf16be_to_utf32(buf, len, utf32_buffer); +} +void change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) noexcept { + active_implementation->change_endianness_utf16(input, length, output); +} +simdutf_warn_unused size_t count_utf16le(const char16_t * input, size_t length) noexcept { + return active_implementation->count_utf16le(input, length); +} +simdutf_warn_unused size_t count_utf16be(const char16_t * input, size_t length) noexcept { + return active_implementation->count_utf16be(input, length); +} +simdutf_warn_unused size_t count_utf8(const char * input, size_t length) noexcept { + return active_implementation->count_utf8(input, length); +} +simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) noexcept { + return active_implementation->utf8_length_from_utf16le(input, length); +} +simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) noexcept { + return active_implementation->utf8_length_from_utf16be(input, length); +} +simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) noexcept { + return active_implementation->utf32_length_from_utf16le(input, length); +} +simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) noexcept { + return active_implementation->utf32_length_from_utf16be(input, length); +} +simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) noexcept { + return active_implementation->utf16_length_from_utf8(input, length); +} +simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) noexcept { + return active_implementation->utf8_length_from_utf32(input, length); +} +simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) noexcept { + return active_implementation->utf16_length_from_utf32(input, length); +} +simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) noexcept { + return active_implementation->utf32_length_from_utf8(input, length); +} +simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const char * buf, size_t length) noexcept { + return active_implementation->autodetect_encoding(buf, length); +} +simdutf_warn_unused int detect_encodings(const char * buf, size_t length) noexcept { + return active_implementation->detect_encodings(buf, length); +} + +const implementation * builtin_implementation() { + static const implementation * builtin_impl = available_implementations[SIMDUTF_STRINGIFY(SIMDUTF_BUILTIN_IMPLEMENTATION)]; + return builtin_impl; +} + + +} // namespace simdutf + +/* end file src/implementation.cpp */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=encoding_types.cpp +/* begin file src/encoding_types.cpp */ + +namespace simdutf { +std::string to_string(encoding_type bom) { + switch (bom) { + case UTF16_LE: return "UTF16 little-endian"; + case UTF16_BE: return "UTF16 big-endian"; + case UTF32_LE: return "UTF32 little-endian"; + case UTF32_BE: return "UTF32 big-endian"; + case UTF8: return "UTF8"; + case unspecified: return "unknown"; + default: return "error"; + } +} + +namespace BOM { +// Note that BOM for UTF8 is discouraged. +encoding_type check_bom(const uint8_t* byte, size_t length) { + if (length >= 2 && byte[0] == 0xff and byte[1] == 0xfe) { + if (length >= 4 && byte[2] == 0x00 and byte[3] == 0x0) { + return encoding_type::UTF32_LE; + } else { + return encoding_type::UTF16_LE; + } + } else if (length >= 2 && byte[0] == 0xfe and byte[1] == 0xff) { + return encoding_type::UTF16_BE; + } else if (length >= 4 && byte[0] == 0x00 and byte[1] == 0x00 and byte[2] == 0xfe and byte[3] == 0xff) { + return encoding_type::UTF32_BE; + } else if (length >= 4 && byte[0] == 0xef and byte[1] == 0xbb and byte[3] == 0xbf) { + return encoding_type::UTF8; + } + return encoding_type::unspecified; + } + +encoding_type check_bom(const char* byte, size_t length) { + return check_bom(reinterpret_cast(byte), length); + } + + size_t bom_byte_size(encoding_type bom) { + switch (bom) { + case UTF16_LE: return 2; + case UTF16_BE: return 2; + case UTF32_LE: return 4; + case UTF32_BE: return 4; + case UTF8: return 3; + case unspecified: return 0; + default: return 0; + } +} + +} +} +/* end file src/encoding_types.cpp */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=error.cpp +/* begin file src/error.cpp */ +namespace simdutf { + + simdutf_really_inline result::result() : error{error_code::SUCCESS}, count{0} {}; + + simdutf_really_inline result::result(error_code _err, size_t _pos) : error{_err}, count{_pos} {}; + +} +/* end file src/error.cpp */ +// The large tables should be included once and they +// should not depend on a kernel. +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=tables/utf8_to_utf16_tables.h +/* begin file src/tables/utf8_to_utf16_tables.h */ +#ifndef SIMDUTF_UTF8_TO_UTF16_TABLES_H +#define SIMDUTF_UTF8_TO_UTF16_TABLES_H +#include + +namespace simdutf { +namespace { +namespace tables { +namespace utf8_to_utf16 { +/** + * utf8bigindex uses about 8 kB + * shufutf8 uses about 3344 B + * + * So we use a bit over 11 kB. It would be + * easy to save about 4 kB by only + * storing the index in utf8bigindex, and + * deriving the consumed bytes otherwise. + * However, this may come at a significant (10% to 20%) + * performance penalty. + */ + +const uint8_t shufutf8[209][16] = +{ {0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 5, 255, 0, 0, 0, 0}, + {0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 6, 5, 0, 0, 0, 0}, + {0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 6, 255, 0, 0, 0, 0}, + {0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 7, 6, 0, 0, 0, 0}, + {0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 6, 255, 0, 0, 0, 0}, + {0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 7, 6, 0, 0, 0, 0}, + {0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 7, 255, 0, 0, 0, 0}, + {0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 8, 7, 0, 0, 0, 0}, + {0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0}, + {0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0}, + {0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0}, + {0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0}, + {0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0}, + {0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0}, + {0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0}, + {0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0}, + {0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0}, + {0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0}, + {0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0}, + {0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0}, + {0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0}, + {0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0}, + {0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0}, + {0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0}, + {0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0}, + {0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0}, + {0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0}, + {0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0}, + {0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0}, + {0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0}, + {0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0}, + {0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0}, + {1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0}, + {1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0}, + {1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0}, + {1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0}, + {1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0}, + {1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0}, + {1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0}, + {1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0}, + {1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0}, + {1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0}, + {1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0}, + {1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0}, + {1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0}, + {1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0}, + {1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0}, + {1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0}, + {1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0}, + {1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0}, + {1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0}, + {1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0}, + {1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0}, + {1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0}, + {1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0}, + {1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0}, + {1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 8, 255, 0, 0, 0, 0}, + {1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 9, 8, 0, 0, 0, 0}, + {1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 9, 255, 0, 0, 0, 0}, + {1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 10, 9, 0, 0, 0, 0}, + {1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 9, 255, 0, 0, 0, 0}, + {1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 10, 9, 0, 0, 0, 0}, + {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 255, 0, 0, 0, 0}, + {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 0, 0, 0, 0}, + {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255}, + {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255}, + {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255}, + {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255}, + {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255}, + {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255}, + {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255}, + {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255}, + {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255}, + {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255}, + {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255}, + {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255}, + {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255}, + {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255}, + {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255}, + {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255}, + {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255}, + {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255}, + {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 5, 255, 255, 255}, + {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 6, 5, 255, 255}, + {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 7, 6, 5, 255}, + {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 6, 255, 255, 255}, + {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 7, 6, 255, 255}, + {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 8, 7, 6, 255}, + {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 7, 255, 255, 255}, + {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 8, 7, 255, 255}, + {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 9, 8, 7, 255}, + {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255}, + {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255}, + {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255}, + {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255}, + {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255}, + {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255}, + {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255}, + {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255}, + {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255}, + {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255}, + {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255}, + {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255}, + {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255}, + {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255}, + {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255}, + {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255}, + {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255}, + {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255}, + {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 6, 255, 255, 255}, + {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 7, 6, 255, 255}, + {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 8, 7, 6, 255}, + {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 7, 255, 255, 255}, + {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 8, 7, 255, 255}, + {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 9, 8, 7, 255}, + {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 8, 255, 255, 255}, + {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 9, 8, 255, 255}, + {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 10, 9, 8, 255}, + {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255}, + {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255}, + {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255}, + {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255}, + {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255}, + {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255}, + {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255}, + {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255}, + {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255}, + {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255}, + {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 7, 6, 255, 255}, + {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 8, 7, 6, 255}, + {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 7, 255, 255, 255}, + {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 8, 7, 255, 255}, + {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 9, 8, 7, 255}, + {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 8, 255, 255, 255}, + {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 9, 8, 255, 255}, + {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 10, 9, 8, 255}, + {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 7, 255, 255, 255}, + {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 8, 7, 255, 255}, + {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 9, 8, 7, 255}, + {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 8, 255, 255, 255}, + {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 9, 8, 255, 255}, + {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 10, 9, 8, 255}, + {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 9, 255, 255, 255}, + {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 10, 9, 255, 255}, + {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255}, + {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 0, 0, 0, 0}, + {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 0, 0, 0, 0}, + {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 0, 0, 0, 0}, + {0, 255, 255, 255, 1, 255, 255, 255, 5, 4, 3, 2, 0, 0, 0, 0}, + {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0}, + {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0}, + {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0}, + {0, 255, 255, 255, 2, 1, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0}, + {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 0, 0, 0, 0}, + {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 0, 0, 0, 0}, + {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 0, 0, 0, 0}, + {0, 255, 255, 255, 3, 2, 1, 255, 7, 6, 5, 4, 0, 0, 0, 0}, + {0, 255, 255, 255, 4, 3, 2, 1, 5, 255, 255, 255, 0, 0, 0, 0}, + {0, 255, 255, 255, 4, 3, 2, 1, 6, 5, 255, 255, 0, 0, 0, 0}, + {0, 255, 255, 255, 4, 3, 2, 1, 7, 6, 5, 255, 0, 0, 0, 0}, + {0, 255, 255, 255, 4, 3, 2, 1, 8, 7, 6, 5, 0, 0, 0, 0}, + {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0}, + {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0}, + {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0}, + {1, 0, 255, 255, 2, 255, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0}, + {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0}, + {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0}, + {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0}, + {1, 0, 255, 255, 3, 2, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0}, + {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 0, 0, 0, 0}, + {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 0, 0, 0, 0}, + {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 0, 0, 0, 0}, + {1, 0, 255, 255, 4, 3, 2, 255, 8, 7, 6, 5, 0, 0, 0, 0}, + {1, 0, 255, 255, 5, 4, 3, 2, 6, 255, 255, 255, 0, 0, 0, 0}, + {1, 0, 255, 255, 5, 4, 3, 2, 7, 6, 255, 255, 0, 0, 0, 0}, + {1, 0, 255, 255, 5, 4, 3, 2, 8, 7, 6, 255, 0, 0, 0, 0}, + {1, 0, 255, 255, 5, 4, 3, 2, 9, 8, 7, 6, 0, 0, 0, 0}, + {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0}, + {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0}, + {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0}, + {2, 1, 0, 255, 3, 255, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0}, + {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0}, + {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0}, + {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0}, + {2, 1, 0, 255, 4, 3, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0}, + {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 0, 0, 0, 0}, + {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 0, 0, 0, 0}, + {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 0, 0, 0, 0}, + {2, 1, 0, 255, 5, 4, 3, 255, 9, 8, 7, 6, 0, 0, 0, 0}, + {2, 1, 0, 255, 6, 5, 4, 3, 7, 255, 255, 255, 0, 0, 0, 0}, + {2, 1, 0, 255, 6, 5, 4, 3, 8, 7, 255, 255, 0, 0, 0, 0}, + {2, 1, 0, 255, 6, 5, 4, 3, 9, 8, 7, 255, 0, 0, 0, 0}, + {2, 1, 0, 255, 6, 5, 4, 3, 10, 9, 8, 7, 0, 0, 0, 0}, + {3, 2, 1, 0, 4, 255, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0}, + {3, 2, 1, 0, 4, 255, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0}, + {3, 2, 1, 0, 4, 255, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0}, + {3, 2, 1, 0, 4, 255, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0}, + {3, 2, 1, 0, 5, 4, 255, 255, 6, 255, 255, 255, 0, 0, 0, 0}, + {3, 2, 1, 0, 5, 4, 255, 255, 7, 6, 255, 255, 0, 0, 0, 0}, + {3, 2, 1, 0, 5, 4, 255, 255, 8, 7, 6, 255, 0, 0, 0, 0}, + {3, 2, 1, 0, 5, 4, 255, 255, 9, 8, 7, 6, 0, 0, 0, 0}, + {3, 2, 1, 0, 6, 5, 4, 255, 7, 255, 255, 255, 0, 0, 0, 0}, + {3, 2, 1, 0, 6, 5, 4, 255, 8, 7, 255, 255, 0, 0, 0, 0}, + {3, 2, 1, 0, 6, 5, 4, 255, 9, 8, 7, 255, 0, 0, 0, 0}, + {3, 2, 1, 0, 6, 5, 4, 255, 10, 9, 8, 7, 0, 0, 0, 0}, + {3, 2, 1, 0, 7, 6, 5, 4, 8, 255, 255, 255, 0, 0, 0, 0}, + {3, 2, 1, 0, 7, 6, 5, 4, 9, 8, 255, 255, 0, 0, 0, 0}, + {3, 2, 1, 0, 7, 6, 5, 4, 10, 9, 8, 255, 0, 0, 0, 0}, + {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 0, 0, 0, 0}}; +/* number of two bytes : 64 */ +/* number of two + three bytes : 145 */ +/* number of two + three + four bytes : 209 */ +const uint8_t utf8bigindex[4096][2] = +{ {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {0, 12}, + {0, 12}, + {0, 12}, + {146, 4}, + {0, 12}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {0, 12}, + {0, 12}, + {147, 5}, + {0, 12}, + {150, 5}, + {162, 5}, + {65, 5}, + {0, 12}, + {153, 5}, + {165, 5}, + {67, 5}, + {177, 5}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {0, 12}, + {148, 6}, + {0, 12}, + {151, 6}, + {163, 6}, + {66, 6}, + {0, 12}, + {154, 6}, + {166, 6}, + {68, 6}, + {178, 6}, + {74, 6}, + {92, 6}, + {64, 4}, + {0, 12}, + {157, 6}, + {169, 6}, + {70, 6}, + {181, 6}, + {76, 6}, + {94, 6}, + {65, 5}, + {193, 6}, + {82, 6}, + {100, 6}, + {67, 5}, + {118, 6}, + {73, 5}, + {91, 5}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {152, 7}, + {164, 7}, + {145, 3}, + {0, 12}, + {155, 7}, + {167, 7}, + {69, 7}, + {179, 7}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {170, 7}, + {71, 7}, + {182, 7}, + {77, 7}, + {95, 7}, + {65, 5}, + {194, 7}, + {83, 7}, + {101, 7}, + {67, 5}, + {119, 7}, + {73, 5}, + {91, 5}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {185, 7}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {103, 7}, + {68, 6}, + {121, 7}, + {74, 6}, + {92, 6}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {76, 6}, + {94, 6}, + {4, 7}, + {193, 6}, + {82, 6}, + {100, 6}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {0, 12}, + {156, 8}, + {168, 8}, + {146, 4}, + {180, 8}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {159, 8}, + {171, 8}, + {72, 8}, + {183, 8}, + {78, 8}, + {96, 8}, + {65, 5}, + {195, 8}, + {84, 8}, + {102, 8}, + {67, 5}, + {120, 8}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {174, 8}, + {148, 6}, + {186, 8}, + {80, 8}, + {98, 8}, + {66, 6}, + {198, 8}, + {86, 8}, + {104, 8}, + {68, 6}, + {122, 8}, + {74, 6}, + {92, 6}, + {3, 8}, + {0, 12}, + {157, 6}, + {110, 8}, + {70, 6}, + {128, 8}, + {76, 6}, + {94, 6}, + {5, 8}, + {193, 6}, + {82, 6}, + {100, 6}, + {9, 8}, + {118, 6}, + {17, 8}, + {33, 8}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {189, 8}, + {152, 7}, + {164, 7}, + {145, 3}, + {201, 8}, + {88, 8}, + {106, 8}, + {69, 7}, + {124, 8}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {112, 8}, + {71, 7}, + {130, 8}, + {77, 7}, + {95, 7}, + {6, 8}, + {194, 7}, + {83, 7}, + {101, 7}, + {10, 8}, + {119, 7}, + {18, 8}, + {34, 8}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {136, 8}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {103, 7}, + {12, 8}, + {121, 7}, + {20, 8}, + {36, 8}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {24, 8}, + {40, 8}, + {4, 7}, + {193, 6}, + {82, 6}, + {48, 8}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {0, 12}, + {0, 12}, + {0, 12}, + {146, 4}, + {0, 12}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {160, 9}, + {172, 9}, + {147, 5}, + {184, 9}, + {150, 5}, + {162, 5}, + {65, 5}, + {196, 9}, + {153, 5}, + {165, 5}, + {67, 5}, + {177, 5}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {175, 9}, + {148, 6}, + {187, 9}, + {81, 9}, + {99, 9}, + {66, 6}, + {199, 9}, + {87, 9}, + {105, 9}, + {68, 6}, + {123, 9}, + {74, 6}, + {92, 6}, + {64, 4}, + {0, 12}, + {157, 6}, + {111, 9}, + {70, 6}, + {129, 9}, + {76, 6}, + {94, 6}, + {65, 5}, + {193, 6}, + {82, 6}, + {100, 6}, + {67, 5}, + {118, 6}, + {73, 5}, + {91, 5}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {190, 9}, + {152, 7}, + {164, 7}, + {145, 3}, + {202, 9}, + {89, 9}, + {107, 9}, + {69, 7}, + {125, 9}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {113, 9}, + {71, 7}, + {131, 9}, + {77, 7}, + {95, 7}, + {7, 9}, + {194, 7}, + {83, 7}, + {101, 7}, + {11, 9}, + {119, 7}, + {19, 9}, + {35, 9}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {137, 9}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {103, 7}, + {13, 9}, + {121, 7}, + {21, 9}, + {37, 9}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {25, 9}, + {41, 9}, + {4, 7}, + {193, 6}, + {82, 6}, + {49, 9}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {205, 9}, + {156, 8}, + {168, 8}, + {146, 4}, + {180, 8}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {159, 8}, + {115, 9}, + {72, 8}, + {133, 9}, + {78, 8}, + {96, 8}, + {65, 5}, + {195, 8}, + {84, 8}, + {102, 8}, + {67, 5}, + {120, 8}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {174, 8}, + {148, 6}, + {139, 9}, + {80, 8}, + {98, 8}, + {66, 6}, + {198, 8}, + {86, 8}, + {104, 8}, + {14, 9}, + {122, 8}, + {22, 9}, + {38, 9}, + {3, 8}, + {0, 12}, + {157, 6}, + {110, 8}, + {70, 6}, + {128, 8}, + {26, 9}, + {42, 9}, + {5, 8}, + {193, 6}, + {82, 6}, + {50, 9}, + {9, 8}, + {118, 6}, + {17, 8}, + {33, 8}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {189, 8}, + {152, 7}, + {164, 7}, + {145, 3}, + {201, 8}, + {88, 8}, + {106, 8}, + {69, 7}, + {124, 8}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {112, 8}, + {71, 7}, + {130, 8}, + {28, 9}, + {44, 9}, + {6, 8}, + {194, 7}, + {83, 7}, + {52, 9}, + {10, 8}, + {119, 7}, + {18, 8}, + {34, 8}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {136, 8}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {56, 9}, + {12, 8}, + {121, 7}, + {20, 8}, + {36, 8}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {24, 8}, + {40, 8}, + {4, 7}, + {193, 6}, + {82, 6}, + {48, 8}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {0, 12}, + {0, 12}, + {0, 12}, + {146, 4}, + {0, 12}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {0, 12}, + {0, 12}, + {147, 5}, + {0, 12}, + {150, 5}, + {162, 5}, + {65, 5}, + {0, 12}, + {153, 5}, + {165, 5}, + {67, 5}, + {177, 5}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {176, 10}, + {148, 6}, + {188, 10}, + {151, 6}, + {163, 6}, + {66, 6}, + {200, 10}, + {154, 6}, + {166, 6}, + {68, 6}, + {178, 6}, + {74, 6}, + {92, 6}, + {64, 4}, + {0, 12}, + {157, 6}, + {169, 6}, + {70, 6}, + {181, 6}, + {76, 6}, + {94, 6}, + {65, 5}, + {193, 6}, + {82, 6}, + {100, 6}, + {67, 5}, + {118, 6}, + {73, 5}, + {91, 5}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {191, 10}, + {152, 7}, + {164, 7}, + {145, 3}, + {203, 10}, + {90, 10}, + {108, 10}, + {69, 7}, + {126, 10}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {114, 10}, + {71, 7}, + {132, 10}, + {77, 7}, + {95, 7}, + {65, 5}, + {194, 7}, + {83, 7}, + {101, 7}, + {67, 5}, + {119, 7}, + {73, 5}, + {91, 5}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {138, 10}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {103, 7}, + {68, 6}, + {121, 7}, + {74, 6}, + {92, 6}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {76, 6}, + {94, 6}, + {4, 7}, + {193, 6}, + {82, 6}, + {100, 6}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {206, 10}, + {156, 8}, + {168, 8}, + {146, 4}, + {180, 8}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {159, 8}, + {116, 10}, + {72, 8}, + {134, 10}, + {78, 8}, + {96, 8}, + {65, 5}, + {195, 8}, + {84, 8}, + {102, 8}, + {67, 5}, + {120, 8}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {174, 8}, + {148, 6}, + {140, 10}, + {80, 8}, + {98, 8}, + {66, 6}, + {198, 8}, + {86, 8}, + {104, 8}, + {15, 10}, + {122, 8}, + {23, 10}, + {39, 10}, + {3, 8}, + {0, 12}, + {157, 6}, + {110, 8}, + {70, 6}, + {128, 8}, + {27, 10}, + {43, 10}, + {5, 8}, + {193, 6}, + {82, 6}, + {51, 10}, + {9, 8}, + {118, 6}, + {17, 8}, + {33, 8}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {189, 8}, + {152, 7}, + {164, 7}, + {145, 3}, + {201, 8}, + {88, 8}, + {106, 8}, + {69, 7}, + {124, 8}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {112, 8}, + {71, 7}, + {130, 8}, + {29, 10}, + {45, 10}, + {6, 8}, + {194, 7}, + {83, 7}, + {53, 10}, + {10, 8}, + {119, 7}, + {18, 8}, + {34, 8}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {136, 8}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {57, 10}, + {12, 8}, + {121, 7}, + {20, 8}, + {36, 8}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {24, 8}, + {40, 8}, + {4, 7}, + {193, 6}, + {82, 6}, + {48, 8}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {0, 12}, + {0, 12}, + {0, 12}, + {146, 4}, + {0, 12}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {160, 9}, + {172, 9}, + {147, 5}, + {184, 9}, + {150, 5}, + {162, 5}, + {65, 5}, + {196, 9}, + {153, 5}, + {165, 5}, + {67, 5}, + {177, 5}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {175, 9}, + {148, 6}, + {142, 10}, + {81, 9}, + {99, 9}, + {66, 6}, + {199, 9}, + {87, 9}, + {105, 9}, + {68, 6}, + {123, 9}, + {74, 6}, + {92, 6}, + {64, 4}, + {0, 12}, + {157, 6}, + {111, 9}, + {70, 6}, + {129, 9}, + {76, 6}, + {94, 6}, + {65, 5}, + {193, 6}, + {82, 6}, + {100, 6}, + {67, 5}, + {118, 6}, + {73, 5}, + {91, 5}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {190, 9}, + {152, 7}, + {164, 7}, + {145, 3}, + {202, 9}, + {89, 9}, + {107, 9}, + {69, 7}, + {125, 9}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {113, 9}, + {71, 7}, + {131, 9}, + {30, 10}, + {46, 10}, + {7, 9}, + {194, 7}, + {83, 7}, + {54, 10}, + {11, 9}, + {119, 7}, + {19, 9}, + {35, 9}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {137, 9}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {58, 10}, + {13, 9}, + {121, 7}, + {21, 9}, + {37, 9}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {25, 9}, + {41, 9}, + {4, 7}, + {193, 6}, + {82, 6}, + {49, 9}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {205, 9}, + {156, 8}, + {168, 8}, + {146, 4}, + {180, 8}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {159, 8}, + {115, 9}, + {72, 8}, + {133, 9}, + {78, 8}, + {96, 8}, + {65, 5}, + {195, 8}, + {84, 8}, + {102, 8}, + {67, 5}, + {120, 8}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {174, 8}, + {148, 6}, + {139, 9}, + {80, 8}, + {98, 8}, + {66, 6}, + {198, 8}, + {86, 8}, + {60, 10}, + {14, 9}, + {122, 8}, + {22, 9}, + {38, 9}, + {3, 8}, + {0, 12}, + {157, 6}, + {110, 8}, + {70, 6}, + {128, 8}, + {26, 9}, + {42, 9}, + {5, 8}, + {193, 6}, + {82, 6}, + {50, 9}, + {9, 8}, + {118, 6}, + {17, 8}, + {33, 8}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {189, 8}, + {152, 7}, + {164, 7}, + {145, 3}, + {201, 8}, + {88, 8}, + {106, 8}, + {69, 7}, + {124, 8}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {112, 8}, + {71, 7}, + {130, 8}, + {28, 9}, + {44, 9}, + {6, 8}, + {194, 7}, + {83, 7}, + {52, 9}, + {10, 8}, + {119, 7}, + {18, 8}, + {34, 8}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {136, 8}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {56, 9}, + {12, 8}, + {121, 7}, + {20, 8}, + {36, 8}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {24, 8}, + {40, 8}, + {4, 7}, + {193, 6}, + {82, 6}, + {48, 8}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {0, 12}, + {0, 12}, + {0, 12}, + {146, 4}, + {0, 12}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {0, 12}, + {0, 12}, + {147, 5}, + {0, 12}, + {150, 5}, + {162, 5}, + {65, 5}, + {0, 12}, + {153, 5}, + {165, 5}, + {67, 5}, + {177, 5}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {0, 12}, + {148, 6}, + {0, 12}, + {151, 6}, + {163, 6}, + {66, 6}, + {0, 12}, + {154, 6}, + {166, 6}, + {68, 6}, + {178, 6}, + {74, 6}, + {92, 6}, + {64, 4}, + {0, 12}, + {157, 6}, + {169, 6}, + {70, 6}, + {181, 6}, + {76, 6}, + {94, 6}, + {65, 5}, + {193, 6}, + {82, 6}, + {100, 6}, + {67, 5}, + {118, 6}, + {73, 5}, + {91, 5}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {192, 11}, + {152, 7}, + {164, 7}, + {145, 3}, + {204, 11}, + {155, 7}, + {167, 7}, + {69, 7}, + {179, 7}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {170, 7}, + {71, 7}, + {182, 7}, + {77, 7}, + {95, 7}, + {65, 5}, + {194, 7}, + {83, 7}, + {101, 7}, + {67, 5}, + {119, 7}, + {73, 5}, + {91, 5}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {185, 7}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {103, 7}, + {68, 6}, + {121, 7}, + {74, 6}, + {92, 6}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {76, 6}, + {94, 6}, + {4, 7}, + {193, 6}, + {82, 6}, + {100, 6}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {207, 11}, + {156, 8}, + {168, 8}, + {146, 4}, + {180, 8}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {159, 8}, + {117, 11}, + {72, 8}, + {135, 11}, + {78, 8}, + {96, 8}, + {65, 5}, + {195, 8}, + {84, 8}, + {102, 8}, + {67, 5}, + {120, 8}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {174, 8}, + {148, 6}, + {141, 11}, + {80, 8}, + {98, 8}, + {66, 6}, + {198, 8}, + {86, 8}, + {104, 8}, + {68, 6}, + {122, 8}, + {74, 6}, + {92, 6}, + {3, 8}, + {0, 12}, + {157, 6}, + {110, 8}, + {70, 6}, + {128, 8}, + {76, 6}, + {94, 6}, + {5, 8}, + {193, 6}, + {82, 6}, + {100, 6}, + {9, 8}, + {118, 6}, + {17, 8}, + {33, 8}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {189, 8}, + {152, 7}, + {164, 7}, + {145, 3}, + {201, 8}, + {88, 8}, + {106, 8}, + {69, 7}, + {124, 8}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {112, 8}, + {71, 7}, + {130, 8}, + {77, 7}, + {95, 7}, + {6, 8}, + {194, 7}, + {83, 7}, + {101, 7}, + {10, 8}, + {119, 7}, + {18, 8}, + {34, 8}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {136, 8}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {103, 7}, + {12, 8}, + {121, 7}, + {20, 8}, + {36, 8}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {24, 8}, + {40, 8}, + {4, 7}, + {193, 6}, + {82, 6}, + {48, 8}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {0, 12}, + {0, 12}, + {0, 12}, + {146, 4}, + {0, 12}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {160, 9}, + {172, 9}, + {147, 5}, + {184, 9}, + {150, 5}, + {162, 5}, + {65, 5}, + {196, 9}, + {153, 5}, + {165, 5}, + {67, 5}, + {177, 5}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {175, 9}, + {148, 6}, + {143, 11}, + {81, 9}, + {99, 9}, + {66, 6}, + {199, 9}, + {87, 9}, + {105, 9}, + {68, 6}, + {123, 9}, + {74, 6}, + {92, 6}, + {64, 4}, + {0, 12}, + {157, 6}, + {111, 9}, + {70, 6}, + {129, 9}, + {76, 6}, + {94, 6}, + {65, 5}, + {193, 6}, + {82, 6}, + {100, 6}, + {67, 5}, + {118, 6}, + {73, 5}, + {91, 5}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {190, 9}, + {152, 7}, + {164, 7}, + {145, 3}, + {202, 9}, + {89, 9}, + {107, 9}, + {69, 7}, + {125, 9}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {113, 9}, + {71, 7}, + {131, 9}, + {31, 11}, + {47, 11}, + {7, 9}, + {194, 7}, + {83, 7}, + {55, 11}, + {11, 9}, + {119, 7}, + {19, 9}, + {35, 9}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {137, 9}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {59, 11}, + {13, 9}, + {121, 7}, + {21, 9}, + {37, 9}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {25, 9}, + {41, 9}, + {4, 7}, + {193, 6}, + {82, 6}, + {49, 9}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {205, 9}, + {156, 8}, + {168, 8}, + {146, 4}, + {180, 8}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {159, 8}, + {115, 9}, + {72, 8}, + {133, 9}, + {78, 8}, + {96, 8}, + {65, 5}, + {195, 8}, + {84, 8}, + {102, 8}, + {67, 5}, + {120, 8}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {174, 8}, + {148, 6}, + {139, 9}, + {80, 8}, + {98, 8}, + {66, 6}, + {198, 8}, + {86, 8}, + {61, 11}, + {14, 9}, + {122, 8}, + {22, 9}, + {38, 9}, + {3, 8}, + {0, 12}, + {157, 6}, + {110, 8}, + {70, 6}, + {128, 8}, + {26, 9}, + {42, 9}, + {5, 8}, + {193, 6}, + {82, 6}, + {50, 9}, + {9, 8}, + {118, 6}, + {17, 8}, + {33, 8}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {189, 8}, + {152, 7}, + {164, 7}, + {145, 3}, + {201, 8}, + {88, 8}, + {106, 8}, + {69, 7}, + {124, 8}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {112, 8}, + {71, 7}, + {130, 8}, + {28, 9}, + {44, 9}, + {6, 8}, + {194, 7}, + {83, 7}, + {52, 9}, + {10, 8}, + {119, 7}, + {18, 8}, + {34, 8}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {136, 8}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {56, 9}, + {12, 8}, + {121, 7}, + {20, 8}, + {36, 8}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {24, 8}, + {40, 8}, + {4, 7}, + {193, 6}, + {82, 6}, + {48, 8}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {0, 12}, + {0, 12}, + {0, 12}, + {146, 4}, + {0, 12}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {0, 12}, + {0, 12}, + {147, 5}, + {0, 12}, + {150, 5}, + {162, 5}, + {65, 5}, + {0, 12}, + {153, 5}, + {165, 5}, + {67, 5}, + {177, 5}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {176, 10}, + {148, 6}, + {188, 10}, + {151, 6}, + {163, 6}, + {66, 6}, + {200, 10}, + {154, 6}, + {166, 6}, + {68, 6}, + {178, 6}, + {74, 6}, + {92, 6}, + {64, 4}, + {0, 12}, + {157, 6}, + {169, 6}, + {70, 6}, + {181, 6}, + {76, 6}, + {94, 6}, + {65, 5}, + {193, 6}, + {82, 6}, + {100, 6}, + {67, 5}, + {118, 6}, + {73, 5}, + {91, 5}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {191, 10}, + {152, 7}, + {164, 7}, + {145, 3}, + {203, 10}, + {90, 10}, + {108, 10}, + {69, 7}, + {126, 10}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {114, 10}, + {71, 7}, + {132, 10}, + {77, 7}, + {95, 7}, + {65, 5}, + {194, 7}, + {83, 7}, + {101, 7}, + {67, 5}, + {119, 7}, + {73, 5}, + {91, 5}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {138, 10}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {103, 7}, + {68, 6}, + {121, 7}, + {74, 6}, + {92, 6}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {76, 6}, + {94, 6}, + {4, 7}, + {193, 6}, + {82, 6}, + {100, 6}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {206, 10}, + {156, 8}, + {168, 8}, + {146, 4}, + {180, 8}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {159, 8}, + {116, 10}, + {72, 8}, + {134, 10}, + {78, 8}, + {96, 8}, + {65, 5}, + {195, 8}, + {84, 8}, + {102, 8}, + {67, 5}, + {120, 8}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {174, 8}, + {148, 6}, + {140, 10}, + {80, 8}, + {98, 8}, + {66, 6}, + {198, 8}, + {86, 8}, + {62, 11}, + {15, 10}, + {122, 8}, + {23, 10}, + {39, 10}, + {3, 8}, + {0, 12}, + {157, 6}, + {110, 8}, + {70, 6}, + {128, 8}, + {27, 10}, + {43, 10}, + {5, 8}, + {193, 6}, + {82, 6}, + {51, 10}, + {9, 8}, + {118, 6}, + {17, 8}, + {33, 8}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {189, 8}, + {152, 7}, + {164, 7}, + {145, 3}, + {201, 8}, + {88, 8}, + {106, 8}, + {69, 7}, + {124, 8}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {112, 8}, + {71, 7}, + {130, 8}, + {29, 10}, + {45, 10}, + {6, 8}, + {194, 7}, + {83, 7}, + {53, 10}, + {10, 8}, + {119, 7}, + {18, 8}, + {34, 8}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {136, 8}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {57, 10}, + {12, 8}, + {121, 7}, + {20, 8}, + {36, 8}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {24, 8}, + {40, 8}, + {4, 7}, + {193, 6}, + {82, 6}, + {48, 8}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {0, 12}, + {0, 12}, + {0, 12}, + {146, 4}, + {0, 12}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {160, 9}, + {172, 9}, + {147, 5}, + {184, 9}, + {150, 5}, + {162, 5}, + {65, 5}, + {196, 9}, + {153, 5}, + {165, 5}, + {67, 5}, + {177, 5}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {175, 9}, + {148, 6}, + {142, 10}, + {81, 9}, + {99, 9}, + {66, 6}, + {199, 9}, + {87, 9}, + {105, 9}, + {68, 6}, + {123, 9}, + {74, 6}, + {92, 6}, + {64, 4}, + {0, 12}, + {157, 6}, + {111, 9}, + {70, 6}, + {129, 9}, + {76, 6}, + {94, 6}, + {65, 5}, + {193, 6}, + {82, 6}, + {100, 6}, + {67, 5}, + {118, 6}, + {73, 5}, + {91, 5}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {190, 9}, + {152, 7}, + {164, 7}, + {145, 3}, + {202, 9}, + {89, 9}, + {107, 9}, + {69, 7}, + {125, 9}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {113, 9}, + {71, 7}, + {131, 9}, + {30, 10}, + {46, 10}, + {7, 9}, + {194, 7}, + {83, 7}, + {54, 10}, + {11, 9}, + {119, 7}, + {19, 9}, + {35, 9}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {137, 9}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {58, 10}, + {13, 9}, + {121, 7}, + {21, 9}, + {37, 9}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {25, 9}, + {41, 9}, + {4, 7}, + {193, 6}, + {82, 6}, + {49, 9}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {205, 9}, + {156, 8}, + {168, 8}, + {146, 4}, + {180, 8}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {159, 8}, + {115, 9}, + {72, 8}, + {133, 9}, + {78, 8}, + {96, 8}, + {65, 5}, + {195, 8}, + {84, 8}, + {102, 8}, + {67, 5}, + {120, 8}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {174, 8}, + {148, 6}, + {139, 9}, + {80, 8}, + {98, 8}, + {66, 6}, + {198, 8}, + {86, 8}, + {60, 10}, + {14, 9}, + {122, 8}, + {22, 9}, + {38, 9}, + {3, 8}, + {0, 12}, + {157, 6}, + {110, 8}, + {70, 6}, + {128, 8}, + {26, 9}, + {42, 9}, + {5, 8}, + {193, 6}, + {82, 6}, + {50, 9}, + {9, 8}, + {118, 6}, + {17, 8}, + {33, 8}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {189, 8}, + {152, 7}, + {164, 7}, + {145, 3}, + {201, 8}, + {88, 8}, + {106, 8}, + {69, 7}, + {124, 8}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {112, 8}, + {71, 7}, + {130, 8}, + {28, 9}, + {44, 9}, + {6, 8}, + {194, 7}, + {83, 7}, + {52, 9}, + {10, 8}, + {119, 7}, + {18, 8}, + {34, 8}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {136, 8}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {56, 9}, + {12, 8}, + {121, 7}, + {20, 8}, + {36, 8}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {24, 8}, + {40, 8}, + {4, 7}, + {193, 6}, + {82, 6}, + {48, 8}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {0, 12}, + {0, 12}, + {0, 12}, + {146, 4}, + {0, 12}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {0, 12}, + {0, 12}, + {147, 5}, + {0, 12}, + {150, 5}, + {162, 5}, + {65, 5}, + {0, 12}, + {153, 5}, + {165, 5}, + {67, 5}, + {177, 5}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {0, 12}, + {148, 6}, + {0, 12}, + {151, 6}, + {163, 6}, + {66, 6}, + {0, 12}, + {154, 6}, + {166, 6}, + {68, 6}, + {178, 6}, + {74, 6}, + {92, 6}, + {64, 4}, + {0, 12}, + {157, 6}, + {169, 6}, + {70, 6}, + {181, 6}, + {76, 6}, + {94, 6}, + {65, 5}, + {193, 6}, + {82, 6}, + {100, 6}, + {67, 5}, + {118, 6}, + {73, 5}, + {91, 5}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {152, 7}, + {164, 7}, + {145, 3}, + {0, 12}, + {155, 7}, + {167, 7}, + {69, 7}, + {179, 7}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {170, 7}, + {71, 7}, + {182, 7}, + {77, 7}, + {95, 7}, + {65, 5}, + {194, 7}, + {83, 7}, + {101, 7}, + {67, 5}, + {119, 7}, + {73, 5}, + {91, 5}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {185, 7}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {103, 7}, + {68, 6}, + {121, 7}, + {74, 6}, + {92, 6}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {76, 6}, + {94, 6}, + {4, 7}, + {193, 6}, + {82, 6}, + {100, 6}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {208, 12}, + {156, 8}, + {168, 8}, + {146, 4}, + {180, 8}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {159, 8}, + {171, 8}, + {72, 8}, + {183, 8}, + {78, 8}, + {96, 8}, + {65, 5}, + {195, 8}, + {84, 8}, + {102, 8}, + {67, 5}, + {120, 8}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {174, 8}, + {148, 6}, + {186, 8}, + {80, 8}, + {98, 8}, + {66, 6}, + {198, 8}, + {86, 8}, + {104, 8}, + {68, 6}, + {122, 8}, + {74, 6}, + {92, 6}, + {3, 8}, + {0, 12}, + {157, 6}, + {110, 8}, + {70, 6}, + {128, 8}, + {76, 6}, + {94, 6}, + {5, 8}, + {193, 6}, + {82, 6}, + {100, 6}, + {9, 8}, + {118, 6}, + {17, 8}, + {33, 8}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {189, 8}, + {152, 7}, + {164, 7}, + {145, 3}, + {201, 8}, + {88, 8}, + {106, 8}, + {69, 7}, + {124, 8}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {112, 8}, + {71, 7}, + {130, 8}, + {77, 7}, + {95, 7}, + {6, 8}, + {194, 7}, + {83, 7}, + {101, 7}, + {10, 8}, + {119, 7}, + {18, 8}, + {34, 8}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {136, 8}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {103, 7}, + {12, 8}, + {121, 7}, + {20, 8}, + {36, 8}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {24, 8}, + {40, 8}, + {4, 7}, + {193, 6}, + {82, 6}, + {48, 8}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {0, 12}, + {0, 12}, + {0, 12}, + {146, 4}, + {0, 12}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {160, 9}, + {172, 9}, + {147, 5}, + {184, 9}, + {150, 5}, + {162, 5}, + {65, 5}, + {196, 9}, + {153, 5}, + {165, 5}, + {67, 5}, + {177, 5}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {175, 9}, + {148, 6}, + {144, 12}, + {81, 9}, + {99, 9}, + {66, 6}, + {199, 9}, + {87, 9}, + {105, 9}, + {68, 6}, + {123, 9}, + {74, 6}, + {92, 6}, + {64, 4}, + {0, 12}, + {157, 6}, + {111, 9}, + {70, 6}, + {129, 9}, + {76, 6}, + {94, 6}, + {65, 5}, + {193, 6}, + {82, 6}, + {100, 6}, + {67, 5}, + {118, 6}, + {73, 5}, + {91, 5}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {190, 9}, + {152, 7}, + {164, 7}, + {145, 3}, + {202, 9}, + {89, 9}, + {107, 9}, + {69, 7}, + {125, 9}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {113, 9}, + {71, 7}, + {131, 9}, + {77, 7}, + {95, 7}, + {7, 9}, + {194, 7}, + {83, 7}, + {101, 7}, + {11, 9}, + {119, 7}, + {19, 9}, + {35, 9}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {137, 9}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {103, 7}, + {13, 9}, + {121, 7}, + {21, 9}, + {37, 9}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {25, 9}, + {41, 9}, + {4, 7}, + {193, 6}, + {82, 6}, + {49, 9}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {205, 9}, + {156, 8}, + {168, 8}, + {146, 4}, + {180, 8}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {159, 8}, + {115, 9}, + {72, 8}, + {133, 9}, + {78, 8}, + {96, 8}, + {65, 5}, + {195, 8}, + {84, 8}, + {102, 8}, + {67, 5}, + {120, 8}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {174, 8}, + {148, 6}, + {139, 9}, + {80, 8}, + {98, 8}, + {66, 6}, + {198, 8}, + {86, 8}, + {104, 8}, + {14, 9}, + {122, 8}, + {22, 9}, + {38, 9}, + {3, 8}, + {0, 12}, + {157, 6}, + {110, 8}, + {70, 6}, + {128, 8}, + {26, 9}, + {42, 9}, + {5, 8}, + {193, 6}, + {82, 6}, + {50, 9}, + {9, 8}, + {118, 6}, + {17, 8}, + {33, 8}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {189, 8}, + {152, 7}, + {164, 7}, + {145, 3}, + {201, 8}, + {88, 8}, + {106, 8}, + {69, 7}, + {124, 8}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {112, 8}, + {71, 7}, + {130, 8}, + {28, 9}, + {44, 9}, + {6, 8}, + {194, 7}, + {83, 7}, + {52, 9}, + {10, 8}, + {119, 7}, + {18, 8}, + {34, 8}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {136, 8}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {56, 9}, + {12, 8}, + {121, 7}, + {20, 8}, + {36, 8}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {24, 8}, + {40, 8}, + {4, 7}, + {193, 6}, + {82, 6}, + {48, 8}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {0, 12}, + {0, 12}, + {0, 12}, + {146, 4}, + {0, 12}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {0, 12}, + {0, 12}, + {147, 5}, + {0, 12}, + {150, 5}, + {162, 5}, + {65, 5}, + {0, 12}, + {153, 5}, + {165, 5}, + {67, 5}, + {177, 5}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {176, 10}, + {148, 6}, + {188, 10}, + {151, 6}, + {163, 6}, + {66, 6}, + {200, 10}, + {154, 6}, + {166, 6}, + {68, 6}, + {178, 6}, + {74, 6}, + {92, 6}, + {64, 4}, + {0, 12}, + {157, 6}, + {169, 6}, + {70, 6}, + {181, 6}, + {76, 6}, + {94, 6}, + {65, 5}, + {193, 6}, + {82, 6}, + {100, 6}, + {67, 5}, + {118, 6}, + {73, 5}, + {91, 5}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {191, 10}, + {152, 7}, + {164, 7}, + {145, 3}, + {203, 10}, + {90, 10}, + {108, 10}, + {69, 7}, + {126, 10}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {114, 10}, + {71, 7}, + {132, 10}, + {77, 7}, + {95, 7}, + {65, 5}, + {194, 7}, + {83, 7}, + {101, 7}, + {67, 5}, + {119, 7}, + {73, 5}, + {91, 5}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {138, 10}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {103, 7}, + {68, 6}, + {121, 7}, + {74, 6}, + {92, 6}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {76, 6}, + {94, 6}, + {4, 7}, + {193, 6}, + {82, 6}, + {100, 6}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {206, 10}, + {156, 8}, + {168, 8}, + {146, 4}, + {180, 8}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {159, 8}, + {116, 10}, + {72, 8}, + {134, 10}, + {78, 8}, + {96, 8}, + {65, 5}, + {195, 8}, + {84, 8}, + {102, 8}, + {67, 5}, + {120, 8}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {174, 8}, + {148, 6}, + {140, 10}, + {80, 8}, + {98, 8}, + {66, 6}, + {198, 8}, + {86, 8}, + {63, 12}, + {15, 10}, + {122, 8}, + {23, 10}, + {39, 10}, + {3, 8}, + {0, 12}, + {157, 6}, + {110, 8}, + {70, 6}, + {128, 8}, + {27, 10}, + {43, 10}, + {5, 8}, + {193, 6}, + {82, 6}, + {51, 10}, + {9, 8}, + {118, 6}, + {17, 8}, + {33, 8}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {189, 8}, + {152, 7}, + {164, 7}, + {145, 3}, + {201, 8}, + {88, 8}, + {106, 8}, + {69, 7}, + {124, 8}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {112, 8}, + {71, 7}, + {130, 8}, + {29, 10}, + {45, 10}, + {6, 8}, + {194, 7}, + {83, 7}, + {53, 10}, + {10, 8}, + {119, 7}, + {18, 8}, + {34, 8}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {136, 8}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {57, 10}, + {12, 8}, + {121, 7}, + {20, 8}, + {36, 8}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {24, 8}, + {40, 8}, + {4, 7}, + {193, 6}, + {82, 6}, + {48, 8}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {0, 12}, + {0, 12}, + {0, 12}, + {146, 4}, + {0, 12}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {160, 9}, + {172, 9}, + {147, 5}, + {184, 9}, + {150, 5}, + {162, 5}, + {65, 5}, + {196, 9}, + {153, 5}, + {165, 5}, + {67, 5}, + {177, 5}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {175, 9}, + {148, 6}, + {142, 10}, + {81, 9}, + {99, 9}, + {66, 6}, + {199, 9}, + {87, 9}, + {105, 9}, + {68, 6}, + {123, 9}, + {74, 6}, + {92, 6}, + {64, 4}, + {0, 12}, + {157, 6}, + {111, 9}, + {70, 6}, + {129, 9}, + {76, 6}, + {94, 6}, + {65, 5}, + {193, 6}, + {82, 6}, + {100, 6}, + {67, 5}, + {118, 6}, + {73, 5}, + {91, 5}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {190, 9}, + {152, 7}, + {164, 7}, + {145, 3}, + {202, 9}, + {89, 9}, + {107, 9}, + {69, 7}, + {125, 9}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {113, 9}, + {71, 7}, + {131, 9}, + {30, 10}, + {46, 10}, + {7, 9}, + {194, 7}, + {83, 7}, + {54, 10}, + {11, 9}, + {119, 7}, + {19, 9}, + {35, 9}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {137, 9}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {58, 10}, + {13, 9}, + {121, 7}, + {21, 9}, + {37, 9}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {25, 9}, + {41, 9}, + {4, 7}, + {193, 6}, + {82, 6}, + {49, 9}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {205, 9}, + {156, 8}, + {168, 8}, + {146, 4}, + {180, 8}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {159, 8}, + {115, 9}, + {72, 8}, + {133, 9}, + {78, 8}, + {96, 8}, + {65, 5}, + {195, 8}, + {84, 8}, + {102, 8}, + {67, 5}, + {120, 8}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {174, 8}, + {148, 6}, + {139, 9}, + {80, 8}, + {98, 8}, + {66, 6}, + {198, 8}, + {86, 8}, + {60, 10}, + {14, 9}, + {122, 8}, + {22, 9}, + {38, 9}, + {3, 8}, + {0, 12}, + {157, 6}, + {110, 8}, + {70, 6}, + {128, 8}, + {26, 9}, + {42, 9}, + {5, 8}, + {193, 6}, + {82, 6}, + {50, 9}, + {9, 8}, + {118, 6}, + {17, 8}, + {33, 8}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {189, 8}, + {152, 7}, + {164, 7}, + {145, 3}, + {201, 8}, + {88, 8}, + {106, 8}, + {69, 7}, + {124, 8}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {112, 8}, + {71, 7}, + {130, 8}, + {28, 9}, + {44, 9}, + {6, 8}, + {194, 7}, + {83, 7}, + {52, 9}, + {10, 8}, + {119, 7}, + {18, 8}, + {34, 8}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {136, 8}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {56, 9}, + {12, 8}, + {121, 7}, + {20, 8}, + {36, 8}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {24, 8}, + {40, 8}, + {4, 7}, + {193, 6}, + {82, 6}, + {48, 8}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {0, 12}, + {0, 12}, + {0, 12}, + {146, 4}, + {0, 12}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {0, 12}, + {0, 12}, + {147, 5}, + {0, 12}, + {150, 5}, + {162, 5}, + {65, 5}, + {0, 12}, + {153, 5}, + {165, 5}, + {67, 5}, + {177, 5}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {0, 12}, + {148, 6}, + {0, 12}, + {151, 6}, + {163, 6}, + {66, 6}, + {0, 12}, + {154, 6}, + {166, 6}, + {68, 6}, + {178, 6}, + {74, 6}, + {92, 6}, + {64, 4}, + {0, 12}, + {157, 6}, + {169, 6}, + {70, 6}, + {181, 6}, + {76, 6}, + {94, 6}, + {65, 5}, + {193, 6}, + {82, 6}, + {100, 6}, + {67, 5}, + {118, 6}, + {73, 5}, + {91, 5}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {192, 11}, + {152, 7}, + {164, 7}, + {145, 3}, + {204, 11}, + {155, 7}, + {167, 7}, + {69, 7}, + {179, 7}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {170, 7}, + {71, 7}, + {182, 7}, + {77, 7}, + {95, 7}, + {65, 5}, + {194, 7}, + {83, 7}, + {101, 7}, + {67, 5}, + {119, 7}, + {73, 5}, + {91, 5}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {185, 7}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {103, 7}, + {68, 6}, + {121, 7}, + {74, 6}, + {92, 6}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {76, 6}, + {94, 6}, + {4, 7}, + {193, 6}, + {82, 6}, + {100, 6}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {207, 11}, + {156, 8}, + {168, 8}, + {146, 4}, + {180, 8}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {159, 8}, + {117, 11}, + {72, 8}, + {135, 11}, + {78, 8}, + {96, 8}, + {65, 5}, + {195, 8}, + {84, 8}, + {102, 8}, + {67, 5}, + {120, 8}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {174, 8}, + {148, 6}, + {141, 11}, + {80, 8}, + {98, 8}, + {66, 6}, + {198, 8}, + {86, 8}, + {104, 8}, + {68, 6}, + {122, 8}, + {74, 6}, + {92, 6}, + {3, 8}, + {0, 12}, + {157, 6}, + {110, 8}, + {70, 6}, + {128, 8}, + {76, 6}, + {94, 6}, + {5, 8}, + {193, 6}, + {82, 6}, + {100, 6}, + {9, 8}, + {118, 6}, + {17, 8}, + {33, 8}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {189, 8}, + {152, 7}, + {164, 7}, + {145, 3}, + {201, 8}, + {88, 8}, + {106, 8}, + {69, 7}, + {124, 8}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {112, 8}, + {71, 7}, + {130, 8}, + {77, 7}, + {95, 7}, + {6, 8}, + {194, 7}, + {83, 7}, + {101, 7}, + {10, 8}, + {119, 7}, + {18, 8}, + {34, 8}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {136, 8}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {103, 7}, + {12, 8}, + {121, 7}, + {20, 8}, + {36, 8}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {24, 8}, + {40, 8}, + {4, 7}, + {193, 6}, + {82, 6}, + {48, 8}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {0, 12}, + {0, 12}, + {0, 12}, + {146, 4}, + {0, 12}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {160, 9}, + {172, 9}, + {147, 5}, + {184, 9}, + {150, 5}, + {162, 5}, + {65, 5}, + {196, 9}, + {153, 5}, + {165, 5}, + {67, 5}, + {177, 5}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {175, 9}, + {148, 6}, + {143, 11}, + {81, 9}, + {99, 9}, + {66, 6}, + {199, 9}, + {87, 9}, + {105, 9}, + {68, 6}, + {123, 9}, + {74, 6}, + {92, 6}, + {64, 4}, + {0, 12}, + {157, 6}, + {111, 9}, + {70, 6}, + {129, 9}, + {76, 6}, + {94, 6}, + {65, 5}, + {193, 6}, + {82, 6}, + {100, 6}, + {67, 5}, + {118, 6}, + {73, 5}, + {91, 5}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {190, 9}, + {152, 7}, + {164, 7}, + {145, 3}, + {202, 9}, + {89, 9}, + {107, 9}, + {69, 7}, + {125, 9}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {113, 9}, + {71, 7}, + {131, 9}, + {31, 11}, + {47, 11}, + {7, 9}, + {194, 7}, + {83, 7}, + {55, 11}, + {11, 9}, + {119, 7}, + {19, 9}, + {35, 9}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {137, 9}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {59, 11}, + {13, 9}, + {121, 7}, + {21, 9}, + {37, 9}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {25, 9}, + {41, 9}, + {4, 7}, + {193, 6}, + {82, 6}, + {49, 9}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {205, 9}, + {156, 8}, + {168, 8}, + {146, 4}, + {180, 8}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {159, 8}, + {115, 9}, + {72, 8}, + {133, 9}, + {78, 8}, + {96, 8}, + {65, 5}, + {195, 8}, + {84, 8}, + {102, 8}, + {67, 5}, + {120, 8}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {174, 8}, + {148, 6}, + {139, 9}, + {80, 8}, + {98, 8}, + {66, 6}, + {198, 8}, + {86, 8}, + {61, 11}, + {14, 9}, + {122, 8}, + {22, 9}, + {38, 9}, + {3, 8}, + {0, 12}, + {157, 6}, + {110, 8}, + {70, 6}, + {128, 8}, + {26, 9}, + {42, 9}, + {5, 8}, + {193, 6}, + {82, 6}, + {50, 9}, + {9, 8}, + {118, 6}, + {17, 8}, + {33, 8}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {189, 8}, + {152, 7}, + {164, 7}, + {145, 3}, + {201, 8}, + {88, 8}, + {106, 8}, + {69, 7}, + {124, 8}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {112, 8}, + {71, 7}, + {130, 8}, + {28, 9}, + {44, 9}, + {6, 8}, + {194, 7}, + {83, 7}, + {52, 9}, + {10, 8}, + {119, 7}, + {18, 8}, + {34, 8}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {136, 8}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {56, 9}, + {12, 8}, + {121, 7}, + {20, 8}, + {36, 8}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {24, 8}, + {40, 8}, + {4, 7}, + {193, 6}, + {82, 6}, + {48, 8}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {0, 12}, + {0, 12}, + {0, 12}, + {146, 4}, + {0, 12}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {0, 12}, + {0, 12}, + {147, 5}, + {0, 12}, + {150, 5}, + {162, 5}, + {65, 5}, + {0, 12}, + {153, 5}, + {165, 5}, + {67, 5}, + {177, 5}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {176, 10}, + {148, 6}, + {188, 10}, + {151, 6}, + {163, 6}, + {66, 6}, + {200, 10}, + {154, 6}, + {166, 6}, + {68, 6}, + {178, 6}, + {74, 6}, + {92, 6}, + {64, 4}, + {0, 12}, + {157, 6}, + {169, 6}, + {70, 6}, + {181, 6}, + {76, 6}, + {94, 6}, + {65, 5}, + {193, 6}, + {82, 6}, + {100, 6}, + {67, 5}, + {118, 6}, + {73, 5}, + {91, 5}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {191, 10}, + {152, 7}, + {164, 7}, + {145, 3}, + {203, 10}, + {90, 10}, + {108, 10}, + {69, 7}, + {126, 10}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {114, 10}, + {71, 7}, + {132, 10}, + {77, 7}, + {95, 7}, + {65, 5}, + {194, 7}, + {83, 7}, + {101, 7}, + {67, 5}, + {119, 7}, + {73, 5}, + {91, 5}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {138, 10}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {103, 7}, + {68, 6}, + {121, 7}, + {74, 6}, + {92, 6}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {76, 6}, + {94, 6}, + {4, 7}, + {193, 6}, + {82, 6}, + {100, 6}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {206, 10}, + {156, 8}, + {168, 8}, + {146, 4}, + {180, 8}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {159, 8}, + {116, 10}, + {72, 8}, + {134, 10}, + {78, 8}, + {96, 8}, + {65, 5}, + {195, 8}, + {84, 8}, + {102, 8}, + {67, 5}, + {120, 8}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {174, 8}, + {148, 6}, + {140, 10}, + {80, 8}, + {98, 8}, + {66, 6}, + {198, 8}, + {86, 8}, + {62, 11}, + {15, 10}, + {122, 8}, + {23, 10}, + {39, 10}, + {3, 8}, + {0, 12}, + {157, 6}, + {110, 8}, + {70, 6}, + {128, 8}, + {27, 10}, + {43, 10}, + {5, 8}, + {193, 6}, + {82, 6}, + {51, 10}, + {9, 8}, + {118, 6}, + {17, 8}, + {33, 8}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {189, 8}, + {152, 7}, + {164, 7}, + {145, 3}, + {201, 8}, + {88, 8}, + {106, 8}, + {69, 7}, + {124, 8}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {112, 8}, + {71, 7}, + {130, 8}, + {29, 10}, + {45, 10}, + {6, 8}, + {194, 7}, + {83, 7}, + {53, 10}, + {10, 8}, + {119, 7}, + {18, 8}, + {34, 8}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {136, 8}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {57, 10}, + {12, 8}, + {121, 7}, + {20, 8}, + {36, 8}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {24, 8}, + {40, 8}, + {4, 7}, + {193, 6}, + {82, 6}, + {48, 8}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {0, 12}, + {0, 12}, + {0, 12}, + {146, 4}, + {0, 12}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {160, 9}, + {172, 9}, + {147, 5}, + {184, 9}, + {150, 5}, + {162, 5}, + {65, 5}, + {196, 9}, + {153, 5}, + {165, 5}, + {67, 5}, + {177, 5}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {175, 9}, + {148, 6}, + {142, 10}, + {81, 9}, + {99, 9}, + {66, 6}, + {199, 9}, + {87, 9}, + {105, 9}, + {68, 6}, + {123, 9}, + {74, 6}, + {92, 6}, + {64, 4}, + {0, 12}, + {157, 6}, + {111, 9}, + {70, 6}, + {129, 9}, + {76, 6}, + {94, 6}, + {65, 5}, + {193, 6}, + {82, 6}, + {100, 6}, + {67, 5}, + {118, 6}, + {73, 5}, + {91, 5}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {190, 9}, + {152, 7}, + {164, 7}, + {145, 3}, + {202, 9}, + {89, 9}, + {107, 9}, + {69, 7}, + {125, 9}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {113, 9}, + {71, 7}, + {131, 9}, + {30, 10}, + {46, 10}, + {7, 9}, + {194, 7}, + {83, 7}, + {54, 10}, + {11, 9}, + {119, 7}, + {19, 9}, + {35, 9}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {137, 9}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {58, 10}, + {13, 9}, + {121, 7}, + {21, 9}, + {37, 9}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {25, 9}, + {41, 9}, + {4, 7}, + {193, 6}, + {82, 6}, + {49, 9}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {205, 9}, + {156, 8}, + {168, 8}, + {146, 4}, + {180, 8}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {159, 8}, + {115, 9}, + {72, 8}, + {133, 9}, + {78, 8}, + {96, 8}, + {65, 5}, + {195, 8}, + {84, 8}, + {102, 8}, + {67, 5}, + {120, 8}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {174, 8}, + {148, 6}, + {139, 9}, + {80, 8}, + {98, 8}, + {66, 6}, + {198, 8}, + {86, 8}, + {60, 10}, + {14, 9}, + {122, 8}, + {22, 9}, + {38, 9}, + {3, 8}, + {0, 12}, + {157, 6}, + {110, 8}, + {70, 6}, + {128, 8}, + {26, 9}, + {42, 9}, + {5, 8}, + {193, 6}, + {82, 6}, + {50, 9}, + {9, 8}, + {118, 6}, + {17, 8}, + {33, 8}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {189, 8}, + {152, 7}, + {164, 7}, + {145, 3}, + {201, 8}, + {88, 8}, + {106, 8}, + {69, 7}, + {124, 8}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {112, 8}, + {71, 7}, + {130, 8}, + {28, 9}, + {44, 9}, + {6, 8}, + {194, 7}, + {83, 7}, + {52, 9}, + {10, 8}, + {119, 7}, + {18, 8}, + {34, 8}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {136, 8}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {56, 9}, + {12, 8}, + {121, 7}, + {20, 8}, + {36, 8}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {24, 8}, + {40, 8}, + {4, 7}, + {193, 6}, + {82, 6}, + {48, 8}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}}; +} // utf8_to_utf16 namespace +} // tables namespace +} // unnamed namespace +} // namespace simdutf + +#endif // SIMDUTF_UTF8_TO_UTF16_TABLES_H +/* end file src/tables/utf8_to_utf16_tables.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=tables/utf16_to_utf8_tables.h +/* begin file src/tables/utf16_to_utf8_tables.h */ +// file generated by scripts/sse_convert_utf16_to_utf8.py +#ifndef SIMDUTF_UTF16_TO_UTF8_TABLES_H +#define SIMDUTF_UTF16_TO_UTF8_TABLES_H + +namespace simdutf { +namespace { +namespace tables { +namespace utf16_to_utf8 { + + // 1 byte for length, 16 bytes for mask + const uint8_t pack_1_2_utf8_bytes[256][17] = { + {16,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14}, + {15,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,0x80}, + {15,1,0,3,2,5,4,7,6,8,11,10,13,12,15,14,0x80}, + {14,0,3,2,5,4,7,6,8,11,10,13,12,15,14,0x80,0x80}, + {15,1,0,2,5,4,7,6,9,8,11,10,13,12,15,14,0x80}, + {14,0,2,5,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80}, + {14,1,0,2,5,4,7,6,8,11,10,13,12,15,14,0x80,0x80}, + {13,0,2,5,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80}, + {15,1,0,3,2,5,4,7,6,9,8,10,13,12,15,14,0x80}, + {14,0,3,2,5,4,7,6,9,8,10,13,12,15,14,0x80,0x80}, + {14,1,0,3,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80}, + {13,0,3,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80}, + {14,1,0,2,5,4,7,6,9,8,10,13,12,15,14,0x80,0x80}, + {13,0,2,5,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80}, + {13,1,0,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80}, + {12,0,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80}, + {15,1,0,3,2,4,7,6,9,8,11,10,13,12,15,14,0x80}, + {14,0,3,2,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80}, + {14,1,0,3,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80}, + {13,0,3,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80}, + {14,1,0,2,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80}, + {13,0,2,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80}, + {13,1,0,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80}, + {12,0,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80}, + {14,1,0,3,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80}, + {13,0,3,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80}, + {13,1,0,3,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80}, + {12,0,3,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80}, + {13,1,0,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80}, + {12,0,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80}, + {12,1,0,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80}, + {11,0,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {15,1,0,3,2,5,4,7,6,9,8,11,10,12,15,14,0x80}, + {14,0,3,2,5,4,7,6,9,8,11,10,12,15,14,0x80,0x80}, + {14,1,0,3,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80}, + {13,0,3,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80}, + {14,1,0,2,5,4,7,6,9,8,11,10,12,15,14,0x80,0x80}, + {13,0,2,5,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80}, + {13,1,0,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80}, + {12,0,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80}, + {14,1,0,3,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80}, + {13,0,3,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80}, + {13,1,0,3,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80}, + {12,0,3,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80}, + {13,1,0,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80}, + {12,0,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80}, + {12,1,0,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80}, + {11,0,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {14,1,0,3,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80}, + {13,0,3,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80}, + {13,1,0,3,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80}, + {12,0,3,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80}, + {13,1,0,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80}, + {12,0,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80}, + {12,1,0,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80}, + {11,0,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {13,1,0,3,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80}, + {12,0,3,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80}, + {12,1,0,3,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80}, + {11,0,3,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {12,1,0,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80}, + {11,0,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {15,1,0,3,2,5,4,6,9,8,11,10,13,12,15,14,0x80}, + {14,0,3,2,5,4,6,9,8,11,10,13,12,15,14,0x80,0x80}, + {14,1,0,3,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80}, + {13,0,3,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80}, + {14,1,0,2,5,4,6,9,8,11,10,13,12,15,14,0x80,0x80}, + {13,0,2,5,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80}, + {13,1,0,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80}, + {12,0,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80}, + {14,1,0,3,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80}, + {13,0,3,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80}, + {13,1,0,3,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80}, + {12,0,3,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80}, + {13,1,0,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80}, + {12,0,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80}, + {12,1,0,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80}, + {11,0,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {14,1,0,3,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80}, + {13,0,3,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80}, + {13,1,0,3,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80}, + {12,0,3,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80}, + {13,1,0,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80}, + {12,0,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80}, + {12,1,0,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80}, + {11,0,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {13,1,0,3,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80}, + {12,0,3,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80}, + {12,1,0,3,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80}, + {11,0,3,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {12,1,0,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80}, + {11,0,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {14,1,0,3,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80}, + {13,0,3,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80}, + {13,1,0,3,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80}, + {12,0,3,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80}, + {13,1,0,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80}, + {12,0,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80}, + {12,1,0,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80}, + {11,0,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {13,1,0,3,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80}, + {12,0,3,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80}, + {12,1,0,3,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80}, + {11,0,3,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {12,1,0,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80}, + {11,0,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {13,1,0,3,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80}, + {12,0,3,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80}, + {12,1,0,3,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80}, + {11,0,3,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {12,1,0,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80}, + {11,0,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {12,1,0,3,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80}, + {11,0,3,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,3,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,3,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {10,1,0,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,0,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {15,1,0,3,2,5,4,7,6,9,8,11,10,13,12,14,0x80}, + {14,0,3,2,5,4,7,6,9,8,11,10,13,12,14,0x80,0x80}, + {14,1,0,3,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80}, + {13,0,3,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80}, + {14,1,0,2,5,4,7,6,9,8,11,10,13,12,14,0x80,0x80}, + {13,0,2,5,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80}, + {13,1,0,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80}, + {12,0,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80}, + {14,1,0,3,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80}, + {13,0,3,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80}, + {13,1,0,3,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80}, + {12,0,3,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80}, + {13,1,0,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80}, + {12,0,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80}, + {12,1,0,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80}, + {11,0,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, + {14,1,0,3,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80}, + {13,0,3,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80}, + {13,1,0,3,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80}, + {12,0,3,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80}, + {13,1,0,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80}, + {12,0,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80}, + {12,1,0,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80}, + {11,0,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, + {13,1,0,3,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80}, + {12,0,3,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80}, + {12,1,0,3,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80}, + {11,0,3,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, + {12,1,0,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80}, + {11,0,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {14,1,0,3,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80}, + {13,0,3,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80}, + {13,1,0,3,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80}, + {12,0,3,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80}, + {13,1,0,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80}, + {12,0,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80}, + {12,1,0,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80}, + {11,0,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {13,1,0,3,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80}, + {12,0,3,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80}, + {12,1,0,3,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80}, + {11,0,3,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {12,1,0,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80}, + {11,0,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {13,1,0,3,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80}, + {12,0,3,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80}, + {12,1,0,3,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80}, + {11,0,3,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {12,1,0,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80}, + {11,0,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {12,1,0,3,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80}, + {11,0,3,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,3,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,3,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {10,1,0,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,0,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {14,1,0,3,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80}, + {13,0,3,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80}, + {13,1,0,3,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80}, + {12,0,3,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80}, + {13,1,0,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80}, + {12,0,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80}, + {12,1,0,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80}, + {11,0,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, + {13,1,0,3,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80}, + {12,0,3,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80}, + {12,1,0,3,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80}, + {11,0,3,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, + {12,1,0,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80}, + {11,0,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {13,1,0,3,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80}, + {12,0,3,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80}, + {12,1,0,3,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80}, + {11,0,3,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, + {12,1,0,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80}, + {11,0,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {12,1,0,3,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80}, + {11,0,3,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,3,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,3,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {10,1,0,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,0,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {13,1,0,3,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80}, + {12,0,3,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80}, + {12,1,0,3,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80}, + {11,0,3,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {12,1,0,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80}, + {11,0,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {12,1,0,3,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80}, + {11,0,3,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,3,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,3,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {10,1,0,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,0,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {12,1,0,3,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80}, + {11,0,3,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,3,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,3,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {10,1,0,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,0,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,3,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,3,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {10,1,0,3,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,0,3,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {10,1,0,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,0,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,1,0,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,0,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80} + }; + + // 1 byte for length, 16 bytes for mask + const uint8_t pack_1_2_3_utf8_bytes[256][17] = { + {12,2,3,1,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80}, + {9,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {11,3,1,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80}, + {10,0,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,2,3,1,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,3,1,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,0,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {11,2,3,1,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80}, + {8,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {10,3,1,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,0,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {10,2,3,1,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,3,1,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,0,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,2,3,1,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,3,1,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,0,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,2,3,1,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,3,1,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,0,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,2,3,1,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,3,1,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,0,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,2,3,1,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,3,1,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,0,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {11,2,3,1,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80}, + {8,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {10,3,1,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,0,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,2,3,1,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,3,1,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,0,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {10,2,3,1,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,3,1,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,0,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,2,3,1,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,3,1,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,0,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {10,2,3,1,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,3,1,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,0,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,2,3,1,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,3,1,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,0,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,2,3,1,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,3,1,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,0,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,2,3,1,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,3,1,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,0,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,2,3,1,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,3,1,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,0,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,2,3,1,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,3,1,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,0,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,2,3,1,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,3,1,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,0,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,2,3,1,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,3,1,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,0,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,2,3,1,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,3,1,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,0,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,2,3,1,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {0,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {2,3,1,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {1,0,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,2,3,1,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {2,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,3,1,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,0,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,2,3,1,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {1,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,3,1,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {2,0,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,2,3,1,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,3,1,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,0,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,2,3,1,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {2,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,3,1,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,0,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,2,3,1,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,3,1,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,0,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,2,3,1,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,3,1,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,0,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,2,3,1,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,3,1,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,0,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,2,3,1,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {1,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,3,1,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {2,0,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,2,3,1,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,3,1,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,0,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,2,3,1,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {2,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,3,1,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,0,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {11,2,3,1,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80}, + {8,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {10,3,1,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,0,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,2,3,1,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,3,1,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,0,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {10,2,3,1,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,3,1,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,0,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,2,3,1,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,3,1,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,0,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,2,3,1,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,3,1,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,0,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,2,3,1,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {2,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,3,1,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,0,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,2,3,1,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,3,1,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,0,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,2,3,1,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,3,1,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,0,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {10,2,3,1,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,3,1,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,0,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,2,3,1,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,3,1,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,0,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,2,3,1,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,3,1,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,0,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,2,3,1,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,3,1,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,0,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,2,3,1,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,3,1,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,0,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,2,3,1,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,3,1,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,0,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,2,3,1,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,3,1,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,0,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,2,3,1,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,3,1,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,0,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {10,2,3,1,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,3,1,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,0,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,2,3,1,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,3,1,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,0,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,2,3,1,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,3,1,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,0,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,2,3,1,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,3,1,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,0,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,2,3,1,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,3,1,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,0,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,2,3,1,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {1,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,3,1,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {2,0,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,2,3,1,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,3,1,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,0,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,2,3,1,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {2,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,3,1,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,0,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,2,3,1,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,3,1,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,0,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,2,3,1,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,3,1,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,0,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,2,3,1,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,3,1,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,0,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,2,3,1,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,3,1,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,0,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,2,3,1,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,3,1,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,0,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,2,3,1,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {2,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,3,1,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,0,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,2,3,1,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,3,1,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,0,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,2,3,1,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,3,1,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,0,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80} + }; + +} // utf16_to_utf8 namespace +} // tables namespace +} // unnamed namespace +} // namespace simdutf + +#endif // SIMDUTF_UTF16_TO_UTF8_TABLES_H +/* end file src/tables/utf16_to_utf8_tables.h */ +// End of tables. + +// The scalar routines should be included once. +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/ascii.h +/* begin file src/scalar/ascii.h */ +#ifndef SIMDUTF_ASCII_H +#define SIMDUTF_ASCII_H + +namespace simdutf { +namespace scalar { +namespace { +namespace ascii { + +inline simdutf_warn_unused bool validate(const char *buf, size_t len) noexcept { + const uint8_t *data = reinterpret_cast(buf); + uint64_t pos = 0; + // process in blocks of 16 bytes when possible + for (;pos + 16 < len; pos += 16) { + uint64_t v1; + std::memcpy(&v1, data + pos, sizeof(uint64_t)); + uint64_t v2; + std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); + uint64_t v{v1 | v2}; + if ((v & 0x8080808080808080) != 0) { return false; } + } + // process the tail byte-by-byte + for (;pos < len; pos ++) { + if (data[pos] >= 0b10000000) { return false; } + } + return true; +} + +inline simdutf_warn_unused result validate_with_errors(const char *buf, size_t len) noexcept { + const uint8_t *data = reinterpret_cast(buf); + size_t pos = 0; + // process in blocks of 16 bytes when possible + for (;pos + 16 < len; pos += 16) { + uint64_t v1; + std::memcpy(&v1, data + pos, sizeof(uint64_t)); + uint64_t v2; + std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); + uint64_t v{v1 | v2}; + if ((v & 0x8080808080808080) != 0) { + for (;pos < len; pos ++) { + if (data[pos] >= 0b10000000) { return result(error_code::TOO_LARGE, pos); } + } + } + } + // process the tail byte-by-byte + for (;pos < len; pos ++) { + if (data[pos] >= 0b10000000) { return result(error_code::TOO_LARGE, pos); } + } + return result(error_code::SUCCESS, pos); +} + +} // ascii namespace +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file src/scalar/ascii.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf8.h +/* begin file src/scalar/utf8.h */ +#ifndef SIMDUTF_UTF8_H +#define SIMDUTF_UTF8_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf8 { +// credit: based on code from Google Fuchsia (Apache Licensed) +inline simdutf_warn_unused bool validate(const char *buf, size_t len) noexcept { + const uint8_t *data = reinterpret_cast(buf); + uint64_t pos = 0; + uint32_t code_point = 0; + while (pos < len) { + // check of the next 8 bytes are ascii. + uint64_t next_pos = pos + 16; + if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii + uint64_t v1; + std::memcpy(&v1, data + pos, sizeof(uint64_t)); + uint64_t v2; + std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); + uint64_t v{v1 | v2}; + if ((v & 0x8080808080808080) == 0) { + pos = next_pos; + continue; + } + } + unsigned char byte = data[pos]; + + while (byte < 0b10000000) { + if (++pos == len) { return true; } + byte = data[pos]; + } + + if ((byte & 0b11100000) == 0b11000000) { + next_pos = pos + 2; + if (next_pos > len) { return false; } + if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; } + // range check + code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); + if ((code_point < 0x80) || (0x7ff < code_point)) { return false; } + } else if ((byte & 0b11110000) == 0b11100000) { + next_pos = pos + 3; + if (next_pos > len) { return false; } + if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; } + if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; } + // range check + code_point = (byte & 0b00001111) << 12 | + (data[pos + 1] & 0b00111111) << 6 | + (data[pos + 2] & 0b00111111); + if ((code_point < 0x800) || (0xffff < code_point) || + (0xd7ff < code_point && code_point < 0xe000)) { + return false; + } + } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000 + next_pos = pos + 4; + if (next_pos > len) { return false; } + if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; } + if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; } + if ((data[pos + 3] & 0b11000000) != 0b10000000) { return false; } + // range check + code_point = + (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | + (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111); + if (code_point <= 0xffff || 0x10ffff < code_point) { return false; } + } else { + // we may have a continuation + return false; + } + pos = next_pos; + } + return true; +} + +inline simdutf_warn_unused result validate_with_errors(const char *buf, size_t len) noexcept { + const uint8_t *data = reinterpret_cast(buf); + size_t pos = 0; + uint32_t code_point = 0; + while (pos < len) { + // check of the next 8 bytes are ascii. + uint64_t next_pos = pos + 16; + if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii + uint64_t v1; + std::memcpy(&v1, data + pos, sizeof(uint64_t)); + uint64_t v2; + std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); + uint64_t v{v1 | v2}; + if ((v & 0x8080808080808080) == 0) { + pos = next_pos; + continue; + } + } + unsigned char byte = data[pos]; + + while (byte < 0b10000000) { + if (++pos == len) { return result(error_code::SUCCESS, len); } + byte = data[pos]; + } + + if ((byte & 0b11100000) == 0b11000000) { + next_pos = pos + 2; + if (next_pos > len) { return result(error_code::TOO_SHORT, pos); } + if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); } + // range check + code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); + if ((code_point < 0x80) || (0x7ff < code_point)) { return result(error_code::OVERLONG, pos); } + } else if ((byte & 0b11110000) == 0b11100000) { + next_pos = pos + 3; + if (next_pos > len) { return result(error_code::TOO_SHORT, pos); } + if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); } + if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); } + // range check + code_point = (byte & 0b00001111) << 12 | + (data[pos + 1] & 0b00111111) << 6 | + (data[pos + 2] & 0b00111111); + if ((code_point < 0x800) || (0xffff < code_point)) { return result(error_code::OVERLONG, pos);} + if (0xd7ff < code_point && code_point < 0xe000) { return result(error_code::SURROGATE, pos); } + } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000 + next_pos = pos + 4; + if (next_pos > len) { return result(error_code::TOO_SHORT, pos); } + if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); } + if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); } + if ((data[pos + 3] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); } + // range check + code_point = + (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | + (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111); + if (code_point <= 0xffff) { return result(error_code::OVERLONG, pos); } + if (0x10ffff < code_point) { return result(error_code::TOO_LARGE, pos); } + } else { + // we either have too many continuation bytes or an invalid leading byte + if ((byte & 0b11000000) == 0b10000000) { return result(error_code::TOO_LONG, pos); } + else { return result(error_code::HEADER_BITS, pos); } + } + pos = next_pos; + } + return result(error_code::SUCCESS, len); +} + +// Finds the previous leading byte and validates with errors from there +// Used to pinpoint the location of an error when an invalid chunk is detected +inline simdutf_warn_unused result rewind_and_validate_with_errors(const char *buf, size_t len) noexcept { + size_t extra_len{0}; + // A leading byte cannot be further than 4 bytes away + for(int i = 0; i < 5; i++) { + unsigned char byte = *buf; + if ((byte & 0b11000000) != 0b10000000) { + break; + } else { + buf--; + extra_len++; + } + } + + result res = validate_with_errors(buf, len + extra_len); + res.count -= extra_len; + return res; +} + +inline size_t count_code_points(const char* buf, size_t len) { + const int8_t * p = reinterpret_cast(buf); + size_t counter{0}; + for(size_t i = 0; i < len; i++) { + // -65 is 0b10111111, anything larger in two-complement's should start a new code point. + if(p[i] > -65) { counter++; } + } + return counter; +} + +inline size_t utf16_length_from_utf8(const char* buf, size_t len) { + const int8_t * p = reinterpret_cast(buf); + size_t counter{0}; + for(size_t i = 0; i < len; i++) { + if(p[i] > -65) { counter++; } + if(uint8_t(p[i]) >= 240) { counter++; } + } + return counter; +} + +inline size_t utf32_length_from_utf8(const char* buf, size_t len) { + const int8_t * p = reinterpret_cast(buf); + size_t counter{0}; + for(size_t i = 0; i < len; i++) { + // -65 is 0b10111111, anything larger in two-complement's should start a new code point. + if(p[i] > -65) { counter++; } + } + return counter; +} + +} // utf8 namespace +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file src/scalar/utf8.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf16.h +/* begin file src/scalar/utf16.h */ +#ifndef SIMDUTF_UTF16_H +#define SIMDUTF_UTF16_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf16 { + +inline simdutf_warn_unused uint16_t swap_bytes(const uint16_t word) { + return uint16_t((word >> 8) | (word << 8)); +} + +template +inline simdutf_warn_unused bool validate(const char16_t *buf, size_t len) noexcept { + const uint16_t *data = reinterpret_cast(buf); + uint64_t pos = 0; + while (pos < len) { + uint16_t word = big_endian ? swap_bytes(data[pos]) : data[pos]; + if((word &0xF800) == 0xD800) { + if(pos + 1 >= len) { return false; } + uint16_t diff = uint16_t(word - 0xD800); + if(diff > 0x3FF) { return false; } + uint16_t next_word = big_endian ? uint16_t((data[pos + 1] >> 8) | (data[pos + 1] << 8)) : data[pos + 1]; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if(diff2 > 0x3FF) { return false; } + pos += 2; + } else { + pos++; + } + } + return true; +} + +template +inline simdutf_warn_unused result validate_with_errors(const char16_t *buf, size_t len) noexcept { + const uint16_t *data = reinterpret_cast(buf); + size_t pos = 0; + while (pos < len) { + uint16_t word = big_endian ? swap_bytes(data[pos]) : data[pos]; + if((word & 0xF800) == 0xD800) { + if(pos + 1 >= len) { return result(error_code::SURROGATE, pos); } + uint16_t diff = uint16_t(word - 0xD800); + if(diff > 0x3FF) { return result(error_code::SURROGATE, pos); } + uint16_t next_word = big_endian ? uint16_t((data[pos + 1] >> 8) | (data[pos + 1] << 8)) : data[pos + 1]; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if(diff2 > 0x3FF) { return result(error_code::SURROGATE, pos); } + pos += 2; + } else { + pos++; + } + } + return result(error_code::SUCCESS, pos); +} + +template +inline size_t count_code_points(const char16_t* buf, size_t len) { + // We are not BOM aware. + const uint16_t * p = reinterpret_cast(buf); + size_t counter{0}; + for(size_t i = 0; i < len; i++) { + uint16_t word = big_endian ? swap_bytes(p[i]) : p[i]; + counter += ((word & 0xFC00) != 0xDC00); + } + return counter; +} + +template +inline size_t utf8_length_from_utf16(const char16_t* buf, size_t len) { + // We are not BOM aware. + const uint16_t * p = reinterpret_cast(buf); + size_t counter{0}; + for(size_t i = 0; i < len; i++) { + uint16_t word = big_endian ? swap_bytes(p[i]) : p[i]; + /** ASCII **/ + if(word <= 0x7F) { counter++; } + /** two-byte **/ + else if (word <= 0x7FF) { counter += 2; } + /** three-byte **/ + else if((word <= 0xD7FF) || (word >= 0xE000)) { counter += 3; } + /** surrogates -- 4 bytes **/ + else { counter += 2; } + } + return counter; +} + +template +inline size_t utf32_length_from_utf16(const char16_t* buf, size_t len) { + // We are not BOM aware. + const uint16_t * p = reinterpret_cast(buf); + size_t counter{0}; + for(size_t i = 0; i < len; i++) { + uint16_t word = big_endian ? swap_bytes(p[i]) : p[i]; + counter += ((word & 0xFC00) != 0xDC00); + } + return counter; +} + +simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* out) { + const uint16_t * input = reinterpret_cast(in); + uint16_t * output = reinterpret_cast(out); + for (size_t i = 0; i < size; i++) { + *output++ = uint16_t(input[i] >> 8 | input[i] << 8); + } +} + +} // utf16 namespace +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file src/scalar/utf16.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf32.h +/* begin file src/scalar/utf32.h */ +#ifndef SIMDUTF_UTF32_H +#define SIMDUTF_UTF32_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf32 { + +inline simdutf_warn_unused bool validate(const char32_t *buf, size_t len) noexcept { + const uint32_t *data = reinterpret_cast(buf); + uint64_t pos = 0; + for(;pos < len; pos++) { + uint32_t word = data[pos]; + if(word > 0x10FFFF || (word >= 0xD800 && word <= 0xDFFF)) { + return false; + } + } + return true; +} + +inline simdutf_warn_unused result validate_with_errors(const char32_t *buf, size_t len) noexcept { + const uint32_t *data = reinterpret_cast(buf); + size_t pos = 0; + for(;pos < len; pos++) { + uint32_t word = data[pos]; + if(word > 0x10FFFF) { + return result(error_code::TOO_LARGE, pos); + } + if(word >= 0xD800 && word <= 0xDFFF) { + return result(error_code::SURROGATE, pos); + } + } + return result(error_code::SUCCESS, pos); +} + +inline size_t utf8_length_from_utf32(const char32_t* buf, size_t len) { + // We are not BOM aware. + const uint32_t * p = reinterpret_cast(buf); + size_t counter{0}; + for(size_t i = 0; i < len; i++) { + /** ASCII **/ + if(p[i] <= 0x7F) { counter++; } + /** two-byte **/ + else if(p[i] <= 0x7FF) { counter += 2; } + /** three-byte **/ + else if(p[i] <= 0xFFFF) { counter += 3; } + /** four-bytes **/ + else { counter += 4; } + } + return counter; +} + +inline size_t utf16_length_from_utf32(const char32_t* buf, size_t len) { + // We are not BOM aware. + const uint32_t * p = reinterpret_cast(buf); + size_t counter{0}; + for(size_t i = 0; i < len; i++) { + /** non-surrogate word **/ + if(p[i] <= 0xFFFF) { counter++; } + /** surrogate pair **/ + else { counter += 2; } + } + return counter; +} + +} // utf32 namespace +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file src/scalar/utf32.h */ + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf32_to_utf8/valid_utf32_to_utf8.h +/* begin file src/scalar/utf32_to_utf8/valid_utf32_to_utf8.h */ +#ifndef SIMDUTF_VALID_UTF32_TO_UTF8_H +#define SIMDUTF_VALID_UTF32_TO_UTF8_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf32_to_utf8 { + +inline size_t convert_valid(const char32_t* buf, size_t len, char* utf8_output) { + const uint32_t *data = reinterpret_cast(buf); + size_t pos = 0; + char* start{utf8_output}; + while (pos < len) { + // try to convert the next block of 2 ASCII characters + if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are ascii + uint64_t v; + ::memcpy(&v, data + pos, sizeof(uint64_t)); + if ((v & 0xFFFFFF80FFFFFF80) == 0) { + *utf8_output++ = char(buf[pos]); + *utf8_output++ = char(buf[pos+1]); + pos += 2; + continue; + } + } + uint32_t word = data[pos]; + if((word & 0xFFFFFF80)==0) { + // will generate one UTF-8 bytes + *utf8_output++ = char(word); + pos++; + } else if((word & 0xFFFFF800)==0) { + // will generate two UTF-8 bytes + // we have 0b110XXXXX 0b10XXXXXX + *utf8_output++ = char((word>>6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + pos++; + } else if((word & 0xFFFF0000)==0) { + // will generate three UTF-8 bytes + // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX + *utf8_output++ = char((word>>12) | 0b11100000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + pos++; + } else { + // will generate four UTF-8 bytes + // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX + *utf8_output++ = char((word>>18) | 0b11110000); + *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + pos ++; + } + } + return utf8_output - start; +} + +} // utf32_to_utf8 namespace +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file src/scalar/utf32_to_utf8/valid_utf32_to_utf8.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf32_to_utf8/utf32_to_utf8.h +/* begin file src/scalar/utf32_to_utf8/utf32_to_utf8.h */ +#ifndef SIMDUTF_UTF32_TO_UTF8_H +#define SIMDUTF_UTF32_TO_UTF8_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf32_to_utf8 { + +inline size_t convert(const char32_t* buf, size_t len, char* utf8_output) { + const uint32_t *data = reinterpret_cast(buf); + size_t pos = 0; + char* start{utf8_output}; + while (pos < len) { + // try to convert the next block of 2 ASCII characters + if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are ascii + uint64_t v; + ::memcpy(&v, data + pos, sizeof(uint64_t)); + if ((v & 0xFFFFFF80FFFFFF80) == 0) { + *utf8_output++ = char(buf[pos]); + *utf8_output++ = char(buf[pos+1]); + pos += 2; + continue; + } + } + uint32_t word = data[pos]; + if((word & 0xFFFFFF80)==0) { + // will generate one UTF-8 bytes + *utf8_output++ = char(word); + pos++; + } else if((word & 0xFFFFF800)==0) { + // will generate two UTF-8 bytes + // we have 0b110XXXXX 0b10XXXXXX + *utf8_output++ = char((word>>6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + pos++; + } else if((word & 0xFFFF0000)==0) { + // will generate three UTF-8 bytes + // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX + if (word >= 0xD800 && word <= 0xDFFF) { return 0; } + *utf8_output++ = char((word>>12) | 0b11100000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + pos++; + } else { + // will generate four UTF-8 bytes + // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX + if (word > 0x10FFFF) { return 0; } + *utf8_output++ = char((word>>18) | 0b11110000); + *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + pos ++; + } + } + return utf8_output - start; +} + +inline result convert_with_errors(const char32_t* buf, size_t len, char* utf8_output) { + const uint32_t *data = reinterpret_cast(buf); + size_t pos = 0; + char* start{utf8_output}; + while (pos < len) { + // try to convert the next block of 2 ASCII characters + if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are ascii + uint64_t v; + ::memcpy(&v, data + pos, sizeof(uint64_t)); + if ((v & 0xFFFFFF80FFFFFF80) == 0) { + *utf8_output++ = char(buf[pos]); + *utf8_output++ = char(buf[pos+1]); + pos += 2; + continue; + } + } + uint32_t word = data[pos]; + if((word & 0xFFFFFF80)==0) { + // will generate one UTF-8 bytes + *utf8_output++ = char(word); + pos++; + } else if((word & 0xFFFFF800)==0) { + // will generate two UTF-8 bytes + // we have 0b110XXXXX 0b10XXXXXX + *utf8_output++ = char((word>>6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + pos++; + } else if((word & 0xFFFF0000)==0) { + // will generate three UTF-8 bytes + // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX + if (word >= 0xD800 && word <= 0xDFFF) { return result(error_code::SURROGATE, pos); } + *utf8_output++ = char((word>>12) | 0b11100000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + pos++; + } else { + // will generate four UTF-8 bytes + // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX + if (word > 0x10FFFF) { return result(error_code::TOO_LARGE, pos); } + *utf8_output++ = char((word>>18) | 0b11110000); + *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + pos ++; + } + } + return result(error_code::SUCCESS, utf8_output - start); +} + +} // utf32_to_utf8 namespace +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file src/scalar/utf32_to_utf8/utf32_to_utf8.h */ + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf32_to_utf16/valid_utf32_to_utf16.h +/* begin file src/scalar/utf32_to_utf16/valid_utf32_to_utf16.h */ +#ifndef SIMDUTF_VALID_UTF32_TO_UTF16_H +#define SIMDUTF_VALID_UTF32_TO_UTF16_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf32_to_utf16 { + +template +inline size_t convert_valid(const char32_t* buf, size_t len, char16_t* utf16_output) { + const uint32_t *data = reinterpret_cast(buf); + size_t pos = 0; + char16_t* start{utf16_output}; + while (pos < len) { + uint32_t word = data[pos]; + if((word & 0xFFFF0000)==0) { + // will not generate a surrogate pair + *utf16_output++ = big_endian ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word); + pos++; + } else { + // will generate a surrogate pair + word -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); + if (big_endian) { + high_surrogate = utf16::swap_bytes(high_surrogate); + low_surrogate = utf16::swap_bytes(low_surrogate); + } + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); + pos++; + } + } + return utf16_output - start; +} + +} // utf32_to_utf16 namespace +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file src/scalar/utf32_to_utf16/valid_utf32_to_utf16.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf32_to_utf16/utf32_to_utf16.h +/* begin file src/scalar/utf32_to_utf16/utf32_to_utf16.h */ +#ifndef SIMDUTF_UTF32_TO_UTF16_H +#define SIMDUTF_UTF32_TO_UTF16_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf32_to_utf16 { + +template +inline size_t convert(const char32_t* buf, size_t len, char16_t* utf16_output) { + const uint32_t *data = reinterpret_cast(buf); + size_t pos = 0; + char16_t* start{utf16_output}; + while (pos < len) { + uint32_t word = data[pos]; + if((word & 0xFFFF0000)==0) { + if (word >= 0xD800 && word <= 0xDFFF) { return 0; } + // will not generate a surrogate pair + *utf16_output++ = big_endian ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word); + } else { + // will generate a surrogate pair + if (word > 0x10FFFF) { return 0; } + word -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); + if (big_endian) { + high_surrogate = utf16::swap_bytes(high_surrogate); + low_surrogate = utf16::swap_bytes(low_surrogate); + } + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); + } + pos++; + } + return utf16_output - start; +} + +template +inline result convert_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) { + const uint32_t *data = reinterpret_cast(buf); + size_t pos = 0; + char16_t* start{utf16_output}; + while (pos < len) { + uint32_t word = data[pos]; + if((word & 0xFFFF0000)==0) { + if (word >= 0xD800 && word <= 0xDFFF) { return result(error_code::SURROGATE, pos); } + // will not generate a surrogate pair + *utf16_output++ = big_endian ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word); + } else { + // will generate a surrogate pair + if (word > 0x10FFFF) { return result(error_code::TOO_LARGE, pos); } + word -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); + if (big_endian) { + high_surrogate = utf16::swap_bytes(high_surrogate); + low_surrogate = utf16::swap_bytes(low_surrogate); + } + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); + } + pos++; + } + return result(error_code::SUCCESS, utf16_output - start); +} + +} // utf32_to_utf16 namespace +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file src/scalar/utf32_to_utf16/utf32_to_utf16.h */ + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf16_to_utf8/valid_utf16_to_utf8.h +/* begin file src/scalar/utf16_to_utf8/valid_utf16_to_utf8.h */ +#ifndef SIMDUTF_VALID_UTF16_TO_UTF8_H +#define SIMDUTF_VALID_UTF16_TO_UTF8_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf16_to_utf8 { + +template +inline size_t convert_valid(const char16_t* buf, size_t len, char* utf8_output) { + const uint16_t *data = reinterpret_cast(buf); + size_t pos = 0; + char* start{utf8_output}; + while (pos < len) { + // try to convert the next block of 4 ASCII characters + if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii + uint64_t v; + ::memcpy(&v, data + pos, sizeof(uint64_t)); + if (big_endian) v = (v >> 8) | (v << (64 - 8)); + if ((v & 0xFF80FF80FF80FF80) == 0) { + size_t final_pos = pos + 4; + while(pos < final_pos) { + *utf8_output++ = big_endian ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]); + pos++; + } + continue; + } + } + uint16_t word = big_endian ? utf16::swap_bytes(data[pos]) : data[pos]; + if((word & 0xFF80)==0) { + // will generate one UTF-8 bytes + *utf8_output++ = char(word); + pos++; + } else if((word & 0xF800)==0) { + // will generate two UTF-8 bytes + // we have 0b110XXXXX 0b10XXXXXX + *utf8_output++ = char((word>>6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + pos++; + } else if((word &0xF800 ) != 0xD800) { + // will generate three UTF-8 bytes + // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX + *utf8_output++ = char((word>>12) | 0b11100000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + pos++; + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + if(pos + 1 >= len) { return 0; } // minimal bound checking + uint16_t next_word = big_endian ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1]; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + uint32_t value = (diff << 10) + diff2 + 0x10000; + // will generate four UTF-8 bytes + // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX + *utf8_output++ = char((value>>18) | 0b11110000); + *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((value & 0b111111) | 0b10000000); + pos += 2; + } + } + return utf8_output - start; +} + +} // utf16_to_utf8 namespace +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file src/scalar/utf16_to_utf8/valid_utf16_to_utf8.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf16_to_utf8/utf16_to_utf8.h +/* begin file src/scalar/utf16_to_utf8/utf16_to_utf8.h */ +#ifndef SIMDUTF_UTF16_TO_UTF8_H +#define SIMDUTF_UTF16_TO_UTF8_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf16_to_utf8 { + +template +inline size_t convert(const char16_t* buf, size_t len, char* utf8_output) { + const uint16_t *data = reinterpret_cast(buf); + size_t pos = 0; + char* start{utf8_output}; + while (pos < len) { + // try to convert the next block of 8 ASCII characters + if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii + uint64_t v; + ::memcpy(&v, data + pos, sizeof(uint64_t)); + if (big_endian) v = (v >> 8) | (v << (64 - 8)); + if ((v & 0xFF80FF80FF80FF80) == 0) { + size_t final_pos = pos + 4; + while(pos < final_pos) { + *utf8_output++ = big_endian ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]); + pos++; + } + continue; + } + } + uint16_t word = big_endian ? utf16::swap_bytes(data[pos]) : data[pos]; + if((word & 0xFF80)==0) { + // will generate one UTF-8 bytes + *utf8_output++ = char(word); + pos++; + } else if((word & 0xF800)==0) { + // will generate two UTF-8 bytes + // we have 0b110XXXXX 0b10XXXXXX + *utf8_output++ = char((word>>6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + pos++; + } else if((word &0xF800 ) != 0xD800) { + // will generate three UTF-8 bytes + // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX + *utf8_output++ = char((word>>12) | 0b11100000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + pos++; + } else { + // must be a surrogate pair + if(pos + 1 >= len) { return 0; } + uint16_t diff = uint16_t(word - 0xD800); + if(diff > 0x3FF) { return 0; } + uint16_t next_word = big_endian ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1]; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if(diff2 > 0x3FF) { return 0; } + uint32_t value = (diff << 10) + diff2 + 0x10000; + // will generate four UTF-8 bytes + // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX + *utf8_output++ = char((value>>18) | 0b11110000); + *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((value & 0b111111) | 0b10000000); + pos += 2; + } + } + return utf8_output - start; +} + +template +inline result convert_with_errors(const char16_t* buf, size_t len, char* utf8_output) { + const uint16_t *data = reinterpret_cast(buf); + size_t pos = 0; + char* start{utf8_output}; + while (pos < len) { + // try to convert the next block of 8 ASCII characters + if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii + uint64_t v; + ::memcpy(&v, data + pos, sizeof(uint64_t)); + if (big_endian) v = (v >> 8) | (v << (64 - 8)); + if ((v & 0xFF80FF80FF80FF80) == 0) { + size_t final_pos = pos + 4; + while(pos < final_pos) { + *utf8_output++ = big_endian ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]); + pos++; + } + continue; + } + } + uint16_t word = big_endian ? utf16::swap_bytes(data[pos]) : data[pos]; + if((word & 0xFF80)==0) { + // will generate one UTF-8 bytes + *utf8_output++ = char(word); + pos++; + } else if((word & 0xF800)==0) { + // will generate two UTF-8 bytes + // we have 0b110XXXXX 0b10XXXXXX + *utf8_output++ = char((word>>6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + pos++; + } else if((word &0xF800 ) != 0xD800) { + // will generate three UTF-8 bytes + // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX + *utf8_output++ = char((word>>12) | 0b11100000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + pos++; + } else { + // must be a surrogate pair + if(pos + 1 >= len) { return result(error_code::SURROGATE, pos); } + uint16_t diff = uint16_t(word - 0xD800); + if(diff > 0x3FF) { return result(error_code::SURROGATE, pos); } + uint16_t next_word = big_endian ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1]; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if(diff2 > 0x3FF) { return result(error_code::SURROGATE, pos); } + uint32_t value = (diff << 10) + diff2 + 0x10000; + // will generate four UTF-8 bytes + // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX + *utf8_output++ = char((value>>18) | 0b11110000); + *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((value & 0b111111) | 0b10000000); + pos += 2; + } + } + return result(error_code::SUCCESS, utf8_output - start); +} + +} // utf16_to_utf8 namespace +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file src/scalar/utf16_to_utf8/utf16_to_utf8.h */ + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf16_to_utf32/valid_utf16_to_utf32.h +/* begin file src/scalar/utf16_to_utf32/valid_utf16_to_utf32.h */ +#ifndef SIMDUTF_VALID_UTF16_TO_UTF32_H +#define SIMDUTF_VALID_UTF16_TO_UTF32_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf16_to_utf32 { + +template +inline size_t convert_valid(const char16_t* buf, size_t len, char32_t* utf32_output) { + const uint16_t *data = reinterpret_cast(buf); + size_t pos = 0; + char32_t* start{utf32_output}; + while (pos < len) { + uint16_t word = big_endian ? utf16::swap_bytes(data[pos]) : data[pos]; + if((word &0xF800 ) != 0xD800) { + // No surrogate pair, extend 16-bit word to 32-bit word + *utf32_output++ = char32_t(word); + pos++; + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + if(pos + 1 >= len) { return 0; } // minimal bound checking + uint16_t next_word = big_endian ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1]; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf32_output++ = char32_t(value); + pos += 2; + } + } + return utf32_output - start; +} + +} // utf16_to_utf32 namespace +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file src/scalar/utf16_to_utf32/valid_utf16_to_utf32.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf16_to_utf32/utf16_to_utf32.h +/* begin file src/scalar/utf16_to_utf32/utf16_to_utf32.h */ +#ifndef SIMDUTF_UTF16_TO_UTF32_H +#define SIMDUTF_UTF16_TO_UTF32_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf16_to_utf32 { + +template +inline size_t convert(const char16_t* buf, size_t len, char32_t* utf32_output) { + const uint16_t *data = reinterpret_cast(buf); + size_t pos = 0; + char32_t* start{utf32_output}; + while (pos < len) { + uint16_t word = big_endian ? utf16::swap_bytes(data[pos]) : data[pos]; + if((word &0xF800 ) != 0xD800) { + // No surrogate pair, extend 16-bit word to 32-bit word + *utf32_output++ = char32_t(word); + pos++; + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + if(diff > 0x3FF) { return 0; } + if(pos + 1 >= len) { return 0; } // minimal bound checking + uint16_t next_word = big_endian ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1]; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if(diff2 > 0x3FF) { return 0; } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf32_output++ = char32_t(value); + pos += 2; + } + } + return utf32_output - start; +} + +template +inline result convert_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) { + const uint16_t *data = reinterpret_cast(buf); + size_t pos = 0; + char32_t* start{utf32_output}; + while (pos < len) { + uint16_t word = big_endian ? utf16::swap_bytes(data[pos]) : data[pos]; + if((word &0xF800 ) != 0xD800) { + // No surrogate pair, extend 16-bit word to 32-bit word + *utf32_output++ = char32_t(word); + pos++; + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + if(diff > 0x3FF) { return result(error_code::SURROGATE, pos); } + if(pos + 1 >= len) { return result(error_code::SURROGATE, pos); } // minimal bound checking + uint16_t next_word = big_endian ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1]; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if(diff2 > 0x3FF) { return result(error_code::SURROGATE, pos); } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf32_output++ = char32_t(value); + pos += 2; + } + } + return result(error_code::SUCCESS, utf32_output - start); +} + +} // utf16_to_utf32 namespace +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file src/scalar/utf16_to_utf32/utf16_to_utf32.h */ + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf8_to_utf16/valid_utf8_to_utf16.h +/* begin file src/scalar/utf8_to_utf16/valid_utf8_to_utf16.h */ +#ifndef SIMDUTF_VALID_UTF8_TO_UTF16_H +#define SIMDUTF_VALID_UTF8_TO_UTF16_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf8_to_utf16 { + +template +inline size_t convert_valid(const char* buf, size_t len, char16_t* utf16_output) { + const uint8_t *data = reinterpret_cast(buf); + size_t pos = 0; + char16_t* start{utf16_output}; + while (pos < len) { + // try to convert the next block of 8 ASCII bytes + if (pos + 8 <= len) { // if it is safe to read 8 more bytes, check that they are ascii + uint64_t v; + ::memcpy(&v, data + pos, sizeof(uint64_t)); + if ((v & 0x8080808080808080) == 0) { + size_t final_pos = pos + 8; + while(pos < final_pos) { + *utf16_output++ = big_endian ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]); + pos++; + } + continue; + } + } + uint8_t leading_byte = data[pos]; // leading byte + if (leading_byte < 0b10000000) { + // converting one ASCII byte !!! + *utf16_output++ = big_endian ? char16_t(utf16::swap_bytes(leading_byte)) : char16_t(leading_byte); + pos++; + } else if ((leading_byte & 0b11100000) == 0b11000000) { + // We have a two-byte UTF-8, it should become + // a single UTF-16 word. + if(pos + 1 >= len) { break; } // minimal bound checking + uint16_t code_point = uint16_t(((leading_byte &0b00011111) << 6) | (data[pos + 1] &0b00111111)); + if (big_endian) { + code_point = utf16::swap_bytes(uint16_t(code_point)); + } + *utf16_output++ = char16_t(code_point); + pos += 2; + } else if ((leading_byte & 0b11110000) == 0b11100000) { + // We have a three-byte UTF-8, it should become + // a single UTF-16 word. + if(pos + 2 >= len) { break; } // minimal bound checking + uint16_t code_point = uint16_t(((leading_byte &0b00001111) << 12) | ((data[pos + 1] &0b00111111) << 6) | (data[pos + 2] &0b00111111)); + if (big_endian) { + code_point = utf16::swap_bytes(uint16_t(code_point)); + } + *utf16_output++ = char16_t(code_point); + pos += 3; + } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000 + // we have a 4-byte UTF-8 word. + if(pos + 3 >= len) { break; } // minimal bound checking + uint32_t code_point = ((leading_byte & 0b00000111) << 18 )| ((data[pos + 1] &0b00111111) << 12) + | ((data[pos + 2] &0b00111111) << 6) | (data[pos + 3] &0b00111111); + code_point -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF)); + if (big_endian) { + high_surrogate = utf16::swap_bytes(high_surrogate); + low_surrogate = utf16::swap_bytes(low_surrogate); + } + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); + pos += 4; + } else { + // we may have a continuation but we do not do error checking + return 0; + } + } + return utf16_output - start; +} + + +} // namespace utf8_to_utf16 +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file src/scalar/utf8_to_utf16/valid_utf8_to_utf16.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf8_to_utf16/utf8_to_utf16.h +/* begin file src/scalar/utf8_to_utf16/utf8_to_utf16.h */ +#ifndef SIMDUTF_UTF8_TO_UTF16_H +#define SIMDUTF_UTF8_TO_UTF16_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf8_to_utf16 { + +template +inline size_t convert(const char* buf, size_t len, char16_t* utf16_output) { + const uint8_t *data = reinterpret_cast(buf); + size_t pos = 0; + char16_t* start{utf16_output}; + while (pos < len) { + // try to convert the next block of 16 ASCII bytes + if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii + uint64_t v1; + ::memcpy(&v1, data + pos, sizeof(uint64_t)); + uint64_t v2; + ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); + uint64_t v{v1 | v2}; + if ((v & 0x8080808080808080) == 0) { + size_t final_pos = pos + 16; + while(pos < final_pos) { + *utf16_output++ = big_endian ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]); + pos++; + } + continue; + } + } + uint8_t leading_byte = data[pos]; // leading byte + if (leading_byte < 0b10000000) { + // converting one ASCII byte !!! + *utf16_output++ = big_endian ? char16_t(utf16::swap_bytes(leading_byte)): char16_t(leading_byte); + pos++; + } else if ((leading_byte & 0b11100000) == 0b11000000) { + // We have a two-byte UTF-8, it should become + // a single UTF-16 word. + if(pos + 1 >= len) { return 0; } // minimal bound checking + if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; } + // range check + uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); + if (code_point < 0x80 || 0x7ff < code_point) { return 0; } + if (big_endian) { + code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point))); + } + *utf16_output++ = char16_t(code_point); + pos += 2; + } else if ((leading_byte & 0b11110000) == 0b11100000) { + // We have a three-byte UTF-8, it should become + // a single UTF-16 word. + if(pos + 2 >= len) { return 0; } // minimal bound checking + + if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; } + if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; } + // range check + uint32_t code_point = (leading_byte & 0b00001111) << 12 | + (data[pos + 1] & 0b00111111) << 6 | + (data[pos + 2] & 0b00111111); + if (code_point < 0x800 || 0xffff < code_point || + (0xd7ff < code_point && code_point < 0xe000)) { + return 0; + } + if (big_endian) { + code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point))); + } + *utf16_output++ = char16_t(code_point); + pos += 3; + } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000 + // we have a 4-byte UTF-8 word. + if(pos + 3 >= len) { return 0; } // minimal bound checking + if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; } + if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; } + if ((data[pos + 3] & 0b11000000) != 0b10000000) { return 0; } + + // range check + uint32_t code_point = + (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | + (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111); + if (code_point <= 0xffff || 0x10ffff < code_point) { return 0; } + code_point -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF)); + if (big_endian) { + high_surrogate = utf16::swap_bytes(high_surrogate); + low_surrogate = utf16::swap_bytes(low_surrogate); + } + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); + pos += 4; + } else { + return 0; + } + } + return utf16_output - start; +} + +template +inline result convert_with_errors(const char* buf, size_t len, char16_t* utf16_output) { + const uint8_t *data = reinterpret_cast(buf); + size_t pos = 0; + char16_t* start{utf16_output}; + while (pos < len) { + // try to convert the next block of 16 ASCII bytes + if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii + uint64_t v1; + ::memcpy(&v1, data + pos, sizeof(uint64_t)); + uint64_t v2; + ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); + uint64_t v{v1 | v2}; + if ((v & 0x8080808080808080) == 0) { + size_t final_pos = pos + 16; + while(pos < final_pos) { + *utf16_output++ = big_endian ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]); + pos++; + } + continue; + } + } + uint8_t leading_byte = data[pos]; // leading byte + if (leading_byte < 0b10000000) { + // converting one ASCII byte !!! + *utf16_output++ = big_endian ? char16_t(utf16::swap_bytes(leading_byte)): char16_t(leading_byte); + pos++; + } else if ((leading_byte & 0b11100000) == 0b11000000) { + // We have a two-byte UTF-8, it should become + // a single UTF-16 word. + if(pos + 1 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking + if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); } + // range check + uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); + if (code_point < 0x80 || 0x7ff < code_point) { return result(error_code::OVERLONG, pos); } + if (big_endian) { + code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point))); + } + *utf16_output++ = char16_t(code_point); + pos += 2; + } else if ((leading_byte & 0b11110000) == 0b11100000) { + // We have a three-byte UTF-8, it should become + // a single UTF-16 word. + if(pos + 2 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking + + if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); } + if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); } + // range check + uint32_t code_point = (leading_byte & 0b00001111) << 12 | + (data[pos + 1] & 0b00111111) << 6 | + (data[pos + 2] & 0b00111111); + if ((code_point < 0x800) || (0xffff < code_point)) { return result(error_code::OVERLONG, pos);} + if (0xd7ff < code_point && code_point < 0xe000) { return result(error_code::SURROGATE, pos); } + if (big_endian) { + code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point))); + } + *utf16_output++ = char16_t(code_point); + pos += 3; + } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000 + // we have a 4-byte UTF-8 word. + if(pos + 3 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking + if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); } + if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); } + if ((data[pos + 3] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); } + + // range check + uint32_t code_point = + (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | + (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111); + if (code_point <= 0xffff) { return result(error_code::OVERLONG, pos); } + if (0x10ffff < code_point) { return result(error_code::TOO_LARGE, pos); } + code_point -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF)); + if (big_endian) { + high_surrogate = utf16::swap_bytes(high_surrogate); + low_surrogate = utf16::swap_bytes(low_surrogate); + } + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); + pos += 4; + } else { + // we either have too many continuation bytes or an invalid leading byte + if ((leading_byte & 0b11000000) == 0b10000000) { return result(error_code::TOO_LONG, pos); } + else { return result(error_code::HEADER_BITS, pos); } + } + } + return result(error_code::SUCCESS, utf16_output - start); +} + +template +inline result rewind_and_convert_with_errors(const char* buf, size_t len, char16_t* utf16_output) { + size_t extra_len{0}; + // A leading byte cannot be further than 4 bytes away + for(int i = 0; i < 5; i++) { + unsigned char byte = *buf; + if ((byte & 0b11000000) != 0b10000000) { + break; + } else { + buf--; + extra_len++; + } + } + + result res = convert_with_errors(buf, len + extra_len, utf16_output); + if (res.error) { + res.count -= extra_len; + } + return res; +} + +} // utf8_to_utf16 namespace +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file src/scalar/utf8_to_utf16/utf8_to_utf16.h */ + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf8_to_utf32/valid_utf8_to_utf32.h +/* begin file src/scalar/utf8_to_utf32/valid_utf8_to_utf32.h */ +#ifndef SIMDUTF_VALID_UTF8_TO_UTF32_H +#define SIMDUTF_VALID_UTF8_TO_UTF32_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf8_to_utf32 { + +inline size_t convert_valid(const char* buf, size_t len, char32_t* utf32_output) { + const uint8_t *data = reinterpret_cast(buf); + size_t pos = 0; + char32_t* start{utf32_output}; + while (pos < len) { + // try to convert the next block of 8 ASCII bytes + if (pos + 8 <= len) { // if it is safe to read 8 more bytes, check that they are ascii + uint64_t v; + ::memcpy(&v, data + pos, sizeof(uint64_t)); + if ((v & 0x8080808080808080) == 0) { + size_t final_pos = pos + 8; + while(pos < final_pos) { + *utf32_output++ = char32_t(buf[pos]); + pos++; + } + continue; + } + } + uint8_t leading_byte = data[pos]; // leading byte + if (leading_byte < 0b10000000) { + // converting one ASCII byte !!! + *utf32_output++ = char32_t(leading_byte); + pos++; + } else if ((leading_byte & 0b11100000) == 0b11000000) { + // We have a two-byte UTF-8 + if(pos + 1 >= len) { break; } // minimal bound checking + *utf32_output++ = char32_t(((leading_byte &0b00011111) << 6) | (data[pos + 1] &0b00111111)); + pos += 2; + } else if ((leading_byte & 0b11110000) == 0b11100000) { + // We have a three-byte UTF-8 + if(pos + 2 >= len) { break; } // minimal bound checking + *utf32_output++ = char32_t(((leading_byte &0b00001111) << 12) | ((data[pos + 1] &0b00111111) << 6) | (data[pos + 2] &0b00111111)); + pos += 3; + } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000 + // we have a 4-byte UTF-8 word. + if(pos + 3 >= len) { break; } // minimal bound checking + uint32_t code_word = ((leading_byte & 0b00000111) << 18 )| ((data[pos + 1] &0b00111111) << 12) + | ((data[pos + 2] &0b00111111) << 6) | (data[pos + 3] &0b00111111); + *utf32_output++ = char32_t(code_word); + pos += 4; + } else { + // we may have a continuation but we do not do error checking + return 0; + } + } + return utf32_output - start; +} + + +} // namespace utf8_to_utf32 +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file src/scalar/utf8_to_utf32/valid_utf8_to_utf32.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf8_to_utf32/utf8_to_utf32.h +/* begin file src/scalar/utf8_to_utf32/utf8_to_utf32.h */ +#ifndef SIMDUTF_UTF8_TO_UTF32_H +#define SIMDUTF_UTF8_TO_UTF32_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf8_to_utf32 { + +inline size_t convert(const char* buf, size_t len, char32_t* utf32_output) { + const uint8_t *data = reinterpret_cast(buf); + size_t pos = 0; + char32_t* start{utf32_output}; + while (pos < len) { + // try to convert the next block of 16 ASCII bytes + if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii + uint64_t v1; + ::memcpy(&v1, data + pos, sizeof(uint64_t)); + uint64_t v2; + ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); + uint64_t v{v1 | v2}; + if ((v & 0x8080808080808080) == 0) { + size_t final_pos = pos + 16; + while(pos < final_pos) { + *utf32_output++ = char32_t(buf[pos]); + pos++; + } + continue; + } + } + uint8_t leading_byte = data[pos]; // leading byte + if (leading_byte < 0b10000000) { + // converting one ASCII byte !!! + *utf32_output++ = char32_t(leading_byte); + pos++; + } else if ((leading_byte & 0b11100000) == 0b11000000) { + // We have a two-byte UTF-8 + if(pos + 1 >= len) { return 0; } // minimal bound checking + if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; } + // range check + uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); + if (code_point < 0x80 || 0x7ff < code_point) { return 0; } + *utf32_output++ = char32_t(code_point); + pos += 2; + } else if ((leading_byte & 0b11110000) == 0b11100000) { + // We have a three-byte UTF-8 + if(pos + 2 >= len) { return 0; } // minimal bound checking + + if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; } + if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; } + // range check + uint32_t code_point = (leading_byte & 0b00001111) << 12 | + (data[pos + 1] & 0b00111111) << 6 | + (data[pos + 2] & 0b00111111); + if (code_point < 0x800 || 0xffff < code_point || + (0xd7ff < code_point && code_point < 0xe000)) { + return 0; + } + *utf32_output++ = char32_t(code_point); + pos += 3; + } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000 + // we have a 4-byte UTF-8 word. + if(pos + 3 >= len) { return 0; } // minimal bound checking + if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; } + if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; } + if ((data[pos + 3] & 0b11000000) != 0b10000000) { return 0; } + + // range check + uint32_t code_point = + (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | + (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111); + if (code_point <= 0xffff || 0x10ffff < code_point) { return 0; } + *utf32_output++ = char32_t(code_point); + pos += 4; + } else { + return 0; + } + } + return utf32_output - start; +} + +inline result convert_with_errors(const char* buf, size_t len, char32_t* utf32_output) { + const uint8_t *data = reinterpret_cast(buf); + size_t pos = 0; + char32_t* start{utf32_output}; + while (pos < len) { + // try to convert the next block of 16 ASCII bytes + if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii + uint64_t v1; + ::memcpy(&v1, data + pos, sizeof(uint64_t)); + uint64_t v2; + ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); + uint64_t v{v1 | v2}; + if ((v & 0x8080808080808080) == 0) { + size_t final_pos = pos + 16; + while(pos < final_pos) { + *utf32_output++ = char32_t(buf[pos]); + pos++; + } + continue; + } + } + uint8_t leading_byte = data[pos]; // leading byte + if (leading_byte < 0b10000000) { + // converting one ASCII byte !!! + *utf32_output++ = char32_t(leading_byte); + pos++; + } else if ((leading_byte & 0b11100000) == 0b11000000) { + // We have a two-byte UTF-8 + if(pos + 1 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking + if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); } + // range check + uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); + if (code_point < 0x80 || 0x7ff < code_point) { return result(error_code::OVERLONG, pos); } + *utf32_output++ = char32_t(code_point); + pos += 2; + } else if ((leading_byte & 0b11110000) == 0b11100000) { + // We have a three-byte UTF-8 + if(pos + 2 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking + + if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); } + if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); } + // range check + uint32_t code_point = (leading_byte & 0b00001111) << 12 | + (data[pos + 1] & 0b00111111) << 6 | + (data[pos + 2] & 0b00111111); + if (code_point < 0x800 || 0xffff < code_point) { return result(error_code::OVERLONG, pos); } + if (0xd7ff < code_point && code_point < 0xe000) { return result(error_code::SURROGATE, pos); } + *utf32_output++ = char32_t(code_point); + pos += 3; + } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000 + // we have a 4-byte UTF-8 word. + if(pos + 3 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking + if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos);} + if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); } + if ((data[pos + 3] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); } + + // range check + uint32_t code_point = + (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | + (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111); + if (code_point <= 0xffff) { return result(error_code::OVERLONG, pos); } + if (0x10ffff < code_point) { return result(error_code::TOO_LARGE, pos); } + *utf32_output++ = char32_t(code_point); + pos += 4; + } else { + // we either have too many continuation bytes or an invalid leading byte + if ((leading_byte & 0b11000000) == 0b10000000) { return result(error_code::TOO_LONG, pos); } + else { return result(error_code::HEADER_BITS, pos); } + } + } + return result(error_code::SUCCESS, utf32_output - start); +} + +inline result rewind_and_convert_with_errors(const char* buf, size_t len, char32_t* utf32_output) { + size_t extra_len{0}; + // A leading byte cannot be further than 4 bytes away + for(int i = 0; i < 5; i++) { + unsigned char byte = *buf; + if ((byte & 0b11000000) != 0b10000000) { + break; + } else { + buf--; + extra_len++; + } + } + + result res = convert_with_errors(buf, len + extra_len, utf32_output); + if (res.error) { + res.count -= extra_len; + } + return res; +} + +} // utf8_to_utf32 namespace +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file src/scalar/utf8_to_utf32/utf8_to_utf32.h */ +// + + +SIMDUTF_PUSH_DISABLE_WARNINGS +SIMDUTF_DISABLE_UNDESIRED_WARNINGS + + +#if SIMDUTF_IMPLEMENTATION_ARM64 +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/implementation.cpp +/* begin file src/arm64/implementation.cpp */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/begin.h +/* begin file src/simdutf/arm64/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "arm64" +// #define SIMDUTF_IMPLEMENTATION arm64 +/* end file src/simdutf/arm64/begin.h */ +namespace simdutf { +namespace arm64 { +namespace { +#ifndef SIMDUTF_ARM64_H +#error "arm64.h must be included" +#endif +using namespace simd; + +simdutf_really_inline bool is_ascii(const simd8x64& input) { + simd8 bits = input.reduce_or(); + return bits.max_val() < 0b10000000u; +} + +simdutf_unused simdutf_really_inline simd8 must_be_continuation(const simd8 prev1, const simd8 prev2, const simd8 prev3) { + simd8 is_second_byte = prev1 >= uint8_t(0b11000000u); + simd8 is_third_byte = prev2 >= uint8_t(0b11100000u); + simd8 is_fourth_byte = prev3 >= uint8_t(0b11110000u); + // Use ^ instead of | for is_*_byte, because ^ is commutative, and the caller is using ^ as well. + // This will work fine because we only have to report errors for cases with 0-1 lead bytes. + // Multiple lead bytes implies 2 overlapping multibyte characters, and if that happens, there is + // guaranteed to be at least *one* lead byte that is part of only 1 other multibyte character. + // The error will be detected there. + return is_second_byte ^ is_third_byte ^ is_fourth_byte; +} + +simdutf_really_inline simd8 must_be_2_3_continuation(const simd8 prev2, const simd8 prev3) { + simd8 is_third_byte = prev2 >= uint8_t(0b11100000u); + simd8 is_fourth_byte = prev3 >= uint8_t(0b11110000u); + return is_third_byte ^ is_fourth_byte; +} + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_detect_encodings.cpp +/* begin file src/arm64/arm_detect_encodings.cpp */ +template +// len is known to be a multiple of 2 when this is called +int arm_detect_encodings(const char * buf, size_t len) { + const char* start = buf; + const char* end = buf + len; + + bool is_utf8 = true; + bool is_utf16 = true; + bool is_utf32 = true; + + int out = 0; + + const auto v_d8 = simd8::splat(0xd8); + const auto v_f8 = simd8::splat(0xf8); + + uint32x4_t currentmax = vmovq_n_u32(0x0); + + checker check{}; + + while(buf + 64 <= end) { + uint16x8_t in = vld1q_u16(reinterpret_cast(buf)); + uint16x8_t secondin = vld1q_u16(reinterpret_cast(buf) + simd16::SIZE / sizeof(char16_t)); + uint16x8_t thirdin = vld1q_u16(reinterpret_cast(buf) + 2*simd16::SIZE / sizeof(char16_t)); + uint16x8_t fourthin = vld1q_u16(reinterpret_cast(buf) + 3*simd16::SIZE / sizeof(char16_t)); + + const auto u0 = simd16(in); + const auto u1 = simd16(secondin); + const auto u2 = simd16(thirdin); + const auto u3 = simd16(fourthin); + + const auto v0 = u0.shr<8>(); + const auto v1 = u1.shr<8>(); + const auto v2 = u2.shr<8>(); + const auto v3 = u3.shr<8>(); + + const auto in16 = simd16::pack(v0, v1); + const auto nextin16 = simd16::pack(v2, v3); + + const uint64_t surrogates_wordmask0 = ((in16 & v_f8) == v_d8).to_bitmask64(); + const uint64_t surrogates_wordmask1 = ((nextin16 & v_f8) == v_d8).to_bitmask64(); + + // Check for surrogates + if (surrogates_wordmask0 != 0 || surrogates_wordmask1 != 0) { + // Cannot be UTF8 + is_utf8 = false; + // Can still be either UTF-16LE or UTF-32LE depending on the positions of the surrogates + // To be valid UTF-32LE, a surrogate cannot be in the two most significant bytes of any 32-bit word. + // On the other hand, to be valid UTF-16LE, at least one surrogate must be in the two most significant + // bytes of a 32-bit word since they always come in pairs in UTF-16LE. + // Note that we always proceed in multiple of 4 before this point so there is no offset in 32-bit words. + + if (((surrogates_wordmask0 | surrogates_wordmask1) & 0xf0f0f0f0f0f0f0f0) != 0) { + is_utf32 = false; + // Code from arm_validate_utf16le.cpp + // Not efficient, we do not process surrogates_wordmask1 + const char16_t * input = reinterpret_cast(buf); + const char16_t* end16 = reinterpret_cast(start) + len/2; + + const auto v_fc = simd8::splat(0xfc); + const auto v_dc = simd8::splat(0xdc); + + const uint64_t V0 = ~surrogates_wordmask0; + + const auto vH0 = ((in16 & v_fc) == v_dc); + const uint64_t H0 = vH0.to_bitmask64(); + + const uint64_t L0 = ~H0 & surrogates_wordmask0; + + const uint64_t a0 = L0 & (H0 >> 4); + + const uint64_t b0 = a0 << 4; + + const uint64_t c0 = V0 | a0 | b0; + if (c0 == ~0ull) { + input += 16; + } else if (c0 == 0xfffffffffffffffull) { + input += 15; + } else { + is_utf16 = false; + break; + } + + while (input + 16 < end16) { + const auto in0 = simd16(input); + const auto in1 = simd16(input + simd16::SIZE / sizeof(char16_t)); + const auto t0 = in0.shr<8>(); + const auto t1 = in1.shr<8>(); + const simd8 in_16 = simd16::pack(t0, t1); + + const uint64_t surrogates_wordmask = ((in_16 & v_f8) == v_d8).to_bitmask64(); + if(surrogates_wordmask == 0) { + input += 16; + } else { + const uint64_t V = ~surrogates_wordmask; + + const auto vH = ((in_16 & v_fc) == v_dc); + const uint64_t H = vH.to_bitmask64(); + + const uint64_t L = ~H & surrogates_wordmask; + + const uint64_t a = L & (H >> 4); + + const uint64_t b = a << 4; + + const uint64_t c = V | a | b; + if (c == ~0ull) { + input += 16; + } else if (c == 0xfffffffffffffffull) { + input += 15; + } else { + is_utf16 = false; + break; + } + } + } + } else { + is_utf16 = false; + // Check for UTF-32LE + if (len % 4 == 0) { + const char32_t * input = reinterpret_cast(buf); + const char32_t* end32 = reinterpret_cast(start) + len/4; + + // Must start checking for surrogates + uint32x4_t currentoffsetmax = vmovq_n_u32(0x0); + const uint32x4_t offset = vmovq_n_u32(0xffff2000); + const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff); + + const uint32x4_t in32 = vreinterpretq_u32_u16(in); + const uint32x4_t secondin32 = vreinterpretq_u32_u16(secondin); + const uint32x4_t thirdin32 = vreinterpretq_u32_u16(thirdin); + const uint32x4_t fourthin32 = vreinterpretq_u32_u16(fourthin); + + currentmax = vmaxq_u32(in32,currentmax); + currentmax = vmaxq_u32(secondin32,currentmax); + currentmax = vmaxq_u32(thirdin32,currentmax); + currentmax = vmaxq_u32(fourthin32,currentmax); + + currentoffsetmax = vmaxq_u32(vaddq_u32(in32, offset), currentoffsetmax); + currentoffsetmax = vmaxq_u32(vaddq_u32(secondin32, offset), currentoffsetmax); + currentoffsetmax = vmaxq_u32(vaddq_u32(thirdin32, offset), currentoffsetmax); + currentoffsetmax = vmaxq_u32(vaddq_u32(fourthin32, offset), currentoffsetmax); + + while (input + 4 < end32) { + const uint32x4_t in_32 = vld1q_u32(reinterpret_cast(input)); + currentmax = vmaxq_u32(in_32,currentmax); + currentoffsetmax = vmaxq_u32(vaddq_u32(in_32, offset), currentoffsetmax); + input += 4; + } + + uint32x4_t forbidden_words = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax), standardoffsetmax); + if(vmaxvq_u32(forbidden_words) != 0) { + is_utf32 = false; + } + } else { + is_utf32 = false; + } + } + break; + } + // If no surrogate, validate under other encodings as well + + // UTF-32LE validation + currentmax = vmaxq_u32(vreinterpretq_u32_u16(in),currentmax); + currentmax = vmaxq_u32(vreinterpretq_u32_u16(secondin),currentmax); + currentmax = vmaxq_u32(vreinterpretq_u32_u16(thirdin),currentmax); + currentmax = vmaxq_u32(vreinterpretq_u32_u16(fourthin),currentmax); + + // UTF-8 validation + // Relies on ../generic/utf8_validation/utf8_lookup4_algorithm.h + simd::simd8x64 in8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(secondin), vreinterpretq_u8_u16(thirdin), vreinterpretq_u8_u16(fourthin)); + check.check_next_input(in8); + + buf += 64; + } + + // Check which encodings are possible + + if (is_utf8) { + if (static_cast(buf - start) != len) { + uint8_t block[64]{}; + std::memset(block, 0x20, 64); + std::memcpy(block, buf, len - (buf - start)); + simd::simd8x64 in(block); + check.check_next_input(in); + } + if (!check.errors()) { + out |= simdutf::encoding_type::UTF8; + } + } + + if (is_utf16 && scalar::utf16::validate(reinterpret_cast(buf), (len - (buf - start))/2)) { + out |= simdutf::encoding_type::UTF16_LE; + } + + if (is_utf32 && (len % 4 == 0)) { + const uint32x4_t standardmax = vmovq_n_u32(0x10ffff); + uint32x4_t is_zero = veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax); + if (vmaxvq_u32(is_zero) == 0 && scalar::utf32::validate(reinterpret_cast(buf), (len - (buf - start))/4)) { + out |= simdutf::encoding_type::UTF32_LE; + } + } + + return out; +} +/* end file src/arm64/arm_detect_encodings.cpp */ + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_validate_utf16.cpp +/* begin file src/arm64/arm_validate_utf16.cpp */ +template +const char16_t* arm_validate_utf16(const char16_t* input, size_t size) { + const char16_t* end = input + size; + const auto v_d8 = simd8::splat(0xd8); + const auto v_f8 = simd8::splat(0xf8); + const auto v_fc = simd8::splat(0xfc); + const auto v_dc = simd8::splat(0xdc); + while (input + 16 < end) { + // 0. Load data: since the validation takes into account only higher + // byte of each word, we compress the two vectors into one which + // consists only the higher bytes. + auto in0 = simd16(input); + auto in1 = simd16(input + simd16::SIZE / sizeof(char16_t)); + if (big_endian) { + #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + #else + const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + #endif + in0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in0), swap)); + in1 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in1), swap)); + } + const auto t0 = in0.shr<8>(); + const auto t1 = in1.shr<8>(); + const simd8 in = simd16::pack(t0, t1); + // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy). + const uint64_t surrogates_wordmask = ((in & v_f8) == v_d8).to_bitmask64(); + if(surrogates_wordmask == 0) { + input += 16; + } else { + // 2. We have some surrogates that have to be distinguished: + // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF) + // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF) + // + // Fact: high surrogate has 11th bit set (3rd bit in the higher word) + + // V - non-surrogate words + // V = not surrogates_wordmask + const uint64_t V = ~surrogates_wordmask; + + // H - word-mask for high surrogates: the six highest bits are 0b1101'11 + const auto vH = ((in & v_fc) == v_dc); + const uint64_t H = vH.to_bitmask64(); + + // L - word mask for low surrogates + // L = not H and surrogates_wordmask + const uint64_t L = ~H & surrogates_wordmask; + + const uint64_t a = L & (H >> 4); // A low surrogate must be followed by high one. + // (A low surrogate placed in the 7th register's word + // is an exception we handle.) + const uint64_t b = a << 4; // Just mark that the opposite fact is hold, + // thanks to that we have only two masks for valid case. + const uint64_t c = V | a | b; // Combine all the masks into the final one. + if (c == ~0ull) { + // The whole input register contains valid UTF-16, i.e., + // either single words or proper surrogate pairs. + input += 16; + } else if (c == 0xfffffffffffffffull) { + // The 15 lower words of the input register contains valid UTF-16. + // The 15th word may be either a low or high surrogate. It the next + // iteration we 1) check if the low surrogate is followed by a high + // one, 2) reject sole high surrogate. + input += 15; + } else { + return nullptr; + } + } + } + return input; +} + + +template +const result arm_validate_utf16_with_errors(const char16_t* input, size_t size) { + const char16_t* start = input; + const char16_t* end = input + size; + + const auto v_d8 = simd8::splat(0xd8); + const auto v_f8 = simd8::splat(0xf8); + const auto v_fc = simd8::splat(0xfc); + const auto v_dc = simd8::splat(0xdc); + while (input + 16 < end) { + // 0. Load data: since the validation takes into account only higher + // byte of each word, we compress the two vectors into one which + // consists only the higher bytes. + auto in0 = simd16(input); + auto in1 = simd16(input + simd16::SIZE / sizeof(char16_t)); + + if (big_endian) { + #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + #else + const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + #endif + in0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in0), swap)); + in1 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in1), swap)); + } + const auto t0 = in0.shr<8>(); + const auto t1 = in1.shr<8>(); + const simd8 in = simd16::pack(t0, t1); + // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy). + const uint64_t surrogates_wordmask = ((in & v_f8) == v_d8).to_bitmask64(); + if(surrogates_wordmask == 0) { + input += 16; + } else { + // 2. We have some surrogates that have to be distinguished: + // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF) + // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF) + // + // Fact: high surrogate has 11th bit set (3rd bit in the higher word) + + // V - non-surrogate words + // V = not surrogates_wordmask + const uint64_t V = ~surrogates_wordmask; + + // H - word-mask for high surrogates: the six highest bits are 0b1101'11 + const auto vH = ((in & v_fc) == v_dc); + const uint64_t H = vH.to_bitmask64(); + + // L - word mask for low surrogates + // L = not H and surrogates_wordmask + const uint64_t L = ~H & surrogates_wordmask; + + const uint64_t a = L & (H >> 4); // A low surrogate must be followed by high one. + // (A low surrogate placed in the 7th register's word + // is an exception we handle.) + const uint64_t b = a << 4; // Just mark that the opposite fact is hold, + // thanks to that we have only two masks for valid case. + const uint64_t c = V | a | b; // Combine all the masks into the final one. + if (c == ~0ull) { + // The whole input register contains valid UTF-16, i.e., + // either single words or proper surrogate pairs. + input += 16; + } else if (c == 0xfffffffffffffffull) { + // The 15 lower words of the input register contains valid UTF-16. + // The 15th word may be either a low or high surrogate. It the next + // iteration we 1) check if the low surrogate is followed by a high + // one, 2) reject sole high surrogate. + input += 15; + } else { + return result(error_code::SURROGATE, input - start); + } + } + } + return result(error_code::SUCCESS, input - start); +} +/* end file src/arm64/arm_validate_utf16.cpp */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_validate_utf32le.cpp +/* begin file src/arm64/arm_validate_utf32le.cpp */ + +const char32_t* arm_validate_utf32le(const char32_t* input, size_t size) { + const char32_t* end = input + size; + + const uint32x4_t standardmax = vmovq_n_u32(0x10ffff); + const uint32x4_t offset = vmovq_n_u32(0xffff2000); + const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff); + uint32x4_t currentmax = vmovq_n_u32(0x0); + uint32x4_t currentoffsetmax = vmovq_n_u32(0x0); + + while (input + 4 < end) { + const uint32x4_t in = vld1q_u32(reinterpret_cast(input)); + currentmax = vmaxq_u32(in,currentmax); + currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax); + input += 4; + } + + uint32x4_t is_zero = veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax); + if(vmaxvq_u32(is_zero) != 0) { + return nullptr; + } + + is_zero = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax), standardoffsetmax); + if(vmaxvq_u32(is_zero) != 0) { + return nullptr; + } + + return input; +} + + +const result arm_validate_utf32le_with_errors(const char32_t* input, size_t size) { + const char32_t* start = input; + const char32_t* end = input + size; + + const uint32x4_t standardmax = vmovq_n_u32(0x10ffff); + const uint32x4_t offset = vmovq_n_u32(0xffff2000); + const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff); + uint32x4_t currentmax = vmovq_n_u32(0x0); + uint32x4_t currentoffsetmax = vmovq_n_u32(0x0); + + while (input + 4 < end) { + const uint32x4_t in = vld1q_u32(reinterpret_cast(input)); + currentmax = vmaxq_u32(in,currentmax); + currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax); + + uint32x4_t is_zero = veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax); + if(vmaxvq_u32(is_zero) != 0) { + return result(error_code::TOO_LARGE, input - start); + } + + is_zero = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax), standardoffsetmax); + if(vmaxvq_u32(is_zero) != 0) { + return result(error_code::SURROGATE, input - start); + } + + input += 4; + } + + return result(error_code::SUCCESS, input - start); +} +/* end file src/arm64/arm_validate_utf32le.cpp */ + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf8_to_utf16.cpp +/* begin file src/arm64/arm_convert_utf8_to_utf16.cpp */ +// Convert up to 12 bytes from utf8 to utf16 using a mask indicating the +// end of the code points. Only the least significant 12 bits of the mask +// are accessed. +// It returns how many bytes were consumed (up to 12). +template +size_t convert_masked_utf8_to_utf16(const char *input, + uint64_t utf8_end_of_code_point_mask, + char16_t *&utf16_output) { + // we use an approach where we try to process up to 12 input bytes. + // Why 12 input bytes and not 16? Because we are concerned with the size of + // the lookup tables. Also 12 is nicely divisible by two and three. + // + #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + #else + const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + #endif + uint8x16_t in = vld1q_u8(reinterpret_cast(input)); + const uint16_t input_utf8_end_of_code_point_mask = + utf8_end_of_code_point_mask & 0xfff; + // + // Optimization note: our main path below is load-latency dependent. Thus it is maybe + // beneficial to have fast paths that depend on branch prediction but have less latency. + // This results in more instructions but, potentially, also higher speeds. + // + // We first try a few fast paths. + if((utf8_end_of_code_point_mask & 0xffff) == 0xffff) { + // We process in chunks of 16 bytes + uint16x8_t ascii_first = vmovl_u8(vget_low_u8 (in)); + uint16x8_t ascii_second = vmovl_high_u8(in); + if (big_endian) { + ascii_first = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(ascii_first), swap)); + ascii_second = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(ascii_second), swap)); + } + vst1q_u16(reinterpret_cast(utf16_output), ascii_first); + vst1q_u16(reinterpret_cast(utf16_output) + 8, ascii_second); + utf16_output += 16; // We wrote 16 16-bit characters. + return 16; // We consumed 16 bytes. + } + if((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa) { + // We want to take 8 2-byte UTF-8 words and turn them into 8 2-byte UTF-16 words. + // There is probably a more efficient sequence, but the following might do. + uint8x16_t perm = vqtbl1q_u8(in, swap); + uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f))); + uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00))); + uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2))); + if (big_endian) composed = vqtbl1q_u8(composed, swap); + vst1q_u8(reinterpret_cast(utf16_output), composed); + utf16_output += 8; // We wrote 16 bytes, 8 code points. + return 16; + } + if(input_utf8_end_of_code_point_mask == 0x924) { + // We want to take 4 3-byte UTF-8 words and turn them into 4 2-byte UTF-16 words. + // There is probably a more efficient sequence, but the following might do. +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint8x16_t sh = make_uint8x16_t(2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255); +#else + const uint8x16_t sh = {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255}; +#endif + uint8x16_t perm = vqtbl1q_u8(in, sh); + uint8x16_t ascii = + vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits + uint8x16_t middlebyte = + vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits + uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2)); + uint32x4_t highbyte = + vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits + uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4); + uint32x4_t composed = + vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted); + uint16x8_t composed_repacked = vmovn_high_u32(vmovn_u32(composed), composed); + if (big_endian) composed_repacked = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(composed_repacked), swap)); + vst1q_u16(reinterpret_cast(utf16_output), composed_repacked); + utf16_output += 4; + return 12; + } + /// We do not have a fast path available, so we fallback. + + const uint8_t idx = + simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0]; + const uint8_t consumed = + simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; + + + if (idx < 64) { + // SIX (6) input code-words + // this is a relatively easy scenario + // we process SIX (6) input code-words. The max length in bytes of six code + // words spanning between 1 and 2 bytes each is 12 bytes. + uint8x16_t sh = vld1q_u8(reinterpret_cast(simdutf::tables::utf8_to_utf16::shufutf8[idx])); + uint8x16_t perm = vqtbl1q_u8(in, sh); + uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f))); + uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00))); + uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2))); + if (big_endian) composed = vqtbl1q_u8(composed, swap); + vst1q_u8(reinterpret_cast(utf16_output), composed); + utf16_output += 6; // We wrote 12 bytes, 6 code points. + } else if (idx < 145) { + // FOUR (4) input code-words + uint8x16_t sh = vld1q_u8(reinterpret_cast(simdutf::tables::utf8_to_utf16::shufutf8[idx])); + uint8x16_t perm = vqtbl1q_u8(in, sh); + uint8x16_t ascii = + vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits + uint8x16_t middlebyte = + vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits + uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2)); + uint32x4_t highbyte = + vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits + uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4); + uint32x4_t composed = + vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted); + uint16x8_t composed_repacked = vmovn_high_u32(vmovn_u32(composed), composed); + if (big_endian) composed_repacked = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(composed_repacked), swap)); + vst1q_u16(reinterpret_cast(utf16_output), composed_repacked); + utf16_output += 4; + } else if (idx < 209) { + // TWO (2) input code-words + uint8x16_t sh = vld1q_u8(reinterpret_cast(simdutf::tables::utf8_to_utf16::shufutf8[idx])); + uint8x16_t perm = vqtbl1q_u8(in, sh); + uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); + uint8x16_t middlebyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); + uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2)); + uint8x16_t middlehighbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f0000))); + // correct for spurious high bit + uint8x16_t correct = + vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x400000)))), 1)); + middlehighbyte = veorq_u8(correct, middlehighbyte); + uint8x16_t middlehighbyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlehighbyte), 4)); + uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x07000000))); + uint8x16_t highbyte_shifted =vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(highbyte), 6)); + uint8x16_t composed = + vorrq_u8(vorrq_u8(ascii, middlebyte_shifted), + vorrq_u8(highbyte_shifted, middlehighbyte_shifted)); + uint32x4_t composedminus = + vsubq_u32(vreinterpretq_u32_u8(composed), vmovq_n_u32(0x10000)); + uint32x4_t lowtenbits = + vandq_u32(composedminus, vmovq_n_u32(0x3ff)); + uint32x4_t hightenbits = vshrq_n_u32(composedminus, 10); + uint32x4_t lowtenbitsadd = + vaddq_u32(lowtenbits, vmovq_n_u32(0xDC00)); + uint32x4_t hightenbitsadd = + vaddq_u32(hightenbits, vmovq_n_u32(0xD800)); + uint32x4_t lowtenbitsaddshifted = vshlq_n_u32(lowtenbitsadd, 16); + uint32x4_t surrogates = + vorrq_u32(hightenbitsadd, lowtenbitsaddshifted); + uint32_t basic_buffer[4]; + uint32_t basic_buffer_swap[4]; + if (big_endian) { + vst1q_u32(basic_buffer_swap, vreinterpretq_u32_u8(vqtbl1q_u8(composed, swap))); + surrogates = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(surrogates), swap)); + } + vst1q_u32(basic_buffer, vreinterpretq_u32_u8(composed)); + uint32_t surrogate_buffer[4]; + vst1q_u32(surrogate_buffer, surrogates); + for (size_t i = 0; i < 3; i++) { + if (basic_buffer[i] < 65536) { + utf16_output[0] = big_endian ? uint16_t(basic_buffer_swap[i]) : uint16_t(basic_buffer[i]); + utf16_output++; + } else { + utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff); + utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16); + utf16_output += 2; + } + } + } else { + // here we know that there is an error but we do not handle errors + } + return consumed; +} +/* end file src/arm64/arm_convert_utf8_to_utf16.cpp */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf8_to_utf32.cpp +/* begin file src/arm64/arm_convert_utf8_to_utf32.cpp */ +// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the +// end of the code points. Only the least significant 12 bits of the mask +// are accessed. +// It returns how many bytes were consumed (up to 12). +size_t convert_masked_utf8_to_utf32(const char *input, + uint64_t utf8_end_of_code_point_mask, + char32_t *&utf32_out) { + // we use an approach where we try to process up to 12 input bytes. + // Why 12 input bytes and not 16? Because we are concerned with the size of + // the lookup tables. Also 12 is nicely divisible by two and three. + // + uint32_t*& utf32_output = reinterpret_cast(utf32_out); + uint8x16_t in = vld1q_u8(reinterpret_cast(input)); + const uint16_t input_utf8_end_of_code_point_mask = + utf8_end_of_code_point_mask & 0xFFF; + // + // Optimization note: our main path below is load-latency dependent. Thus it is maybe + // beneficial to have fast paths that depend on branch prediction but have less latency. + // This results in more instructions but, potentially, also higher speeds. + // + // We first try a few fast paths. + if((utf8_end_of_code_point_mask & 0xffff) == 0xffff) { + // We process in chunks of 16 bytes + vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8 (in))))); + vst1q_u32(utf32_output + 4, vmovl_high_u16(vmovl_u8(vget_low_u8 (in)))); + vst1q_u32(utf32_output + 8, vmovl_u16(vget_low_u16(vmovl_high_u8(in)))); + vst1q_u32(utf32_output + 12, vmovl_high_u16(vmovl_high_u8(in))); + utf32_output += 16; // We wrote 16 16-bit characters. + return 16; // We consumed 16 bytes. + } + if((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa) { + // We want to take 8 2-byte UTF-8 words and turn them into 8 4-byte UTF-32 words. + // There is probably a more efficient sequence, but the following might do. +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint8x16_t sh = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); +#else + //const uint8x16_t sh = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + const uint8x16_t sh = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; +#endif + uint8x16_t perm = vqtbl1q_u8(in, sh); + uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f))); + uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00))); + uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2))); + vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(vreinterpretq_u16_u8(composed)))); + vst1q_u32(utf32_output+4, vmovl_high_u16(vreinterpretq_u16_u8(composed))); + utf32_output += 8; // We wrote 32 bytes, 8 code points. + return 16; + } + if(input_utf8_end_of_code_point_mask == 0x924) { + // We want to take 4 3-byte UTF-8 words and turn them into 4 4-byte UTF-32 words. + // There is probably a more efficient sequence, but the following might do. +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint8x16_t sh = make_uint8x16_t(2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255); +#else + const uint8x16_t sh = {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255}; +#endif + uint8x16_t perm = vqtbl1q_u8(in, sh); + uint8x16_t ascii = + vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits + uint8x16_t middlebyte = + vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits + uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2)); + uint32x4_t highbyte = + vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits + uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4); + uint32x4_t composed = + vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted); + vst1q_u32(utf32_output, composed); + utf32_output += 4; + return 12; + } + /// We do not have a fast path available, so we fallback. + + const uint8_t idx = + simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0]; + const uint8_t consumed = + simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; + + + if (idx < 64) { + // SIX (6) input code-words + // this is a relatively easy scenario + // we process SIX (6) input code-words. The max length in bytes of six code + // words spanning between 1 and 2 bytes each is 12 bytes. + uint8x16_t sh = vld1q_u8(reinterpret_cast(simdutf::tables::utf8_to_utf16::shufutf8[idx])); + uint8x16_t perm = vqtbl1q_u8(in, sh); + uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f))); + uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00))); + uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2))); + vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(vreinterpretq_u16_u8(composed)))); + vst1q_u32(utf32_output+4, vmovl_high_u16(vreinterpretq_u16_u8(composed))); + utf32_output += 6; // We wrote 12 bytes, 6 code points. + } else if (idx < 145) { + // FOUR (4) input code-words + uint8x16_t sh = vld1q_u8(reinterpret_cast(simdutf::tables::utf8_to_utf16::shufutf8[idx])); + uint8x16_t perm = vqtbl1q_u8(in, sh); + uint8x16_t ascii = + vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits + uint8x16_t middlebyte = + vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits + uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2)); + uint32x4_t highbyte = + vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits + uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4); + uint32x4_t composed = + vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted); + vst1q_u32(utf32_output, composed); + utf32_output += 4; + } else if (idx < 209) { + // TWO (2) input code-words + uint8x16_t sh = vld1q_u8(reinterpret_cast(simdutf::tables::utf8_to_utf16::shufutf8[idx])); + uint8x16_t perm = vqtbl1q_u8(in, sh); + uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); + uint8x16_t middlebyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); + uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2)); + uint8x16_t middlehighbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f0000))); + // correct for spurious high bit + uint8x16_t correct = + vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x400000)))), 1)); + middlehighbyte = veorq_u8(correct, middlehighbyte); + uint8x16_t middlehighbyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlehighbyte), 4)); + uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x07000000))); + uint8x16_t highbyte_shifted =vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(highbyte), 6)); + uint8x16_t composed = + vorrq_u8(vorrq_u8(ascii, middlebyte_shifted), + vorrq_u8(highbyte_shifted, middlehighbyte_shifted)); + vst1q_u32(utf32_output, vreinterpretq_u32_u8(composed)); + utf32_output += 3; + } else { + // here we know that there is an error but we do not handle errors + } + return consumed; +} +/* end file src/arm64/arm_convert_utf8_to_utf32.cpp */ + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf16_to_utf8.cpp +/* begin file src/arm64/arm_convert_utf16_to_utf8.cpp */ +/* + The vectorized algorithm works on single SSE register i.e., it + loads eight 16-bit words. + + We consider three cases: + 1. an input register contains no surrogates and each value + is in range 0x0000 .. 0x07ff. + 2. an input register contains no surrogates and values are + is in range 0x0000 .. 0xffff. + 3. an input register contains surrogates --- i.e. codepoints + can have 16 or 32 bits. + + Ad 1. + + When values are less than 0x0800, it means that a 16-bit words + can be converted into: 1) single UTF8 byte (when it's an ASCII + char) or 2) two UTF8 bytes. + + For this case we do only some shuffle to obtain these 2-byte + codes and finally compress the whole SSE register with a single + shuffle. + + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. + + Ad 2. + + When values fit in 16-bit words, but are above 0x07ff, then + a single word may produce one, two or three UTF8 bytes. + + We prepare data for all these three cases in two registers. + The first register contains lower two UTF8 bytes (used in all + cases), while the second one contains just the third byte for + the three-UTF8-bytes case. + + Finally these two registers are interleaved forming eight-element + array of 32-bit values. The array spans two SSE registers. + The bytes from the registers are compressed using two shuffles. + + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. + + + To summarize: + - We need two 256-entry tables that have 8704 bytes in total. +*/ +/* + Returns a pair: the first unprocessed byte from buf and utf8_output + A scalar routing should carry on the conversion of the tail. +*/ +template +std::pair arm_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_out) { + uint8_t * utf8_output = reinterpret_cast(utf8_out); + const char16_t* end = buf + len; + + const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800); + const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800); + const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080); + + while (buf + 16 <= end) { + uint16x8_t in = vld1q_u16(reinterpret_cast(buf)); + if (big_endian) { + #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + #else + const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + #endif + in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap)); + } + if(vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!! + // It is common enough that we have sequences of 16 consecutive ASCII characters. + uint16x8_t nextin = vld1q_u16(reinterpret_cast(buf) + 8); + if (big_endian) { + #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + #else + const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + #endif + nextin = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(nextin), swap)); + } + if(vmaxvq_u16(nextin) > 0x7F) { + // 1. pack the bytes + // obviously suboptimal. + uint8x8_t utf8_packed = vmovn_u16(in); + // 2. store (8 bytes) + vst1_u8(utf8_output, utf8_packed); + // 3. adjust pointers + buf += 8; + utf8_output += 8; + in = nextin; + } else { + // 1. pack the bytes + // obviously suboptimal. + uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin); + // 2. store (16 bytes) + vst1q_u8(utf8_output, utf8_packed); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! + } + } + + if (vmaxvq_u16(in) <= 0x7FF) { + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00); + const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f); + + // t0 = [000a|aaaa|bbbb|bb00] + const uint16x8_t t0 = vshlq_n_u16(in, 2); + // t1 = [000a|aaaa|0000|0000] + const uint16x8_t t1 = vandq_u16(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const uint16x8_t t2 = vandq_u16(in, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const uint16x8_t t3 = vorrq_u16(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const uint16x8_t t4 = vorrq_u16(t3, v_c080); + // 2. merge ASCII and 2-byte codewords + const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); + const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f); + const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4)); + // 3. prepare bitmask for 8-bit lookup +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004, + 0x0010, 0x0040, + 0x0002, 0x0008, + 0x0020, 0x0080); +#else + const uint16x8_t mask = { 0x0001, 0x0004, + 0x0010, 0x0040, + 0x0002, 0x0008, + 0x0020, 0x0080 }; +#endif + uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask)); + // 4. pack the bytes + const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; + const uint8x16_t shuffle = vld1q_u8(row + 1); + const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle); + + // 5. store bytes + vst1q_u8(utf8_output, utf8_packed); + + // 6. adjust pointers + buf += 8; + utf8_output += row[0]; + continue; + + } + const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, + // it is likely an uncommon occurrence. + if (vmaxvq_u16(surrogates_bytemask) == 0) { + // case: words from register produce either 1, 2 or 3 UTF-8 bytes +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); +#else + const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e}; +#endif + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes + + We expand the input word (16-bit) into two words (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. + + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. + + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. + + Finally from these two words we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ +#define vec(x) vmovq_n_u16(static_cast(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even))); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const uint16x8_t t1 = vandq_u16(t0, vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const uint16x8_t t2 = vorrq_u16 (t1, vec(0b1000000000000000)); + + // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] + const uint16x8_t s0 = vshrq_n_u16(in, 12); + // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] + const uint16x8_t s1 = vandq_u16(in, vec(0b0000111111000000)); + // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000] + const uint16x8_t s1s = vshlq_n_u16(s1, 2); + // [00bb|bbbb|0000|aaaa] + const uint16x8_t s2 = vorrq_u16(s0, s1s); + // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const uint16x8_t s3 = vorrq_u16(s2, vec(0b1100000011100000)); + const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF); + const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff); + const uint16x8_t m0 = vbicq_u16(vec(0b0100000000000000), one_or_two_bytes_bytemask); + const uint16x8_t s4 = veorq_u16(s3, m0); +#undef vec + + // 4. expand words 16-bit => 32-bit + const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4)); + const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4)); + + // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle + const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); + const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f); +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004, + 0x0010, 0x0040, + 0x0100, 0x0400, + 0x1000, 0x4000 ); + const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008, + 0x0020, 0x0080, + 0x0200, 0x0800, + 0x2000, 0x8000 ); +#else + const uint16x8_t onemask = { 0x0001, 0x0004, + 0x0010, 0x0040, + 0x0100, 0x0400, + 0x1000, 0x4000 }; + const uint16x8_t twomask = { 0x0002, 0x0008, + 0x0020, 0x0080, + 0x0200, 0x0800, + 0x2000, 0x8000 }; +#endif + const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask)); + const uint16_t mask = vaddvq_u16(combined); + // The following fast path may or may not be beneficial. + /*if(mask == 0) { + // We only have three-byte words. Use fast path. + const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0}; + const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle); + const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle); + vst1q_u8(utf8_output, utf8_0); + utf8_output += 12; + vst1q_u8(utf8_output, utf8_1); + utf8_output += 12; + buf += 8; + continue; + }*/ + const uint8_t mask0 = uint8_t(mask); + + const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const uint8x16_t shuffle0 = vld1q_u8(row0 + 1); + const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0); + + const uint8_t mask1 = static_cast(mask >> 8); + const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const uint8x16_t shuffle1 = vld1q_u8(row1 + 1); + const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1); + + vst1q_u8(utf8_output, utf8_0); + utf8_output += row0[0]; + vst1q_u8(utf8_output, utf8_1); + utf8_output += row1[0]; + + buf += 8; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; + if((word & 0xFF80)==0) { + *utf8_output++ = char(word); + } else if((word & 0xF800)==0) { + *utf8_output++ = char((word>>6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if((word &0xF800 ) != 0xD800) { + *utf8_output++ = char((word>>12) | 0b11100000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1]; + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if((diff | diff2) > 0x3FF) { return std::make_pair(nullptr, reinterpret_cast(utf8_output)); } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf8_output++ = char((value>>18) | 0b11110000); + *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((value & 0b111111) | 0b10000000); + } + } + buf += k; + } + } // while + + return std::make_pair(buf, reinterpret_cast(utf8_output)); +} + + +/* + Returns a pair: a result struct and utf8_output. + If there is an error, the count field of the result is the position of the error. + Otherwise, it is the position of the first unprocessed byte in buf (even if finished). + A scalar routing should carry on the conversion of the tail if needed. +*/ +template +std::pair arm_convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_out) { + uint8_t * utf8_output = reinterpret_cast(utf8_out); + const char16_t* start = buf; + const char16_t* end = buf + len; + + const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800); + const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800); + const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080); + + while (buf + 16 <= end) { + uint16x8_t in = vld1q_u16(reinterpret_cast(buf)); + if (big_endian) { + #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + #else + const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + #endif + in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap)); + } + if(vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!! + // It is common enough that we have sequences of 16 consecutive ASCII characters. + uint16x8_t nextin = vld1q_u16(reinterpret_cast(buf) + 8); + if (big_endian) { + #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + #else + const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + #endif + nextin = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(nextin), swap)); + } + if(vmaxvq_u16(nextin) > 0x7F) { + // 1. pack the bytes + // obviously suboptimal. + uint8x8_t utf8_packed = vmovn_u16(in); + // 2. store (8 bytes) + vst1_u8(utf8_output, utf8_packed); + // 3. adjust pointers + buf += 8; + utf8_output += 8; + in = nextin; + } else { + // 1. pack the bytes + // obviously suboptimal. + uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin); + // 2. store (16 bytes) + vst1q_u8(utf8_output, utf8_packed); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! + } + } + + if (vmaxvq_u16(in) <= 0x7FF) { + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00); + const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f); + + // t0 = [000a|aaaa|bbbb|bb00] + const uint16x8_t t0 = vshlq_n_u16(in, 2); + // t1 = [000a|aaaa|0000|0000] + const uint16x8_t t1 = vandq_u16(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const uint16x8_t t2 = vandq_u16(in, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const uint16x8_t t3 = vorrq_u16(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const uint16x8_t t4 = vorrq_u16(t3, v_c080); + // 2. merge ASCII and 2-byte codewords + const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); + const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f); + const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4)); + // 3. prepare bitmask for 8-bit lookup +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004, + 0x0010, 0x0040, + 0x0002, 0x0008, + 0x0020, 0x0080); +#else + const uint16x8_t mask = { 0x0001, 0x0004, + 0x0010, 0x0040, + 0x0002, 0x0008, + 0x0020, 0x0080 }; +#endif + uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask)); + // 4. pack the bytes + const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; + const uint8x16_t shuffle = vld1q_u8(row + 1); + const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle); + + // 5. store bytes + vst1q_u8(utf8_output, utf8_packed); + + // 6. adjust pointers + buf += 8; + utf8_output += row[0]; + continue; + + } + const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, + // it is likely an uncommon occurrence. + if (vmaxvq_u16(surrogates_bytemask) == 0) { + // case: words from register produce either 1, 2 or 3 UTF-8 bytes +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); +#else + const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e}; +#endif + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes + + We expand the input word (16-bit) into two words (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. + + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. + + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. + + Finally from these two words we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ +#define vec(x) vmovq_n_u16(static_cast(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even))); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const uint16x8_t t1 = vandq_u16(t0, vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const uint16x8_t t2 = vorrq_u16 (t1, vec(0b1000000000000000)); + + // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] + const uint16x8_t s0 = vshrq_n_u16(in, 12); + // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] + const uint16x8_t s1 = vandq_u16(in, vec(0b0000111111000000)); + // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000] + const uint16x8_t s1s = vshlq_n_u16(s1, 2); + // [00bb|bbbb|0000|aaaa] + const uint16x8_t s2 = vorrq_u16(s0, s1s); + // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const uint16x8_t s3 = vorrq_u16(s2, vec(0b1100000011100000)); + const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF); + const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff); + const uint16x8_t m0 = vbicq_u16(vec(0b0100000000000000), one_or_two_bytes_bytemask); + const uint16x8_t s4 = veorq_u16(s3, m0); +#undef vec + + // 4. expand words 16-bit => 32-bit + const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4)); + const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4)); + + // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle + const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); + const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f); +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004, + 0x0010, 0x0040, + 0x0100, 0x0400, + 0x1000, 0x4000 ); + const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008, + 0x0020, 0x0080, + 0x0200, 0x0800, + 0x2000, 0x8000 ); +#else + const uint16x8_t onemask = { 0x0001, 0x0004, + 0x0010, 0x0040, + 0x0100, 0x0400, + 0x1000, 0x4000 }; + const uint16x8_t twomask = { 0x0002, 0x0008, + 0x0020, 0x0080, + 0x0200, 0x0800, + 0x2000, 0x8000 }; +#endif + const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask)); + const uint16_t mask = vaddvq_u16(combined); + // The following fast path may or may not be beneficial. + /*if(mask == 0) { + // We only have three-byte words. Use fast path. + const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0}; + const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle); + const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle); + vst1q_u8(utf8_output, utf8_0); + utf8_output += 12; + vst1q_u8(utf8_output, utf8_1); + utf8_output += 12; + buf += 8; + continue; + }*/ + const uint8_t mask0 = uint8_t(mask); + + const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const uint8x16_t shuffle0 = vld1q_u8(row0 + 1); + const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0); + + const uint8_t mask1 = static_cast(mask >> 8); + const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const uint8x16_t shuffle1 = vld1q_u8(row1 + 1); + const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1); + + vst1q_u8(utf8_output, utf8_0); + utf8_output += row0[0]; + vst1q_u8(utf8_output, utf8_1); + utf8_output += row1[0]; + + buf += 8; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; + if((word & 0xFF80)==0) { + *utf8_output++ = char(word); + } else if((word & 0xF800)==0) { + *utf8_output++ = char((word>>6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if((word &0xF800 ) != 0xD800) { + *utf8_output++ = char((word>>12) | 0b11100000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1]; + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if((diff | diff2) > 0x3FF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), reinterpret_cast(utf8_output)); } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf8_output++ = char((value>>18) | 0b11110000); + *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((value & 0b111111) | 0b10000000); + } + } + buf += k; + } + } // while + + return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast(utf8_output)); +} +/* end file src/arm64/arm_convert_utf16_to_utf8.cpp */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf16_to_utf32.cpp +/* begin file src/arm64/arm_convert_utf16_to_utf32.cpp */ +/* + The vectorized algorithm works on single SSE register i.e., it + loads eight 16-bit words. + + We consider three cases: + 1. an input register contains no surrogates and each value + is in range 0x0000 .. 0x07ff. + 2. an input register contains no surrogates and values are + is in range 0x0000 .. 0xffff. + 3. an input register contains surrogates --- i.e. codepoints + can have 16 or 32 bits. + + Ad 1. + + When values are less than 0x0800, it means that a 16-bit words + can be converted into: 1) single UTF8 byte (when it's an ASCII + char) or 2) two UTF8 bytes. + + For this case we do only some shuffle to obtain these 2-byte + codes and finally compress the whole SSE register with a single + shuffle. + + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. + + Ad 2. + + When values fit in 16-bit words, but are above 0x07ff, then + a single word may produce one, two or three UTF8 bytes. + + We prepare data for all these three cases in two registers. + The first register contains lower two UTF8 bytes (used in all + cases), while the second one contains just the third byte for + the three-UTF8-bytes case. + + Finally these two registers are interleaved forming eight-element + array of 32-bit values. The array spans two SSE registers. + The bytes from the registers are compressed using two shuffles. + + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. + + + To summarize: + - We need two 256-entry tables that have 8704 bytes in total. +*/ +/* + Returns a pair: the first unprocessed byte from buf and utf8_output + A scalar routing should carry on the conversion of the tail. +*/ +template +std::pair arm_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_out) { + uint32_t * utf32_output = reinterpret_cast(utf32_out); + const char16_t* end = buf + len; + + const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800); + const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800); + + while (buf + 16 <= end) { + uint16x8_t in = vld1q_u16(reinterpret_cast(buf)); + if (big_endian) { + #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + #else + const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + #endif + in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap)); + } + + const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, + // it is likely an uncommon occurrence. + if (vmaxvq_u16(surrogates_bytemask) == 0) { + // case: no surrogate pairs, extend all 16-bit words to 32-bit words + vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(in))); + vst1q_u32(utf32_output+4, vmovl_high_u16(in)); + utf32_output += 8; + buf += 8; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; + if((word &0xF800 ) != 0xD800) { + *utf32_output++ = char32_t(word); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1]; + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if((diff | diff2) > 0x3FF) { return std::make_pair(nullptr, reinterpret_cast(utf32_output)); } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf32_output++ = char32_t(value); + } + } + buf += k; + } + } // while + return std::make_pair(buf, reinterpret_cast(utf32_output)); +} + + +/* + Returns a pair: a result struct and utf8_output. + If there is an error, the count field of the result is the position of the error. + Otherwise, it is the position of the first unprocessed byte in buf (even if finished). + A scalar routing should carry on the conversion of the tail if needed. +*/ +template +std::pair arm_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_out) { + uint32_t * utf32_output = reinterpret_cast(utf32_out); + const char16_t* start = buf; + const char16_t* end = buf + len; + + const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800); + const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800); + + while (buf + 16 <= end) { + uint16x8_t in = vld1q_u16(reinterpret_cast(buf)); + if (big_endian) { + #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + #else + const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + #endif + in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap)); + } + + const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, + // it is likely an uncommon occurrence. + if (vmaxvq_u16(surrogates_bytemask) == 0) { + // case: no surrogate pairs, extend all 16-bit words to 32-bit words + vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(in))); + vst1q_u32(utf32_output+4, vmovl_high_u16(in)); + utf32_output += 8; + buf += 8; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; + if((word &0xF800 ) != 0xD800) { + *utf32_output++ = char32_t(word); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1]; + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if((diff | diff2) > 0x3FF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), reinterpret_cast(utf32_output)); } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf32_output++ = char32_t(value); + } + } + buf += k; + } + } // while + return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast(utf32_output)); +} +/* end file src/arm64/arm_convert_utf16_to_utf32.cpp */ + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf32_to_utf8.cpp +/* begin file src/arm64/arm_convert_utf32_to_utf8.cpp */ +std::pair arm_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_out) { + uint8_t * utf8_output = reinterpret_cast(utf8_out); + const char32_t* end = buf + len; + + const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080); + + uint16x8_t forbidden_bytemask = vmovq_n_u16(0x0); + + while (buf + 16 <= end) { + uint32x4_t in = vld1q_u32(reinterpret_cast(buf)); + uint32x4_t nextin = vld1q_u32(reinterpret_cast(buf+4)); + + // Check if no bits set above 16th + if(vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) { + // Pack UTF-32 to UTF-16 safely (without surrogate pairs) + // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp) + uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin)); + if(vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!! + // 1. pack the bytes + // obviously suboptimal. + uint8x8_t utf8_packed = vmovn_u16(utf16_packed); + // 2. store (8 bytes) + vst1_u8(utf8_output, utf8_packed); + // 3. adjust pointers + buf += 8; + utf8_output += 8; + continue; // we are done for this round! + } + + if (vmaxvq_u16(utf16_packed) <= 0x7FF) { + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00); + const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f); + + // t0 = [000a|aaaa|bbbb|bb00] + const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2); + // t1 = [000a|aaaa|0000|0000] + const uint16x8_t t1 = vandq_u16(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const uint16x8_t t3 = vorrq_u16(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const uint16x8_t t4 = vorrq_u16(t3, v_c080); + // 2. merge ASCII and 2-byte codewords + const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); + const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f); + const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, utf16_packed, t4)); + // 3. prepare bitmask for 8-bit lookup + #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004, + 0x0010, 0x0040, + 0x0002, 0x0008, + 0x0020, 0x0080); + #else + const uint16x8_t mask = { 0x0001, 0x0004, + 0x0010, 0x0040, + 0x0002, 0x0008, + 0x0020, 0x0080 }; + #endif + uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask)); + // 4. pack the bytes + const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; + const uint8x16_t shuffle = vld1q_u8(row + 1); + const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle); + + // 5. store bytes + vst1q_u8(utf8_output, utf8_packed); + + // 6. adjust pointers + buf += 8; + utf8_output += row[0]; + continue; + + } else { + // case: words from register produce either 1, 2 or 3 UTF-8 bytes + const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800); + const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff); + forbidden_bytemask = vorrq_u16(vandq_u16(vcleq_u16(utf16_packed, v_dfff), vcgeq_u16(utf16_packed, v_d800)), forbidden_bytemask); + + #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); + #else + const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e}; + #endif + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes + + We expand the input word (16-bit) into two words (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. + + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. + + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. + + Finally from these two words we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ + #define vec(x) vmovq_n_u16(static_cast(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed), vreinterpretq_u8_u16(dup_even))); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const uint16x8_t t1 = vandq_u16(t0, vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const uint16x8_t t2 = vorrq_u16 (t1, vec(0b1000000000000000)); + + // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] + const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12); + // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] + const uint16x8_t s1 = vandq_u16(utf16_packed, vec(0b0000111111000000)); + // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000] + const uint16x8_t s1s = vshlq_n_u16(s1, 2); + // [00bb|bbbb|0000|aaaa] + const uint16x8_t s2 = vorrq_u16(s0, s1s); + // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const uint16x8_t s3 = vorrq_u16(s2, vec(0b1100000011100000)); + const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF); + const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(utf16_packed, v_07ff); + const uint16x8_t m0 = vbicq_u16(vec(0b0100000000000000), one_or_two_bytes_bytemask); + const uint16x8_t s4 = veorq_u16(s3, m0); + #undef vec + + // 4. expand words 16-bit => 32-bit + const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4)); + const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4)); + + // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle + const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); + const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f); + #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004, + 0x0010, 0x0040, + 0x0100, 0x0400, + 0x1000, 0x4000 ); + const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008, + 0x0020, 0x0080, + 0x0200, 0x0800, + 0x2000, 0x8000 ); + #else + const uint16x8_t onemask = { 0x0001, 0x0004, + 0x0010, 0x0040, + 0x0100, 0x0400, + 0x1000, 0x4000 }; + const uint16x8_t twomask = { 0x0002, 0x0008, + 0x0020, 0x0080, + 0x0200, 0x0800, + 0x2000, 0x8000 }; + #endif + const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask)); + const uint16_t mask = vaddvq_u16(combined); + // The following fast path may or may not be beneficial. + /*if(mask == 0) { + // We only have three-byte words. Use fast path. + const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0}; + const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle); + const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle); + vst1q_u8(utf8_output, utf8_0); + utf8_output += 12; + vst1q_u8(utf8_output, utf8_1); + utf8_output += 12; + buf += 8; + continue; + }*/ + const uint8_t mask0 = uint8_t(mask); + + const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const uint8x16_t shuffle0 = vld1q_u8(row0 + 1); + const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0); + + const uint8_t mask1 = static_cast(mask >> 8); + const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const uint8x16_t shuffle1 = vld1q_u8(row1 + 1); + const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1); + + vst1q_u8(utf8_output, utf8_0); + utf8_output += row0[0]; + vst1q_u8(utf8_output, utf8_1); + utf8_output += row1[0]; + + buf += 8; + } + // At least one 32-bit word will produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes. + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint32_t word = buf[k]; + if((word & 0xFFFFFF80)==0) { + *utf8_output++ = char(word); + } else if((word & 0xFFFFF800)==0) { + *utf8_output++ = char((word>>6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if((word & 0xFFFF0000)==0) { + if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, reinterpret_cast(utf8_output)); } + *utf8_output++ = char((word>>12) | 0b11100000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { + if (word > 0x10FFFF) { return std::make_pair(nullptr, reinterpret_cast(utf8_output)); } + *utf8_output++ = char((word>>18) | 0b11110000); + *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } + } + buf += k; + } + } // while + + // check for invalid input + if (vmaxvq_u16(forbidden_bytemask) != 0) { + return std::make_pair(nullptr, reinterpret_cast(utf8_output)); + } + return std::make_pair(buf, reinterpret_cast(utf8_output)); +} + + +std::pair arm_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_out) { + uint8_t * utf8_output = reinterpret_cast(utf8_out); + const char32_t* start = buf; + const char32_t* end = buf + len; + + const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080); + + while (buf + 16 <= end) { + uint32x4_t in = vld1q_u32(reinterpret_cast(buf)); + uint32x4_t nextin = vld1q_u32(reinterpret_cast(buf+4)); + + // Check if no bits set above 16th + if(vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) { + // Pack UTF-32 to UTF-16 safely (without surrogate pairs) + // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp) + uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin)); + if(vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!! + // 1. pack the bytes + // obviously suboptimal. + uint8x8_t utf8_packed = vmovn_u16(utf16_packed); + // 2. store (8 bytes) + vst1_u8(utf8_output, utf8_packed); + // 3. adjust pointers + buf += 8; + utf8_output += 8; + continue; // we are done for this round! + } + + if (vmaxvq_u16(utf16_packed) <= 0x7FF) { + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00); + const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f); + + // t0 = [000a|aaaa|bbbb|bb00] + const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2); + // t1 = [000a|aaaa|0000|0000] + const uint16x8_t t1 = vandq_u16(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const uint16x8_t t3 = vorrq_u16(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const uint16x8_t t4 = vorrq_u16(t3, v_c080); + // 2. merge ASCII and 2-byte codewords + const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); + const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f); + const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, utf16_packed, t4)); + // 3. prepare bitmask for 8-bit lookup + #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004, + 0x0010, 0x0040, + 0x0002, 0x0008, + 0x0020, 0x0080); + #else + const uint16x8_t mask = { 0x0001, 0x0004, + 0x0010, 0x0040, + 0x0002, 0x0008, + 0x0020, 0x0080 }; + #endif + uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask)); + // 4. pack the bytes + const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; + const uint8x16_t shuffle = vld1q_u8(row + 1); + const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle); + + // 5. store bytes + vst1q_u8(utf8_output, utf8_packed); + + // 6. adjust pointers + buf += 8; + utf8_output += row[0]; + continue; + + } else { + // case: words from register produce either 1, 2 or 3 UTF-8 bytes + + // check for invalid input + const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800); + const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff); + const uint16x8_t forbidden_bytemask = vandq_u16(vcleq_u16(utf16_packed, v_dfff), vcgeq_u16(utf16_packed, v_d800)); + if (vmaxvq_u16(forbidden_bytemask) != 0) { + return std::make_pair(result(error_code::SURROGATE, buf - start), reinterpret_cast(utf8_output)); + } + + #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); + #else + const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e}; + #endif + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes + + We expand the input word (16-bit) into two words (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. + + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. + + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. + + Finally from these two words we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ + #define vec(x) vmovq_n_u16(static_cast(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed), vreinterpretq_u8_u16(dup_even))); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const uint16x8_t t1 = vandq_u16(t0, vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const uint16x8_t t2 = vorrq_u16 (t1, vec(0b1000000000000000)); + + // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] + const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12); + // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] + const uint16x8_t s1 = vandq_u16(utf16_packed, vec(0b0000111111000000)); + // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000] + const uint16x8_t s1s = vshlq_n_u16(s1, 2); + // [00bb|bbbb|0000|aaaa] + const uint16x8_t s2 = vorrq_u16(s0, s1s); + // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const uint16x8_t s3 = vorrq_u16(s2, vec(0b1100000011100000)); + const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF); + const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(utf16_packed, v_07ff); + const uint16x8_t m0 = vbicq_u16(vec(0b0100000000000000), one_or_two_bytes_bytemask); + const uint16x8_t s4 = veorq_u16(s3, m0); + #undef vec + + // 4. expand words 16-bit => 32-bit + const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4)); + const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4)); + + // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle + const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); + const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f); + #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004, + 0x0010, 0x0040, + 0x0100, 0x0400, + 0x1000, 0x4000 ); + const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008, + 0x0020, 0x0080, + 0x0200, 0x0800, + 0x2000, 0x8000 ); + #else + const uint16x8_t onemask = { 0x0001, 0x0004, + 0x0010, 0x0040, + 0x0100, 0x0400, + 0x1000, 0x4000 }; + const uint16x8_t twomask = { 0x0002, 0x0008, + 0x0020, 0x0080, + 0x0200, 0x0800, + 0x2000, 0x8000 }; + #endif + const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask)); + const uint16_t mask = vaddvq_u16(combined); + // The following fast path may or may not be beneficial. + /*if(mask == 0) { + // We only have three-byte words. Use fast path. + const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0}; + const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle); + const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle); + vst1q_u8(utf8_output, utf8_0); + utf8_output += 12; + vst1q_u8(utf8_output, utf8_1); + utf8_output += 12; + buf += 8; + continue; + }*/ + const uint8_t mask0 = uint8_t(mask); + + const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const uint8x16_t shuffle0 = vld1q_u8(row0 + 1); + const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0); + + const uint8_t mask1 = static_cast(mask >> 8); + const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const uint8x16_t shuffle1 = vld1q_u8(row1 + 1); + const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1); + + vst1q_u8(utf8_output, utf8_0); + utf8_output += row0[0]; + vst1q_u8(utf8_output, utf8_1); + utf8_output += row1[0]; + + buf += 8; + } + // At least one 32-bit word will produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes. + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint32_t word = buf[k]; + if((word & 0xFFFFFF80)==0) { + *utf8_output++ = char(word); + } else if((word & 0xFFFFF800)==0) { + *utf8_output++ = char((word>>6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if((word & 0xFFFF0000)==0) { + if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), reinterpret_cast(utf8_output)); } + *utf8_output++ = char((word>>12) | 0b11100000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { + if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), reinterpret_cast(utf8_output)); } + *utf8_output++ = char((word>>18) | 0b11110000); + *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } + } + buf += k; + } + } // while + + return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast(utf8_output)); +} +/* end file src/arm64/arm_convert_utf32_to_utf8.cpp */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf32_to_utf16.cpp +/* begin file src/arm64/arm_convert_utf32_to_utf16.cpp */ +template +std::pair arm_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_out) { + uint16_t * utf16_output = reinterpret_cast(utf16_out); + const char32_t* end = buf + len; + + uint16x4_t forbidden_bytemask = vmov_n_u16(0x0); + + while(buf + 4 <= end) { + uint32x4_t in = vld1q_u32(reinterpret_cast(buf)); + + // Check if no bits set above 16th + if(vmaxvq_u32(in) <= 0xFFFF) { + uint16x4_t utf16_packed = vmovn_u32(in); + + const uint16x4_t v_d800 = vmov_n_u16((uint16_t)0xd800); + const uint16x4_t v_dfff = vmov_n_u16((uint16_t)0xdfff); + forbidden_bytemask = vorr_u16(vand_u16(vcle_u16(utf16_packed, v_dfff), vcge_u16(utf16_packed, v_d800)), forbidden_bytemask); + + if (big_endian) { + #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint8x8_t swap = make_uint8x8_t(1, 0, 3, 2, 5, 4, 7, 6); + #else + const uint8x8_t swap = {1, 0, 3, 2, 5, 4, 7, 6}; + #endif + utf16_packed = vreinterpret_u16_u8(vtbl1_u8(vreinterpret_u8_u16(utf16_packed), swap)); + } + vst1_u16(utf16_output, utf16_packed); + utf16_output += 4; + buf += 4; + } else { + size_t forward = 3; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint32_t word = buf[k]; + if((word & 0xFFFF0000)==0) { + // will not generate a surrogate pair + if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, reinterpret_cast(utf16_output)); } + *utf16_output++ = big_endian ? char16_t(word >> 8 | word << 8) : char16_t(word); + } else { + // will generate a surrogate pair + if (word > 0x10FFFF) { return std::make_pair(nullptr, reinterpret_cast(utf16_output)); } + word -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); + if (big_endian) { + high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8); + low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8); + } + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); + } + } + buf += k; + } + } + + // check for invalid input + if (vmaxv_u16(forbidden_bytemask) != 0) { + return std::make_pair(nullptr, reinterpret_cast(utf16_output)); + } + + return std::make_pair(buf, reinterpret_cast(utf16_output)); +} + + +template +std::pair arm_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_out) { + uint16_t * utf16_output = reinterpret_cast(utf16_out); + const char32_t* start = buf; + const char32_t* end = buf + len; + + while(buf + 4 <= end) { + uint32x4_t in = vld1q_u32(reinterpret_cast(buf)); + + // Check if no bits set above 16th + if(vmaxvq_u32(in) <= 0xFFFF) { + uint16x4_t utf16_packed = vmovn_u32(in); + + const uint16x4_t v_d800 = vmov_n_u16((uint16_t)0xd800); + const uint16x4_t v_dfff = vmov_n_u16((uint16_t)0xdfff); + const uint16x4_t forbidden_bytemask = vand_u16(vcle_u16(utf16_packed, v_dfff), vcge_u16(utf16_packed, v_d800)); + if (vmaxv_u16(forbidden_bytemask) != 0) { + return std::make_pair(result(error_code::SURROGATE, buf - start), reinterpret_cast(utf16_output)); + } + + if (big_endian) { + #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint8x8_t swap = make_uint8x8_t(1, 0, 3, 2, 5, 4, 7, 6); + #else + const uint8x8_t swap = {1, 0, 3, 2, 5, 4, 7, 6}; + #endif + utf16_packed = vreinterpret_u16_u8(vtbl1_u8(vreinterpret_u8_u16(utf16_packed), swap)); + } + vst1_u16(utf16_output, utf16_packed); + utf16_output += 4; + buf += 4; + } else { + size_t forward = 3; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint32_t word = buf[k]; + if((word & 0xFFFF0000)==0) { + // will not generate a surrogate pair + if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), reinterpret_cast(utf16_output)); } + *utf16_output++ = big_endian ? char16_t(word >> 8 | word << 8) : char16_t(word); + } else { + // will generate a surrogate pair + if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), reinterpret_cast(utf16_output)); } + word -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); + if (big_endian) { + high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8); + low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8); + } + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); + } + } + buf += k; + } + } + + return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast(utf16_output)); +} +/* end file src/arm64/arm_convert_utf32_to_utf16.cpp */ +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/buf_block_reader.h +/* begin file src/generic/buf_block_reader.h */ +namespace simdutf { +namespace arm64 { +namespace { + +// Walks through a buffer in block-sized increments, loading the last part with spaces +template +struct buf_block_reader { +public: + simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len); + simdutf_really_inline size_t block_index(); + simdutf_really_inline bool has_full_block() const; + simdutf_really_inline const uint8_t *full_block() const; + /** + * Get the last block, padded with spaces. + * + * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this + * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there + * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding. + * + * @return the number of effective characters in the last block. + */ + simdutf_really_inline size_t get_remainder(uint8_t *dst) const; + simdutf_really_inline void advance(); +private: + const uint8_t *buf; + const size_t len; + const size_t lenminusstep; + size_t idx; +}; + +// Routines to print masks and text for debugging bitmask operations +simdutf_unused static char * format_input_text_64(const uint8_t *text) { + static char *buf = reinterpret_cast(malloc(sizeof(simd8x64) + 1)); + for (size_t i=0; i); i++) { + buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]); + } + buf[sizeof(simd8x64)] = '\0'; + return buf; +} + +// Routines to print masks and text for debugging bitmask operations +simdutf_unused static char * format_input_text(const simd8x64& in) { + static char *buf = reinterpret_cast(malloc(sizeof(simd8x64) + 1)); + in.store(reinterpret_cast(buf)); + for (size_t i=0; i); i++) { + if (buf[i] < ' ') { buf[i] = '_'; } + } + buf[sizeof(simd8x64)] = '\0'; + return buf; +} + +simdutf_unused static char * format_mask(uint64_t mask) { + static char *buf = reinterpret_cast(malloc(64 + 1)); + for (size_t i=0; i<64; i++) { + buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' '; + } + buf[64] = '\0'; + return buf; +} + +template +simdutf_really_inline buf_block_reader::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {} + +template +simdutf_really_inline size_t buf_block_reader::block_index() { return idx; } + +template +simdutf_really_inline bool buf_block_reader::has_full_block() const { + return idx < lenminusstep; +} + +template +simdutf_really_inline const uint8_t *buf_block_reader::full_block() const { + return &buf[idx]; +} + +template +simdutf_really_inline size_t buf_block_reader::get_remainder(uint8_t *dst) const { + if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers + std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once. + std::memcpy(dst, buf + idx, len - idx); + return len - idx; +} + +template +simdutf_really_inline void buf_block_reader::advance() { + idx += STEP_SIZE; +} + +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +/* end file src/generic/buf_block_reader.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h +/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ +namespace simdutf { +namespace arm64 { +namespace { +namespace utf8_validation { + +using namespace simd; + + simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { +// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) +// Bit 1 = Too Long (ASCII followed by continuation) +// Bit 2 = Overlong 3-byte +// Bit 4 = Surrogate +// Bit 5 = Overlong 2-byte +// Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1<<6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 + ); + constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, + CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000 + ); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT + ); + return (byte_1_high & byte_1_low & byte_2_high); + } + simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, + const simd8 prev_input, const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; + } + + // + // Return nonzero if there are incomplete multibyte characters at the end of the block: + // e.g. if there is a 4-byte character, but it's 3 bytes from the end. + // + simdutf_really_inline simd8 is_incomplete(const simd8 input) { + // If the previous input's last 3 bytes match this, they're too short (they ended at EOF): + // ... 1111____ 111_____ 11______ + static const uint8_t max_array[32] = { + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1 + }; + const simd8 max_value(&max_array[sizeof(max_array)-sizeof(simd8)]); + return input.gt_bits(max_value); + } + + struct utf8_checker { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + // The last input we received + simd8 prev_input_block; + // Whether the last input we received was incomplete (used for ASCII fast path) + simd8 prev_incomplete; + + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes + // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + // The only problem that can happen at EOF is that a multibyte character is too short + // or a byte value too large in the last bytes: check_special_cases only checks for bytes + // too large in the first of two bytes. + simdutf_really_inline void check_eof() { + // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't + // possibly finish them. + this->error |= this->prev_incomplete; + } + + simdutf_really_inline void check_next_input(const simd8x64& input) { + if(simdutf_likely(is_ascii(input))) { + this->error |= this->prev_incomplete; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], this->prev_input_block); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], this->prev_input_block); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + this->prev_incomplete = is_incomplete(input.chunks[simd8x64::NUM_CHUNKS-1]); + this->prev_input_block = input.chunks[simd8x64::NUM_CHUNKS-1]; + + } + } + + // do not forget to call check_eof! + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } + + }; // struct utf8_checker +} // namespace utf8_validation + +using utf8_validation::utf8_checker; + +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_validator.h +/* begin file src/generic/utf8_validation/utf8_validator.h */ +namespace simdutf { +namespace arm64 { +namespace { +namespace utf8_validation { + +/** + * Validates that the string is actual UTF-8. + */ +template +bool generic_validate_utf8(const uint8_t * input, size_t length) { + checker c{}; + buf_block_reader<64> reader(input, length); + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + c.check_next_input(in); + reader.advance(); + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + c.check_next_input(in); + reader.advance(); + c.check_eof(); + return !c.errors(); +} + +bool generic_validate_utf8(const char * input, size_t length) { + return generic_validate_utf8(reinterpret_cast(input),length); +} + +/** + * Validates that the string is actual UTF-8 and stops on errors. + */ +template +result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) { + checker c{}; + buf_block_reader<64> reader(input, length); + size_t count{0}; + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + c.check_next_input(in); + if(c.errors()) { + if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk + result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast(input + count), length - count); + res.count += count; + return res; + } + reader.advance(); + count += 64; + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + c.check_next_input(in); + reader.advance(); + c.check_eof(); + if (c.errors()) { + result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast(input) + count, length - count); + res.count += count; + return res; + } else { + return result(error_code::SUCCESS, length); + } +} + +result generic_validate_utf8_with_errors(const char * input, size_t length) { + return generic_validate_utf8_with_errors(reinterpret_cast(input),length); +} + +template +bool generic_validate_ascii(const uint8_t * input, size_t length) { + buf_block_reader<64> reader(input, length); + uint8_t blocks[64]{}; + simd::simd8x64 running_or(blocks); + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + running_or |= in; + reader.advance(); + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + running_or |= in; + return running_or.is_ascii(); +} + +bool generic_validate_ascii(const char * input, size_t length) { + return generic_validate_ascii(reinterpret_cast(input),length); +} + +template +result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) { + buf_block_reader<64> reader(input, length); + size_t count{0}; + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + if (!in.is_ascii()) { + result res = scalar::ascii::validate_with_errors(reinterpret_cast(input + count), length - count); + return result(res.error, count + res.count); + } + reader.advance(); + + count += 64; + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + if (!in.is_ascii()) { + result res = scalar::ascii::validate_with_errors(reinterpret_cast(input + count), length - count); + return result(res.error, count + res.count); + } else { + return result(error_code::SUCCESS, length); + } +} + +result generic_validate_ascii_with_errors(const char * input, size_t length) { + return generic_validate_ascii_with_errors(reinterpret_cast(input),length); +} + +} // namespace utf8_validation +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +/* end file src/generic/utf8_validation/utf8_validator.h */ +// transcoding from UTF-8 to UTF-16 +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h +/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ + + +namespace simdutf { +namespace arm64 { +namespace { +namespace utf8_to_utf16 { + +using namespace simd; + +template +simdutf_warn_unused size_t convert_valid(const char* input, size_t size, + char16_t* utf16_output) noexcept { + // The implementation is not specific to haswell and should be moved to the generic directory. + size_t pos = 0; + char16_t* start{utf16_output}; + const size_t safety_margin = 16; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + // this loop could be unrolled further. For example, we could process the mask + // far more than 64 bytes. + simd8x64 in(reinterpret_cast(input + pos)); + if(in.is_ascii()) { + in.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // Slow path. We hope that the compiler will recognize that this is a slow path. + // Anything that is not a continuation mask is a 'leading byte', that is, the + // start of a new code point. + uint64_t utf8_continuation_mask = in.lt(-65 + 1); + // -65 is 0b10111111 in two-complement's, so largest possible continuation byte + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + // The *start* of code points is not so useful, rather, we want the *end* of code points. + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times when using solely + // the slow/regular path, and at least four times if there are fast paths. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + // + // Thus we may allow convert_masked_utf8_to_utf16 to process + // more bytes at a time under a fast-path mode where 16 bytes + // are consumed at once (e.g., when encountering ASCII). + size_t consumed = convert_masked_utf8_to_utf16(input + pos, + utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block.These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + utf16_output += scalar::utf8_to_utf16::convert_valid(input + pos, size - pos, utf16_output); + return utf16_output - start; +} + +} // namespace utf8_to_utf16 +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h +/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */ + + +namespace simdutf { +namespace arm64 { +namespace { +namespace utf8_to_utf16 { +using namespace simd; + + + simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { +// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) +// Bit 1 = Too Long (ASCII followed by continuation) +// Bit 2 = Overlong 3-byte +// Bit 4 = Surrogate +// Bit 5 = Overlong 2-byte +// Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1<<6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 + ); + constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, + CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000 + ); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT + ); + return (byte_1_high & byte_1_low & byte_2_high); + } + simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, + const simd8 prev_input, const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; + } + + + struct validating_transcoder { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + + validating_transcoder() : error(uint8_t(0)) {} + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes + // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + + template + simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) { + size_t pos = 0; + char16_t* start{utf16_output}; + const size_t safety_margin = 16; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if(input.is_ascii()) { + input.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf16(in + pos, + utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block.These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if(errors()) { return 0; } + if(pos < size) { + size_t howmany = scalar::utf8_to_utf16::convert(in + pos, size - pos, utf16_output); + if(howmany == 0) { return 0; } + utf16_output += howmany; + } + return utf16_output - start; + } + + template + simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) { + size_t pos = 0; + char16_t* start{utf16_output}; + const size_t safety_margin = 16; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if(input.is_ascii()) { + input.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + if (errors()) { + result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(in + pos, size - pos, utf16_output); + res.count += pos; + return res; + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf16(in + pos, + utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block.These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if(errors()) { + result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(in + pos, size - pos, utf16_output); + res.count += pos; + return res; + } + if(pos < size) { + result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(in + pos, size - pos, utf16_output); + if (res.error) { // In case of error, we want the error position + res.count += pos; + return res; + } else { // In case of success, we want the number of word written + utf16_output += res.count; + } + } + return result(error_code::SUCCESS, utf16_output - start); + } + + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } + + }; // struct utf8_checker +} // utf8_to_utf16 namespace +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */ +// transcoding from UTF-8 to UTF-32 +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/valid_utf8_to_utf32.h +/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ + +namespace simdutf { +namespace arm64 { +namespace { +namespace utf8_to_utf32 { + +using namespace simd; + + +simdutf_warn_unused size_t convert_valid(const char* input, size_t size, + char32_t* utf32_output) noexcept { + size_t pos = 0; + char32_t* start{utf32_output}; + const size_t safety_margin = 16; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 in(reinterpret_cast(input + pos)); + if(in.is_ascii()) { + in.store_ascii_as_utf32(utf32_output); + utf32_output += 64; + pos += 64; + } else { + // -65 is 0b10111111 in two-complement's, so largest possible continuation byte + uint64_t utf8_continuation_mask = in.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + size_t max_starting_point = (pos + 64) - 12; + while(pos < max_starting_point) { + size_t consumed = convert_masked_utf8_to_utf32(input + pos, + utf8_end_of_code_point_mask, utf32_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + } + } + utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output); + return utf32_output - start; +} + + +} // namespace utf8_to_utf32 +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/utf8_to_utf32.h +/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */ + + +namespace simdutf { +namespace arm64 { +namespace { +namespace utf8_to_utf32 { +using namespace simd; + + + simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { +// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) +// Bit 1 = Too Long (ASCII followed by continuation) +// Bit 2 = Overlong 3-byte +// Bit 4 = Surrogate +// Bit 5 = Overlong 2-byte +// Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1<<6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 + ); + constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, + CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000 + ); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT + ); + return (byte_1_high & byte_1_low & byte_2_high); + } + simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, + const simd8 prev_input, const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; + } + + + struct validating_transcoder { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + + validating_transcoder() : error(uint8_t(0)) {} + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes + // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + + + simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) { + size_t pos = 0; + char32_t* start{utf32_output}; + const size_t safety_margin = 16; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if(input.is_ascii()) { + input.store_ascii_as_utf32(utf32_output); + utf32_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf32(in + pos, + utf8_end_of_code_point_mask, utf32_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block.These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if(errors()) { return 0; } + if(pos < size) { + size_t howmany = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output); + if(howmany == 0) { return 0; } + utf32_output += howmany; + } + return utf32_output - start; + } + + simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) { + size_t pos = 0; + char32_t* start{utf32_output}; + const size_t safety_margin = 16; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if(input.is_ascii()) { + input.store_ascii_as_utf32(utf32_output); + utf32_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + if (errors()) { + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(in + pos, size - pos, utf32_output); + res.count += pos; + return res; + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf32(in + pos, + utf8_end_of_code_point_mask, utf32_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block.These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if(errors()) { + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(in + pos, size - pos, utf32_output); + res.count += pos; + return res; + } + if(pos < size) { + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(in + pos, size - pos, utf32_output); + if (res.error) { // In case of error, we want the error position + res.count += pos; + return res; + } else { // In case of success, we want the number of word written + utf32_output += res.count; + } + } + return result(error_code::SUCCESS, utf32_output - start); + } + + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } + + }; // struct utf8_checker +} // utf8_to_utf32 namespace +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */ +// other functions +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8.h +/* begin file src/generic/utf8.h */ + +namespace simdutf { +namespace arm64 { +namespace { +namespace utf8 { + +using namespace simd; + +simdutf_really_inline size_t count_code_points(const char* in, size_t size) { + size_t pos = 0; + size_t count = 0; + for(;pos + 64 <= size; pos += 64) { + simd8x64 input(reinterpret_cast(in + pos)); + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + count += 64 - count_ones(utf8_continuation_mask); + } + return count + scalar::utf8::count_code_points(in + pos, size - pos); +} + + +simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) { + size_t pos = 0; + size_t count = 0; + // This algorithm could no doubt be improved! + for(;pos + 64 <= size; pos += 64) { + simd8x64 input(reinterpret_cast(in + pos)); + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + // We count one word for anything that is not a continuation (so + // leading bytes). + count += 64 - count_ones(utf8_continuation_mask); + int64_t utf8_4byte = input.gteq_unsigned(240); + count += count_ones(utf8_4byte); + } + return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos); +} + + +simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size) { + return count_code_points(in, size); +} +} // utf8 namespace +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +/* end file src/generic/utf8.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf16.h +/* begin file src/generic/utf16.h */ +namespace simdutf { +namespace arm64 { +namespace { +namespace utf16 { + +template +simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) { + size_t pos = 0; + size_t count = 0; + for(;pos + 32 <= size; pos += 32) { + simd16x32 input(reinterpret_cast(in + pos)); + if (big_endian) input.swap_bytes(); + uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF); + count += count_ones(not_pair) / 2; + } + return count + scalar::utf16::count_code_points(in + pos, size - pos); +} + +template +simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) { + size_t pos = 0; + size_t count = 0; + // This algorithm could no doubt be improved! + for(;pos + 32 <= size; pos += 32) { + simd16x32 input(reinterpret_cast(in + pos)); + if (big_endian) input.swap_bytes(); + uint64_t ascii_mask = input.lteq(0x7F); + uint64_t twobyte_mask = input.lteq(0x7FF); + uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF); + + size_t ascii_count = count_ones(ascii_mask) / 2; + size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2; + size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2; + size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2; + count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count; + } + return count + scalar::utf16::utf8_length_from_utf16(in + pos, size - pos); +} + +template +simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) { + return count_code_points(in, size); +} + +simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) { + size_t pos = 0; + + while (pos + 32 <= size) { + simd16x32 input(reinterpret_cast(in + pos)); + input.swap_bytes(); + input.store(reinterpret_cast(output)); + pos += 32; + output += 32; + } + + scalar::utf16::change_endianness_utf16(in + pos, size - pos, output); +} + +} // utf16 +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +/* end file src/generic/utf16.h */ +// +// Implementation-specific overrides +// +namespace simdutf { +namespace arm64 { + +simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept { + // If there is a BOM, then we trust it. + auto bom_encoding = simdutf::BOM::check_bom(input, length); + if(bom_encoding != encoding_type::unspecified) { return bom_encoding; } + if (length % 2 == 0) { + return arm_detect_encodings(input, length); + } else { + if (implementation::validate_utf8(input, length)) { + return simdutf::encoding_type::UTF8; + } else { + return simdutf::encoding_type::unspecified; + } + } +} + +simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { + return arm64::utf8_validation::generic_validate_utf8(buf,len); +} + +simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept { + return arm64::utf8_validation::generic_validate_utf8_with_errors(buf,len); +} + +simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept { + return arm64::utf8_validation::generic_validate_ascii(buf,len); +} + +simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept { + return arm64::utf8_validation::generic_validate_ascii_with_errors(buf,len); +} + +simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept { + const char16_t* tail = arm_validate_utf16(buf, len); + if (tail) { + return scalar::utf16::validate(tail, len - (tail - buf)); + } else { + return false; + } +} + +simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept { + const char16_t* tail = arm_validate_utf16(buf, len); + if (tail) { + return scalar::utf16::validate(tail, len - (tail - buf)); + } else { + return false; + } +} + +simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept { + result res = arm_validate_utf16_with_errors(buf, len); + if (res.count != len) { + result scalar_res = scalar::utf16::validate_with_errors(buf + res.count, len - res.count); + return result(scalar_res.error, res.count + scalar_res.count); + } else { + return res; + } +} + +simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept { + result res = arm_validate_utf16_with_errors(buf, len); + if (res.count != len) { + result scalar_res = scalar::utf16::validate_with_errors(buf + res.count, len - res.count); + return result(scalar_res.error, res.count + scalar_res.count); + } else { + return res; + } +} + +simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept { + const char32_t* tail = arm_validate_utf32le(buf, len); + if (tail) { + return scalar::utf32::validate(tail, len - (tail - buf)); + } else { + return false; + } +} + +simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept { + result res = arm_validate_utf32le_with_errors(buf, len); + if (res.count != len) { + result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count); + return result(scalar_res.error, res.count + scalar_res.count); + } else { + return res; + } +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert(buf, len, utf16_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert_with_errors(buf, len, utf16_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert_with_errors(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* input, size_t size, + char16_t* utf16_output) const noexcept { + return utf8_to_utf16::convert_valid(input, size, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* input, size_t size, + char16_t* utf16_output) const noexcept { + return utf8_to_utf16::convert_valid(input, size, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept { + utf8_to_utf32::validating_transcoder converter; + return converter.convert(buf, len, utf32_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept { + utf8_to_utf32::validating_transcoder converter; + return converter.convert_with_errors(buf, len, utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size, + char32_t* utf32_output) const noexcept { + return utf8_to_utf32::convert_valid(input, size, utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + std::pair ret = arm_convert_utf16_to_utf8(buf, len, utf8_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - utf8_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + std::pair ret = arm_convert_utf16_to_utf8(buf, len, utf8_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - utf8_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of words written even if finished + std::pair ret = arm_convert_utf16_to_utf8_with_errors(buf, len, utf8_output); + if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = scalar::utf16_to_utf8::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written + return ret.first; +} + +simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of words written even if finished + std::pair ret = arm_convert_utf16_to_utf8_with_errors(buf, len, utf8_output); + if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = scalar::utf16_to_utf8::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written + return ret.first; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + return convert_utf16le_to_utf8(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + return convert_utf16be_to_utf8(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept { + std::pair ret = arm_convert_utf32_to_utf8(buf, len, utf8_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - utf8_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of words written even if finished + std::pair ret = arm_convert_utf32_to_utf8_with_errors(buf, len, utf8_output); + if (ret.first.count != len) { + result scalar_res = scalar::utf32_to_utf8::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written + return ret.first; +} + +simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + std::pair ret = arm_convert_utf16_to_utf32(buf, len, utf32_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - utf32_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + std::pair ret = arm_convert_utf16_to_utf32(buf, len, utf32_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - utf32_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of words written even if finished + std::pair ret = arm_convert_utf16_to_utf32_with_errors(buf, len, utf32_output); + if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = scalar::utf16_to_utf32::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit words written + return ret.first; +} + +simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of words written even if finished + std::pair ret = arm_convert_utf16_to_utf32_with_errors(buf, len, utf32_output); + if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = scalar::utf16_to_utf32::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit words written + return ret.first; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept { + return convert_utf32_to_utf8(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + std::pair ret = arm_convert_utf32_to_utf16(buf, len, utf16_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + std::pair ret = arm_convert_utf32_to_utf16(buf, len, utf16_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of words written even if finished + std::pair ret = arm_convert_utf32_to_utf16_with_errors(buf, len, utf16_output); + if (ret.first.count != len) { + result scalar_res = scalar::utf32_to_utf16::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written + return ret.first; +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of words written even if finished + std::pair ret = arm_convert_utf32_to_utf16_with_errors(buf, len, utf16_output); + if (ret.first.count != len) { + result scalar_res = scalar::utf32_to_utf16::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written + return ret.first; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + return convert_utf32_to_utf16le(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + return convert_utf32_to_utf16be(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + return convert_utf16le_to_utf32(buf, len, utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + return convert_utf16be_to_utf32(buf, len, utf32_output); +} + +void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept { + utf16::change_endianness_utf16(input, length, output); +} + +simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept { + return utf16::count_code_points(input, length); +} + +simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept { + return utf16::count_code_points(input, length); +} + +simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept { + return utf8::count_code_points(input, length); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept { + return utf16::utf8_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept { + return utf16::utf8_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept { + return utf16::utf32_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept { + return utf16::utf32_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept { + return utf8::utf16_length_from_utf8(input, length); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept { + const uint32x4_t v_7f = vmovq_n_u32((uint32_t)0x7f); + const uint32x4_t v_7ff = vmovq_n_u32((uint32_t)0x7ff); + const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff); + const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1); + size_t pos = 0; + size_t count = 0; + for(;pos + 4 <= length; pos += 4) { + uint32x4_t in = vld1q_u32(reinterpret_cast(input + pos)); + const uint32x4_t ascii_bytes_bytemask = vcleq_u32(in, v_7f); + const uint32x4_t one_two_bytes_bytemask = vcleq_u32(in, v_7ff); + const uint32x4_t two_bytes_bytemask = veorq_u32(one_two_bytes_bytemask, ascii_bytes_bytemask); + const uint32x4_t three_bytes_bytemask = veorq_u32(vcleq_u32(in, v_ffff), one_two_bytes_bytemask); + + const uint16x8_t reduced_ascii_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(ascii_bytes_bytemask, v_1)); + const uint16x8_t reduced_two_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(two_bytes_bytemask, v_1)); + const uint16x8_t reduced_three_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(three_bytes_bytemask, v_1)); + + const uint16x8_t compressed_bytemask0 = vpaddq_u16(reduced_ascii_bytes_bytemask, reduced_two_bytes_bytemask); + const uint16x8_t compressed_bytemask1 = vpaddq_u16(reduced_three_bytes_bytemask, reduced_three_bytes_bytemask); + + size_t ascii_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 0)); + size_t two_bytes_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 1)); + size_t three_bytes_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask1), 0)); + + count += 16 - 3*ascii_count - 2*two_bytes_count - three_bytes_count; + } + return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos); +} + +simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept { + const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff); + const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1); + size_t pos = 0; + size_t count = 0; + for(;pos + 4 <= length; pos += 4) { + uint32x4_t in = vld1q_u32(reinterpret_cast(input + pos)); + const uint32x4_t surrogate_bytemask = vcgtq_u32(in, v_ffff); + const uint16x8_t reduced_bytemask = vreinterpretq_u16_u32(vandq_u32(surrogate_bytemask, v_1)); + const uint16x8_t compressed_bytemask = vpaddq_u16(reduced_bytemask, reduced_bytemask); + size_t surrogate_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask), 0)); + count += 4 + surrogate_count; + } + return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos); +} + +simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept { + return utf8::utf32_length_from_utf8(input, length); +} + +} // namespace arm64 +} // namespace simdutf + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/end.h +/* begin file src/simdutf/arm64/end.h */ +/* end file src/simdutf/arm64/end.h */ +/* end file src/arm64/implementation.cpp */ +#endif +#if SIMDUTF_IMPLEMENTATION_FALLBACK +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=fallback/implementation.cpp +/* begin file src/fallback/implementation.cpp */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/fallback/begin.h +/* begin file src/simdutf/fallback/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "fallback" +// #define SIMDUTF_IMPLEMENTATION fallback +/* end file src/simdutf/fallback/begin.h */ + + + + + + + + +namespace simdutf { +namespace fallback { + +simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept { + // If there is a BOM, then we trust it. + auto bom_encoding = simdutf::BOM::check_bom(input, length); + if(bom_encoding != encoding_type::unspecified) { return bom_encoding; } + int out = 0; + if(validate_utf8(input, length)) { out |= encoding_type::UTF8; } + if((length % 2) == 0) { + if(validate_utf16le(reinterpret_cast(input), length/2)) { out |= encoding_type::UTF16_LE; } + } + if((length % 4) == 0) { + if(validate_utf32(reinterpret_cast(input), length/4)) { out |= encoding_type::UTF32_LE; } + } + + return out; +} + +simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { + return scalar::utf8::validate(buf, len); +} + +simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept { + return scalar::utf8::validate_with_errors(buf, len); +} + +simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept { + return scalar::ascii::validate(buf, len); +} + +simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept { + return scalar::ascii::validate_with_errors(buf, len); +} + +simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept { + return scalar::utf16::validate(buf, len); +} + +simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept { + return scalar::utf16::validate(buf, len); +} + +simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept { + return scalar::utf16::validate_with_errors(buf, len); +} + +simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept { + return scalar::utf16::validate_with_errors(buf, len); +} + +simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept { + return scalar::utf32::validate(buf, len); +} + +simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept { + return scalar::utf32::validate_with_errors(buf, len); +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + return scalar::utf8_to_utf16::convert(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + return scalar::utf8_to_utf16::convert(buf, len, utf16_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + return scalar::utf8_to_utf16::convert_with_errors(buf, len, utf16_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + return scalar::utf8_to_utf16::convert_with_errors(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + return scalar::utf8_to_utf16::convert_valid(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + return scalar::utf8_to_utf16::convert_valid(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept { + return scalar::utf8_to_utf32::convert(buf, len, utf32_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept { + return scalar::utf8_to_utf32::convert_with_errors(buf, len, utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size, + char32_t* utf32_output) const noexcept { + return scalar::utf8_to_utf32::convert_valid(input, size, utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + return scalar::utf16_to_utf8::convert(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + return scalar::utf16_to_utf8::convert(buf, len, utf8_output); +} + +simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + return scalar::utf16_to_utf8::convert_with_errors(buf, len, utf8_output); +} + +simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + return scalar::utf16_to_utf8::convert_with_errors(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + return scalar::utf16_to_utf8::convert_valid(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + return scalar::utf16_to_utf8::convert_valid(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept { + return scalar::utf32_to_utf8::convert(buf, len, utf8_output); +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept { + return scalar::utf32_to_utf8::convert_with_errors(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept { + return scalar::utf32_to_utf8::convert_valid(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + return scalar::utf32_to_utf16::convert(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + return scalar::utf32_to_utf16::convert(buf, len, utf16_output); +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + return scalar::utf32_to_utf16::convert_with_errors(buf, len, utf16_output); +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + return scalar::utf32_to_utf16::convert_with_errors(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + return scalar::utf32_to_utf16::convert_valid(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + return scalar::utf32_to_utf16::convert_valid(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + return scalar::utf16_to_utf32::convert(buf, len, utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + return scalar::utf16_to_utf32::convert(buf, len, utf32_output); +} + +simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + return scalar::utf16_to_utf32::convert_with_errors(buf, len, utf32_output); +} + +simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + return scalar::utf16_to_utf32::convert_with_errors(buf, len, utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + return scalar::utf16_to_utf32::convert_valid(buf, len, utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + return scalar::utf16_to_utf32::convert_valid(buf, len, utf32_output); +} + +void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept { + scalar::utf16::change_endianness_utf16(input, length, output); +} + +simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept { + return scalar::utf16::count_code_points(input, length); +} + +simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept { + return scalar::utf16::count_code_points(input, length); +} + +simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept { + return scalar::utf8::count_code_points(input, length); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept { + return scalar::utf16::utf8_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept { + return scalar::utf16::utf8_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept { + return scalar::utf16::utf32_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept { + return scalar::utf16::utf32_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept { + return scalar::utf8::utf16_length_from_utf8(input, length); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept { + return scalar::utf32::utf8_length_from_utf32(input, length); +} + +simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept { + return scalar::utf32::utf16_length_from_utf32(input, length); +} + +simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept { + return scalar::utf8::utf32_length_from_utf8(input, length); +} + +} // namespace fallback +} // namespace simdutf + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/fallback/end.h +/* begin file src/simdutf/fallback/end.h */ +/* end file src/simdutf/fallback/end.h */ +/* end file src/fallback/implementation.cpp */ +#endif +#if SIMDUTF_IMPLEMENTATION_ICELAKE +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/implementation.cpp +/* begin file src/icelake/implementation.cpp */ + + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake/begin.h +/* begin file src/simdutf/icelake/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "icelake" +// #define SIMDUTF_IMPLEMENTATION icelake +SIMDUTF_TARGET_ICELAKE + +/* end file src/simdutf/icelake/begin.h */ +namespace simdutf { +namespace icelake { +namespace { +#ifndef SIMDUTF_ICELAKE_H +#error "icelake.h must be included" +#endif +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_utf8_common.inl.cpp +/* begin file src/icelake/icelake_utf8_common.inl.cpp */ +// Common procedures for both validating and non-validating conversions from UTF-8. +enum block_processing_mode { SIMDUTF_FULL, SIMDUTF_TAIL}; + +using utf8_to_utf16_result = std::pair; +using utf8_to_utf32_result = std::pair; + +/* + process_block_utf8_to_utf16 converts up to 64 bytes from 'in' from UTF-8 + to UTF-16. When tail = SIMDUTF_FULL, then the full input buffer (64 bytes) + might be used. When tail = SIMDUTF_TAIL, we take into account 'gap' which + indicates how many input bytes are relevant. + + Returns true when the result is correct, otherwise it returns false. + + The provided in and out pointers are advanced according to how many input + bytes have been processed, upon success. +*/ +template +simdutf_really_inline bool process_block_utf8_to_utf16(const char *&in, char16_t *&out, size_t gap) { + // constants + __m512i mask_identity = _mm512_set_epi8(63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + __m512i mask_c0c0c0c0 = _mm512_set1_epi32(0xc0c0c0c0); + __m512i mask_80808080 = _mm512_set1_epi32(0x80808080); + __m512i mask_f0f0f0f0 = _mm512_set1_epi32(0xf0f0f0f0); + __m512i mask_dfdfdfdf_tail = _mm512_set_epi64(0xffffdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf); + __m512i mask_c2c2c2c2 = _mm512_set1_epi32(0xc2c2c2c2); + __m512i mask_ffffffff = _mm512_set1_epi32(0xffffffff); + __m512i mask_d7c0d7c0 = _mm512_set1_epi32(0xd7c0d7c0); + __m512i mask_dc00dc00 = _mm512_set1_epi32(0xdc00dc00); + __m512i byteflip = _mm512_setr_epi64( + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809 + ); + // Note that 'tail' is a compile-time constant ! + __mmask64 b = (tail == SIMDUTF_FULL) ? 0xFFFFFFFFFFFFFFFF : (uint64_t(1) << gap) - 1; + __m512i input = (tail == SIMDUTF_FULL) ? _mm512_loadu_si512(in) : _mm512_maskz_loadu_epi8(b, in); + __mmask64 m1 = (tail == SIMDUTF_FULL) ? _mm512_cmplt_epu8_mask(input, mask_80808080) : _mm512_mask_cmplt_epu8_mask(b, input, mask_80808080); + if(_ktestc_mask64_u8(m1, b)) {// NOT(m1) AND b -- if all zeroes, then all ASCII + // alternatively, we could do 'if (m1 == b) { ' + if (tail == SIMDUTF_FULL) { + in += 64; // consumed 64 bytes + // we convert a full 64-byte block, writing 128 bytes. + __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input)); + if(big_endian) { input1 = _mm512_shuffle_epi8(input1, byteflip); } + _mm512_storeu_si512(out, input1); + out += 32; + __m512i input2 = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1)); + if(big_endian) { input2 = _mm512_shuffle_epi8(input2, byteflip); } + _mm512_storeu_si512(out, input2); + out += 32; + return true; // we are done + } else { + in += gap; + if (gap <= 32) { + __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input)); + if(big_endian) { input1 = _mm512_shuffle_epi8(input1, byteflip); } + _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << (gap)) - 1), input1); + out += gap; + } else { + __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input)); + if(big_endian) { input1 = _mm512_shuffle_epi8(input1, byteflip); } + _mm512_storeu_si512(out, input1); + out += 32; + __m512i input2 = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1)); + if(big_endian) { input2 = _mm512_shuffle_epi8(input2, byteflip); } + _mm512_mask_storeu_epi16(out, __mmask32((uint32_t(1) << (gap - 32)) - 1), input2); + out += gap - 32; + } + return true; // we are done + } + } + // classify characters further + __mmask64 m234 = _mm512_cmp_epu8_mask(mask_c0c0c0c0, input, + _MM_CMPINT_LE); // 0xc0 <= input, 2, 3, or 4 leading byte + __mmask64 m34 = _mm512_cmp_epu8_mask(mask_dfdfdfdf_tail, input, + _MM_CMPINT_LT); // 0xdf < input, 3 or 4 leading byte + + __mmask64 milltwobytes = _mm512_mask_cmp_epu8_mask(m234, input, mask_c2c2c2c2, + _MM_CMPINT_LT); // 0xc0 <= input < 0xc2 (illegal two byte sequence) + // Overlong 2-byte sequence + if (_ktestz_mask64_u8(milltwobytes, milltwobytes) == 0) { + // Overlong 2-byte sequence + return false; + } + if (_ktestz_mask64_u8(m34, m34) == 0) { + // We have a 3-byte sequence and/or a 2-byte sequence, or possibly even a 4-byte sequence! + __mmask64 m4 = _mm512_cmp_epu8_mask(input, mask_f0f0f0f0, + _MM_CMPINT_NLT); // 0xf0 <= zmm0 (4 byte start bytes) + + __mmask64 mask_not_ascii = (tail == SIMDUTF_FULL) ? _knot_mask64(m1) : _kand_mask64(_knot_mask64(m1), b); + + __mmask64 mp1 = _kshiftli_mask64(m234, 1); + __mmask64 mp2 = _kshiftli_mask64(m34, 2); + // We could do it as follows... + // if (_kortestz_mask64_u8(m4,m4)) { // compute the bitwise OR of the 64-bit masks a and b and return 1 if all zeroes + // but GCC generates better code when we do: + if (m4 == 0) { // compute the bitwise OR of the 64-bit masks a and b and return 1 if all zeroes + // Fast path with 1,2,3 bytes + __mmask64 mc = _kor_mask64(mp1, mp2); // expected continuation bytes + __mmask64 m1234 = _kor_mask64(m1, m234); + // mismatched continuation bytes: + if (tail == SIMDUTF_FULL) { + __mmask64 xnormcm1234 = _kxnor_mask64(mc, m1234); // XNOR of mc and m1234 should be all zero if they differ + // the presence of a 1 bit indicates that they overlap. + // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return 1 if all zeroes. + if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) { return false; } + } else { + __mmask64 bxorm1234 = _kxor_mask64(b, m1234); + if (mc != bxorm1234) { return false; } + } + // mend: identifying the last bytes of each sequence to be decoded + __mmask64 mend = _kshiftri_mask64(m1234, 1); + if (tail != SIMDUTF_FULL) { + mend = _kor_mask64(mend, (uint64_t(1) << (gap - 1))); + } + + + __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity); + __m512i last_and_thirdu16 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third)); + + __m512i nonasciitags = _mm512_maskz_mov_epi8(mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000 other: 11000000 + __m512i clearedbytes = _mm512_andnot_si512(nonasciitags, input); // high two bits cleared where not ASCII + __m512i lastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, last_and_thirdu16, + clearedbytes); // the last byte of each character + + __mmask64 mask_before_non_ascii = _kshiftri_mask64(mask_not_ascii, 1); // bytes that precede non-ASCII bytes + __m512i indexofsecondlastbytes = _mm512_add_epi16(mask_ffffffff, last_and_thirdu16); // indices of the second last bytes + __m512i beforeasciibytes = _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes); + __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofsecondlastbytes, + beforeasciibytes); // the second last bytes (of two, three byte seq, + // surrogates) + secondlastbytes = _mm512_slli_epi16(secondlastbytes, 6); // shifted into position + + __m512i indexofthirdlastbytes = _mm512_add_epi16(mask_ffffffff, + indexofsecondlastbytes); // indices of the second last bytes + __m512i thirdlastbyte = _mm512_maskz_mov_epi8(m34, + clearedbytes); // only those that are the third last byte of a sequece + __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofthirdlastbytes, + thirdlastbyte); // the third last bytes (of three byte sequences, hi + // surrogate) + thirdlastbytes = _mm512_slli_epi16(thirdlastbytes, 12); // shifted into position + __m512i Wout = _mm512_ternarylogic_epi32(lastbytes, secondlastbytes, thirdlastbytes, 254); + // the elements of Wout excluding the last element if it happens to be a high surrogate: + + __mmask64 mprocessed = (tail == SIMDUTF_FULL) ? _pdep_u64(0xFFFFFFFF, mend) : _pdep_u64(0xFFFFFFFF, _kand_mask64(mend, b)); // we adjust mend at the end of the output. + + + // Encodings out of range... + { + // the location of 3-byte sequence start bytes in the input + __mmask64 m3 = m34 & (b ^ m4); + // words in Wout corresponding to 3-byte sequences. + __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend)); + __m512i mask_08000800 = _mm512_set1_epi32(0x08000800); + __mmask32 Msmall800 = _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800); + __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800); + __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800); + __mmask32 M3s = _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800); + if (_kor_mask32(Msmall800, M3s)) { return false; } + } + int64_t nout = _mm_popcnt_u64(mprocessed); + in += 64 - _lzcnt_u64(mprocessed); + if(big_endian) { Wout = _mm512_shuffle_epi8(Wout, byteflip); } + _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout); + out += nout; + return true; // ok + } + // + // We have a 4-byte sequence, this is the general case. + // Slow! + __mmask64 mp3 = _kshiftli_mask64(m4, 3); + __mmask64 mc = _kor_mask64(_kor_mask64(mp1, mp2), mp3); // expected continuation bytes + __mmask64 m1234 = _kor_mask64(m1, m234); + + // mend: identifying the last bytes of each sequence to be decoded + __mmask64 mend = _kor_mask64(_kshiftri_mask64(_kor_mask64(mp3, m1234), 1), mp3); + if (tail != SIMDUTF_FULL) { + mend = _kor_mask64(mend, __mmask64(uint64_t(1) << (gap - 1))); + } + __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity); + __m512i last_and_thirdu16 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third)); + + __m512i nonasciitags = _mm512_maskz_mov_epi8(mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000 other: 11000000 + __m512i clearedbytes = _mm512_andnot_si512(nonasciitags, input); // high two bits cleared where not ASCII + __m512i lastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, last_and_thirdu16, + clearedbytes); // the last byte of each character + + __mmask64 mask_before_non_ascii = _kshiftri_mask64(mask_not_ascii, 1); // bytes that precede non-ASCII bytes + __m512i indexofsecondlastbytes = _mm512_add_epi16(mask_ffffffff, last_and_thirdu16); // indices of the second last bytes + __m512i beforeasciibytes = _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes); + __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofsecondlastbytes, + beforeasciibytes); // the second last bytes (of two, three byte seq, + // surrogates) + secondlastbytes = _mm512_slli_epi16(secondlastbytes, 6); // shifted into position + + __m512i indexofthirdlastbytes = _mm512_add_epi16(mask_ffffffff, + indexofsecondlastbytes); // indices of the second last bytes + __m512i thirdlastbyte = _mm512_maskz_mov_epi8(m34, + clearedbytes); // only those that are the third last byte of a sequece + __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofthirdlastbytes, + thirdlastbyte); // the third last bytes (of three byte sequences, hi + // surrogate) + thirdlastbytes = _mm512_slli_epi16(thirdlastbytes, 12); // shifted into position + __m512i thirdsecondandlastbytes = _mm512_ternarylogic_epi32(lastbytes, secondlastbytes, thirdlastbytes, 254); + uint64_t Mlo_uint64 = _pext_u64(mp3, mend); + __mmask32 Mlo = __mmask32(Mlo_uint64); + __mmask32 Mhi = __mmask32(Mlo_uint64 >> 1); + __m512i lo_surr_mask = _mm512_maskz_mov_epi16(Mlo, + mask_dc00dc00); // lo surr: 1101110000000000, other: 0000000000000000 + __m512i shifted4_thirdsecondandlastbytes = _mm512_srli_epi16(thirdsecondandlastbytes, + 4); // hi surr: 00000WVUTSRQPNML vuts = WVUTS - 1 + __m512i tagged_lo_surrogates = _mm512_or_si512(thirdsecondandlastbytes, + lo_surr_mask); // lo surr: 110111KJHGFEDCBA, other: unchanged + __m512i Wout = _mm512_mask_add_epi16(tagged_lo_surrogates, Mhi, shifted4_thirdsecondandlastbytes, + mask_d7c0d7c0); // hi sur: 110110vutsRQPNML, other: unchanged + // the elements of Wout excluding the last element if it happens to be a high surrogate: + __mmask32 Mout = ~(Mhi & 0x80000000); + __mmask64 mprocessed = (tail == SIMDUTF_FULL) ? _pdep_u64(Mout, mend) : _pdep_u64(Mout, _kand_mask64(mend, b)); // we adjust mend at the end of the output. + + + // mismatched continuation bytes: + if (tail == SIMDUTF_FULL) { + __mmask64 xnormcm1234 = _kxnor_mask64(mc, m1234); // XNOR of mc and m1234 should be all zero if they differ + // the presence of a 1 bit indicates that they overlap. + // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return 1 if all zeroes. + if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) { return false; } + } else { + __mmask64 bxorm1234 = _kxor_mask64(b, m1234); + if (mc != bxorm1234) { return false; } + } + // Encodings out of range... + { + // the location of 3-byte sequence start bytes in the input + __mmask64 m3 = m34 & (b ^ m4); + // words in Wout corresponding to 3-byte sequences. + __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend)); + __m512i mask_08000800 = _mm512_set1_epi32(0x08000800); + __mmask32 Msmall800 = _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800); + __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800); + __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800); + __mmask32 M3s = _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800); + __m512i mask_04000400 = _mm512_set1_epi32(0x04000400); + __mmask32 M4s = _mm512_mask_cmpge_epu16_mask(Mhi, Moutminusd800, mask_04000400); + if (!_kortestz_mask32_u8(M4s, _kor_mask32(Msmall800, M3s))) { return false; } + } + in += 64 - _lzcnt_u64(mprocessed); + int64_t nout = _mm_popcnt_u64(mprocessed); + if(big_endian) { Wout = _mm512_shuffle_epi8(Wout, byteflip); } + _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout); + out += nout; + return true; // ok + } + // Fast path 2: all ASCII or 2 byte + __mmask64 continuation_or_ascii = (tail == SIMDUTF_FULL) ? _knot_mask64(m234) : _kand_mask64(_knot_mask64(m234), b); + // on top of -0xc0 we substract -2 which we get back later of the + // continuation byte tags + __m512i leading2byte = _mm512_maskz_sub_epi8(m234, input, mask_c2c2c2c2); + __mmask64 leading = tail == (tail == SIMDUTF_FULL) ? _kor_mask64(m1, m234) : _kand_mask64(_kor_mask64(m1, m234), b); // first bytes of each sequence + if (tail == SIMDUTF_FULL) { + __mmask64 xnor234leading = _kxnor_mask64(_kshiftli_mask64(m234, 1), leading); + if (!_kortestz_mask64_u8(xnor234leading, xnor234leading)) { return false; } + } else { + __mmask64 bxorleading = _kxor_mask64(b, leading); + if (_kshiftli_mask64(m234, 1) != bxorleading) { return false; } + } + in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii)); + + __m512i lead = _mm512_maskz_compress_epi8(leading, leading2byte); // will contain zero for ascii, and the data + lead = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(lead)); // ... zero extended into words + __m512i follow = _mm512_maskz_compress_epi8(continuation_or_ascii, input); // the last bytes of each sequence + follow = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(follow)); // ... zero extended into words + lead = _mm512_slli_epi16(lead, 6); // shifted into position + __m512i final = _mm512_add_epi16(follow, lead); // combining lead and follow + + if(big_endian) { final = _mm512_shuffle_epi8(final, byteflip); } + if (tail == SIMDUTF_FULL) { + // Next part is UTF-16 specific and can be generalized to UTF-32. + _mm512_storeu_si512(out, final); + out += 32; // UTF-8 to UTF-16 is only expansionary in this case. + } else { + int nout = int(_mm_popcnt_u64(_pdep_u64(0xFFFFFFFF, leading))); + _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final); + out += nout; // UTF-8 to UTF-16 is only expansionary in this case. + } + + return true; // we are fine. +} + + + + +/* + utf32_to_utf16_masked converts `count` lower UTF-32 words + from input `utf32` into UTF-16. It differs from utf32_to_utf16 + in that it 'masks' the writes. + + Returns how many 16-bit words were stored. + + byteflip is used for flipping 16-bit words, and it should be + __m512i byteflip = _mm512_setr_epi64( + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809 + ); + We pass it to the (always inlined) function to encourage the compiler to + keep the value in a (constant) register. +*/ +template +simdutf_really_inline size_t utf32_to_utf16_masked(const __m512i byteflip, __m512i utf32, unsigned int count, char16_t* output) { + + const __mmask16 valid = uint16_t((1 << count) - 1); + // 1. check if we have any surrogate pairs + const __m512i v_0000_ffff = _mm512_set1_epi32(0x0000ffff); + const __mmask16 sp_mask = _mm512_mask_cmpgt_epu32_mask(valid, utf32, v_0000_ffff); + + if (sp_mask == 0) { + if(big_endian) { + _mm256_mask_storeu_epi16((__m256i*)output, valid, _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32), _mm512_castsi512_si256(byteflip))); + + } else { + _mm256_mask_storeu_epi16((__m256i*)output, valid, _mm512_cvtepi32_epi16(utf32)); + } + return count; + } + + { + // build surrogate pair words in 32-bit lanes + + // t0 = 8 x [000000000000aaaa|aaaaaabbbbbbbbbb] + const __m512i v_0001_0000 = _mm512_set1_epi32(0x00010000); + const __m512i t0 = _mm512_sub_epi32(utf32, v_0001_0000); + + // t1 = 8 x [000000aaaaaaaaaa|bbbbbbbbbb000000] + const __m512i t1 = _mm512_slli_epi32(t0, 6); + + // t2 = 8 x [000000aaaaaaaaaa|aaaaaabbbbbbbbbb] -- copy hi word from t1 to t0 + // 0xe4 = (t1 and v_ffff_0000) or (t0 and not v_ffff_0000) + const __m512i v_ffff_0000 = _mm512_set1_epi32(0xffff0000); + const __m512i t2 = _mm512_ternarylogic_epi32(t1, t0, v_ffff_0000, 0xe4); + + // t2 = 8 x [110110aaaaaaaaaa|110111bbbbbbbbbb] -- copy hi word from t1 to t0 + // 0xba = (t2 and not v_fc00_fc000) or v_d800_dc00 + const __m512i v_fc00_fc00 = _mm512_set1_epi32(0xfc00fc00); + const __m512i v_d800_dc00 = _mm512_set1_epi32(0xd800dc00); + const __m512i t3 = _mm512_ternarylogic_epi32(t2, v_fc00_fc00, v_d800_dc00, 0xba); + const __m512i t4 = _mm512_mask_blend_epi32(sp_mask, utf32, t3); + __m512i t5 = _mm512_ror_epi32(t4, 16); + // Here we want to trim all of the upper 16-bit words from the 2-byte + // characters represented as 4-byte values. We can compute it from + // sp_mask or the following... It can be more optimized! + const __mmask32 nonzero = _kor_mask32(0xaaaaaaaa,_mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512())); + const __mmask32 nonzero_masked = _kand_mask32(nonzero, __mmask32((uint64_t(1) << (2*count)) - 1)); + if(big_endian) { t5 = _mm512_shuffle_epi8(t5, byteflip); } + // we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability (zen4) + __m512i compressed = _mm512_maskz_compress_epi16(nonzero_masked, t5); + _mm512_mask_storeu_epi16(output, (1<<(count + static_cast(count_ones(sp_mask)))) - 1, compressed); + //_mm512_mask_compressstoreu_epi16(output, nonzero_masked, t5); + } + + return count + static_cast(count_ones(sp_mask)); +} + +/* + utf32_to_utf16 converts `count` lower UTF-32 words + from input `utf32` into UTF-16. It may overflow. + + Returns how many 16-bit words were stored. + + byteflip is used for flipping 16-bit words, and it should be + __m512i byteflip = _mm512_setr_epi64( + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809 + ); + We pass it to the (always inlined) function to encourage the compiler to + keep the value in a (constant) register. +*/ +template +simdutf_really_inline size_t utf32_to_utf16(const __m512i byteflip, __m512i utf32, unsigned int count, char16_t* output) { + // check if we have any surrogate pairs + const __m512i v_0000_ffff = _mm512_set1_epi32(0x0000ffff); + const __mmask16 sp_mask = _mm512_cmpgt_epu32_mask(utf32, v_0000_ffff); + + if (sp_mask == 0) { + // technically, it should be _mm256_storeu_epi16 + if(big_endian) { + _mm256_storeu_si256((__m256i*)output, _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32),_mm512_castsi512_si256(byteflip))); + } else { + _mm256_storeu_si256((__m256i*)output, _mm512_cvtepi32_epi16(utf32)); + } + return count; + } + + { + // build surrogate pair words in 32-bit lanes + + // t0 = 8 x [000000000000aaaa|aaaaaabbbbbbbbbb] + const __m512i v_0001_0000 = _mm512_set1_epi32(0x00010000); + const __m512i t0 = _mm512_sub_epi32(utf32, v_0001_0000); + + // t1 = 8 x [000000aaaaaaaaaa|bbbbbbbbbb000000] + const __m512i t1 = _mm512_slli_epi32(t0, 6); + + // t2 = 8 x [000000aaaaaaaaaa|aaaaaabbbbbbbbbb] -- copy hi word from t1 to t0 + // 0xe4 = (t1 and v_ffff_0000) or (t0 and not v_ffff_0000) + const __m512i v_ffff_0000 = _mm512_set1_epi32(0xffff0000); + const __m512i t2 = _mm512_ternarylogic_epi32(t1, t0, v_ffff_0000, 0xe4); + + // t2 = 8 x [110110aaaaaaaaaa|110111bbbbbbbbbb] -- copy hi word from t1 to t0 + // 0xba = (t2 and not v_fc00_fc000) or v_d800_dc00 + const __m512i v_fc00_fc00 = _mm512_set1_epi32(0xfc00fc00); + const __m512i v_d800_dc00 = _mm512_set1_epi32(0xd800dc00); + const __m512i t3 = _mm512_ternarylogic_epi32(t2, v_fc00_fc00, v_d800_dc00, 0xba); + const __m512i t4 = _mm512_mask_blend_epi32(sp_mask, utf32, t3); + __m512i t5 = _mm512_ror_epi32(t4, 16); + const __mmask32 nonzero = _kor_mask32(0xaaaaaaaa,_mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512())); + if(big_endian) { t5 = _mm512_shuffle_epi8(t5, byteflip); } + // we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability (zen4) + __m512i compressed = _mm512_maskz_compress_epi16(nonzero, t5); + _mm512_mask_storeu_epi16(output, (1<<(count + static_cast(count_ones(sp_mask)))) - 1, compressed); + //_mm512_mask_compressstoreu_epi16(output, nonzero, t5); + } + + return count + static_cast(count_ones(sp_mask)); +} + +/** + * Store the last N bytes of previous followed by 512-N bytes from input. + */ +template +__m512i prev(__m512i input, __m512i previous) { + static_assert(N<=32, "N must be no larger than 32"); + const __m512i movemask = _mm512_setr_epi32(28,29,30,31,0,1,2,3,4,5,6,7,8,9,10,11); + const __m512i rotated = _mm512_permutex2var_epi32(input, movemask, previous); +#if SIMDUTF_GCC8 + constexpr int shift = 16-N; // workaround for GCC8 + return _mm512_alignr_epi8(input, rotated, shift); +#else + return _mm512_alignr_epi8(input, rotated, 16-N); +#endif // SIMDUTF_GCC8 +} + +template +__m512i shuffle_epi128(__m512i v) { + static_assert((idx0 >= 0 && idx0 <= 3), "idx0 must be in range 0..3"); + static_assert((idx1 >= 0 && idx1 <= 3), "idx1 must be in range 0..3"); + static_assert((idx2 >= 0 && idx2 <= 3), "idx2 must be in range 0..3"); + static_assert((idx3 >= 0 && idx3 <= 3), "idx3 must be in range 0..3"); + + constexpr unsigned shuffle = idx0 | (idx1 << 2) | (idx2 << 4) | (idx3 << 6); + return _mm512_shuffle_i32x4(v, v, shuffle); +} + +template +constexpr __m512i broadcast_epi128(__m512i v) { + return shuffle_epi128(v); +} + +/** + * Current unused. + */ +template +__m512i rotate_by_N_epi8(const __m512i input) { + + // lanes order: 1, 2, 3, 0 => 0b00_11_10_01 + const __m512i permuted = _mm512_shuffle_i32x4(input, input, 0x39); + + return _mm512_alignr_epi8(permuted, input, N); +} + +/* + expanded_utf8_to_utf32 converts expanded UTF-8 characters (`utf8`) + stored at separate 32-bit lanes. + + For each lane we have also a character class (`char_class), given in form + 0x8080800N, where N is 4 higest bits from the leading byte; 0x80 resets + corresponding bytes during pshufb. +*/ +simdutf_really_inline __m512i expanded_utf8_to_utf32(__m512i char_class, __m512i utf8) { + /* + Input: + - utf8: bytes stored at separate 32-bit words + - valid: which words have valid UTF-8 characters + + Bit layout of single word. We show 4 cases for each possible + UTF-8 character encoding. The `?` denotes bits we must not + assume their value. + + |10dd.dddd|10cc.cccc|10bb.bbbb|1111.0aaa| 4-byte char + |????.????|10cc.cccc|10bb.bbbb|1110.aaaa| 3-byte char + |????.????|????.????|10bb.bbbb|110a.aaaa| 2-byte char + |????.????|????.????|????.????|0aaa.aaaa| ASCII char + byte 3 byte 2 byte 1 byte 0 + */ + + /* 1. Reset control bits of continuation bytes and the MSB + of the leading byte; this makes all bytes unsigned (and + does not alter ASCII char). + + |00dd.dddd|00cc.cccc|00bb.bbbb|0111.0aaa| 4-byte char + |00??.????|00cc.cccc|00bb.bbbb|0110.aaaa| 3-byte char + |00??.????|00??.????|00bb.bbbb|010a.aaaa| 2-byte char + |00??.????|00??.????|00??.????|0aaa.aaaa| ASCII char + ^^ ^^ ^^ ^ + */ + __m512i values; + const __m512i v_3f3f_3f7f = _mm512_set1_epi32(0x3f3f3f7f); + values = _mm512_and_si512(utf8, v_3f3f_3f7f); + + /* 2. Swap and join fields A-B and C-D + + |0000.cccc|ccdd.dddd|0001.110a|aabb.bbbb| 4-byte char + |0000.cccc|cc??.????|0001.10aa|aabb.bbbb| 3-byte char + |0000.????|????.????|0001.0aaa|aabb.bbbb| 2-byte char + |0000.????|????.????|000a.aaaa|aa??.????| ASCII char */ + const __m512i v_0140_0140 = _mm512_set1_epi32(0x01400140); + values = _mm512_maddubs_epi16(values, v_0140_0140); + + /* 3. Swap and join fields AB & CD + + |0000.0001|110a.aabb|bbbb.cccc|ccdd.dddd| 4-byte char + |0000.0001|10aa.aabb|bbbb.cccc|cc??.????| 3-byte char + |0000.0001|0aaa.aabb|bbbb.????|????.????| 2-byte char + |0000.000a|aaaa.aa??|????.????|????.????| ASCII char */ + const __m512i v_0001_1000 = _mm512_set1_epi32(0x00011000); + values = _mm512_madd_epi16(values, v_0001_1000); + + /* 4. Shift left the values by variable amounts to reset highest UTF-8 bits + |aaab.bbbb|bccc.cccd|dddd.d000|0000.0000| 4-byte char -- by 11 + |aaaa.bbbb|bbcc.cccc|????.??00|0000.0000| 3-byte char -- by 10 + |aaaa.abbb|bbb?.????|????.???0|0000.0000| 2-byte char -- by 9 + |aaaa.aaa?|????.????|????.????|?000.0000| ASCII char -- by 7 */ + { + /** pshufb + + continuation = 0 + ascii = 7 + _2_bytes = 9 + _3_bytes = 10 + _4_bytes = 11 + + shift_left_v3 = 4 * [ + ascii, # 0000 + ascii, # 0001 + ascii, # 0010 + ascii, # 0011 + ascii, # 0100 + ascii, # 0101 + ascii, # 0110 + ascii, # 0111 + continuation, # 1000 + continuation, # 1001 + continuation, # 1010 + continuation, # 1011 + _2_bytes, # 1100 + _2_bytes, # 1101 + _3_bytes, # 1110 + _4_bytes, # 1111 + ] */ + const __m512i shift_left_v3 = _mm512_setr_epi64( + 0x0707070707070707, + 0x0b0a090900000000, + 0x0707070707070707, + 0x0b0a090900000000, + 0x0707070707070707, + 0x0b0a090900000000, + 0x0707070707070707, + 0x0b0a090900000000 + ); + + const __m512i shift = _mm512_shuffle_epi8(shift_left_v3, char_class); + values = _mm512_sllv_epi32(values, shift); + } + + /* 5. Shift right the values by variable amounts to reset lowest bits + |0000.0000|000a.aabb|bbbb.cccc|ccdd.dddd| 4-byte char -- by 11 + |0000.0000|0000.0000|aaaa.bbbb|bbcc.cccc| 3-byte char -- by 16 + |0000.0000|0000.0000|0000.0aaa|aabb.bbbb| 2-byte char -- by 21 + |0000.0000|0000.0000|0000.0000|0aaa.aaaa| ASCII char -- by 25 */ + { + // 4 * [25, 25, 25, 25, 25, 25, 25, 25, 0, 0, 0, 0, 21, 21, 16, 11] + const __m512i shift_right = _mm512_setr_epi64( + 0x1919191919191919, + 0x0b10151500000000, + 0x1919191919191919, + 0x0b10151500000000, + 0x1919191919191919, + 0x0b10151500000000, + 0x1919191919191919, + 0x0b10151500000000 + ); + + const __m512i shift = _mm512_shuffle_epi8(shift_right, char_class); + values = _mm512_srlv_epi32(values, shift); + } + + return values; +} + + +simdutf_really_inline __m512i expand_and_identify(__m512i lane0, __m512i lane1, int &count) { + const __m512i merged = _mm512_mask_mov_epi32(lane0, 0x1000, lane1); + const __m512i expand_ver2 = _mm512_setr_epi64( + 0x0403020103020100, + 0x0605040305040302, + 0x0807060507060504, + 0x0a09080709080706, + 0x0c0b0a090b0a0908, + 0x0e0d0c0b0d0c0b0a, + 0x000f0e0d0f0e0d0c, + 0x0201000f01000f0e + ); + const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2); + const __m512i v_0000_00c0 = _mm512_set1_epi32(0xc0); + const __m512i t0 = _mm512_and_si512(input, v_0000_00c0); + const __m512i v_0000_0080 = _mm512_set1_epi32(0x80); + const __mmask16 leading_bytes = _mm512_cmpneq_epu32_mask(t0, v_0000_0080); + count = static_cast(count_ones(leading_bytes)); + return _mm512_mask_compress_epi32(_mm512_setzero_si512(), leading_bytes, input); +} + +simdutf_really_inline __m512i expand_utf8_to_utf32(__m512i input) { + __m512i char_class = _mm512_srli_epi32(input, 4); + /* char_class = ((input >> 4) & 0x0f) | 0x80808000 */ + const __m512i v_0000_000f = _mm512_set1_epi32(0x0f); + const __m512i v_8080_8000 = _mm512_set1_epi32(0x80808000); + char_class = _mm512_ternarylogic_epi32(char_class, v_0000_000f, v_8080_8000, 0xea); + return expanded_utf8_to_utf32(char_class, input); +} +/* end file src/icelake/icelake_utf8_common.inl.cpp */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_macros.inl.cpp +/* begin file src/icelake/icelake_macros.inl.cpp */ + +/* + This upcoming macro (SIMDUTF_ICELAKE_TRANSCODE16) takes 16 + 4 bytes (of a UTF-8 string) + and loads all possible 4-byte substring into an AVX512 register. + + For example if we have bytes abcdefgh... we create following 32-bit lanes + + [abcd|bcde|cdef|defg|efgh|...] + ^ ^ + byte 0 of reg byte 63 of reg +*/ +/** pshufb + # lane{0,1,2} have got bytes: [ 0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15] + # lane3 has got bytes: [ 16, 17, 18, 19, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15] + + expand_ver2 = [ + # lane 0: + 0, 1, 2, 3, + 1, 2, 3, 4, + 2, 3, 4, 5, + 3, 4, 5, 6, + + # lane 1: + 4, 5, 6, 7, + 5, 6, 7, 8, + 6, 7, 8, 9, + 7, 8, 9, 10, + + # lane 2: + 8, 9, 10, 11, + 9, 10, 11, 12, + 10, 11, 12, 13, + 11, 12, 13, 14, + + # lane 3 order: 13, 14, 15, 16 14, 15, 16, 17, 15, 16, 17, 18, 16, 17, 18, 19 + 12, 13, 14, 15, + 13, 14, 15, 0, + 14, 15, 0, 1, + 15, 0, 1, 2, + ] +*/ + +#define SIMDUTF_ICELAKE_TRANSCODE16(LANE0, LANE1, MASKED) \ + { \ + const __m512i merged = _mm512_mask_mov_epi32(LANE0, 0x1000, LANE1); \ + const __m512i expand_ver2 = _mm512_setr_epi64( \ + 0x0403020103020100, \ + 0x0605040305040302, \ + 0x0807060507060504, \ + 0x0a09080709080706, \ + 0x0c0b0a090b0a0908, \ + 0x0e0d0c0b0d0c0b0a, \ + 0x000f0e0d0f0e0d0c, \ + 0x0201000f01000f0e \ + ); \ + const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2); \ + \ + __mmask16 leading_bytes; \ + const __m512i v_0000_00c0 = _mm512_set1_epi32(0xc0); \ + const __m512i t0 = _mm512_and_si512(input, v_0000_00c0); \ + const __m512i v_0000_0080 = _mm512_set1_epi32(0x80); \ + leading_bytes = _mm512_cmpneq_epu32_mask(t0, v_0000_0080); \ + \ + __m512i char_class; \ + char_class = _mm512_srli_epi32(input, 4); \ + /* char_class = ((input >> 4) & 0x0f) | 0x80808000 */ \ + const __m512i v_0000_000f = _mm512_set1_epi32(0x0f); \ + const __m512i v_8080_8000 = _mm512_set1_epi32(0x80808000); \ + char_class = _mm512_ternarylogic_epi32(char_class, v_0000_000f, v_8080_8000, 0xea); \ + \ + const int valid_count = static_cast(count_ones(leading_bytes)); \ + const __m512i utf32 = expanded_utf8_to_utf32(char_class, input); \ + \ + const __m512i out = _mm512_mask_compress_epi32(_mm512_setzero_si512(), leading_bytes, utf32); \ + \ + if (UTF32) { \ + if(MASKED) { \ + const __mmask16 valid = uint16_t((1 << valid_count) - 1); \ + _mm512_mask_storeu_epi32((__m512i*)output, valid, out); \ + } else { \ + _mm512_storeu_si512((__m512i*)output, out); \ + } \ + output += valid_count; \ + } else { \ + if(MASKED) { \ + output += utf32_to_utf16_masked(byteflip, out, valid_count, reinterpret_cast(output)); \ + } else { \ + output += utf32_to_utf16(byteflip, out, valid_count, reinterpret_cast(output)); \ + } \ + } \ + } + +#define SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(INPUT, VALID_COUNT, MASKED) \ +{ \ + if (UTF32) { \ + if(MASKED) { \ + const __mmask16 valid_mask = uint16_t((1 << VALID_COUNT) - 1); \ + _mm512_mask_storeu_epi32((__m512i*)output, valid_mask, INPUT); \ + } else { \ + _mm512_storeu_si512((__m512i*)output, INPUT); \ + } \ + output += VALID_COUNT; \ + } else { \ + if(MASKED) { \ + output += utf32_to_utf16_masked(byteflip, INPUT, VALID_COUNT, reinterpret_cast(output)); \ + } else { \ + output += utf32_to_utf16(byteflip, INPUT, VALID_COUNT, reinterpret_cast(output)); \ + } \ + } \ +} + + +#define SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output) \ + if (UTF32) { \ + const __m128i t0 = _mm512_castsi512_si128(utf8); \ + const __m128i t1 = _mm512_extracti32x4_epi32(utf8, 1); \ + const __m128i t2 = _mm512_extracti32x4_epi32(utf8, 2); \ + const __m128i t3 = _mm512_extracti32x4_epi32(utf8, 3); \ + _mm512_storeu_si512((__m512i*)(output + 0*16), _mm512_cvtepu8_epi32(t0)); \ + _mm512_storeu_si512((__m512i*)(output + 1*16), _mm512_cvtepu8_epi32(t1)); \ + _mm512_storeu_si512((__m512i*)(output + 2*16), _mm512_cvtepu8_epi32(t2)); \ + _mm512_storeu_si512((__m512i*)(output + 3*16), _mm512_cvtepu8_epi32(t3)); \ + } else { \ + const __m256i h0 = _mm512_castsi512_si256(utf8); \ + const __m256i h1 = _mm512_extracti64x4_epi64(utf8, 1); \ + if(big_endian) { \ + _mm512_storeu_si512((__m512i*)(output + 0*16), _mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h0), byteflip)); \ + _mm512_storeu_si512((__m512i*)(output + 2*16), _mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h1), byteflip)); \ + } else { \ + _mm512_storeu_si512((__m512i*)(output + 0*16), _mm512_cvtepu8_epi16(h0)); \ + _mm512_storeu_si512((__m512i*)(output + 2*16), _mm512_cvtepu8_epi16(h1)); \ + } \ + } +/* end file src/icelake/icelake_macros.inl.cpp */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_from_valid_utf8.inl.cpp +/* begin file src/icelake/icelake_from_valid_utf8.inl.cpp */ +// file included directly + +// File contains conversion procedure from VALID UTF-8 strings. + +/* + valid_utf8_to_fixed_length converts a valid UTF-8 string into UTF-32. + + The `OUTPUT` template type decides what to do with UTF-32: store + it directly or convert into UTF-16 (with AVX512). + + Input: + - str - valid UTF-8 string + - len - string length + - out_buffer - output buffer + + Result: + - pair.first - the first unprocessed input byte + - pair.second - the first unprocessed output word +*/ +template +std::pair valid_utf8_to_fixed_length(const char* str, size_t len, OUTPUT* dwords) { + constexpr bool UTF32 = std::is_same::value; + constexpr bool UTF16 = std::is_same::value; + static_assert(UTF32 or UTF16, "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)"); + static_assert(!(UTF32 and big_endian), "we do not currently support big-endian UTF-32"); + + __m512i byteflip = _mm512_setr_epi64( + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809 + ); + const char* ptr = str; + const char* end = ptr + len; + + OUTPUT* output = dwords; + /** + * In the main loop, we consume 64 bytes per iteration, + * but we access 64 + 4 bytes. + * We check for ptr + 64 + 64 <= end because + * we want to be do maskless writes without overruns. + */ + while (ptr + 64 + 64 <= end) { + const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr); + const __m512i v_80 = _mm512_set1_epi8(char(0x80)); + const __mmask64 ascii = _mm512_test_epi8_mask(utf8, v_80); + if(ascii == 0) { + SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output) + output += 64; + ptr += 64; + continue; + } + + const __m512i lane0 = broadcast_epi128<0>(utf8); + const __m512i lane1 = broadcast_epi128<1>(utf8); + int valid_count0; + __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0); + const __m512i lane2 = broadcast_epi128<2>(utf8); + int valid_count1; + __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1); + if(valid_count0 + valid_count1 <= 16) { + vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<(utf8); + int valid_count2; + __m512i vec2 = expand_and_identify(lane2, lane3, valid_count2); + uint32_t tmp1; + ::memcpy(&tmp1, ptr + 64, sizeof(tmp1)); + const __m512i lane4 = _mm512_set1_epi32(tmp1); + int valid_count3; + __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3); + if(valid_count2 + valid_count3 <= 16) { + vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1<(utf8); + const __m512i lane1 = broadcast_epi128<1>(utf8); + int valid_count0; + __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0); + const __m512i lane2 = broadcast_epi128<2>(utf8); + int valid_count1; + __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1); + if(valid_count0 + valid_count1 <= 16) { + vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<(utf8); + SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true) + + ptr += 3*16; + } + } + return {ptr, output}; +} + + +using utf8_to_utf16_result = std::pair; +/* end file src/icelake/icelake_from_valid_utf8.inl.cpp */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_utf8_validation.inl.cpp +/* begin file src/icelake/icelake_utf8_validation.inl.cpp */ +// file included directly + + +simdutf_really_inline __m512i check_special_cases(__m512i input, const __m512i prev1) { + __m512i mask1 = _mm512_setr_epi64( + 0x0202020202020202, + 0x4915012180808080, + 0x0202020202020202, + 0x4915012180808080, + 0x0202020202020202, + 0x4915012180808080, + 0x0202020202020202, + 0x4915012180808080); + const __m512i v_0f = _mm512_set1_epi8(0x0f); + __m512i index1 = _mm512_and_si512(_mm512_srli_epi16(prev1, 4), v_0f); + + __m512i byte_1_high = _mm512_shuffle_epi8(mask1, index1); + __m512i mask2 = _mm512_setr_epi64( + 0xcbcbcb8b8383a3e7, + 0xcbcbdbcbcbcbcbcb, + 0xcbcbcb8b8383a3e7, + 0xcbcbdbcbcbcbcbcb, + 0xcbcbcb8b8383a3e7, + 0xcbcbdbcbcbcbcbcb, + 0xcbcbcb8b8383a3e7, + 0xcbcbdbcbcbcbcbcb); + __m512i index2 = _mm512_and_si512(prev1, v_0f); + + __m512i byte_1_low = _mm512_shuffle_epi8(mask2, index2); + __m512i mask3 = _mm512_setr_epi64( + 0x101010101010101, + 0x1010101babaaee6, + 0x101010101010101, + 0x1010101babaaee6, + 0x101010101010101, + 0x1010101babaaee6, + 0x101010101010101, + 0x1010101babaaee6 + ); + __m512i index3 = _mm512_and_si512(_mm512_srli_epi16(input, 4), v_0f); + __m512i byte_2_high = _mm512_shuffle_epi8(mask3, index3); + return _mm512_ternarylogic_epi64(byte_1_high, byte_1_low, byte_2_high, 128); + } + + simdutf_really_inline __m512i check_multibyte_lengths(const __m512i input, + const __m512i prev_input, const __m512i sc) { + __m512i prev2 = prev<2>(input, prev_input); + __m512i prev3 = prev<3>(input, prev_input); + __m512i is_third_byte = _mm512_subs_epu8(prev2, _mm512_set1_epi8(0b11100000u-1)); // Only 111_____ will be > 0 + __m512i is_fourth_byte = _mm512_subs_epu8(prev3, _mm512_set1_epi8(0b11110000u-1)); // Only 1111____ will be > 0 + __m512i is_third_or_fourth_byte = _mm512_or_si512(is_third_byte, is_fourth_byte); + const __m512i v_7f = _mm512_set1_epi8(char(0x7f)); + is_third_or_fourth_byte = _mm512_adds_epu8(v_7f, is_third_or_fourth_byte); + // We want to compute (is_third_or_fourth_byte AND v80) XOR sc. + const __m512i v_80 = _mm512_set1_epi8(char(0x80)); + return _mm512_ternarylogic_epi32(is_third_or_fourth_byte, v_80, sc, 0b1101010); + //__m512i is_third_or_fourth_byte_mask = _mm512_and_si512(is_third_or_fourth_byte, v_80); + //return _mm512_xor_si512(is_third_or_fourth_byte_mask, sc); + } + // + // Return nonzero if there are incomplete multibyte characters at the end of the block: + // e.g. if there is a 4-byte character, but it's 3 bytes from the end. + // + simdutf_really_inline __m512i is_incomplete(const __m512i input) { + // If the previous input's last 3 bytes match this, they're too short (they ended at EOF): + // ... 1111____ 111_____ 11______ + __m512i max_value = _mm512_setr_epi64( + 0xffffffffffffffff, + 0xffffffffffffffff, + 0xffffffffffffffff, + 0xffffffffffffffff, + 0xffffffffffffffff, + 0xffffffffffffffff, + 0xffffffffffffffff, + 0xbfdfefffffffffff); + return _mm512_subs_epu8(input, max_value); + } + + struct avx512_utf8_checker { + // If this is nonzero, there has been a UTF-8 error. + __m512i error{}; + + // The last input we received + __m512i prev_input_block{}; + // Whether the last input we received was incomplete (used for ASCII fast path) + __m512i prev_incomplete{}; + + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const __m512i input, const __m512i prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes + // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) + __m512i prev1 = prev<1>(input, prev_input); + __m512i sc = check_special_cases(input, prev1); + this->error = _mm512_or_si512(check_multibyte_lengths(input, prev_input, sc), this->error); + } + + // The only problem that can happen at EOF is that a multibyte character is too short + // or a byte value too large in the last bytes: check_special_cases only checks for bytes + // too large in the first of two bytes. + simdutf_really_inline void check_eof() { + // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't + // possibly finish them. + this->error = _mm512_or_si512(this->error, this->prev_incomplete); + } + + // returns true if ASCII. + simdutf_really_inline bool check_next_input(const __m512i input) { + const __m512i v_80 = _mm512_set1_epi8(char(0x80)); + const __mmask64 ascii = _mm512_test_epi8_mask(input, v_80); + if(ascii == 0) { + this->error = _mm512_or_si512(this->error, this->prev_incomplete); + return true; + } else { + this->check_utf8_bytes(input, this->prev_input_block); + this->prev_incomplete = is_incomplete(input); + this->prev_input_block = input; + return false; + } + } + // do not forget to call check_eof! + simdutf_really_inline bool errors() const { + return _mm512_test_epi8_mask(this->error, this->error) != 0; + } + + }; // struct avx512_utf8_checker +/* end file src/icelake/icelake_utf8_validation.inl.cpp */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_from_utf8.inl.cpp +/* begin file src/icelake/icelake_from_utf8.inl.cpp */ +// file included directly + +// File contains conversion procedure from possibly invalid UTF-8 strings. + +/** + * Attempts to convert up to len 1-byte words from in (in UTF-8 format) to + * out. + * Returns the position of the input and output after the processing is + * completed. Upon error, the output is set to null. + */ +template +utf8_to_utf16_result fast_avx512_convert_utf8_to_utf16(const char *in, size_t len, char16_t *out) { + const char *const final_in = in + len; + bool result = true; + while (result) { + if (in + 64 <= final_in) { + result = process_block_utf8_to_utf16(in, out, final_in - in); + } else if(in < final_in) { + result = process_block_utf8_to_utf16(in, out, final_in - in); + } else { break; } + } + if(!result) { out = nullptr; } + return std::make_pair(in, out); +} + +template +simdutf::result fast_avx512_convert_utf8_to_utf16_with_errors(const char *in, size_t len, char16_t *out) { + const char *const init_in = in; + const char16_t *const init_out = out; + const char *const final_in = in + len; + bool result = true; + while (result) { + if (in + 64 <= final_in) { + result = process_block_utf8_to_utf16(in, out, final_in - in); + } else if(in < final_in) { + result = process_block_utf8_to_utf16(in, out, final_in - in); + } else { break; } + } + if(!result) { + simdutf::result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(in, final_in - in, out); + res.count += (in - init_in); + return res; + } else { + return simdutf::result(error_code::SUCCESS,out - init_out); + } +} + + +template +std::pair validating_utf8_to_fixed_length(const char* str, size_t len, OUTPUT* dwords) { + constexpr bool UTF32 = std::is_same::value; + constexpr bool UTF16 = std::is_same::value; + static_assert(UTF32 or UTF16, "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)"); + static_assert(!(UTF32 and big_endian), "we do not currently support big-endian UTF-32"); + + const char* ptr = str; + const char* end = ptr + len; + __m512i byteflip = _mm512_setr_epi64( + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809 + ); + OUTPUT* output = dwords; + avx512_utf8_checker checker{}; + /** + * In the main loop, we consume 64 bytes per iteration, + * but we access 64 + 4 bytes. + * We check for ptr + 64 + 64 <= end because + * we want to be do maskless writes without overruns. + */ + while (ptr + 64 + 64 <= end) { + const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr); + if(checker.check_next_input(utf8)) { + SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output) + output += 64; + ptr += 64; + continue; + } + const __m512i lane0 = broadcast_epi128<0>(utf8); + const __m512i lane1 = broadcast_epi128<1>(utf8); + int valid_count0; + __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0); + const __m512i lane2 = broadcast_epi128<2>(utf8); + int valid_count1; + __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1); + if(valid_count0 + valid_count1 <= 16) { + vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<(utf8); + int valid_count2; + __m512i vec2 = expand_and_identify(lane2, lane3, valid_count2); + uint32_t tmp1; + ::memcpy(&tmp1, ptr + 64, sizeof(tmp1)); + const __m512i lane4 = _mm512_set1_epi32(tmp1); + int valid_count3; + __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3); + if(valid_count2 + valid_count3 <= 16) { + vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1<(utf8); + const __m512i lane1 = broadcast_epi128<1>(utf8); + int valid_count0; + __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0); + const __m512i lane2 = broadcast_epi128<2>(utf8); + int valid_count1; + __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1); + if(valid_count0 + valid_count1 <= 16) { + vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<(utf8); + SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true) + + ptr += 3*16; + } + validatedptr += 4*16; + } + { + const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - validatedptr))-1, (const __m512i*)validatedptr); + checker.check_next_input(utf8); + } + checker.check_eof(); + if(checker.errors()) { + return {ptr, nullptr}; // We found an error. + } + return {ptr, output}; +} + +// Like validating_utf8_to_fixed_length but returns as soon as an error is identified +template +std::tuple validating_utf8_to_fixed_length_with_constant_checks(const char* str, size_t len, OUTPUT* dwords) { + constexpr bool UTF32 = std::is_same::value; + constexpr bool UTF16 = std::is_same::value; + static_assert(UTF32 or UTF16, "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)"); + static_assert(!(UTF32 and big_endian), "we do not currently support big-endian UTF-32"); + + const char* ptr = str; + const char* end = ptr + len; + __m512i byteflip = _mm512_setr_epi64( + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809 + ); + OUTPUT* output = dwords; + avx512_utf8_checker checker{}; + /** + * In the main loop, we consume 64 bytes per iteration, + * but we access 64 + 4 bytes. + * We check for ptr + 64 + 64 <= end because + * we want to be do maskless writes without overruns. + */ + while (ptr + 64 + 64 <= end) { + const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr); + if(checker.check_next_input(utf8)) { + SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output) + output += 64; + ptr += 64; + continue; + } + if(checker.errors()) { + return {ptr, output, false}; // We found an error. + } + const __m512i lane0 = broadcast_epi128<0>(utf8); + const __m512i lane1 = broadcast_epi128<1>(utf8); + int valid_count0; + __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0); + const __m512i lane2 = broadcast_epi128<2>(utf8); + int valid_count1; + __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1); + if(valid_count0 + valid_count1 <= 16) { + vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<(utf8); + int valid_count2; + __m512i vec2 = expand_and_identify(lane2, lane3, valid_count2); + uint32_t tmp1; + ::memcpy(&tmp1, ptr + 64, sizeof(tmp1)); + const __m512i lane4 = _mm512_set1_epi32(tmp1); + int valid_count3; + __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3); + if(valid_count2 + valid_count3 <= 16) { + vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1<(utf8); + const __m512i lane1 = broadcast_epi128<1>(utf8); + int valid_count0; + __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0); + const __m512i lane2 = broadcast_epi128<2>(utf8); + int valid_count1; + __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1); + if(valid_count0 + valid_count1 <= 16) { + vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<(utf8); + SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true) + + ptr += 3*16; + } + validatedptr += 4*16; + } + { + const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - validatedptr))-1, (const __m512i*)validatedptr); + checker.check_next_input(utf8); + } + checker.check_eof(); + if(checker.errors()) { + return {ptr, output, false}; // We found an error. + } + return {ptr, output, true}; +} +/* end file src/icelake/icelake_from_utf8.inl.cpp */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_convert_utf16_to_utf32.inl.cpp +/* begin file src/icelake/icelake_convert_utf16_to_utf32.inl.cpp */ +// file included directly + +/* + Returns a pair: the first unprocessed byte from buf and utf32_output + A scalar routing should carry on the conversion of the tail. +*/ +template +std::tuple convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) { + const char16_t* end = buf + len; + const __m512i v_fc00 = _mm512_set1_epi16((uint16_t)0xfc00); + const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800); + const __m512i v_dc00 = _mm512_set1_epi16((uint16_t)0xdc00); + __mmask32 carry{0}; + const __m512i byteflip = _mm512_setr_epi64( + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809 + ); + while (buf + 32 <= end) { + // Always safe because buf + 32 <= end so that end - buf >= 32 bytes: + __m512i in = _mm512_loadu_si512((__m512i*)buf); + if(big_endian) { in = _mm512_shuffle_epi8(in, byteflip); } + + // H - bitmask for high surrogates + const __mmask32 H = _mm512_cmpeq_epi16_mask(_mm512_and_si512(in, v_fc00), v_d800); + // H - bitmask for low surrogates + const __mmask32 L = _mm512_cmpeq_epi16_mask(_mm512_and_si512(in, v_fc00), v_dc00); + + if ((H|L)) { + // surrogate pair(s) in a register + const __mmask32 V = (L ^ (carry | (H << 1))); // A high surrogate must be followed by low one and a low one must be preceded by a high one. + // If valid, V should be equal to 0 + + if(V == 0) { + // valid case + /* + Input surrogate pair: + |1101.11aa.aaaa.aaaa|1101.10bb.bbbb.bbbb| + low surrogate high surrogate + */ + /* 1. Expand all words to 32-bit words + in |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0000.0000.0000.1101.10bb.bbbb.bbbb| + */ + const __m512i first = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in)); + const __m512i second = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in,1)); + + /* 2. Shift by one 16-bit word to align low surrogates with high surrogates + in |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0000.0000.0000.1101.10bb.bbbb.bbbb| + shifted |????.????.????.????.????.????.????.????|0000.0000.0000.0000.1101.11aa.aaaa.aaaa| + */ + const __m512i shifted_first = _mm512_alignr_epi32(second, first, 1); + const __m512i shifted_second = _mm512_alignr_epi32(_mm512_setzero_si512(), second, 1); + + /* 3. Align all high surrogates in first and second by shifting to the left by 10 bits + |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0011.0110.bbbb.bbbb.bb00.0000.0000| + */ + const __m512i aligned_first = _mm512_mask_slli_epi32(first, (__mmask16)H, first, 10); + const __m512i aligned_second = _mm512_mask_slli_epi32(second, (__mmask16)(H>>16), second, 10); + + /* 4. Remove surrogate prefixes and add offset 0x10000 by adding in, shifted and constant + in |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0011.0110.bbbb.bbbb.bb00.0000.0000| + shifted |????.????.????.????.????.????.????.????|0000.0000.0000.0000.1101.11aa.aaaa.aaaa| + constant|1111.1100.1010.0000.0010.0100.0000.0000|1111.1100.1010.0000.0010.0100.0000.0000| + */ + const __m512i constant = _mm512_set1_epi32((uint32_t)0xfca02400); + const __m512i added_first = _mm512_mask_add_epi32(aligned_first, (__mmask16)H, aligned_first, shifted_first); + const __m512i utf32_first = _mm512_mask_add_epi32(added_first, (__mmask16)H, added_first, constant); + + const __m512i added_second = _mm512_mask_add_epi32(aligned_second, (__mmask16)(H>>16), aligned_second, shifted_second); + const __m512i utf32_second = _mm512_mask_add_epi32(added_second, (__mmask16)(H>>16), added_second, constant); + + // 5. Store all valid UTF-32 words (low surrogate positions and 32nd word are invalid) + const __mmask32 valid = ~L & 0x7fffffff; + // We deliberately do a _mm512_maskz_compress_epi32 followed by storeu_epi32 + // to ease performance portability to Zen 4. + const __m512i compressed_first = _mm512_maskz_compress_epi32((__mmask16)(valid), utf32_first); + const size_t howmany1 = count_ones((uint16_t)(valid)); + _mm512_storeu_si512((__m512i *) utf32_output, compressed_first); + utf32_output += howmany1; + const __m512i compressed_second = _mm512_maskz_compress_epi32((__mmask16)(valid >> 16), utf32_second); + const size_t howmany2 = count_ones((uint16_t)(valid >> 16)); + // The following could be unsafe in some cases? + //_mm512_storeu_epi32((__m512i *) utf32_output, compressed_second); + _mm512_mask_storeu_epi32((__m512i *) utf32_output, __mmask16((1<> 30) & 0x1; + } else { + // invalid case + return std::make_tuple(buf+carry, utf32_output, false); + } + } else { + // no surrogates + // extend all thirty-two 16-bit words to thirty-two 32-bit words + _mm512_storeu_si512((__m512i *)(utf32_output), _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in))); + _mm512_storeu_si512((__m512i *)(utf32_output) + 1, _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in,1))); + utf32_output += 32; + buf += 32; + carry = 0; + } + } // while + return std::make_tuple(buf+carry, utf32_output, true); +} +/* end file src/icelake/icelake_convert_utf16_to_utf32.inl.cpp */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_convert_utf32_to_utf8.inl.cpp +/* begin file src/icelake/icelake_convert_utf32_to_utf8.inl.cpp */ +// file included directly + +// Todo: currently, this is just the haswell code, optimize for icelake kernel. +std::pair avx512_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) { + const char32_t* end = buf + len; + const __m256i v_0000 = _mm256_setzero_si256(); + const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000); + const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80); + const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800); + const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080); + const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff); + __m256i running_max = _mm256_setzero_si256(); + __m256i forbidden_bytemask = _mm256_setzero_si256(); + + const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 + + while (buf + 16 + safety_margin <= end) { + __m256i in = _mm256_loadu_si256((__m256i*)buf); + __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1); + running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin); + + // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation + __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff)); + in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000); + + // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp) + + if(_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!! + // 1. pack the bytes + const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1)); + // 2. store (16 bytes) + _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! + } + // no bits set above 7th bit + const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000); + const uint32_t one_byte_bitmask = static_cast(_mm256_movemask_epi8(one_byte_bytemask)); + + // no bits set above 11th bit + const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000); + const uint32_t one_or_two_bytes_bitmask = static_cast(_mm256_movemask_epi8(one_or_two_bytes_bytemask)); + if (one_or_two_bytes_bitmask == 0xffffffff) { + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00); + const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f); + + // t0 = [000a|aaaa|bbbb|bb00] + const __m256i t0 = _mm256_slli_epi16(in_16, 2); + // t1 = [000a|aaaa|0000|0000] + const __m256i t1 = _mm256_and_si256(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const __m256i t2 = _mm256_and_si256(in_16, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const __m256i t3 = _mm256_or_si256(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const __m256i t4 = _mm256_or_si256(t3, v_c080); + + // 2. merge ASCII and 2-byte codewords + const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask); + + // 3. prepare bitmask for 8-bit lookup + const uint32_t M0 = one_byte_bitmask & 0x55555555; + const uint32_t M1 = M0 >> 7; + const uint32_t M2 = (M1 | M0) & 0x00ff00ff; + // 4. pack the bytes + + const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0]; + const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0]; + + const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1)); + const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1)); + + const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2)); + // 5. store bytes + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed)); + utf8_output += row[0]; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1)); + utf8_output += row_2[0]; + + // 6. adjust pointers + buf += 16; + continue; + } + // Must check for overflow in packing + const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000); + const uint32_t saturation_bitmask = static_cast(_mm256_movemask_epi8(saturation_bytemask)); + if (saturation_bitmask == 0xffffffff) { + // case: words from register produce either 1, 2 or 3 UTF-8 bytes + const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800); + forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800)); + + const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e, + 0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); + + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes + + We expand the input word (16-bit) into two words (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. + + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. + + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. + + Finally from these two words we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ +#define vec(x) _mm256_set1_epi16(static_cast(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const __m256i t1 = _mm256_and_si256(t0, vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const __m256i t2 = _mm256_or_si256 (t1, vec(0b1000000000000000)); + + // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] + const __m256i s0 = _mm256_srli_epi16(in_16, 4); + // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] + const __m256i s1 = _mm256_and_si256(s0, vec(0b0000111111111100)); + // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] + const __m256i s2 = _mm256_maddubs_epi16(s1, vec(0x0140)); + // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const __m256i s3 = _mm256_or_si256(s2, vec(0b1100000011100000)); + const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, vec(0b0100000000000000)); + const __m256i s4 = _mm256_xor_si256(s3, m0); +#undef vec + + // 4. expand words 16-bit => 32-bit + const __m256i out0 = _mm256_unpacklo_epi16(t2, s4); + const __m256i out1 = _mm256_unpackhi_epi16(t2, s4); + + // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle + const uint32_t mask = (one_byte_bitmask & 0x55555555) | + (one_or_two_bytes_bitmask & 0xaaaaaaaa); + // Due to the wider registers, the following path is less likely to be useful. + /*if(mask == 0) { + // We only have three-byte words. Use fast path. + const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); + const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle); + const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle); + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1)); + utf8_output += 12; + buf += 16; + continue; + }*/ + const uint8_t mask0 = uint8_t(mask); + const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1)); + const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0); + + const uint8_t mask1 = static_cast(mask >> 8); + const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1)); + const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1); + + const uint8_t mask2 = static_cast(mask >> 16); + const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0]; + const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1)); + const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2); + + + const uint8_t mask3 = static_cast(mask >> 24); + const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0]; + const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1)); + const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3); + + _mm_storeu_si128((__m128i*)utf8_output, utf8_0); + utf8_output += row0[0]; + _mm_storeu_si128((__m128i*)utf8_output, utf8_1); + utf8_output += row1[0]; + _mm_storeu_si128((__m128i*)utf8_output, utf8_2); + utf8_output += row2[0]; + _mm_storeu_si128((__m128i*)utf8_output, utf8_3); + utf8_output += row3[0]; + buf += 16; + } else { + // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes. + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // may require large, non-trivial tables? + size_t forward = 15; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint32_t word = buf[k]; + if((word & 0xFFFFFF80)==0) { // 1-byte (ASCII) + *utf8_output++ = char(word); + } else if((word & 0xFFFFF800)==0) { // 2-byte + *utf8_output++ = char((word>>6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if((word & 0xFFFF0000 )==0) { // 3-byte + if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf8_output); } + *utf8_output++ = char((word>>12) | 0b11100000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { // 4-byte + if (word > 0x10FFFF) { return std::make_pair(nullptr, utf8_output); } + *utf8_output++ = char((word>>18) | 0b11110000); + *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } + } + buf += k; + } + } // while + + // check for invalid input + const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff); + if(static_cast(_mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) { + return std::make_pair(nullptr, utf8_output); + } + + if (static_cast(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf8_output); } + + return std::make_pair(buf, utf8_output); +} + +// Todo: currently, this is just the haswell code, optimize for icelake kernel. +std::pair avx512_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) { + const char32_t* end = buf + len; + const char32_t* start = buf; + + const __m256i v_0000 = _mm256_setzero_si256(); + const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000); + const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80); + const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800); + const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080); + const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff); + const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff); + + const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 + + while (buf + 16 + safety_margin <= end) { + __m256i in = _mm256_loadu_si256((__m256i*)buf); + __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1); + // Check for too large input + const __m256i max_input = _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff); + if(static_cast(_mm256_movemask_epi8(_mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) { + return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output); + } + + // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation + __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff)); + in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000); + + // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp) + + if(_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!! + // 1. pack the bytes + const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1)); + // 2. store (16 bytes) + _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! + } + // no bits set above 7th bit + const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000); + const uint32_t one_byte_bitmask = static_cast(_mm256_movemask_epi8(one_byte_bytemask)); + + // no bits set above 11th bit + const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000); + const uint32_t one_or_two_bytes_bitmask = static_cast(_mm256_movemask_epi8(one_or_two_bytes_bytemask)); + if (one_or_two_bytes_bitmask == 0xffffffff) { + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00); + const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f); + + // t0 = [000a|aaaa|bbbb|bb00] + const __m256i t0 = _mm256_slli_epi16(in_16, 2); + // t1 = [000a|aaaa|0000|0000] + const __m256i t1 = _mm256_and_si256(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const __m256i t2 = _mm256_and_si256(in_16, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const __m256i t3 = _mm256_or_si256(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const __m256i t4 = _mm256_or_si256(t3, v_c080); + + // 2. merge ASCII and 2-byte codewords + const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask); + + // 3. prepare bitmask for 8-bit lookup + const uint32_t M0 = one_byte_bitmask & 0x55555555; + const uint32_t M1 = M0 >> 7; + const uint32_t M2 = (M1 | M0) & 0x00ff00ff; + // 4. pack the bytes + + const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0]; + const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0]; + + const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1)); + const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1)); + + const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2)); + // 5. store bytes + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed)); + utf8_output += row[0]; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1)); + utf8_output += row_2[0]; + + // 6. adjust pointers + buf += 16; + continue; + } + // Must check for overflow in packing + const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000); + const uint32_t saturation_bitmask = static_cast(_mm256_movemask_epi8(saturation_bytemask)); + if (saturation_bitmask == 0xffffffff) { + // case: words from register produce either 1, 2 or 3 UTF-8 bytes + + // Check for illegal surrogate words + const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800); + const __m256i forbidden_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800); + if (static_cast(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) { + return std::make_pair(result(error_code::SURROGATE, buf - start), utf8_output); + } + + const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e, + 0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); + + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes + + We expand the input word (16-bit) into two words (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. + + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. + + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. + + Finally from these two words we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ +#define vec(x) _mm256_set1_epi16(static_cast(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const __m256i t1 = _mm256_and_si256(t0, vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const __m256i t2 = _mm256_or_si256 (t1, vec(0b1000000000000000)); + + // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] + const __m256i s0 = _mm256_srli_epi16(in_16, 4); + // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] + const __m256i s1 = _mm256_and_si256(s0, vec(0b0000111111111100)); + // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] + const __m256i s2 = _mm256_maddubs_epi16(s1, vec(0x0140)); + // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const __m256i s3 = _mm256_or_si256(s2, vec(0b1100000011100000)); + const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, vec(0b0100000000000000)); + const __m256i s4 = _mm256_xor_si256(s3, m0); +#undef vec + + // 4. expand words 16-bit => 32-bit + const __m256i out0 = _mm256_unpacklo_epi16(t2, s4); + const __m256i out1 = _mm256_unpackhi_epi16(t2, s4); + + // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle + const uint32_t mask = (one_byte_bitmask & 0x55555555) | + (one_or_two_bytes_bitmask & 0xaaaaaaaa); + // Due to the wider registers, the following path is less likely to be useful. + /*if(mask == 0) { + // We only have three-byte words. Use fast path. + const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); + const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle); + const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle); + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1)); + utf8_output += 12; + buf += 16; + continue; + }*/ + const uint8_t mask0 = uint8_t(mask); + const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1)); + const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0); + + const uint8_t mask1 = static_cast(mask >> 8); + const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1)); + const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1); + + const uint8_t mask2 = static_cast(mask >> 16); + const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0]; + const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1)); + const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2); + + + const uint8_t mask3 = static_cast(mask >> 24); + const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0]; + const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1)); + const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3); + + _mm_storeu_si128((__m128i*)utf8_output, utf8_0); + utf8_output += row0[0]; + _mm_storeu_si128((__m128i*)utf8_output, utf8_1); + utf8_output += row1[0]; + _mm_storeu_si128((__m128i*)utf8_output, utf8_2); + utf8_output += row2[0]; + _mm_storeu_si128((__m128i*)utf8_output, utf8_3); + utf8_output += row3[0]; + buf += 16; + } else { + // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes. + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // may require large, non-trivial tables? + size_t forward = 15; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint32_t word = buf[k]; + if((word & 0xFFFFFF80)==0) { // 1-byte (ASCII) + *utf8_output++ = char(word); + } else if((word & 0xFFFFF800)==0) { // 2-byte + *utf8_output++ = char((word>>6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if((word & 0xFFFF0000 )==0) { // 3-byte + if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output); } + *utf8_output++ = char((word>>12) | 0b11100000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { // 4-byte + if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf8_output); } + *utf8_output++ = char((word>>18) | 0b11110000); + *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } + } + buf += k; + } + } // while + + return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output); +} +/* end file src/icelake/icelake_convert_utf32_to_utf8.inl.cpp */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_convert_utf32_to_utf16.inl.cpp +/* begin file src/icelake/icelake_convert_utf32_to_utf16.inl.cpp */ +// file included directly + +// Todo: currently, this is just the haswell code, optimize for icelake kernel. +template +std::pair avx512_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_output) { + const char32_t* end = buf + len; + + const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 + __m256i forbidden_bytemask = _mm256_setzero_si256(); + + + while (buf + 8 + safety_margin <= end) { + __m256i in = _mm256_loadu_si256((__m256i*)buf); + + const __m256i v_00000000 = _mm256_setzero_si256(); + const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000); + + // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs + const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000); + const uint32_t saturation_bitmask = static_cast(_mm256_movemask_epi8(saturation_bytemask)); + + if (saturation_bitmask == 0xffffffff) { + const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800); + const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800); + forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800)); + + __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1)); + if (big_endian) { + const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + utf16_packed = _mm_shuffle_epi8(utf16_packed, swap); + } + _mm_storeu_si128((__m128i*)utf16_output, utf16_packed); + utf16_output += 8; + buf += 8; + } else { + size_t forward = 7; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint32_t word = buf[k]; + if((word & 0xFFFF0000)==0) { + // will not generate a surrogate pair + if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf16_output); } + *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word); + } else { + // will generate a surrogate pair + if (word > 0x10FFFF) { return std::make_pair(nullptr, utf16_output); } + word -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); + if (big_endian) { + high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8)); + low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8)); + } + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); + } + } + buf += k; + } + } + + // check for invalid input + if (static_cast(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf16_output); } + + return std::make_pair(buf, utf16_output); +} + +// Todo: currently, this is just the haswell code, optimize for icelake kernel. +template +std::pair avx512_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) { + const char32_t* start = buf; + const char32_t* end = buf + len; + + const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 + + while (buf + 8 + safety_margin <= end) { + __m256i in = _mm256_loadu_si256((__m256i*)buf); + + const __m256i v_00000000 = _mm256_setzero_si256(); + const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000); + + // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs + const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000); + const uint32_t saturation_bitmask = static_cast(_mm256_movemask_epi8(saturation_bytemask)); + + if (saturation_bitmask == 0xffffffff) { + const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800); + const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800); + const __m256i forbidden_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800); + if (static_cast(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) { + return std::make_pair(result(error_code::SURROGATE, buf - start), utf16_output); + } + + __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1)); + if (big_endian) { + const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + utf16_packed = _mm_shuffle_epi8(utf16_packed, swap); + } + _mm_storeu_si128((__m128i*)utf16_output, utf16_packed); + utf16_output += 8; + buf += 8; + } else { + size_t forward = 7; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint32_t word = buf[k]; + if((word & 0xFFFF0000)==0) { + // will not generate a surrogate pair + if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output); } + *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word); + } else { + // will generate a surrogate pair + if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output); } + word -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); + if (big_endian) { + high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8)); + low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8)); + } + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); + } + } + buf += k; + } + } + + return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output); +} +/* end file src/icelake/icelake_convert_utf32_to_utf16.inl.cpp */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_ascii_validation.inl.cpp +/* begin file src/icelake/icelake_ascii_validation.inl.cpp */ +// file included directly + +const char* validate_ascii(const char* buf, size_t len) { + const char* end = buf + len; + const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80); + __m512i running_or = _mm512_setzero_si512(); + for (; buf + 64 <= end; buf += 64) { + const __m512i utf8 = _mm512_loadu_si512((const __m512i*)buf); + running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii, 0xf8); // running_or | (utf8 & ascii) + } + if (_mm512_test_epi8_mask(running_or, running_or) != 0) { + return nullptr; + } else { + return buf; + } +} +/* end file src/icelake/icelake_ascii_validation.inl.cpp */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_utf32_validation.inl.cpp +/* begin file src/icelake/icelake_utf32_validation.inl.cpp */ +// file included directly + +const char32_t* validate_utf32(const char32_t* buf, size_t len) { + const char32_t* end = len >= 16 ? buf + len - 16 : nullptr; + + const __m512i offset = _mm512_set1_epi32((uint32_t)0xffff2000); + __m512i currentmax = _mm512_setzero_si512(); + __m512i currentoffsetmax = _mm512_setzero_si512(); + + while (buf <= end) { + __m512i utf32 = _mm512_loadu_si512((const __m512i*)buf); + buf += 16; + currentoffsetmax = _mm512_max_epu32(_mm512_add_epi32(utf32, offset), currentoffsetmax); + currentmax = _mm512_max_epu32(utf32, currentmax); + } + + const __m512i standardmax = _mm512_set1_epi32((uint32_t)0x10ffff); + const __m512i standardoffsetmax = _mm512_set1_epi32((uint32_t)0xfffff7ff); + __m512i is_zero = _mm512_xor_si512(_mm512_max_epu32(currentmax, standardmax), standardmax); + if (_mm512_test_epi8_mask(is_zero, is_zero) != 0) { + return nullptr; + } + is_zero = _mm512_xor_si512(_mm512_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax); + if (_mm512_test_epi8_mask(is_zero, is_zero) != 0) { + return nullptr; + } + + return buf; +} +/* end file src/icelake/icelake_utf32_validation.inl.cpp */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_convert_utf16_to_utf8.inl.cpp +/* begin file src/icelake/icelake_convert_utf16_to_utf8.inl.cpp */ +// file included directly + +/** + * This function converts the input (inbuf, inlen), assumed to be valid + * UTF16 (little endian) into UTF-8 (to outbuf). The number of words written + * is written to 'outlen' and the function reports the number of input word + * consumed. + */ + + template + size_t utf16_to_utf8_avx512i(const char16_t *inbuf, size_t inlen, + unsigned char *outbuf, size_t *outlen) { + __m512i in; + __mmask32 inmask = _cvtu32_mask32(0x7fffffff); + __m512i byteflip = _mm512_setr_epi64( + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809 + ); + const char16_t * const inbuf_orig = inbuf; + const unsigned char * const outbuf_orig = outbuf; + size_t adjust = 0; + int carry = 0; + + while (inlen >= 32) { + in = _mm512_loadu_si512(inbuf); + if(big_endian) { in = _mm512_shuffle_epi8(in, byteflip); } + inlen -= 31; + lastiteration: + inbuf += 31; + + failiteration: + const __mmask32 is234byte = _mm512_mask_cmp_epu16_mask( + inmask, in, _mm512_set1_epi16(0x0080), _MM_CMPINT_NLT); + + if (_ktestz_mask32_u8(inmask, is234byte)) { + // fast path for ASCII only + _mm512_mask_cvtepi16_storeu_epi8(outbuf, inmask, in); + outbuf += 31; + carry = 0; + + if (inlen < 32) { + goto tail; + } else { + continue; + } + } + + const __mmask32 is12byte = + _mm512_cmp_epu16_mask(in, _mm512_set1_epi16(0x0800), _MM_CMPINT_LT); + + if (_ktestc_mask32_u8(is12byte, inmask)) { + // fast path for 1 and 2 byte only + + const __m512i twobytes = _mm512_ternarylogic_epi32( + _mm512_slli_epi16(in, 8), _mm512_srli_epi16(in, 6), + _mm512_set1_epi16(0x3f3f), 0xa8); // (A|B)&C + in = _mm512_mask_add_epi16(in, is234byte, twobytes, + _mm512_set1_epi16(int16_t(0x80c0))); + const __m512i cmpmask = + _mm512_mask_blend_epi16(inmask, _mm512_set1_epi16(int16_t(0xffff)), + _mm512_set1_epi16(0x0800)); + const __mmask64 smoosh = _mm512_cmp_epu8_mask(in, cmpmask, _MM_CMPINT_NLT); + const __m512i out = _mm512_maskz_compress_epi8(smoosh, in); + _mm512_mask_storeu_epi8(outbuf, _cvtu64_mask64(_pext_u64(_cvtmask64_u64(smoosh), _cvtmask64_u64(smoosh))), + out); + outbuf += 31 + _mm_popcnt_u32(_cvtmask32_u32(is234byte)); + carry = 0; + + if (inlen < 32) { + goto tail; + } else { + continue; + } + } + __m512i lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in)); + __m512i hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1)); + + + __m512i taglo = _mm512_set1_epi32(0x8080e000); + __m512i taghi = taglo; + + const __m512i fc00masked = _mm512_and_epi32(in, _mm512_set1_epi16(int16_t(0xfc00))); + const __mmask32 hisurr = _mm512_mask_cmp_epu16_mask( + inmask, fc00masked, _mm512_set1_epi16(int16_t(0xd800)), _MM_CMPINT_EQ); + const __mmask32 losurr = _mm512_cmp_epu16_mask( + fc00masked, _mm512_set1_epi16(int16_t(0xdc00)), _MM_CMPINT_EQ); + + int carryout = 0; + if (!_kortestz_mask32_u8(hisurr, losurr)) { + // handle surrogates + + __m512i los = _mm512_alignr_epi32(hi, lo, 1); + __m512i his = _mm512_alignr_epi32(lo, hi, 1); + + const __mmask32 hisurrhi = _kshiftri_mask32(hisurr, 16); + taglo = + _mm512_mask_mov_epi32(taglo,__mmask16(hisurr), _mm512_set1_epi32(0x808080f0)); + taghi = + _mm512_mask_mov_epi32(taghi, __mmask16(hisurrhi), _mm512_set1_epi32(0x808080f0)); + + lo = _mm512_mask_slli_epi32(lo, __mmask16(hisurr), lo, 10); + hi = _mm512_mask_slli_epi32(hi, __mmask16(hisurrhi), hi, 10); + los = _mm512_add_epi32(los, _mm512_set1_epi32(0xfca02400)); + his = _mm512_add_epi32(his, _mm512_set1_epi32(0xfca02400)); + lo = _mm512_mask_add_epi32(lo, __mmask16(hisurr), lo, los); + hi = _mm512_mask_add_epi32(hi, __mmask16(hisurrhi), hi, his); + + carryout = _cvtu32_mask32(_kshiftri_mask32(hisurr, 30)); + + const uint32_t h = _cvtmask32_u32(hisurr); + const uint32_t l = _cvtmask32_u32(losurr); + // check for mismatched surrogates + if ((h + h + carry) ^ l) { + const uint32_t lonohi = l & ~(h + h + carry); + const uint32_t hinolo = h & ~(l >> 1); + inlen = _tzcnt_u32(hinolo | lonohi); + inmask = __mmask32(0x7fffffff & ((1 << inlen) - 1)); + in = _mm512_maskz_mov_epi16(inmask, in); + adjust = (int)inlen - 31; + inlen = 0; + goto failiteration; + } + } + + hi = _mm512_maskz_mov_epi32(_cvtu32_mask16(0x7fff),hi); + carry = carryout; + + __m512i mslo = + _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), lo); + + __m512i mshi = + _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), hi); + + const __mmask32 outmask = __mmask32(_kandn_mask64(losurr, inmask)); + const __mmask64 outmhi = _kshiftri_mask64(outmask, 16); + + const __mmask32 is1byte = __mmask32(_knot_mask64(is234byte)); + const __mmask64 is1bhi = _kshiftri_mask64(is1byte, 16); + const __mmask64 is12bhi = _kshiftri_mask64(is12byte, 16); + + taglo = + _mm512_mask_mov_epi32(taglo, __mmask16(is12byte), _mm512_set1_epi32(0x80c00000)); + taghi = + _mm512_mask_mov_epi32(taghi, __mmask16(is12bhi), _mm512_set1_epi32(0x80c00000)); + __m512i magiclo = _mm512_mask_blend_epi32(__mmask16(outmask), _mm512_set1_epi32(0xffffffff), + _mm512_set1_epi32(0x00010101)); + __m512i magichi = _mm512_mask_blend_epi32(__mmask16(outmhi), _mm512_set1_epi32(0xffffffff), + _mm512_set1_epi32(0x00010101)); + + + magiclo = _mm512_mask_blend_epi32(__mmask16(outmask), _mm512_set1_epi32(0xffffffff), + _mm512_set1_epi32(0x00010101)); + magichi = _mm512_mask_blend_epi32(__mmask16(outmhi), _mm512_set1_epi32(0xffffffff), + _mm512_set1_epi32(0x00010101)); + + mslo = _mm512_ternarylogic_epi32(mslo, _mm512_set1_epi32(0x3f3f3f3f), taglo, + 0xea); // A&B|C + mshi = _mm512_ternarylogic_epi32(mshi, _mm512_set1_epi32(0x3f3f3f3f), taghi, + 0xea); + mslo = _mm512_mask_slli_epi32(mslo, __mmask16(is1byte), lo, 24); + + mshi = _mm512_mask_slli_epi32(mshi, __mmask16(is1bhi), hi, 24); + + const __mmask64 wantlo = _mm512_cmp_epu8_mask(mslo, magiclo, _MM_CMPINT_NLT); + const __mmask64 wanthi = _mm512_cmp_epu8_mask(mshi, magichi, _MM_CMPINT_NLT); + const __m512i outlo = _mm512_maskz_compress_epi8(wantlo, mslo); + const __m512i outhi = _mm512_maskz_compress_epi8(wanthi, mshi); + const uint64_t wantlo_uint64 = _cvtmask64_u64(wantlo); + const uint64_t wanthi_uint64 = _cvtmask64_u64(wanthi); + + uint64_t advlo = _mm_popcnt_u64(wantlo_uint64); + uint64_t advhi = _mm_popcnt_u64(wanthi_uint64); + + _mm512_mask_storeu_epi8(outbuf, _cvtu64_mask64(_pext_u64(wantlo_uint64, wantlo_uint64)), outlo); + _mm512_mask_storeu_epi8(outbuf + advlo, _cvtu64_mask64(_pext_u64(wanthi_uint64, wanthi_uint64)), outhi); + outbuf += advlo + advhi; + } + outbuf -= adjust; + +tail: + if (inlen != 0) { + // We must have inlen < 31. + inmask = _cvtu32_mask32((1 << inlen) - 1); + in = _mm512_maskz_loadu_epi16(inmask, inbuf); + if(big_endian) { in = _mm512_shuffle_epi8(in, byteflip); } + adjust = inlen - 31; + inlen = 0; + goto lastiteration; + } + *outlen = (outbuf - outbuf_orig) + adjust; + return ((inbuf - inbuf_orig) + adjust); +} +/* end file src/icelake/icelake_convert_utf16_to_utf8.inl.cpp */ + +} // namespace +} // namespace icelake +} // namespace simdutf + +namespace simdutf { +namespace icelake { + + +simdutf_warn_unused int +implementation::detect_encodings(const char *input, + size_t length) const noexcept { + // If there is a BOM, then we trust it. + auto bom_encoding = simdutf::BOM::check_bom(input, length); + if(bom_encoding != encoding_type::unspecified) { return bom_encoding; } + if (length % 2 == 0) { + const char *buf = input; + + const char *start = buf; + const char *end = input + length; + + bool is_utf8 = true; + bool is_utf16 = true; + bool is_utf32 = true; + + int out = 0; + + avx512_utf8_checker checker{}; + __m512i currentmax = _mm512_setzero_si512(); + while (buf + 64 <= end) { + __m512i in = _mm512_loadu_si512((__m512i *)buf); + __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800))); + __mmask32 surrogates = + _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800))); + if (surrogates) { + is_utf8 = false; + + // Can still be either UTF-16LE or UTF-32LE depending on the positions + // of the surrogates To be valid UTF-32LE, a surrogate cannot be in the + // two most significant bytes of any 32-bit word. On the other hand, to + // be valid UTF-16LE, at least one surrogate must be in the two most + // significant bytes of a 32-bit word since they always come in pairs in + // UTF-16LE. Note that we always proceed in multiple of 4 before this + // point so there is no offset in 32-bit words. + + if ((surrogates & 0xaaaaaaaa) != 0) { + is_utf32 = false; + __mmask32 highsurrogates = _mm512_cmplt_epu16_mask( + diff, _mm512_set1_epi16(uint16_t(0x0400))); + __mmask32 lowsurrogates = surrogates ^ highsurrogates; + // high must be followed by low + if ((highsurrogates << 1) != lowsurrogates) { + return simdutf::encoding_type::unspecified; + } + + bool ends_with_high = ((highsurrogates & 0x80000000) != 0); + if (ends_with_high) { + buf += + 31 * + sizeof(char16_t); // advance only by 31 words so that we start + // with the high surrogate on the next round. + } else { + buf += 32 * sizeof(char16_t); + } + is_utf16 = validate_utf16le(reinterpret_cast(buf), + (end - buf) / sizeof(char16_t)); + if (!is_utf16) { + return simdutf::encoding_type::unspecified; + + } else { + return simdutf::encoding_type::UTF16_LE; + } + + } else { + is_utf16 = false; + // Check for UTF-32LE + if (length % 4 == 0) { + const char32_t *input32 = reinterpret_cast(buf); + const char32_t *end32 = + reinterpret_cast(start) + length / 4; + if (validate_utf32(input32, end32 - input32)) { + return simdutf::encoding_type::UTF32_LE; + } + } + return simdutf::encoding_type::unspecified; + } + break; + } + // If no surrogate, validate under other encodings as well + + // UTF-32LE validation + currentmax = _mm512_max_epu32(in, currentmax); + + // UTF-8 validation + checker.check_next_input(in); + + buf += 64; + } + + // Check which encodings are possible + + if (is_utf8) { + size_t current_length = static_cast(buf - start); + if (current_length != length) { + const __m512i utf8 = _mm512_maskz_loadu_epi8( + (1ULL << (length - current_length)) - 1, (const __m512i *)buf); + checker.check_next_input(utf8); + } + checker.check_eof(); + if (!checker.errors()) { + out |= simdutf::encoding_type::UTF8; + } + } + + if (is_utf16 && scalar::utf16::validate( + reinterpret_cast(buf), + (length - (buf - start)) / 2)) { + out |= simdutf::encoding_type::UTF16_LE; + } + + if (is_utf32 && (length % 4 == 0)) { + currentmax = _mm512_max_epu32( + _mm512_maskz_loadu_epi8( + (1ULL << (length - static_cast(buf - start))) - 1, + (const __m512i *)buf), + currentmax); + __mmask16 outside_range = _mm512_cmp_epu32_mask(currentmax, _mm512_set1_epi32(0x10ffff), + _MM_CMPINT_GT); + if (outside_range == 0) { + out |= simdutf::encoding_type::UTF32_LE; + } + } + + return out; + } else if (implementation::validate_utf8(input, length)) { + return simdutf::encoding_type::UTF8; + } else { + return simdutf::encoding_type::unspecified; + } +} + +simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { + avx512_utf8_checker checker{}; + const char* ptr = buf; + const char* end = ptr + len; + for (; ptr + 64 <= end; ptr += 64) { + const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr); + checker.check_next_input(utf8); + } + { + const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - ptr))-1, (const __m512i*)ptr); + checker.check_next_input(utf8); + } + checker.check_eof(); + return ! checker.errors(); +} + +simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept { + avx512_utf8_checker checker{}; + const char* ptr = buf; + const char* end = ptr + len; + size_t count{0}; + for (; ptr + 64 <= end; ptr += 64) { + const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr); + checker.check_next_input(utf8); + if(checker.errors()) { + if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk + result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast(buf + count), len - count); + res.count += count; + return res; + } + count += 64; + } + { + const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - ptr))-1, (const __m512i*)ptr); + checker.check_next_input(utf8); + if(checker.errors()) { + if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk + result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast(buf + count), len - count); + res.count += count; + return res; + } else { + return result(error_code::SUCCESS, len); + } + } +} + +simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept { + const char* tail = icelake::validate_ascii(buf, len); + if (tail) { + return scalar::ascii::validate(tail, len - (tail - buf)); + } else { + return false; + } +} + +simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept { + const char* buf_orig = buf; + const char* end = buf + len; + const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80); + for (; buf + 64 <= end; buf += 64) { + const __m512i input = _mm512_loadu_si512((const __m512i*)buf); + __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT); + if(notascii) { + return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u64(notascii)); + } + } + { + const __m512i input = _mm512_maskz_loadu_epi8((1ULL<<(end - buf))-1, (const __m512i*)buf); + __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT); + if(notascii) { + return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u64(notascii)); + } + } + return result(error_code::SUCCESS, len); +} + +simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept { + const char16_t *end = buf + len; + + for(;buf + 32 <= end; ) { + __m512i in = _mm512_loadu_si512((__m512i*)buf); + __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800))); + __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800))); + if(surrogates) { + __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400))); + __mmask32 lowsurrogates = surrogates ^ highsurrogates; + // high must be followed by low + if ((highsurrogates << 1) != lowsurrogates) { + return false; + } + bool ends_with_high = ((highsurrogates & 0x80000000) != 0); + if(ends_with_high) { + buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round. + } else { + buf += 32; + } + } else { + buf += 32; + } + } + if(buf < end) { + __m512i in = _mm512_maskz_loadu_epi16((1<<(end-buf))-1,(__m512i*)buf); + __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800))); + __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800))); + if(surrogates) { + __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400))); + __mmask32 lowsurrogates = surrogates ^ highsurrogates; + // high must be followed by low + if ((highsurrogates << 1) != lowsurrogates) { + return false; + } + } + } + return true; +} + +simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept { + const char16_t *end = buf + len; + const __m512i byteflip = _mm512_setr_epi64( + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809 + ); + for(;buf + 32 <= end; ) { + __m512i in = _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i*)buf), byteflip); + __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800))); + __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800))); + if(surrogates) { + __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400))); + __mmask32 lowsurrogates = surrogates ^ highsurrogates; + // high must be followed by low + if ((highsurrogates << 1) != lowsurrogates) { + return false; + } + bool ends_with_high = ((highsurrogates & 0x80000000) != 0); + if(ends_with_high) { + buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round. + } else { + buf += 32; + } + } else { + buf += 32; + } + } + if(buf < end) { + __m512i in = _mm512_shuffle_epi8(_mm512_maskz_loadu_epi16((1<<(end-buf))-1,(__m512i*)buf), byteflip); + __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800))); + __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800))); + if(surrogates) { + __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400))); + __mmask32 lowsurrogates = surrogates ^ highsurrogates; + // high must be followed by low + if ((highsurrogates << 1) != lowsurrogates) { + return false; + } + } + } + return true; +} + +simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept { + const char16_t *start_buf = buf; + const char16_t *end = buf + len; + for(;buf + 32 <= end; ) { + __m512i in = _mm512_loadu_si512((__m512i*)buf); + __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800))); + __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800))); + if(surrogates) { + __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400))); + __mmask32 lowsurrogates = surrogates ^ highsurrogates; + // high must be followed by low + if ((highsurrogates << 1) != lowsurrogates) { + uint32_t extra_low = _tzcnt_u32(lowsurrogates &~(highsurrogates << 1)); + uint32_t extra_high = _tzcnt_u32(highsurrogates &~(lowsurrogates >> 1)); + return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high)); + } + bool ends_with_high = ((highsurrogates & 0x80000000) != 0); + if(ends_with_high) { + buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round. + } else { + buf += 32; + } + } else { + buf += 32; + } + } + if(buf < end) { + __m512i in = _mm512_maskz_loadu_epi16((1<<(end-buf))-1,(__m512i*)buf); + __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800))); + __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800))); + if(surrogates) { + __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400))); + __mmask32 lowsurrogates = surrogates ^ highsurrogates; + // high must be followed by low + if ((highsurrogates << 1) != lowsurrogates) { + uint32_t extra_low = _tzcnt_u32(lowsurrogates &~(highsurrogates << 1)); + uint32_t extra_high = _tzcnt_u32(highsurrogates &~(lowsurrogates >> 1)); + return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high)); + } + } + } + return result(error_code::SUCCESS, len); +} + +simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept { + const char16_t *start_buf = buf; + const char16_t *end = buf + len; + const __m512i byteflip = _mm512_setr_epi64( + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809 + ); + for(;buf + 32 <= end; ) { + __m512i in = _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i*)buf), byteflip); + __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800))); + __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800))); + if(surrogates) { + __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400))); + __mmask32 lowsurrogates = surrogates ^ highsurrogates; + // high must be followed by low + if ((highsurrogates << 1) != lowsurrogates) { + uint32_t extra_low = _tzcnt_u32(lowsurrogates &~(highsurrogates << 1)); + uint32_t extra_high = _tzcnt_u32(highsurrogates &~(lowsurrogates >> 1)); + return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high)); + } + bool ends_with_high = ((highsurrogates & 0x80000000) != 0); + if(ends_with_high) { + buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round. + } else { + buf += 32; + } + } else { + buf += 32; + } + } + if(buf < end) { + __m512i in = _mm512_shuffle_epi8(_mm512_maskz_loadu_epi16((1<<(end-buf))-1,(__m512i*)buf), byteflip); + __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800))); + __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800))); + if(surrogates) { + __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400))); + __mmask32 lowsurrogates = surrogates ^ highsurrogates; + // high must be followed by low + if ((highsurrogates << 1) != lowsurrogates) { + uint32_t extra_low = _tzcnt_u32(lowsurrogates &~(highsurrogates << 1)); + uint32_t extra_high = _tzcnt_u32(highsurrogates &~(lowsurrogates >> 1)); + return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high)); + } + } + } + return result(error_code::SUCCESS, len); +} + +simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept { + const char32_t * tail = icelake::validate_utf32(buf, len); + if (tail) { + return scalar::utf32::validate(tail, len - (tail - buf)); + } else { + return false; + } +} + +simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept { + + const char32_t* end = len >= 16 ? buf + len - 16 : nullptr; + const char32_t* buf_orig = buf; + while (buf <= end) { + __m512i utf32 = _mm512_loadu_si512((const __m512i*)buf); + __mmask16 outside_range = _mm512_cmp_epu32_mask(utf32, _mm512_set1_epi32(0x10ffff), + _MM_CMPINT_GT); + if (outside_range) { + return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u32(outside_range)); + } + + __m512i utf32_off = _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000)); + + __mmask16 surrogate_range = _mm512_cmp_epu32_mask(utf32_off, _mm512_set1_epi32(0xfffff7ff), + _MM_CMPINT_GT); + if (surrogate_range) { + return result(error_code::SURROGATE, buf - buf_orig + _tzcnt_u32(surrogate_range)); + } + buf += 16; + } + if(buf < buf_orig + len) { + __m512i utf32 = _mm512_maskz_loadu_epi32(__mmask16((1<<(buf_orig + len - buf))-1),(const __m512i*)buf); + __mmask16 outside_range = _mm512_cmp_epu32_mask(utf32, _mm512_set1_epi32(0x10ffff), + _MM_CMPINT_GT); + if (outside_range) { + return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u32(outside_range)); + } + __m512i utf32_off = _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000)); + + __mmask16 surrogate_range = _mm512_cmp_epu32_mask(utf32_off, _mm512_set1_epi32(0xfffff7ff), + _MM_CMPINT_GT); + if (surrogate_range) { + return result(error_code::SURROGATE, buf - buf_orig + _tzcnt_u32(surrogate_range)); + } + } + + return result(error_code::SUCCESS, len); +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + utf8_to_utf16_result ret = fast_avx512_convert_utf8_to_utf16(buf, len, utf16_output); + if (ret.second == nullptr) { + return 0; + } + return ret.second - utf16_output; +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + utf8_to_utf16_result ret = fast_avx512_convert_utf8_to_utf16(buf, len, utf16_output); + if (ret.second == nullptr) { + return 0; + } + return ret.second - utf16_output; +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + return fast_avx512_convert_utf8_to_utf16_with_errors(buf, len, utf16_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + return fast_avx512_convert_utf8_to_utf16_with_errors(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + utf8_to_utf16_result ret = icelake::valid_utf8_to_fixed_length(buf, len, utf16_output); + size_t saved_bytes = ret.second - utf16_output; + const char* end = buf + len; + if (ret.first == end) { + return saved_bytes; + } + + // Note: AVX512 procedure looks up 4 bytes forward, and + // correctly converts multi-byte chars even if their + // continuation bytes lie outsiede 16-byte window. + // It meas, we have to skip continuation bytes from + // the beginning ret.first, as they were already consumed. + while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) { + ret.first += 1; + } + + if (ret.first != end) { + const size_t scalar_saved_bytes = scalar::utf8_to_utf16::convert_valid( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + utf8_to_utf16_result ret = icelake::valid_utf8_to_fixed_length(buf, len, utf16_output); + size_t saved_bytes = ret.second - utf16_output; + const char* end = buf + len; + if (ret.first == end) { + return saved_bytes; + } + + // Note: AVX512 procedure looks up 4 bytes forward, and + // correctly converts multi-byte chars even if their + // continuation bytes lie outsiede 16-byte window. + // It meas, we have to skip continuation bytes from + // the beginning ret.first, as they were already consumed. + while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) { + ret.first += 1; + } + + if (ret.first != end) { + const size_t scalar_saved_bytes = scalar::utf8_to_utf16::convert_valid( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + + return saved_bytes; +} + + +simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_out) const noexcept { + uint32_t * utf32_output = reinterpret_cast(utf32_out); + utf8_to_utf32_result ret = icelake::validating_utf8_to_fixed_length(buf, len, utf32_output); + if (ret.second == nullptr) + return 0; + + size_t saved_bytes = ret.second - utf32_output; + const char* end = buf + len; + if (ret.first == end) { + return saved_bytes; + } + + // Note: the AVX512 procedure looks up 4 bytes forward, and + // correctly converts multi-byte chars even if their + // continuation bytes lie outside 16-byte window. + // It means, we have to skip continuation bytes from + // the beginning ret.first, as they were already consumed. + while (ret.first != end and ((uint8_t(*ret.first) & 0xc0) == 0x80)) { + ret.first += 1; + } + + if (ret.first != end) { + const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert( + ret.first, len - (ret.first - buf), utf32_out + saved_bytes); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32) const noexcept { + uint32_t * utf32_output = reinterpret_cast(utf32); + auto ret = icelake::validating_utf8_to_fixed_length_with_constant_checks(buf, len, utf32_output); + if (!std::get<2>(ret)) { + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(std::get<0>(ret), len - (std::get<0>(ret) - buf), reinterpret_cast(std::get<1>(ret))); + res.count += (std::get<0>(ret) - buf); + return res; + } + size_t saved_bytes = std::get<1>(ret) - utf32_output; + const char* end = buf + len; + if (std::get<0>(ret) == end) { + return {simdutf::SUCCESS, saved_bytes}; + } + + // Note: the AVX512 procedure looks up 4 bytes forward, and + // correctly converts multi-byte chars even if their + // continuation bytes lie outside 16-byte window. + // It means, we have to skip continuation bytes from + // the beginning ret.first, as they were already consumed. + while (std::get<0>(ret) != end and ((uint8_t(*std::get<0>(ret)) & 0xc0) == 0x80)) { + std::get<0>(ret) += 1; + } + + if (std::get<0>(ret) != end) { + auto scalar_result = scalar::utf8_to_utf32::convert_with_errors( + std::get<0>(ret), len - (std::get<0>(ret) - buf), reinterpret_cast(utf32_output) + saved_bytes); + if (scalar_result.error != simdutf::SUCCESS) { + scalar_result.count += (std::get<0>(ret) - buf); + } else { + scalar_result.count += saved_bytes; + } + return scalar_result; + } + + return {simdutf::SUCCESS, size_t(std::get<1>(ret) - utf32_output)}; +} + + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_out) const noexcept { + uint32_t * utf32_output = reinterpret_cast(utf32_out); + utf8_to_utf32_result ret = icelake::valid_utf8_to_fixed_length(buf, len, utf32_output); + size_t saved_bytes = ret.second - utf32_output; + const char* end = buf + len; + if (ret.first == end) { + return saved_bytes; + } + + // Note: AVX512 procedure looks up 4 bytes forward, and + // correctly converts multi-byte chars even if their + // continuation bytes lie outsiede 16-byte window. + // It meas, we have to skip continuation bytes from + // the beginning ret.first, as they were already consumed. + while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) { + ret.first += 1; + } + + if (ret.first != end) { + const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert_valid( + ret.first, len - (ret.first - buf), utf32_out + saved_bytes); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + size_t outlen; + size_t inlen = utf16_to_utf8_avx512i(buf, len, (unsigned char*)utf8_output, &outlen); + if(inlen != len) { return 0; } + return outlen; +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + size_t outlen; + size_t inlen = utf16_to_utf8_avx512i(buf, len, (unsigned char*)utf8_output, &outlen); + if(inlen != len) { return 0; } + return outlen; +} + +simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + size_t outlen; + size_t inlen = utf16_to_utf8_avx512i(buf, len, (unsigned char*)utf8_output, &outlen); + if(inlen != len) { + result res = scalar::utf16_to_utf8::convert_with_errors(buf + inlen, len - outlen, utf8_output + outlen); + res.count += inlen; + return res; + } + return {simdutf::SUCCESS, outlen}; +} + +simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + size_t outlen; + size_t inlen = utf16_to_utf8_avx512i(buf, len, (unsigned char*)utf8_output, &outlen); + if(inlen != len) { + result res = scalar::utf16_to_utf8::convert_with_errors(buf + inlen, len - outlen, utf8_output + outlen); + res.count += inlen; + return res; + } + return {simdutf::SUCCESS, outlen}; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + return convert_utf16le_to_utf8(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + return convert_utf16be_to_utf8(buf, len, utf8_output); +} + + +simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept { + std::pair ret = avx512_convert_utf32_to_utf8(buf, len, utf8_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - utf8_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of words written even if finished + std::pair ret = icelake::avx512_convert_utf32_to_utf8_with_errors(buf, len, utf8_output); + if (ret.first.count != len) { + result scalar_res = scalar::utf32_to_utf8::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written + return ret.first; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept { + return convert_utf32_to_utf8(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + std::pair ret = avx512_convert_utf32_to_utf16(buf, len, utf16_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + std::pair ret = avx512_convert_utf32_to_utf16(buf, len, utf16_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of words written even if finished + std::pair ret = avx512_convert_utf32_to_utf16_with_errors(buf, len, utf16_output); + if (ret.first.count != len) { + result scalar_res = scalar::utf32_to_utf16::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written + return ret.first; +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of words written even if finished + std::pair ret = avx512_convert_utf32_to_utf16_with_errors(buf, len, utf16_output); + if (ret.first.count != len) { + result scalar_res = scalar::utf32_to_utf16::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written + return ret.first; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + return convert_utf32_to_utf16le(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + return convert_utf32_to_utf16be(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + std::tuple ret = icelake::convert_utf16_to_utf32(buf, len, utf32_output); + if (!std::get<2>(ret)) { return 0; } + size_t saved_bytes = std::get<1>(ret) - utf32_output; + if (std::get<0>(ret) != buf + len) { + const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert( + std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret)); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + std::tuple ret = icelake::convert_utf16_to_utf32(buf, len, utf32_output); + if (!std::get<2>(ret)) { return 0; } + size_t saved_bytes = std::get<1>(ret) - utf32_output; + if (std::get<0>(ret) != buf + len) { + const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert( + std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret)); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + std::tuple ret = icelake::convert_utf16_to_utf32(buf, len, utf32_output); + if (!std::get<2>(ret)) { + result scalar_res = scalar::utf16_to_utf32::convert_with_errors( + std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret)); + scalar_res.count += (std::get<0>(ret) - buf); + return scalar_res; + } + size_t saved_bytes = std::get<1>(ret) - utf32_output; + if (std::get<0>(ret) != buf + len) { + result scalar_res = scalar::utf16_to_utf32::convert_with_errors( + std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret)); + if (scalar_res.error) { + scalar_res.count += (std::get<0>(ret) - buf); + return scalar_res; + } else { + scalar_res.count += saved_bytes; + return scalar_res; + } + } + return simdutf::result(simdutf::SUCCESS, saved_bytes); +} + +simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + std::tuple ret = icelake::convert_utf16_to_utf32(buf, len, utf32_output); + if (!std::get<2>(ret)) { + result scalar_res = scalar::utf16_to_utf32::convert_with_errors( + std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret)); + scalar_res.count += (std::get<0>(ret) - buf); + return scalar_res; + } + size_t saved_bytes = std::get<1>(ret) - utf32_output; + if (std::get<0>(ret) != buf + len) { + result scalar_res = scalar::utf16_to_utf32::convert_with_errors( + std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret)); + if (scalar_res.error) { + scalar_res.count += (std::get<0>(ret) - buf); + return scalar_res; + } else { + scalar_res.count += saved_bytes; + return scalar_res; + } + } + return simdutf::result(simdutf::SUCCESS, saved_bytes); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + std::tuple ret = icelake::convert_utf16_to_utf32(buf, len, utf32_output); + if (!std::get<2>(ret)) { return 0; } + size_t saved_bytes = std::get<1>(ret) - utf32_output; + if (std::get<0>(ret) != buf + len) { + const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert( + std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret)); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + std::tuple ret = icelake::convert_utf16_to_utf32(buf, len, utf32_output); + if (!std::get<2>(ret)) { return 0; } + size_t saved_bytes = std::get<1>(ret) - utf32_output; + if (std::get<0>(ret) != buf + len) { + const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert( + std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret)); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept { + size_t pos = 0; + const __m512i byteflip = _mm512_setr_epi64( + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809 + ); + while (pos + 32 <= length) { + __m512i utf16 = _mm512_loadu_si512((const __m512i*)(input + pos)); + utf16 = _mm512_shuffle_epi8(utf16, byteflip); + _mm512_storeu_si512(output + pos, utf16); + pos += 32; + } + if(pos < length) { + __mmask32 m((1<< (length - pos))-1); + __m512i utf16 = _mm512_maskz_loadu_epi16(m, (const __m512i*)(input + pos)); + utf16 = _mm512_shuffle_epi8(utf16, byteflip); + _mm512_mask_storeu_epi16(output + pos, m, utf16); + } +} + + +simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept { + const char16_t* end = length >= 32 ? input + length - 32 : nullptr; + const char16_t* ptr = input; + + const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00); + const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff); + + size_t count{0}; + + while (ptr <= end) { + __m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr); + ptr += 32; + uint64_t not_high_surrogate = static_cast(_mm512_cmpgt_epu16_mask(utf16, high) | _mm512_cmplt_epu16_mask(utf16, low)); + count += count_ones(not_high_surrogate); + } + + return count + scalar::utf16::count_code_points(ptr, length - (ptr - input)); +} + +simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept { + const char16_t* end = length >= 32 ? input + length - 32 : nullptr; + const char16_t* ptr = input; + + const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00); + const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff); + + size_t count{0}; + const __m512i byteflip = _mm512_setr_epi64( + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809 + ); + while (ptr <= end) { + __m512i utf16 = _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i*)ptr), byteflip); + ptr += 32; + uint64_t not_high_surrogate = static_cast(_mm512_cmpgt_epu16_mask(utf16, high) | _mm512_cmplt_epu16_mask(utf16, low)); + count += count_ones(not_high_surrogate); + } + + return count + scalar::utf16::count_code_points(ptr, length - (ptr - input)); +} + + +simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept { + const char* end = length >= 64 ? input + length - 64 : nullptr; + const char* ptr = input; + + const __m512i continuation = _mm512_set1_epi8(char(0b10111111)); + + size_t count{0}; + + while (ptr <= end) { + __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr); + ptr += 64; + uint64_t continuation_bitmask = static_cast(_mm512_cmple_epi8_mask(utf8, continuation)); + count += 64 - count_ones(continuation_bitmask); + } + + return count + scalar::utf8::count_code_points(ptr, length - (ptr - input)); +} + + +simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept { + const char16_t* end = length >= 32 ? input + length - 32 : nullptr; + const char16_t* ptr = input; + + const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f); + const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff); + const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff); + const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800); + + size_t count{0}; + + while (ptr <= end) { + __m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr); + ptr += 32; + __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f); + __mmask32 two_bytes_bitmask = _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff); + __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask); + __mmask32 surrogates_bitmask = _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) & _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800); + + size_t ascii_count = count_ones(ascii_bitmask); + size_t two_bytes_count = count_ones(two_bytes_bitmask); + size_t surrogate_bytes_count = count_ones(surrogates_bitmask); + size_t three_bytes_count = 32 - ascii_count - two_bytes_count - surrogate_bytes_count; + + count += ascii_count + 2*two_bytes_count + 3*three_bytes_count + 2*surrogate_bytes_count; + } + + return count + scalar::utf16::utf8_length_from_utf16(ptr, length - (ptr - input)); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept { + const char16_t* end = length >= 32 ? input + length - 32 : nullptr; + const char16_t* ptr = input; + + const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f); + const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff); + const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff); + const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800); + + size_t count{0}; + const __m512i byteflip = _mm512_setr_epi64( + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809, + 0x0607040502030001, + 0x0e0f0c0d0a0b0809 + ); + while (ptr <= end) { + __m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr); + utf16 = _mm512_shuffle_epi8(utf16, byteflip); + ptr += 32; + __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f); + __mmask32 two_bytes_bitmask = _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff); + __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask); + __mmask32 surrogates_bitmask = _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) & _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800); + + size_t ascii_count = count_ones(ascii_bitmask); + size_t two_bytes_count = count_ones(two_bytes_bitmask); + size_t surrogate_bytes_count = count_ones(surrogates_bitmask); + size_t three_bytes_count = 32 - ascii_count - two_bytes_count - surrogate_bytes_count; + count += ascii_count + 2*two_bytes_count + 3*three_bytes_count + 2*surrogate_bytes_count; + } + + return count + scalar::utf16::utf8_length_from_utf16(ptr, length - (ptr - input)); +} + +simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept { + return implementation::count_utf16le(input, length); +} + +simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept { + return implementation::count_utf16be(input, length); +} + +simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept { + size_t pos = 0; + size_t count = 0; + // This algorithm could no doubt be improved! + for(;pos + 64 <= length; pos += 64) { + __m512i utf8 = _mm512_loadu_si512((const __m512i*)(input+pos)); + uint64_t utf8_continuation_mask = _mm512_cmple_epi8_mask(utf8, _mm512_set1_epi8(-65+1)); + // We count one word for anything that is not a continuation (so + // leading bytes). + count += 64 - count_ones(utf8_continuation_mask); + uint64_t utf8_4byte = _mm512_cmpge_epu8_mask(utf8, _mm512_set1_epi8(int8_t(240))); + count += count_ones(utf8_4byte); + } + return count + scalar::utf8::utf16_length_from_utf8(input + pos, length - pos); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept { + const char32_t* end = length >= 16 ? input + length - 16 : nullptr; + const char32_t* ptr = input; + + const __m512i v_0000_007f = _mm512_set1_epi32((uint32_t)0x7f); + const __m512i v_0000_07ff = _mm512_set1_epi32((uint32_t)0x7ff); + const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff); + + size_t count{0}; + + while (ptr <= end) { + __m512i utf32 = _mm512_loadu_si512((const __m512i*)ptr); + ptr += 16; + __mmask16 ascii_bitmask = _mm512_cmple_epu32_mask(utf32, v_0000_007f); + __mmask16 two_bytes_bitmask = _mm512_mask_cmple_epu32_mask(_knot_mask16(ascii_bitmask), utf32, v_0000_07ff); + __mmask16 three_bytes_bitmask = _mm512_mask_cmple_epu32_mask(_knot_mask16(_mm512_kor(ascii_bitmask, two_bytes_bitmask)), utf32, v_0000_ffff); + + size_t ascii_count = count_ones(ascii_bitmask); + size_t two_bytes_count = count_ones(two_bytes_bitmask); + size_t three_bytes_count = count_ones(three_bytes_bitmask); + size_t four_bytes_count = 16 - ascii_count - two_bytes_count - three_bytes_count; + count += ascii_count + 2*two_bytes_count + 3*three_bytes_count + 4*four_bytes_count; + } + + return count + scalar::utf32::utf8_length_from_utf32(ptr, length - (ptr - input)); +} + +simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept { + const char32_t* end = length >= 16 ? input + length - 16 : nullptr; + const char32_t* ptr = input; + + const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff); + + size_t count{0}; + + while (ptr <= end) { + __m512i utf32 = _mm512_loadu_si512((const __m512i*)ptr); + ptr += 16; + __mmask16 surrogates_bitmask = _mm512_cmpgt_epu32_mask(utf32, v_0000_ffff); + + count += 16 + count_ones(surrogates_bitmask); + } + + return count + scalar::utf32::utf16_length_from_utf32(ptr, length - (ptr - input)); +} + +simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept { + return implementation::count_utf8(input, length); +} + +} // namespace icelake +} // namespace simdutf + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake/end.h +/* begin file src/simdutf/icelake/end.h */ +SIMDUTF_UNTARGET_REGION +/* end file src/simdutf/icelake/end.h */ +/* end file src/icelake/implementation.cpp */ +#endif +#if SIMDUTF_IMPLEMENTATION_HASWELL +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/implementation.cpp +/* begin file src/haswell/implementation.cpp */ + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/begin.h +/* begin file src/simdutf/haswell/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "haswell" +// #define SIMDUTF_IMPLEMENTATION haswell +SIMDUTF_TARGET_HASWELL +/* end file src/simdutf/haswell/begin.h */ +namespace simdutf { +namespace haswell { +namespace { +#ifndef SIMDUTF_HASWELL_H +#error "haswell.h must be included" +#endif +using namespace simd; + + +simdutf_really_inline bool is_ascii(const simd8x64& input) { + return input.reduce_or().is_ascii(); +} + +simdutf_unused simdutf_really_inline simd8 must_be_continuation(const simd8 prev1, const simd8 prev2, const simd8 prev3) { + simd8 is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0 + simd8 is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0 + simd8 is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0 + // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine. + return simd8(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0); +} + +simdutf_really_inline simd8 must_be_2_3_continuation(const simd8 prev2, const simd8 prev3) { + simd8 is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0 + simd8 is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0 + // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine. + return simd8(is_third_byte | is_fourth_byte) > int8_t(0); +} + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_detect_encodings.cpp +/* begin file src/haswell/avx2_detect_encodings.cpp */ +template +// len is known to be a multiple of 2 when this is called +int avx2_detect_encodings(const char * buf, size_t len) { + const char* start = buf; + const char* end = buf + len; + + bool is_utf8 = true; + bool is_utf16 = true; + bool is_utf32 = true; + + int out = 0; + + const auto v_d8 = simd8::splat(0xd8); + const auto v_f8 = simd8::splat(0xf8); + + __m256i currentmax = _mm256_setzero_si256(); + + checker check{}; + + while(buf + 64 <= end) { + __m256i in = _mm256_loadu_si256((__m256i*)buf); + __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1); + + const auto u0 = simd16(in); + const auto u1 = simd16(nextin); + + const auto v0 = u0.shr<8>(); + const auto v1 = u1.shr<8>(); + + const auto in16 = simd16::pack(v0, v1); + + const auto surrogates_wordmask0 = (in16 & v_f8) == v_d8; + uint32_t surrogates_bitmask0 = surrogates_wordmask0.to_bitmask(); + + // Check for surrogates + if (surrogates_bitmask0 != 0x0) { + // Cannot be UTF8 + is_utf8 = false; + // Can still be either UTF-16LE or UTF-32LE depending on the positions of the surrogates + // To be valid UTF-32LE, a surrogate cannot be in the two most significant bytes of any 32-bit word. + // On the other hand, to be valid UTF-16LE, at least one surrogate must be in the two most significant + // bytes of a 32-bit word since they always come in pairs in UTF-16LE. + // Note that we always proceed in multiple of 4 before this point so there is no offset in 32-bit words. + + if ((surrogates_bitmask0 & 0xaaaaaaaa) != 0) { + is_utf32 = false; + // Code from avx2_validate_utf16le.cpp + const char16_t * input = reinterpret_cast(buf); + const char16_t* end16 = reinterpret_cast(start) + len/2; + + const auto v_fc = simd8::splat(0xfc); + const auto v_dc = simd8::splat(0xdc); + + const uint32_t V0 = ~surrogates_bitmask0; + + const auto vH0 = (in16 & v_fc) == v_dc; + const uint32_t H0 = vH0.to_bitmask(); + + const uint32_t L0 = ~H0 & surrogates_bitmask0; + + const uint32_t a0 = L0 & (H0 >> 1); + const uint32_t b0 = a0 << 1; + const uint32_t c0 = V0 | a0 | b0; + + if (c0 == 0xffffffff) { + input += simd16::ELEMENTS * 2; + } else if (c0 == 0x7fffffff) { + input += simd16::ELEMENTS * 2 - 1; + } else { + return simdutf::encoding_type::unspecified; + } + + while (input + simd16::ELEMENTS * 2 < end16) { + const auto in0 = simd16(input); + const auto in1 = simd16(input + simd16::ELEMENTS); + + const auto t0 = in0.shr<8>(); + const auto t1 = in1.shr<8>(); + + const auto in_16 = simd16::pack(t0, t1); + + const auto surrogates_wordmask = (in_16 & v_f8) == v_d8; + const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask(); + if (surrogates_bitmask == 0x0) { + input += simd16::ELEMENTS * 2; + } else { + const uint32_t V = ~surrogates_bitmask; + + const auto vH = (in_16 & v_fc) == v_dc; + const uint32_t H = vH.to_bitmask(); + + const uint32_t L = ~H & surrogates_bitmask; + + const uint32_t a = L & (H >> 1); + + const uint32_t b = a << 1; + + const uint32_t c = V | a | b; + + if (c == 0xffffffff) { + input += simd16::ELEMENTS * 2; + } else if (c == 0x7fffffff) { + input += simd16::ELEMENTS * 2 - 1; + } else { + return simdutf::encoding_type::unspecified; + } + } + } + } else { + is_utf16 = false; + // Check for UTF-32LE + if (len % 4 == 0) { + const char32_t * input = reinterpret_cast(buf); + const char32_t* end32 = reinterpret_cast(start) + len/4; + + // Must start checking for surrogates + __m256i currentoffsetmax = _mm256_setzero_si256(); + const __m256i offset = _mm256_set1_epi32(0xffff2000); + const __m256i standardoffsetmax = _mm256_set1_epi32(0xfffff7ff); + + currentmax = _mm256_max_epu32(in, currentmax); + currentmax = _mm256_max_epu32(nextin, currentmax); + + currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in, offset), currentoffsetmax); + currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(nextin, offset), currentoffsetmax); + + while (input + 8 < end32) { + const __m256i in32 = _mm256_loadu_si256((__m256i *)input); + currentmax = _mm256_max_epu32(in32,currentmax); + currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in32, offset), currentoffsetmax); + input += 8; + } + + __m256i forbidden_words = _mm256_xor_si256(_mm256_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax); + if(_mm256_testz_si256(forbidden_words, forbidden_words) == 0) { + return simdutf::encoding_type::unspecified; + } + } else { + return simdutf::encoding_type::unspecified; + } + } + break; + } + // If no surrogate, validate under other encodings as well + + // UTF-32LE validation + currentmax = _mm256_max_epu32(in, currentmax); + currentmax = _mm256_max_epu32(nextin, currentmax); + + // UTF-8 validation + // Relies on ../generic/utf8_validation/utf8_lookup4_algorithm.h + simd::simd8x64 in8(in, nextin); + check.check_next_input(in8); + + buf += 64; + } + + // Check which encodings are possible + + if (is_utf8) { + if (static_cast(buf - start) != len) { + uint8_t block[64]{}; + std::memset(block, 0x20, 64); + std::memcpy(block, buf, len - (buf - start)); + simd::simd8x64 in(block); + check.check_next_input(in); + } + if (!check.errors()) { + out |= simdutf::encoding_type::UTF8; + } + } + + if (is_utf16 && scalar::utf16::validate(reinterpret_cast(buf), (len - (buf - start))/2)) { + out |= simdutf::encoding_type::UTF16_LE; + } + + if (is_utf32 && (len % 4 == 0)) { + const __m256i standardmax = _mm256_set1_epi32(0x10ffff); + __m256i is_zero = _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax); + if (_mm256_testz_si256(is_zero, is_zero) == 1 && scalar::utf32::validate(reinterpret_cast(buf), (len - (buf - start))/4)) { + out |= simdutf::encoding_type::UTF32_LE; + } + } + + return out; +} +/* end file src/haswell/avx2_detect_encodings.cpp */ + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_validate_utf16.cpp +/* begin file src/haswell/avx2_validate_utf16.cpp */ +/* + In UTF-16 words in range 0xD800 to 0xDFFF have special meaning. + + In a vectorized algorithm we want to examine the most significant + nibble in order to select a fast path. If none of highest nibbles + are 0xD (13), than we are sure that UTF-16 chunk in a vector + register is valid. + + Let us analyze what we need to check if the nibble is 0xD. The + value of the preceding nibble determines what we have: + + 0xd000 .. 0xd7ff - a valid word + 0xd800 .. 0xdbff - low surrogate + 0xdc00 .. 0xdfff - high surrogate + + Other constraints we have to consider: + - there must not be two consecutive low surrogates (0xd800 .. 0xdbff) + - there must not be two consecutive high surrogates (0xdc00 .. 0xdfff) + - there must not be sole low surrogate nor high surrogate + + We're going to build three bitmasks based on the 3rd nibble: + - V = valid word, + - L = low surrogate (0xd800 .. 0xdbff) + - H = high surrogate (0xdc00 .. 0xdfff) + + 0 1 2 3 4 5 6 7 <--- word index + [ V | L | H | L | H | V | V | L ] + 1 0 0 0 0 1 1 0 - V = valid masks + 0 1 0 1 0 0 0 1 - L = low surrogate + 0 0 1 0 1 0 0 0 - H high surrogate + + + 1 0 0 0 0 1 1 0 V = valid masks + 0 1 0 1 0 0 0 0 a = L & (H >> 1) + 0 0 1 0 1 0 0 0 b = a << 1 + 1 1 1 1 1 1 1 0 c = V | a | b + ^ + the last bit can be zero, we just consume 7 words + and recheck this word in the next iteration +*/ + +/* Returns: + - pointer to the last unprocessed character (a scalar fallback should check the rest); + - nullptr if an error was detected. +*/ +template +const char16_t* avx2_validate_utf16(const char16_t* input, size_t size) { + const char16_t* end = input + size; + + const auto v_d8 = simd8::splat(0xd8); + const auto v_f8 = simd8::splat(0xf8); + const auto v_fc = simd8::splat(0xfc); + const auto v_dc = simd8::splat(0xdc); + + while (input + simd16::ELEMENTS * 2 < end) { + // 0. Load data: since the validation takes into account only higher + // byte of each word, we compress the two vectors into one which + // consists only the higher bytes. + auto in0 = simd16(input); + auto in1 = simd16(input + simd16::ELEMENTS); + + if (big_endian) { + in0 = in0.swap_bytes(); + in1 = in1.swap_bytes(); + } + + const auto t0 = in0.shr<8>(); + const auto t1 = in1.shr<8>(); + + const auto in = simd16::pack(t0, t1); + + // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy). + const auto surrogates_wordmask = (in & v_f8) == v_d8; + const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask(); + if (surrogates_bitmask == 0x0) { + input += simd16::ELEMENTS * 2; + } else { + // 2. We have some surrogates that have to be distinguished: + // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF) + // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF) + // + // Fact: high surrogate has 11th bit set (3rd bit in the higher word) + + // V - non-surrogate words + // V = not surrogates_wordmask + const uint32_t V = ~surrogates_bitmask; + + // H - word-mask for high surrogates: the six highest bits are 0b1101'11 + const auto vH = (in & v_fc) == v_dc; + const uint32_t H = vH.to_bitmask(); + + // L - word mask for low surrogates + // L = not H and surrogates_wordmask + const uint32_t L = ~H & surrogates_bitmask; + + const uint32_t a = L & (H >> 1); // A low surrogate must be followed by high one. + // (A low surrogate placed in the 7th register's word + // is an exception we handle.) + const uint32_t b = a << 1; // Just mark that the opposite fact is hold, + // thanks to that we have only two masks for valid case. + const uint32_t c = V | a | b; // Combine all the masks into the final one. + + if (c == 0xffffffff) { + // The whole input register contains valid UTF-16, i.e., + // either single words or proper surrogate pairs. + input += simd16::ELEMENTS * 2; + } else if (c == 0x7fffffff) { + // The 31 lower words of the input register contains valid UTF-16. + // The 31 word may be either a low or high surrogate. It the next + // iteration we 1) check if the low surrogate is followed by a high + // one, 2) reject sole high surrogate. + input += simd16::ELEMENTS * 2 - 1; + } else { + return nullptr; + } + } + } + + return input; +} + + +template +const result avx2_validate_utf16_with_errors(const char16_t* input, size_t size) { + const char16_t* start = input; + const char16_t* end = input + size; + + const auto v_d8 = simd8::splat(0xd8); + const auto v_f8 = simd8::splat(0xf8); + const auto v_fc = simd8::splat(0xfc); + const auto v_dc = simd8::splat(0xdc); + + while (input + simd16::ELEMENTS * 2 < end) { + // 0. Load data: since the validation takes into account only higher + // byte of each word, we compress the two vectors into one which + // consists only the higher bytes. + auto in0 = simd16(input); + auto in1 = simd16(input + simd16::ELEMENTS); + + if (big_endian) { + in0 = in0.swap_bytes(); + in1 = in1.swap_bytes(); + } + + const auto t0 = in0.shr<8>(); + const auto t1 = in1.shr<8>(); + + const auto in = simd16::pack(t0, t1); + + // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy). + const auto surrogates_wordmask = (in & v_f8) == v_d8; + const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask(); + if (surrogates_bitmask == 0x0) { + input += simd16::ELEMENTS * 2; + } else { + // 2. We have some surrogates that have to be distinguished: + // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF) + // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF) + // + // Fact: high surrogate has 11th bit set (3rd bit in the higher word) + + // V - non-surrogate words + // V = not surrogates_wordmask + const uint32_t V = ~surrogates_bitmask; + + // H - word-mask for high surrogates: the six highest bits are 0b1101'11 + const auto vH = (in & v_fc) == v_dc; + const uint32_t H = vH.to_bitmask(); + + // L - word mask for low surrogates + // L = not H and surrogates_wordmask + const uint32_t L = ~H & surrogates_bitmask; + + const uint32_t a = L & (H >> 1); // A low surrogate must be followed by high one. + // (A low surrogate placed in the 7th register's word + // is an exception we handle.) + const uint32_t b = a << 1; // Just mark that the opposite fact is hold, + // thanks to that we have only two masks for valid case. + const uint32_t c = V | a | b; // Combine all the masks into the final one. + + if (c == 0xffffffff) { + // The whole input register contains valid UTF-16, i.e., + // either single words or proper surrogate pairs. + input += simd16::ELEMENTS * 2; + } else if (c == 0x7fffffff) { + // The 31 lower words of the input register contains valid UTF-16. + // The 31 word may be either a low or high surrogate. It the next + // iteration we 1) check if the low surrogate is followed by a high + // one, 2) reject sole high surrogate. + input += simd16::ELEMENTS * 2 - 1; + } else { + return result(error_code::SURROGATE, input - start); + } + } + } + + return result(error_code::SUCCESS, input - start); +} +/* end file src/haswell/avx2_validate_utf16.cpp */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_validate_utf32le.cpp +/* begin file src/haswell/avx2_validate_utf32le.cpp */ +/* Returns: + - pointer to the last unprocessed character (a scalar fallback should check the rest); + - nullptr if an error was detected. +*/ +const char32_t* avx2_validate_utf32le(const char32_t* input, size_t size) { + const char32_t* end = input + size; + + const __m256i standardmax = _mm256_set1_epi32(0x10ffff); + const __m256i offset = _mm256_set1_epi32(0xffff2000); + const __m256i standardoffsetmax = _mm256_set1_epi32(0xfffff7ff); + __m256i currentmax = _mm256_setzero_si256(); + __m256i currentoffsetmax = _mm256_setzero_si256(); + + while (input + 8 < end) { + const __m256i in = _mm256_loadu_si256((__m256i *)input); + currentmax = _mm256_max_epu32(in,currentmax); + currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in, offset), currentoffsetmax); + input += 8; + } + __m256i is_zero = _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax); + if(_mm256_testz_si256(is_zero, is_zero) == 0) { + return nullptr; + } + + is_zero = _mm256_xor_si256(_mm256_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax); + if(_mm256_testz_si256(is_zero, is_zero) == 0) { + return nullptr; + } + + return input; +} + + +const result avx2_validate_utf32le_with_errors(const char32_t* input, size_t size) { + const char32_t* start = input; + const char32_t* end = input + size; + + const __m256i standardmax = _mm256_set1_epi32(0x10ffff); + const __m256i offset = _mm256_set1_epi32(0xffff2000); + const __m256i standardoffsetmax = _mm256_set1_epi32(0xfffff7ff); + __m256i currentmax = _mm256_setzero_si256(); + __m256i currentoffsetmax = _mm256_setzero_si256(); + + while (input + 8 < end) { + const __m256i in = _mm256_loadu_si256((__m256i *)input); + currentmax = _mm256_max_epu32(in,currentmax); + currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in, offset), currentoffsetmax); + + __m256i is_zero = _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax); + if(_mm256_testz_si256(is_zero, is_zero) == 0) { + return result(error_code::TOO_LARGE, input - start); + } + + is_zero = _mm256_xor_si256(_mm256_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax); + if(_mm256_testz_si256(is_zero, is_zero) == 0) { + return result(error_code::SURROGATE, input - start); + } + input += 8; + } + + return result(error_code::SUCCESS, input - start); +} +/* end file src/haswell/avx2_validate_utf32le.cpp */ + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf8_to_utf16.cpp +/* begin file src/haswell/avx2_convert_utf8_to_utf16.cpp */ +// depends on "tables/utf8_to_utf16_tables.h" + + +// Convert up to 12 bytes from utf8 to utf16 using a mask indicating the +// end of the code points. Only the least significant 12 bits of the mask +// are accessed. +// It returns how many bytes were consumed (up to 12). +template +size_t convert_masked_utf8_to_utf16(const char *input, + uint64_t utf8_end_of_code_point_mask, + char16_t *&utf16_output) { + // we use an approach where we try to process up to 12 input bytes. + // Why 12 input bytes and not 16? Because we are concerned with the size of + // the lookup tables. Also 12 is nicely divisible by two and three. + // + // + // Optimization note: our main path below is load-latency dependent. Thus it is maybe + // beneficial to have fast paths that depend on branch prediction but have less latency. + // This results in more instructions but, potentially, also higher speeds. + // + // We first try a few fast paths. + const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + const __m128i in = _mm_loadu_si128((__m128i *)input); + const uint16_t input_utf8_end_of_code_point_mask = + utf8_end_of_code_point_mask & 0xfff; + if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) { + // We process the data in chunks of 16 bytes. + __m256i ascii = _mm256_cvtepu8_epi16(in); + if (big_endian) { + const __m256i swap256 = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, + 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30); + ascii = _mm256_shuffle_epi8(ascii, swap256); + } + _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf16_output), ascii); + utf16_output += 16; // We wrote 16 16-bit characters. + return 16; // We consumed 16 bytes. + } + if(((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) { + // We want to take 8 2-byte UTF-8 words and turn them into 8 2-byte UTF-16 words. + // There is probably a more efficient sequence, but the following might do. + const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); + __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); + if (big_endian) composed = _mm_shuffle_epi8(composed, swap); + _mm_storeu_si128((__m128i *)utf16_output, composed); + utf16_output += 8; // We wrote 16 bytes, 8 code points. + return 16; + } + if(input_utf8_end_of_code_point_mask == 0x924) { + // We want to take 4 3-byte UTF-8 words and turn them into 4 2-byte UTF-16 words. + // There is probably a more efficient sequence, but the following might do. + const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = + _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits + const __m128i middlebyte = + _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits + const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); + const __m128i highbyte = + _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits + const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4); + const __m128i composed = + _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted); + __m128i composed_repacked = _mm_packus_epi32(composed, composed); + if (big_endian) composed_repacked = _mm_shuffle_epi8(composed_repacked, swap); + _mm_storeu_si128((__m128i *)utf16_output, composed_repacked); + utf16_output += 4; + return 12; + } + + const uint8_t idx = + simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0]; + const uint8_t consumed = + simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; + if (idx < 64) { + // SIX (6) input code-words + // this is a relatively easy scenario + // we process SIX (6) input code-words. The max length in bytes of six code + // words spanning between 1 and 2 bytes each is 12 bytes. On processors + // where pdep/pext is fast, we might be able to use a small lookup table. + const __m128i sh = + _mm_loadu_si128((const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); + __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); + if (big_endian) composed = _mm_shuffle_epi8(composed, swap); + _mm_storeu_si128((__m128i *)utf16_output, composed); + utf16_output += 6; // We wrote 12 bytes, 6 code points. + } else if (idx < 145) { + // FOUR (4) input code-words + const __m128i sh = + _mm_loadu_si128((const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = + _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits + const __m128i middlebyte = + _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits + const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); + const __m128i highbyte = + _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits + const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4); + const __m128i composed = + _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted); + __m128i composed_repacked = _mm_packus_epi32(composed, composed); + if (big_endian) composed_repacked = _mm_shuffle_epi8(composed_repacked, swap); + _mm_storeu_si128((__m128i *)utf16_output, composed_repacked); + utf16_output += 4; + } else if (idx < 209) { + // TWO (2) input code-words + const __m128i sh = + _mm_loadu_si128((const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); + const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); + const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); + __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000)); + // correct for spurious high bit + const __m128i correct = + _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1); + middlehighbyte = _mm_xor_si128(correct, middlehighbyte); + const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000)); + const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6); + const __m128i composed = + _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), + _mm_or_si128(highbyte_shifted, middlehighbyte_shifted)); + const __m128i composedminus = + _mm_sub_epi32(composed, _mm_set1_epi32(0x10000)); + const __m128i lowtenbits = + _mm_and_si128(composedminus, _mm_set1_epi32(0x3ff)); + const __m128i hightenbits = _mm_srli_epi32(composedminus, 10); + const __m128i lowtenbitsadd = + _mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00)); + const __m128i hightenbitsadd = + _mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800)); + const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16); + __m128i surrogates = + _mm_or_si128(hightenbitsadd, lowtenbitsaddshifted); + uint32_t basic_buffer[4]; + uint32_t basic_buffer_swap[4]; + if (big_endian) { + _mm_storeu_si128((__m128i *)basic_buffer_swap, _mm_shuffle_epi8(composed, swap)); + surrogates = _mm_shuffle_epi8(surrogates, swap); + } + _mm_storeu_si128((__m128i *)basic_buffer, composed); + uint32_t surrogate_buffer[4]; + _mm_storeu_si128((__m128i *)surrogate_buffer, surrogates); + for (size_t i = 0; i < 3; i++) { + if (basic_buffer[i] < 65536) { + utf16_output[0] = big_endian ? uint16_t(basic_buffer_swap[i]) : uint16_t(basic_buffer[i]); + utf16_output++; + } else { + utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff); + utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16); + utf16_output += 2; + } + } + } else { + // here we know that there is an error but we do not handle errors + } + return consumed; +} +/* end file src/haswell/avx2_convert_utf8_to_utf16.cpp */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf8_to_utf32.cpp +/* begin file src/haswell/avx2_convert_utf8_to_utf32.cpp */ +// depends on "tables/utf8_to_utf16_tables.h" + + +// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the +// end of the code points. Only the least significant 12 bits of the mask +// are accessed. +// It returns how many bytes were consumed (up to 12). +size_t convert_masked_utf8_to_utf32(const char *input, + uint64_t utf8_end_of_code_point_mask, + char32_t *&utf32_output) { + // we use an approach where we try to process up to 12 input bytes. + // Why 12 input bytes and not 16? Because we are concerned with the size of + // the lookup tables. Also 12 is nicely divisible by two and three. + // + // + // Optimization note: our main path below is load-latency dependent. Thus it is maybe + // beneficial to have fast paths that depend on branch prediction but have less latency. + // This results in more instructions but, potentially, also higher speeds. + // + // We first try a few fast paths. + const __m128i in = _mm_loadu_si128((__m128i *)input); + const uint16_t input_utf8_end_of_code_point_mask = + utf8_end_of_code_point_mask & 0xfff; + if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) { + // We process the data in chunks of 16 bytes. + _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), _mm256_cvtepu8_epi32(in)); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output+8), _mm256_cvtepu8_epi32(_mm_srli_si128(in,8))); + utf32_output += 16; // We wrote 16 32-bit characters. + return 16; // We consumed 16 bytes. + } + if(((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) { + // We want to take 8 2-byte UTF-8 words and turn them into 8 4-byte UTF-32 words. + // There is probably a more efficient sequence, but the following might do. + const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); + const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); + _mm256_storeu_si256((__m256i *)utf32_output, _mm256_cvtepu16_epi32(composed)); + utf32_output += 8; // We wrote 16 bytes, 8 code points. + return 16; + } + if(input_utf8_end_of_code_point_mask == 0x924) { + // We want to take 4 3-byte UTF-8 words and turn them into 4 4-byte UTF-32 words. + // There is probably a more efficient sequence, but the following might do. + const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = + _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits + const __m128i middlebyte = + _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits + const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); + const __m128i highbyte = + _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits + const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4); + const __m128i composed = + _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted); + _mm_storeu_si128((__m128i *)utf32_output, composed); + utf32_output += 4; + return 12; + } + /// We do not have a fast path available, so we fallback. + + const uint8_t idx = + tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0]; + const uint8_t consumed = + tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; + if (idx < 64) { + // SIX (6) input code-words + // this is a relatively easy scenario + // we process SIX (6) input code-words. The max length in bytes of six code + // words spanning between 1 and 2 bytes each is 12 bytes. On processors + // where pdep/pext is fast, we might be able to use a small lookup table. + const __m128i sh = + _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); + const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); + _mm256_storeu_si256((__m256i *)utf32_output, _mm256_cvtepu16_epi32(composed)); + utf32_output += 6; // We wrote 12 bytes, 6 code points. + } else if (idx < 145) { + // FOUR (4) input code-words + const __m128i sh = + _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = + _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits + const __m128i middlebyte = + _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits + const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); + const __m128i highbyte = + _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits + const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4); + const __m128i composed = + _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted); + _mm_storeu_si128((__m128i *)utf32_output, composed); + utf32_output += 4; + } else if (idx < 209) { + // TWO (2) input code-words + const __m128i sh = + _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); + const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); + const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); + __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000)); + // correct for spurious high bit + const __m128i correct = + _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1); + middlehighbyte = _mm_xor_si128(correct, middlehighbyte); + const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000)); + const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6); + const __m128i composed = + _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), + _mm_or_si128(highbyte_shifted, middlehighbyte_shifted)); + _mm_storeu_si128((__m128i *)utf32_output, composed); + utf32_output += 3; + } else { + // here we know that there is an error but we do not handle errors + } + return consumed; +} +/* end file src/haswell/avx2_convert_utf8_to_utf32.cpp */ + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf16_to_utf8.cpp +/* begin file src/haswell/avx2_convert_utf16_to_utf8.cpp */ +/* + The vectorized algorithm works on single SSE register i.e., it + loads eight 16-bit words. + + We consider three cases: + 1. an input register contains no surrogates and each value + is in range 0x0000 .. 0x07ff. + 2. an input register contains no surrogates and values are + is in range 0x0000 .. 0xffff. + 3. an input register contains surrogates --- i.e. codepoints + can have 16 or 32 bits. + + Ad 1. + + When values are less than 0x0800, it means that a 16-bit words + can be converted into: 1) single UTF8 byte (when it's an ASCII + char) or 2) two UTF8 bytes. + + For this case we do only some shuffle to obtain these 2-byte + codes and finally compress the whole SSE register with a single + shuffle. + + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. + + Ad 2. + + When values fit in 16-bit words, but are above 0x07ff, then + a single word may produce one, two or three UTF8 bytes. + + We prepare data for all these three cases in two registers. + The first register contains lower two UTF8 bytes (used in all + cases), while the second one contains just the third byte for + the three-UTF8-bytes case. + + Finally these two registers are interleaved forming eight-element + array of 32-bit values. The array spans two SSE registers. + The bytes from the registers are compressed using two shuffles. + + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. + + + To summarize: + - We need two 256-entry tables that have 8704 bytes in total. +*/ + + +/* + Returns a pair: the first unprocessed byte from buf and utf8_output + A scalar routing should carry on the conversion of the tail. +*/ +template +std::pair avx2_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) { + const char16_t* end = buf + len; + const __m256i v_0000 = _mm256_setzero_si256(); + const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800); + const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800); + const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080); + const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 + + while (buf + 16 + safety_margin <= end) { + __m256i in = _mm256_loadu_si256((__m256i*)buf); + if (big_endian) { + const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, + 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30); + in = _mm256_shuffle_epi8(in, swap); + } + // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes + const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80); + if(_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!! + // 1. pack the bytes + const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1)); + // 2. store (16 bytes) + _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! + } + // no bits set above 7th bit + const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000); + const uint32_t one_byte_bitmask = static_cast(_mm256_movemask_epi8(one_byte_bytemask)); + + // no bits set above 11th bit + const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000); + const uint32_t one_or_two_bytes_bitmask = static_cast(_mm256_movemask_epi8(one_or_two_bytes_bytemask)); + if (one_or_two_bytes_bitmask == 0xffffffff) { + + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00); + const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f); + + // t0 = [000a|aaaa|bbbb|bb00] + const __m256i t0 = _mm256_slli_epi16(in, 2); + // t1 = [000a|aaaa|0000|0000] + const __m256i t1 = _mm256_and_si256(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const __m256i t2 = _mm256_and_si256(in, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const __m256i t3 = _mm256_or_si256(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const __m256i t4 = _mm256_or_si256(t3, v_c080); + + // 2. merge ASCII and 2-byte codewords + const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask); + + // 3. prepare bitmask for 8-bit lookup + const uint32_t M0 = one_byte_bitmask & 0x55555555; + const uint32_t M1 = M0 >> 7; + const uint32_t M2 = (M1 | M0) & 0x00ff00ff; + // 4. pack the bytes + + const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0]; + const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0]; + + const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1)); + const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1)); + + const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2)); + // 5. store bytes + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed)); + utf8_output += row[0]; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1)); + utf8_output += row_2[0]; + + // 6. adjust pointers + buf += 16; + continue; + } + // 1. Check if there are any surrogate word in the input chunk. + // We have also deal with situation when there is a surrogate word + // at the end of a chunk. + const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800); + + // bitmask = 0x0000 if there are no surrogates + // = 0xc000 if the last word is a surrogate + const uint32_t surrogates_bitmask = static_cast(_mm256_movemask_epi8(surrogates_bytemask)); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, + // it is likely an uncommon occurrence. + if (surrogates_bitmask == 0x00000000) { + // case: words from register produce either 1, 2 or 3 UTF-8 bytes + const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e, + 0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); + + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes + + We expand the input word (16-bit) into two words (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. + + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. + + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. + + Finally from these two words we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ +#define vec(x) _mm256_set1_epi16(static_cast(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const __m256i t0 = _mm256_shuffle_epi8(in, dup_even); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const __m256i t1 = _mm256_and_si256(t0, vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const __m256i t2 = _mm256_or_si256 (t1, vec(0b1000000000000000)); + + // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] + const __m256i s0 = _mm256_srli_epi16(in, 4); + // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] + const __m256i s1 = _mm256_and_si256(s0, vec(0b0000111111111100)); + // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] + const __m256i s2 = _mm256_maddubs_epi16(s1, vec(0x0140)); + // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const __m256i s3 = _mm256_or_si256(s2, vec(0b1100000011100000)); + const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, vec(0b0100000000000000)); + const __m256i s4 = _mm256_xor_si256(s3, m0); +#undef vec + + // 4. expand words 16-bit => 32-bit + const __m256i out0 = _mm256_unpacklo_epi16(t2, s4); + const __m256i out1 = _mm256_unpackhi_epi16(t2, s4); + + // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle + const uint32_t mask = (one_byte_bitmask & 0x55555555) | + (one_or_two_bytes_bitmask & 0xaaaaaaaa); + // Due to the wider registers, the following path is less likely to be useful. + /*if(mask == 0) { + // We only have three-byte words. Use fast path. + const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); + const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle); + const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle); + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1)); + utf8_output += 12; + buf += 16; + continue; + }*/ + const uint8_t mask0 = uint8_t(mask); + const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1)); + const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0); + + const uint8_t mask1 = static_cast(mask >> 8); + const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1)); + const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1); + + const uint8_t mask2 = static_cast(mask >> 16); + const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0]; + const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1)); + const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2); + + + const uint8_t mask3 = static_cast(mask >> 24); + const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0]; + const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1)); + const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3); + + _mm_storeu_si128((__m128i*)utf8_output, utf8_0); + utf8_output += row0[0]; + _mm_storeu_si128((__m128i*)utf8_output, utf8_1); + utf8_output += row1[0]; + _mm_storeu_si128((__m128i*)utf8_output, utf8_2); + utf8_output += row2[0]; + _mm_storeu_si128((__m128i*)utf8_output, utf8_3); + utf8_output += row3[0]; + buf += 16; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; + if((word & 0xFF80)==0) { + *utf8_output++ = char(word); + } else if((word & 0xF800)==0) { + *utf8_output++ = char((word>>6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if((word &0xF800 ) != 0xD800) { + *utf8_output++ = char((word>>12) | 0b11100000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1]; + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if((diff | diff2) > 0x3FF) { return std::make_pair(nullptr, utf8_output); } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf8_output++ = char((value>>18) | 0b11110000); + *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((value & 0b111111) | 0b10000000); + } + } + buf += k; + } + } // while + return std::make_pair(buf, utf8_output); +} + + +/* + Returns a pair: a result struct and utf8_output. + If there is an error, the count field of the result is the position of the error. + Otherwise, it is the position of the first unprocessed byte in buf (even if finished). + A scalar routing should carry on the conversion of the tail if needed. +*/ +template +std::pair avx2_convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) { + const char16_t* start = buf; + const char16_t* end = buf + len; + + const __m256i v_0000 = _mm256_setzero_si256(); + const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800); + const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800); + const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080); + const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 + + while (buf + 16 + safety_margin <= end) { + __m256i in = _mm256_loadu_si256((__m256i*)buf); + if (big_endian) { + const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, + 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30); + in = _mm256_shuffle_epi8(in, swap); + } + // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes + const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80); + if(_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!! + // 1. pack the bytes + const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1)); + // 2. store (16 bytes) + _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! + } + // no bits set above 7th bit + const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000); + const uint32_t one_byte_bitmask = static_cast(_mm256_movemask_epi8(one_byte_bytemask)); + + // no bits set above 11th bit + const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000); + const uint32_t one_or_two_bytes_bitmask = static_cast(_mm256_movemask_epi8(one_or_two_bytes_bytemask)); + if (one_or_two_bytes_bitmask == 0xffffffff) { + + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00); + const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f); + + // t0 = [000a|aaaa|bbbb|bb00] + const __m256i t0 = _mm256_slli_epi16(in, 2); + // t1 = [000a|aaaa|0000|0000] + const __m256i t1 = _mm256_and_si256(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const __m256i t2 = _mm256_and_si256(in, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const __m256i t3 = _mm256_or_si256(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const __m256i t4 = _mm256_or_si256(t3, v_c080); + + // 2. merge ASCII and 2-byte codewords + const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask); + + // 3. prepare bitmask for 8-bit lookup + const uint32_t M0 = one_byte_bitmask & 0x55555555; + const uint32_t M1 = M0 >> 7; + const uint32_t M2 = (M1 | M0) & 0x00ff00ff; + // 4. pack the bytes + + const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0]; + const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0]; + + const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1)); + const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1)); + + const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2)); + // 5. store bytes + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed)); + utf8_output += row[0]; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1)); + utf8_output += row_2[0]; + + // 6. adjust pointers + buf += 16; + continue; + } + // 1. Check if there are any surrogate word in the input chunk. + // We have also deal with situation when there is a surrogate word + // at the end of a chunk. + const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800); + + // bitmask = 0x0000 if there are no surrogates + // = 0xc000 if the last word is a surrogate + const uint32_t surrogates_bitmask = static_cast(_mm256_movemask_epi8(surrogates_bytemask)); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, + // it is likely an uncommon occurrence. + if (surrogates_bitmask == 0x00000000) { + // case: words from register produce either 1, 2 or 3 UTF-8 bytes + const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e, + 0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); + + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes + + We expand the input word (16-bit) into two words (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. + + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. + + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. + + Finally from these two words we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ +#define vec(x) _mm256_set1_epi16(static_cast(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const __m256i t0 = _mm256_shuffle_epi8(in, dup_even); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const __m256i t1 = _mm256_and_si256(t0, vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const __m256i t2 = _mm256_or_si256 (t1, vec(0b1000000000000000)); + + // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] + const __m256i s0 = _mm256_srli_epi16(in, 4); + // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] + const __m256i s1 = _mm256_and_si256(s0, vec(0b0000111111111100)); + // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] + const __m256i s2 = _mm256_maddubs_epi16(s1, vec(0x0140)); + // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const __m256i s3 = _mm256_or_si256(s2, vec(0b1100000011100000)); + const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, vec(0b0100000000000000)); + const __m256i s4 = _mm256_xor_si256(s3, m0); +#undef vec + + // 4. expand words 16-bit => 32-bit + const __m256i out0 = _mm256_unpacklo_epi16(t2, s4); + const __m256i out1 = _mm256_unpackhi_epi16(t2, s4); + + // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle + const uint32_t mask = (one_byte_bitmask & 0x55555555) | + (one_or_two_bytes_bitmask & 0xaaaaaaaa); + // Due to the wider registers, the following path is less likely to be useful. + /*if(mask == 0) { + // We only have three-byte words. Use fast path. + const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); + const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle); + const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle); + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1)); + utf8_output += 12; + buf += 16; + continue; + }*/ + const uint8_t mask0 = uint8_t(mask); + const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1)); + const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0); + + const uint8_t mask1 = static_cast(mask >> 8); + const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1)); + const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1); + + const uint8_t mask2 = static_cast(mask >> 16); + const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0]; + const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1)); + const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2); + + + const uint8_t mask3 = static_cast(mask >> 24); + const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0]; + const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1)); + const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3); + + _mm_storeu_si128((__m128i*)utf8_output, utf8_0); + utf8_output += row0[0]; + _mm_storeu_si128((__m128i*)utf8_output, utf8_1); + utf8_output += row1[0]; + _mm_storeu_si128((__m128i*)utf8_output, utf8_2); + utf8_output += row2[0]; + _mm_storeu_si128((__m128i*)utf8_output, utf8_3); + utf8_output += row3[0]; + buf += 16; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; + if((word & 0xFF80)==0) { + *utf8_output++ = char(word); + } else if((word & 0xF800)==0) { + *utf8_output++ = char((word>>6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if((word &0xF800 ) != 0xD800) { + *utf8_output++ = char((word>>12) | 0b11100000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1]; + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if((diff | diff2) > 0x3FF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf8_output); } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf8_output++ = char((value>>18) | 0b11110000); + *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((value & 0b111111) | 0b10000000); + } + } + buf += k; + } + } // while + return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output); +} +/* end file src/haswell/avx2_convert_utf16_to_utf8.cpp */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf16_to_utf32.cpp +/* begin file src/haswell/avx2_convert_utf16_to_utf32.cpp */ +/* + The vectorized algorithm works on single SSE register i.e., it + loads eight 16-bit words. + + We consider three cases: + 1. an input register contains no surrogates and each value + is in range 0x0000 .. 0x07ff. + 2. an input register contains no surrogates and values are + is in range 0x0000 .. 0xffff. + 3. an input register contains surrogates --- i.e. codepoints + can have 16 or 32 bits. + + Ad 1. + + When values are less than 0x0800, it means that a 16-bit words + can be converted into: 1) single UTF8 byte (when it's an ASCII + char) or 2) two UTF8 bytes. + + For this case we do only some shuffle to obtain these 2-byte + codes and finally compress the whole SSE register with a single + shuffle. + + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. + + Ad 2. + + When values fit in 16-bit words, but are above 0x07ff, then + a single word may produce one, two or three UTF8 bytes. + + We prepare data for all these three cases in two registers. + The first register contains lower two UTF8 bytes (used in all + cases), while the second one contains just the third byte for + the three-UTF8-bytes case. + + Finally these two registers are interleaved forming eight-element + array of 32-bit values. The array spans two SSE registers. + The bytes from the registers are compressed using two shuffles. + + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. + + + To summarize: + - We need two 256-entry tables that have 8704 bytes in total. +*/ + + +/* + Returns a pair: the first unprocessed byte from buf and utf32_output + A scalar routing should carry on the conversion of the tail. +*/ +template +std::pair avx2_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) { + const char16_t* end = buf + len; + const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800); + const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800); + + while (buf + 16 <= end) { + __m256i in = _mm256_loadu_si256((__m256i*)buf); + if (big_endian) { + const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, + 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30); + in = _mm256_shuffle_epi8(in, swap); + } + + // 1. Check if there are any surrogate word in the input chunk. + // We have also deal with situation when there is a surrogate word + // at the end of a chunk. + const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800); + + // bitmask = 0x0000 if there are no surrogates + // = 0xc000 if the last word is a surrogate + const uint32_t surrogates_bitmask = static_cast(_mm256_movemask_epi8(surrogates_bytemask)); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, + // it is likely an uncommon occurrence. + if (surrogates_bitmask == 0x00000000) { + // case: we extend all sixteen 16-bit words to sixteen 32-bit words + _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in))); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output + 8), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in,1))); + utf32_output += 16; + buf += 16; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; + if((word &0xF800 ) != 0xD800) { + // No surrogate pair + *utf32_output++ = char32_t(word); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1]; + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if((diff | diff2) > 0x3FF) { return std::make_pair(nullptr, utf32_output); } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf32_output++ = char32_t(value); + } + } + buf += k; + } + } // while + return std::make_pair(buf, utf32_output); +} + + +/* + Returns a pair: a result struct and utf8_output. + If there is an error, the count field of the result is the position of the error. + Otherwise, it is the position of the first unprocessed byte in buf (even if finished). + A scalar routing should carry on the conversion of the tail if needed. +*/ +template +std::pair avx2_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) { + const char16_t* start = buf; + const char16_t* end = buf + len; + const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800); + const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800); + + while (buf + 16 <= end) { + __m256i in = _mm256_loadu_si256((__m256i*)buf); + if (big_endian) { + const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, + 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30); + in = _mm256_shuffle_epi8(in, swap); + } + + // 1. Check if there are any surrogate word in the input chunk. + // We have also deal with situation when there is a surrogate word + // at the end of a chunk. + const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800); + + // bitmask = 0x0000 if there are no surrogates + // = 0xc000 if the last word is a surrogate + const uint32_t surrogates_bitmask = static_cast(_mm256_movemask_epi8(surrogates_bytemask)); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, + // it is likely an uncommon occurrence. + if (surrogates_bitmask == 0x00000000) { + // case: we extend all sixteen 16-bit words to sixteen 32-bit words + _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in))); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output + 8), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in,1))); + utf32_output += 16; + buf += 16; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; + if((word &0xF800 ) != 0xD800) { + // No surrogate pair + *utf32_output++ = char32_t(word); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1]; + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if((diff | diff2) > 0x3FF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf32_output); } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf32_output++ = char32_t(value); + } + } + buf += k; + } + } // while + return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output); +} +/* end file src/haswell/avx2_convert_utf16_to_utf32.cpp */ + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf32_to_utf8.cpp +/* begin file src/haswell/avx2_convert_utf32_to_utf8.cpp */ +std::pair avx2_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) { + const char32_t* end = buf + len; + const __m256i v_0000 = _mm256_setzero_si256(); + const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000); + const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80); + const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800); + const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080); + const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff); + __m256i running_max = _mm256_setzero_si256(); + __m256i forbidden_bytemask = _mm256_setzero_si256(); + + const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 + + while (buf + 16 + safety_margin <= end) { + __m256i in = _mm256_loadu_si256((__m256i*)buf); + __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1); + running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin); + + // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation + __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff)); + in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000); + + // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp) + + if(_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!! + // 1. pack the bytes + const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1)); + // 2. store (16 bytes) + _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! + } + // no bits set above 7th bit + const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000); + const uint32_t one_byte_bitmask = static_cast(_mm256_movemask_epi8(one_byte_bytemask)); + + // no bits set above 11th bit + const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000); + const uint32_t one_or_two_bytes_bitmask = static_cast(_mm256_movemask_epi8(one_or_two_bytes_bytemask)); + if (one_or_two_bytes_bitmask == 0xffffffff) { + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00); + const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f); + + // t0 = [000a|aaaa|bbbb|bb00] + const __m256i t0 = _mm256_slli_epi16(in_16, 2); + // t1 = [000a|aaaa|0000|0000] + const __m256i t1 = _mm256_and_si256(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const __m256i t2 = _mm256_and_si256(in_16, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const __m256i t3 = _mm256_or_si256(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const __m256i t4 = _mm256_or_si256(t3, v_c080); + + // 2. merge ASCII and 2-byte codewords + const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask); + + // 3. prepare bitmask for 8-bit lookup + const uint32_t M0 = one_byte_bitmask & 0x55555555; + const uint32_t M1 = M0 >> 7; + const uint32_t M2 = (M1 | M0) & 0x00ff00ff; + // 4. pack the bytes + + const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0]; + const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0]; + + const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1)); + const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1)); + + const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2)); + // 5. store bytes + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed)); + utf8_output += row[0]; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1)); + utf8_output += row_2[0]; + + // 6. adjust pointers + buf += 16; + continue; + } + // Must check for overflow in packing + const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000); + const uint32_t saturation_bitmask = static_cast(_mm256_movemask_epi8(saturation_bytemask)); + if (saturation_bitmask == 0xffffffff) { + // case: words from register produce either 1, 2 or 3 UTF-8 bytes + const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800); + forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800)); + + const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e, + 0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); + + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes + + We expand the input word (16-bit) into two words (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. + + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. + + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. + + Finally from these two words we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ +#define vec(x) _mm256_set1_epi16(static_cast(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const __m256i t1 = _mm256_and_si256(t0, vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const __m256i t2 = _mm256_or_si256 (t1, vec(0b1000000000000000)); + + // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] + const __m256i s0 = _mm256_srli_epi16(in_16, 4); + // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] + const __m256i s1 = _mm256_and_si256(s0, vec(0b0000111111111100)); + // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] + const __m256i s2 = _mm256_maddubs_epi16(s1, vec(0x0140)); + // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const __m256i s3 = _mm256_or_si256(s2, vec(0b1100000011100000)); + const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, vec(0b0100000000000000)); + const __m256i s4 = _mm256_xor_si256(s3, m0); +#undef vec + + // 4. expand words 16-bit => 32-bit + const __m256i out0 = _mm256_unpacklo_epi16(t2, s4); + const __m256i out1 = _mm256_unpackhi_epi16(t2, s4); + + // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle + const uint32_t mask = (one_byte_bitmask & 0x55555555) | + (one_or_two_bytes_bitmask & 0xaaaaaaaa); + // Due to the wider registers, the following path is less likely to be useful. + /*if(mask == 0) { + // We only have three-byte words. Use fast path. + const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); + const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle); + const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle); + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1)); + utf8_output += 12; + buf += 16; + continue; + }*/ + const uint8_t mask0 = uint8_t(mask); + const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1)); + const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0); + + const uint8_t mask1 = static_cast(mask >> 8); + const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1)); + const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1); + + const uint8_t mask2 = static_cast(mask >> 16); + const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0]; + const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1)); + const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2); + + + const uint8_t mask3 = static_cast(mask >> 24); + const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0]; + const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1)); + const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3); + + _mm_storeu_si128((__m128i*)utf8_output, utf8_0); + utf8_output += row0[0]; + _mm_storeu_si128((__m128i*)utf8_output, utf8_1); + utf8_output += row1[0]; + _mm_storeu_si128((__m128i*)utf8_output, utf8_2); + utf8_output += row2[0]; + _mm_storeu_si128((__m128i*)utf8_output, utf8_3); + utf8_output += row3[0]; + buf += 16; + } else { + // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes. + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // may require large, non-trivial tables? + size_t forward = 15; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint32_t word = buf[k]; + if((word & 0xFFFFFF80)==0) { // 1-byte (ASCII) + *utf8_output++ = char(word); + } else if((word & 0xFFFFF800)==0) { // 2-byte + *utf8_output++ = char((word>>6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if((word & 0xFFFF0000 )==0) { // 3-byte + if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf8_output); } + *utf8_output++ = char((word>>12) | 0b11100000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { // 4-byte + if (word > 0x10FFFF) { return std::make_pair(nullptr, utf8_output); } + *utf8_output++ = char((word>>18) | 0b11110000); + *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } + } + buf += k; + } + } // while + + // check for invalid input + const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff); + if(static_cast(_mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) { + return std::make_pair(nullptr, utf8_output); + } + + if (static_cast(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf8_output); } + + return std::make_pair(buf, utf8_output); +} + + +std::pair avx2_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) { + const char32_t* end = buf + len; + const char32_t* start = buf; + + const __m256i v_0000 = _mm256_setzero_si256(); + const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000); + const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80); + const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800); + const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080); + const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff); + const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff); + + const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 + + while (buf + 16 + safety_margin <= end) { + __m256i in = _mm256_loadu_si256((__m256i*)buf); + __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1); + // Check for too large input + const __m256i max_input = _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff); + if(static_cast(_mm256_movemask_epi8(_mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) { + return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output); + } + + // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation + __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff)); + in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000); + + // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp) + + if(_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!! + // 1. pack the bytes + const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1)); + // 2. store (16 bytes) + _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! + } + // no bits set above 7th bit + const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000); + const uint32_t one_byte_bitmask = static_cast(_mm256_movemask_epi8(one_byte_bytemask)); + + // no bits set above 11th bit + const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000); + const uint32_t one_or_two_bytes_bitmask = static_cast(_mm256_movemask_epi8(one_or_two_bytes_bytemask)); + if (one_or_two_bytes_bitmask == 0xffffffff) { + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00); + const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f); + + // t0 = [000a|aaaa|bbbb|bb00] + const __m256i t0 = _mm256_slli_epi16(in_16, 2); + // t1 = [000a|aaaa|0000|0000] + const __m256i t1 = _mm256_and_si256(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const __m256i t2 = _mm256_and_si256(in_16, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const __m256i t3 = _mm256_or_si256(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const __m256i t4 = _mm256_or_si256(t3, v_c080); + + // 2. merge ASCII and 2-byte codewords + const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask); + + // 3. prepare bitmask for 8-bit lookup + const uint32_t M0 = one_byte_bitmask & 0x55555555; + const uint32_t M1 = M0 >> 7; + const uint32_t M2 = (M1 | M0) & 0x00ff00ff; + // 4. pack the bytes + + const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0]; + const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0]; + + const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1)); + const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1)); + + const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2)); + // 5. store bytes + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed)); + utf8_output += row[0]; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1)); + utf8_output += row_2[0]; + + // 6. adjust pointers + buf += 16; + continue; + } + // Must check for overflow in packing + const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000); + const uint32_t saturation_bitmask = static_cast(_mm256_movemask_epi8(saturation_bytemask)); + if (saturation_bitmask == 0xffffffff) { + // case: words from register produce either 1, 2 or 3 UTF-8 bytes + + // Check for illegal surrogate words + const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800); + const __m256i forbidden_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800); + if (static_cast(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) { + return std::make_pair(result(error_code::SURROGATE, buf - start), utf8_output); + } + + const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e, + 0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); + + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes + + We expand the input word (16-bit) into two words (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. + + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. + + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. + + Finally from these two words we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ +#define vec(x) _mm256_set1_epi16(static_cast(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const __m256i t1 = _mm256_and_si256(t0, vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const __m256i t2 = _mm256_or_si256 (t1, vec(0b1000000000000000)); + + // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] + const __m256i s0 = _mm256_srli_epi16(in_16, 4); + // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] + const __m256i s1 = _mm256_and_si256(s0, vec(0b0000111111111100)); + // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] + const __m256i s2 = _mm256_maddubs_epi16(s1, vec(0x0140)); + // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const __m256i s3 = _mm256_or_si256(s2, vec(0b1100000011100000)); + const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, vec(0b0100000000000000)); + const __m256i s4 = _mm256_xor_si256(s3, m0); +#undef vec + + // 4. expand words 16-bit => 32-bit + const __m256i out0 = _mm256_unpacklo_epi16(t2, s4); + const __m256i out1 = _mm256_unpackhi_epi16(t2, s4); + + // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle + const uint32_t mask = (one_byte_bitmask & 0x55555555) | + (one_or_two_bytes_bitmask & 0xaaaaaaaa); + // Due to the wider registers, the following path is less likely to be useful. + /*if(mask == 0) { + // We only have three-byte words. Use fast path. + const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); + const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle); + const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle); + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1)); + utf8_output += 12; + buf += 16; + continue; + }*/ + const uint8_t mask0 = uint8_t(mask); + const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1)); + const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0); + + const uint8_t mask1 = static_cast(mask >> 8); + const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1)); + const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1); + + const uint8_t mask2 = static_cast(mask >> 16); + const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0]; + const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1)); + const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2); + + + const uint8_t mask3 = static_cast(mask >> 24); + const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0]; + const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1)); + const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3); + + _mm_storeu_si128((__m128i*)utf8_output, utf8_0); + utf8_output += row0[0]; + _mm_storeu_si128((__m128i*)utf8_output, utf8_1); + utf8_output += row1[0]; + _mm_storeu_si128((__m128i*)utf8_output, utf8_2); + utf8_output += row2[0]; + _mm_storeu_si128((__m128i*)utf8_output, utf8_3); + utf8_output += row3[0]; + buf += 16; + } else { + // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes. + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // may require large, non-trivial tables? + size_t forward = 15; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint32_t word = buf[k]; + if((word & 0xFFFFFF80)==0) { // 1-byte (ASCII) + *utf8_output++ = char(word); + } else if((word & 0xFFFFF800)==0) { // 2-byte + *utf8_output++ = char((word>>6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if((word & 0xFFFF0000 )==0) { // 3-byte + if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output); } + *utf8_output++ = char((word>>12) | 0b11100000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { // 4-byte + if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf8_output); } + *utf8_output++ = char((word>>18) | 0b11110000); + *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } + } + buf += k; + } + } // while + + return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output); +} +/* end file src/haswell/avx2_convert_utf32_to_utf8.cpp */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf32_to_utf16.cpp +/* begin file src/haswell/avx2_convert_utf32_to_utf16.cpp */ +template +std::pair avx2_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_output) { + const char32_t* end = buf + len; + + const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 + __m256i forbidden_bytemask = _mm256_setzero_si256(); + + + while (buf + 8 + safety_margin <= end) { + __m256i in = _mm256_loadu_si256((__m256i*)buf); + + const __m256i v_00000000 = _mm256_setzero_si256(); + const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000); + + // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs + const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000); + const uint32_t saturation_bitmask = static_cast(_mm256_movemask_epi8(saturation_bytemask)); + + if (saturation_bitmask == 0xffffffff) { + const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800); + const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800); + forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800)); + + __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1)); + if (big_endian) { + const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + utf16_packed = _mm_shuffle_epi8(utf16_packed, swap); + } + _mm_storeu_si128((__m128i*)utf16_output, utf16_packed); + utf16_output += 8; + buf += 8; + } else { + size_t forward = 7; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint32_t word = buf[k]; + if((word & 0xFFFF0000)==0) { + // will not generate a surrogate pair + if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf16_output); } + *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word); + } else { + // will generate a surrogate pair + if (word > 0x10FFFF) { return std::make_pair(nullptr, utf16_output); } + word -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); + if (big_endian) { + high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8)); + low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8)); + } + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); + } + } + buf += k; + } + } + + // check for invalid input + if (static_cast(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf16_output); } + + return std::make_pair(buf, utf16_output); +} + + +template +std::pair avx2_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) { + const char32_t* start = buf; + const char32_t* end = buf + len; + + const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 + + while (buf + 8 + safety_margin <= end) { + __m256i in = _mm256_loadu_si256((__m256i*)buf); + + const __m256i v_00000000 = _mm256_setzero_si256(); + const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000); + + // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs + const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000); + const uint32_t saturation_bitmask = static_cast(_mm256_movemask_epi8(saturation_bytemask)); + + if (saturation_bitmask == 0xffffffff) { + const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800); + const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800); + const __m256i forbidden_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800); + if (static_cast(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) { + return std::make_pair(result(error_code::SURROGATE, buf - start), utf16_output); + } + + __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1)); + if (big_endian) { + const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + utf16_packed = _mm_shuffle_epi8(utf16_packed, swap); + } + _mm_storeu_si128((__m128i*)utf16_output, utf16_packed); + utf16_output += 8; + buf += 8; + } else { + size_t forward = 7; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint32_t word = buf[k]; + if((word & 0xFFFF0000)==0) { + // will not generate a surrogate pair + if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output); } + *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word); + } else { + // will generate a surrogate pair + if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output); } + word -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); + if (big_endian) { + high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8)); + low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8)); + } + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); + } + } + buf += k; + } + } + + return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output); +} +/* end file src/haswell/avx2_convert_utf32_to_utf16.cpp */ +} // unnamed namespace +} // namespace haswell +} // namespace simdutf + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/buf_block_reader.h +/* begin file src/generic/buf_block_reader.h */ +namespace simdutf { +namespace haswell { +namespace { + +// Walks through a buffer in block-sized increments, loading the last part with spaces +template +struct buf_block_reader { +public: + simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len); + simdutf_really_inline size_t block_index(); + simdutf_really_inline bool has_full_block() const; + simdutf_really_inline const uint8_t *full_block() const; + /** + * Get the last block, padded with spaces. + * + * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this + * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there + * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding. + * + * @return the number of effective characters in the last block. + */ + simdutf_really_inline size_t get_remainder(uint8_t *dst) const; + simdutf_really_inline void advance(); +private: + const uint8_t *buf; + const size_t len; + const size_t lenminusstep; + size_t idx; +}; + +// Routines to print masks and text for debugging bitmask operations +simdutf_unused static char * format_input_text_64(const uint8_t *text) { + static char *buf = reinterpret_cast(malloc(sizeof(simd8x64) + 1)); + for (size_t i=0; i); i++) { + buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]); + } + buf[sizeof(simd8x64)] = '\0'; + return buf; +} + +// Routines to print masks and text for debugging bitmask operations +simdutf_unused static char * format_input_text(const simd8x64& in) { + static char *buf = reinterpret_cast(malloc(sizeof(simd8x64) + 1)); + in.store(reinterpret_cast(buf)); + for (size_t i=0; i); i++) { + if (buf[i] < ' ') { buf[i] = '_'; } + } + buf[sizeof(simd8x64)] = '\0'; + return buf; +} + +simdutf_unused static char * format_mask(uint64_t mask) { + static char *buf = reinterpret_cast(malloc(64 + 1)); + for (size_t i=0; i<64; i++) { + buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' '; + } + buf[64] = '\0'; + return buf; +} + +template +simdutf_really_inline buf_block_reader::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {} + +template +simdutf_really_inline size_t buf_block_reader::block_index() { return idx; } + +template +simdutf_really_inline bool buf_block_reader::has_full_block() const { + return idx < lenminusstep; +} + +template +simdutf_really_inline const uint8_t *buf_block_reader::full_block() const { + return &buf[idx]; +} + +template +simdutf_really_inline size_t buf_block_reader::get_remainder(uint8_t *dst) const { + if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers + std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once. + std::memcpy(dst, buf + idx, len - idx); + return len - idx; +} + +template +simdutf_really_inline void buf_block_reader::advance() { + idx += STEP_SIZE; +} + +} // unnamed namespace +} // namespace haswell +} // namespace simdutf +/* end file src/generic/buf_block_reader.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h +/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ +namespace simdutf { +namespace haswell { +namespace { +namespace utf8_validation { + +using namespace simd; + + simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { +// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) +// Bit 1 = Too Long (ASCII followed by continuation) +// Bit 2 = Overlong 3-byte +// Bit 4 = Surrogate +// Bit 5 = Overlong 2-byte +// Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1<<6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 + ); + constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, + CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000 + ); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT + ); + return (byte_1_high & byte_1_low & byte_2_high); + } + simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, + const simd8 prev_input, const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; + } + + // + // Return nonzero if there are incomplete multibyte characters at the end of the block: + // e.g. if there is a 4-byte character, but it's 3 bytes from the end. + // + simdutf_really_inline simd8 is_incomplete(const simd8 input) { + // If the previous input's last 3 bytes match this, they're too short (they ended at EOF): + // ... 1111____ 111_____ 11______ + static const uint8_t max_array[32] = { + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1 + }; + const simd8 max_value(&max_array[sizeof(max_array)-sizeof(simd8)]); + return input.gt_bits(max_value); + } + + struct utf8_checker { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + // The last input we received + simd8 prev_input_block; + // Whether the last input we received was incomplete (used for ASCII fast path) + simd8 prev_incomplete; + + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes + // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + // The only problem that can happen at EOF is that a multibyte character is too short + // or a byte value too large in the last bytes: check_special_cases only checks for bytes + // too large in the first of two bytes. + simdutf_really_inline void check_eof() { + // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't + // possibly finish them. + this->error |= this->prev_incomplete; + } + + simdutf_really_inline void check_next_input(const simd8x64& input) { + if(simdutf_likely(is_ascii(input))) { + this->error |= this->prev_incomplete; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], this->prev_input_block); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], this->prev_input_block); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + this->prev_incomplete = is_incomplete(input.chunks[simd8x64::NUM_CHUNKS-1]); + this->prev_input_block = input.chunks[simd8x64::NUM_CHUNKS-1]; + + } + } + + // do not forget to call check_eof! + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } + + }; // struct utf8_checker +} // namespace utf8_validation + +using utf8_validation::utf8_checker; + +} // unnamed namespace +} // namespace haswell +} // namespace simdutf +/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_validator.h +/* begin file src/generic/utf8_validation/utf8_validator.h */ +namespace simdutf { +namespace haswell { +namespace { +namespace utf8_validation { + +/** + * Validates that the string is actual UTF-8. + */ +template +bool generic_validate_utf8(const uint8_t * input, size_t length) { + checker c{}; + buf_block_reader<64> reader(input, length); + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + c.check_next_input(in); + reader.advance(); + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + c.check_next_input(in); + reader.advance(); + c.check_eof(); + return !c.errors(); +} + +bool generic_validate_utf8(const char * input, size_t length) { + return generic_validate_utf8(reinterpret_cast(input),length); +} + +/** + * Validates that the string is actual UTF-8 and stops on errors. + */ +template +result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) { + checker c{}; + buf_block_reader<64> reader(input, length); + size_t count{0}; + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + c.check_next_input(in); + if(c.errors()) { + if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk + result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast(input + count), length - count); + res.count += count; + return res; + } + reader.advance(); + count += 64; + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + c.check_next_input(in); + reader.advance(); + c.check_eof(); + if (c.errors()) { + result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast(input) + count, length - count); + res.count += count; + return res; + } else { + return result(error_code::SUCCESS, length); + } +} + +result generic_validate_utf8_with_errors(const char * input, size_t length) { + return generic_validate_utf8_with_errors(reinterpret_cast(input),length); +} + +template +bool generic_validate_ascii(const uint8_t * input, size_t length) { + buf_block_reader<64> reader(input, length); + uint8_t blocks[64]{}; + simd::simd8x64 running_or(blocks); + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + running_or |= in; + reader.advance(); + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + running_or |= in; + return running_or.is_ascii(); +} + +bool generic_validate_ascii(const char * input, size_t length) { + return generic_validate_ascii(reinterpret_cast(input),length); +} + +template +result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) { + buf_block_reader<64> reader(input, length); + size_t count{0}; + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + if (!in.is_ascii()) { + result res = scalar::ascii::validate_with_errors(reinterpret_cast(input + count), length - count); + return result(res.error, count + res.count); + } + reader.advance(); + + count += 64; + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + if (!in.is_ascii()) { + result res = scalar::ascii::validate_with_errors(reinterpret_cast(input + count), length - count); + return result(res.error, count + res.count); + } else { + return result(error_code::SUCCESS, length); + } +} + +result generic_validate_ascii_with_errors(const char * input, size_t length) { + return generic_validate_ascii_with_errors(reinterpret_cast(input),length); +} + +} // namespace utf8_validation +} // unnamed namespace +} // namespace haswell +} // namespace simdutf +/* end file src/generic/utf8_validation/utf8_validator.h */ +// transcoding from UTF-8 to UTF-16 +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h +/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ + + +namespace simdutf { +namespace haswell { +namespace { +namespace utf8_to_utf16 { + +using namespace simd; + +template +simdutf_warn_unused size_t convert_valid(const char* input, size_t size, + char16_t* utf16_output) noexcept { + // The implementation is not specific to haswell and should be moved to the generic directory. + size_t pos = 0; + char16_t* start{utf16_output}; + const size_t safety_margin = 16; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + // this loop could be unrolled further. For example, we could process the mask + // far more than 64 bytes. + simd8x64 in(reinterpret_cast(input + pos)); + if(in.is_ascii()) { + in.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // Slow path. We hope that the compiler will recognize that this is a slow path. + // Anything that is not a continuation mask is a 'leading byte', that is, the + // start of a new code point. + uint64_t utf8_continuation_mask = in.lt(-65 + 1); + // -65 is 0b10111111 in two-complement's, so largest possible continuation byte + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + // The *start* of code points is not so useful, rather, we want the *end* of code points. + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times when using solely + // the slow/regular path, and at least four times if there are fast paths. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + // + // Thus we may allow convert_masked_utf8_to_utf16 to process + // more bytes at a time under a fast-path mode where 16 bytes + // are consumed at once (e.g., when encountering ASCII). + size_t consumed = convert_masked_utf8_to_utf16(input + pos, + utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block.These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + utf16_output += scalar::utf8_to_utf16::convert_valid(input + pos, size - pos, utf16_output); + return utf16_output - start; +} + +} // namespace utf8_to_utf16 +} // unnamed namespace +} // namespace haswell +} // namespace simdutf +/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h +/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */ + + +namespace simdutf { +namespace haswell { +namespace { +namespace utf8_to_utf16 { +using namespace simd; + + + simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { +// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) +// Bit 1 = Too Long (ASCII followed by continuation) +// Bit 2 = Overlong 3-byte +// Bit 4 = Surrogate +// Bit 5 = Overlong 2-byte +// Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1<<6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 + ); + constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, + CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000 + ); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT + ); + return (byte_1_high & byte_1_low & byte_2_high); + } + simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, + const simd8 prev_input, const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; + } + + + struct validating_transcoder { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + + validating_transcoder() : error(uint8_t(0)) {} + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes + // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + + template + simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) { + size_t pos = 0; + char16_t* start{utf16_output}; + const size_t safety_margin = 16; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if(input.is_ascii()) { + input.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf16(in + pos, + utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block.These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if(errors()) { return 0; } + if(pos < size) { + size_t howmany = scalar::utf8_to_utf16::convert(in + pos, size - pos, utf16_output); + if(howmany == 0) { return 0; } + utf16_output += howmany; + } + return utf16_output - start; + } + + template + simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) { + size_t pos = 0; + char16_t* start{utf16_output}; + const size_t safety_margin = 16; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if(input.is_ascii()) { + input.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + if (errors()) { + result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(in + pos, size - pos, utf16_output); + res.count += pos; + return res; + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf16(in + pos, + utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block.These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if(errors()) { + result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(in + pos, size - pos, utf16_output); + res.count += pos; + return res; + } + if(pos < size) { + result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(in + pos, size - pos, utf16_output); + if (res.error) { // In case of error, we want the error position + res.count += pos; + return res; + } else { // In case of success, we want the number of word written + utf16_output += res.count; + } + } + return result(error_code::SUCCESS, utf16_output - start); + } + + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } + + }; // struct utf8_checker +} // utf8_to_utf16 namespace +} // unnamed namespace +} // namespace haswell +} // namespace simdutf +/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */ +// transcoding from UTF-8 to UTF-32 +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/valid_utf8_to_utf32.h +/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ + +namespace simdutf { +namespace haswell { +namespace { +namespace utf8_to_utf32 { + +using namespace simd; + + +simdutf_warn_unused size_t convert_valid(const char* input, size_t size, + char32_t* utf32_output) noexcept { + size_t pos = 0; + char32_t* start{utf32_output}; + const size_t safety_margin = 16; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 in(reinterpret_cast(input + pos)); + if(in.is_ascii()) { + in.store_ascii_as_utf32(utf32_output); + utf32_output += 64; + pos += 64; + } else { + // -65 is 0b10111111 in two-complement's, so largest possible continuation byte + uint64_t utf8_continuation_mask = in.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + size_t max_starting_point = (pos + 64) - 12; + while(pos < max_starting_point) { + size_t consumed = convert_masked_utf8_to_utf32(input + pos, + utf8_end_of_code_point_mask, utf32_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + } + } + utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output); + return utf32_output - start; +} + + +} // namespace utf8_to_utf32 +} // unnamed namespace +} // namespace haswell +} // namespace simdutf +/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/utf8_to_utf32.h +/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */ + + +namespace simdutf { +namespace haswell { +namespace { +namespace utf8_to_utf32 { +using namespace simd; + + + simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { +// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) +// Bit 1 = Too Long (ASCII followed by continuation) +// Bit 2 = Overlong 3-byte +// Bit 4 = Surrogate +// Bit 5 = Overlong 2-byte +// Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1<<6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 + ); + constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, + CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000 + ); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT + ); + return (byte_1_high & byte_1_low & byte_2_high); + } + simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, + const simd8 prev_input, const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; + } + + + struct validating_transcoder { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + + validating_transcoder() : error(uint8_t(0)) {} + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes + // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + + + simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) { + size_t pos = 0; + char32_t* start{utf32_output}; + const size_t safety_margin = 16; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if(input.is_ascii()) { + input.store_ascii_as_utf32(utf32_output); + utf32_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf32(in + pos, + utf8_end_of_code_point_mask, utf32_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block.These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if(errors()) { return 0; } + if(pos < size) { + size_t howmany = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output); + if(howmany == 0) { return 0; } + utf32_output += howmany; + } + return utf32_output - start; + } + + simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) { + size_t pos = 0; + char32_t* start{utf32_output}; + const size_t safety_margin = 16; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if(input.is_ascii()) { + input.store_ascii_as_utf32(utf32_output); + utf32_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + if (errors()) { + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(in + pos, size - pos, utf32_output); + res.count += pos; + return res; + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf32(in + pos, + utf8_end_of_code_point_mask, utf32_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block.These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if(errors()) { + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(in + pos, size - pos, utf32_output); + res.count += pos; + return res; + } + if(pos < size) { + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(in + pos, size - pos, utf32_output); + if (res.error) { // In case of error, we want the error position + res.count += pos; + return res; + } else { // In case of success, we want the number of word written + utf32_output += res.count; + } + } + return result(error_code::SUCCESS, utf32_output - start); + } + + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } + + }; // struct utf8_checker +} // utf8_to_utf32 namespace +} // unnamed namespace +} // namespace haswell +} // namespace simdutf +/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */ +// other functions +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8.h +/* begin file src/generic/utf8.h */ + +namespace simdutf { +namespace haswell { +namespace { +namespace utf8 { + +using namespace simd; + +simdutf_really_inline size_t count_code_points(const char* in, size_t size) { + size_t pos = 0; + size_t count = 0; + for(;pos + 64 <= size; pos += 64) { + simd8x64 input(reinterpret_cast(in + pos)); + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + count += 64 - count_ones(utf8_continuation_mask); + } + return count + scalar::utf8::count_code_points(in + pos, size - pos); +} + + +simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) { + size_t pos = 0; + size_t count = 0; + // This algorithm could no doubt be improved! + for(;pos + 64 <= size; pos += 64) { + simd8x64 input(reinterpret_cast(in + pos)); + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + // We count one word for anything that is not a continuation (so + // leading bytes). + count += 64 - count_ones(utf8_continuation_mask); + int64_t utf8_4byte = input.gteq_unsigned(240); + count += count_ones(utf8_4byte); + } + return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos); +} + + +simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size) { + return count_code_points(in, size); +} +} // utf8 namespace +} // unnamed namespace +} // namespace haswell +} // namespace simdutf +/* end file src/generic/utf8.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf16.h +/* begin file src/generic/utf16.h */ +namespace simdutf { +namespace haswell { +namespace { +namespace utf16 { + +template +simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) { + size_t pos = 0; + size_t count = 0; + for(;pos + 32 <= size; pos += 32) { + simd16x32 input(reinterpret_cast(in + pos)); + if (big_endian) input.swap_bytes(); + uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF); + count += count_ones(not_pair) / 2; + } + return count + scalar::utf16::count_code_points(in + pos, size - pos); +} + +template +simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) { + size_t pos = 0; + size_t count = 0; + // This algorithm could no doubt be improved! + for(;pos + 32 <= size; pos += 32) { + simd16x32 input(reinterpret_cast(in + pos)); + if (big_endian) input.swap_bytes(); + uint64_t ascii_mask = input.lteq(0x7F); + uint64_t twobyte_mask = input.lteq(0x7FF); + uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF); + + size_t ascii_count = count_ones(ascii_mask) / 2; + size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2; + size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2; + size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2; + count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count; + } + return count + scalar::utf16::utf8_length_from_utf16(in + pos, size - pos); +} + +template +simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) { + return count_code_points(in, size); +} + +simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) { + size_t pos = 0; + + while (pos + 32 <= size) { + simd16x32 input(reinterpret_cast(in + pos)); + input.swap_bytes(); + input.store(reinterpret_cast(output)); + pos += 32; + output += 32; + } + + scalar::utf16::change_endianness_utf16(in + pos, size - pos, output); +} + +} // utf16 +} // unnamed namespace +} // namespace haswell +} // namespace simdutf +/* end file src/generic/utf16.h */ + +namespace simdutf { +namespace haswell { + +simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept { + // If there is a BOM, then we trust it. + auto bom_encoding = simdutf::BOM::check_bom(input, length); + if(bom_encoding != encoding_type::unspecified) { return bom_encoding; } + if (length % 2 == 0) { + return avx2_detect_encodings(input, length); + } else { + if (implementation::validate_utf8(input, length)) { + return simdutf::encoding_type::UTF8; + } else { + return simdutf::encoding_type::unspecified; + } + } +} + +simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { + return haswell::utf8_validation::generic_validate_utf8(buf,len); +} + +simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept { + return haswell::utf8_validation::generic_validate_utf8_with_errors(buf,len); +} + +simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept { + return haswell::utf8_validation::generic_validate_ascii(buf,len); +} + +simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept { + return haswell::utf8_validation::generic_validate_ascii_with_errors(buf,len); +} + +simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept { + const char16_t* tail = avx2_validate_utf16(buf, len); + if (tail) { + return scalar::utf16::validate(tail, len - (tail - buf)); + } else { + return false; + } +} + +simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept { + const char16_t* tail = avx2_validate_utf16(buf, len); + if (tail) { + return scalar::utf16::validate(tail, len - (tail - buf)); + } else { + return false; + } +} + +simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept { + result res = avx2_validate_utf16_with_errors(buf, len); + if (res.count != len) { + result scalar_res = scalar::utf16::validate_with_errors(buf + res.count, len - res.count); + return result(scalar_res.error, res.count + scalar_res.count); + } else { + return res; + } +} + +simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept { + result res = avx2_validate_utf16_with_errors(buf, len); + if (res.count != len) { + result scalar_res = scalar::utf16::validate_with_errors(buf + res.count, len - res.count); + return result(scalar_res.error, res.count + scalar_res.count); + } else { + return res; + } +} + +simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept { + const char32_t* tail = avx2_validate_utf32le(buf, len); + if (tail) { + return scalar::utf32::validate(tail, len - (tail - buf)); + } else { + return false; + } +} + +simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept { + result res = avx2_validate_utf32le_with_errors(buf, len); + if (res.count != len) { + result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count); + return result(scalar_res.error, res.count + scalar_res.count); + } else { + return res; + } +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert(buf, len, utf16_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert_with_errors(buf, len, utf16_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert_with_errors(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* input, size_t size, + char16_t* utf16_output) const noexcept { + return utf8_to_utf16::convert_valid(input, size, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* input, size_t size, + char16_t* utf16_output) const noexcept { + return utf8_to_utf16::convert_valid(input, size, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept { + utf8_to_utf32::validating_transcoder converter; + return converter.convert(buf, len, utf32_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept { + utf8_to_utf32::validating_transcoder converter; + return converter.convert_with_errors(buf, len, utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size, + char32_t* utf32_output) const noexcept { + return utf8_to_utf32::convert_valid(input, size, utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + std::pair ret = haswell::avx2_convert_utf16_to_utf8(buf, len, utf8_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - utf8_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + std::pair ret = haswell::avx2_convert_utf16_to_utf8(buf, len, utf8_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - utf8_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of words written even if finished + std::pair ret = haswell::avx2_convert_utf16_to_utf8_with_errors(buf, len, utf8_output); + if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = scalar::utf16_to_utf8::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written + return ret.first; +} + +simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of words written even if finished + std::pair ret = haswell::avx2_convert_utf16_to_utf8_with_errors(buf, len, utf8_output); + if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = scalar::utf16_to_utf8::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written + return ret.first; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + return convert_utf16le_to_utf8(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + return convert_utf16be_to_utf8(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept { + std::pair ret = avx2_convert_utf32_to_utf8(buf, len, utf8_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - utf8_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of words written even if finished + std::pair ret = haswell::avx2_convert_utf32_to_utf8_with_errors(buf, len, utf8_output); + if (ret.first.count != len) { + result scalar_res = scalar::utf32_to_utf8::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written + return ret.first; +} + +simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + std::pair ret = haswell::avx2_convert_utf16_to_utf32(buf, len, utf32_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - utf32_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + std::pair ret = haswell::avx2_convert_utf16_to_utf32(buf, len, utf32_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - utf32_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of words written even if finished + std::pair ret = haswell::avx2_convert_utf16_to_utf32_with_errors(buf, len, utf32_output); + if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = scalar::utf16_to_utf32::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit words written + return ret.first; +} + +simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of words written even if finished + std::pair ret = haswell::avx2_convert_utf16_to_utf32_with_errors(buf, len, utf32_output); + if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = scalar::utf16_to_utf32::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit words written + return ret.first; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept { + return convert_utf32_to_utf8(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + std::pair ret = avx2_convert_utf32_to_utf16(buf, len, utf16_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + std::pair ret = avx2_convert_utf32_to_utf16(buf, len, utf16_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of words written even if finished + std::pair ret = haswell::avx2_convert_utf32_to_utf16_with_errors(buf, len, utf16_output); + if (ret.first.count != len) { + result scalar_res = scalar::utf32_to_utf16::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written + return ret.first; +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of words written even if finished + std::pair ret = haswell::avx2_convert_utf32_to_utf16_with_errors(buf, len, utf16_output); + if (ret.first.count != len) { + result scalar_res = scalar::utf32_to_utf16::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written + return ret.first; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + return convert_utf32_to_utf16le(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + return convert_utf32_to_utf16be(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + return convert_utf16le_to_utf32(buf, len, utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + return convert_utf16be_to_utf32(buf, len, utf32_output); +} + +void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept { + utf16::change_endianness_utf16(input, length, output); +} + +simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept { + return utf16::count_code_points(input, length); +} + +simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept { + return utf16::count_code_points(input, length); +} + +simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept { + return utf8::count_code_points(input, length); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept { + return utf16::utf8_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept { + return utf16::utf8_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept { + return utf16::utf32_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept { + return utf16::utf32_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept { + return utf8::utf16_length_from_utf8(input, length); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept { + const __m256i v_00000000 = _mm256_setzero_si256(); + const __m256i v_ffffff80 = _mm256_set1_epi32((uint32_t)0xffffff80); + const __m256i v_fffff800 = _mm256_set1_epi32((uint32_t)0xfffff800); + const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000); + size_t pos = 0; + size_t count = 0; + for(;pos + 8 <= length; pos += 8) { + __m256i in = _mm256_loadu_si256((__m256i*)(input + pos)); + const __m256i ascii_bytes_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffffff80), v_00000000); + const __m256i one_two_bytes_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_fffff800), v_00000000); + const __m256i two_bytes_bytemask = _mm256_xor_si256(one_two_bytes_bytemask, ascii_bytes_bytemask); + const __m256i one_two_three_bytes_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000); + const __m256i three_bytes_bytemask = _mm256_xor_si256(one_two_three_bytes_bytemask, one_two_bytes_bytemask); + const uint32_t ascii_bytes_bitmask = static_cast(_mm256_movemask_epi8(ascii_bytes_bytemask)); + const uint32_t two_bytes_bitmask = static_cast(_mm256_movemask_epi8(two_bytes_bytemask)); + const uint32_t three_bytes_bitmask = static_cast(_mm256_movemask_epi8(three_bytes_bytemask)); + + size_t ascii_count = count_ones(ascii_bytes_bitmask) / 4; + size_t two_bytes_count = count_ones(two_bytes_bitmask) / 4; + size_t three_bytes_count = count_ones(three_bytes_bitmask) / 4; + count += 32 - 3*ascii_count - 2*two_bytes_count - three_bytes_count; + } + return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos); +} + +simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept { + const __m256i v_00000000 = _mm256_setzero_si256(); + const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000); + size_t pos = 0; + size_t count = 0; + for(;pos + 8 <= length; pos += 8) { + __m256i in = _mm256_loadu_si256((__m256i*)(input + pos)); + const __m256i surrogate_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000); + const uint32_t surrogate_bitmask = static_cast(_mm256_movemask_epi8(surrogate_bytemask)); + size_t surrogate_count = (32-count_ones(surrogate_bitmask))/4; + count += 8 + surrogate_count; + } + return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos); +} + +simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept { + return utf8::utf32_length_from_utf8(input, length); +} + +} // namespace haswell +} // namespace simdutf + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/end.h +/* begin file src/simdutf/haswell/end.h */ +SIMDUTF_UNTARGET_REGION +/* end file src/simdutf/haswell/end.h */ +/* end file src/haswell/implementation.cpp */ +#endif +#if SIMDUTF_IMPLEMENTATION_PPC64 +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=ppc64/implementation.cpp +/* begin file src/ppc64/implementation.cpp */ + + + + + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/begin.h +/* begin file src/simdutf/ppc64/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "ppc64" +// #define SIMDUTF_IMPLEMENTATION ppc64 +/* end file src/simdutf/ppc64/begin.h */ +namespace simdutf { +namespace ppc64 { +namespace { +#ifndef SIMDUTF_PPC64_H +#error "ppc64.h must be included" +#endif +using namespace simd; + + +simdutf_really_inline bool is_ascii(const simd8x64& input) { + // careful: 0x80 is not ascii. + return input.reduce_or().saturating_sub(0b01111111u).bits_not_set_anywhere(); +} + +simdutf_unused simdutf_really_inline simd8 must_be_continuation(const simd8 prev1, const simd8 prev2, const simd8 prev3) { + simd8 is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0 + simd8 is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0 + simd8 is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0 + // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine. + return simd8(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0); +} + +simdutf_really_inline simd8 must_be_2_3_continuation(const simd8 prev2, const simd8 prev3) { + simd8 is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0 + simd8 is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0 + // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine. + return simd8(is_third_byte | is_fourth_byte) > int8_t(0); +} + +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/buf_block_reader.h +/* begin file src/generic/buf_block_reader.h */ +namespace simdutf { +namespace ppc64 { +namespace { + +// Walks through a buffer in block-sized increments, loading the last part with spaces +template +struct buf_block_reader { +public: + simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len); + simdutf_really_inline size_t block_index(); + simdutf_really_inline bool has_full_block() const; + simdutf_really_inline const uint8_t *full_block() const; + /** + * Get the last block, padded with spaces. + * + * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this + * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there + * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding. + * + * @return the number of effective characters in the last block. + */ + simdutf_really_inline size_t get_remainder(uint8_t *dst) const; + simdutf_really_inline void advance(); +private: + const uint8_t *buf; + const size_t len; + const size_t lenminusstep; + size_t idx; +}; + +// Routines to print masks and text for debugging bitmask operations +simdutf_unused static char * format_input_text_64(const uint8_t *text) { + static char *buf = reinterpret_cast(malloc(sizeof(simd8x64) + 1)); + for (size_t i=0; i); i++) { + buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]); + } + buf[sizeof(simd8x64)] = '\0'; + return buf; +} + +// Routines to print masks and text for debugging bitmask operations +simdutf_unused static char * format_input_text(const simd8x64& in) { + static char *buf = reinterpret_cast(malloc(sizeof(simd8x64) + 1)); + in.store(reinterpret_cast(buf)); + for (size_t i=0; i); i++) { + if (buf[i] < ' ') { buf[i] = '_'; } + } + buf[sizeof(simd8x64)] = '\0'; + return buf; +} + +simdutf_unused static char * format_mask(uint64_t mask) { + static char *buf = reinterpret_cast(malloc(64 + 1)); + for (size_t i=0; i<64; i++) { + buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' '; + } + buf[64] = '\0'; + return buf; +} + +template +simdutf_really_inline buf_block_reader::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {} + +template +simdutf_really_inline size_t buf_block_reader::block_index() { return idx; } + +template +simdutf_really_inline bool buf_block_reader::has_full_block() const { + return idx < lenminusstep; +} + +template +simdutf_really_inline const uint8_t *buf_block_reader::full_block() const { + return &buf[idx]; +} + +template +simdutf_really_inline size_t buf_block_reader::get_remainder(uint8_t *dst) const { + if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers + std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once. + std::memcpy(dst, buf + idx, len - idx); + return len - idx; +} + +template +simdutf_really_inline void buf_block_reader::advance() { + idx += STEP_SIZE; +} + +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf +/* end file src/generic/buf_block_reader.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h +/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ +namespace simdutf { +namespace ppc64 { +namespace { +namespace utf8_validation { + +using namespace simd; + + simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { +// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) +// Bit 1 = Too Long (ASCII followed by continuation) +// Bit 2 = Overlong 3-byte +// Bit 4 = Surrogate +// Bit 5 = Overlong 2-byte +// Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1<<6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 + ); + constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, + CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000 + ); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT + ); + return (byte_1_high & byte_1_low & byte_2_high); + } + simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, + const simd8 prev_input, const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; + } + + // + // Return nonzero if there are incomplete multibyte characters at the end of the block: + // e.g. if there is a 4-byte character, but it's 3 bytes from the end. + // + simdutf_really_inline simd8 is_incomplete(const simd8 input) { + // If the previous input's last 3 bytes match this, they're too short (they ended at EOF): + // ... 1111____ 111_____ 11______ + static const uint8_t max_array[32] = { + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1 + }; + const simd8 max_value(&max_array[sizeof(max_array)-sizeof(simd8)]); + return input.gt_bits(max_value); + } + + struct utf8_checker { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + // The last input we received + simd8 prev_input_block; + // Whether the last input we received was incomplete (used for ASCII fast path) + simd8 prev_incomplete; + + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes + // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + // The only problem that can happen at EOF is that a multibyte character is too short + // or a byte value too large in the last bytes: check_special_cases only checks for bytes + // too large in the first of two bytes. + simdutf_really_inline void check_eof() { + // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't + // possibly finish them. + this->error |= this->prev_incomplete; + } + + simdutf_really_inline void check_next_input(const simd8x64& input) { + if(simdutf_likely(is_ascii(input))) { + this->error |= this->prev_incomplete; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], this->prev_input_block); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], this->prev_input_block); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + this->prev_incomplete = is_incomplete(input.chunks[simd8x64::NUM_CHUNKS-1]); + this->prev_input_block = input.chunks[simd8x64::NUM_CHUNKS-1]; + + } + } + + // do not forget to call check_eof! + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } + + }; // struct utf8_checker +} // namespace utf8_validation + +using utf8_validation::utf8_checker; + +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf +/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_validator.h +/* begin file src/generic/utf8_validation/utf8_validator.h */ +namespace simdutf { +namespace ppc64 { +namespace { +namespace utf8_validation { + +/** + * Validates that the string is actual UTF-8. + */ +template +bool generic_validate_utf8(const uint8_t * input, size_t length) { + checker c{}; + buf_block_reader<64> reader(input, length); + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + c.check_next_input(in); + reader.advance(); + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + c.check_next_input(in); + reader.advance(); + c.check_eof(); + return !c.errors(); +} + +bool generic_validate_utf8(const char * input, size_t length) { + return generic_validate_utf8(reinterpret_cast(input),length); +} + +/** + * Validates that the string is actual UTF-8 and stops on errors. + */ +template +result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) { + checker c{}; + buf_block_reader<64> reader(input, length); + size_t count{0}; + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + c.check_next_input(in); + if(c.errors()) { + if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk + result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast(input + count), length - count); + res.count += count; + return res; + } + reader.advance(); + count += 64; + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + c.check_next_input(in); + reader.advance(); + c.check_eof(); + if (c.errors()) { + result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast(input) + count, length - count); + res.count += count; + return res; + } else { + return result(error_code::SUCCESS, length); + } +} + +result generic_validate_utf8_with_errors(const char * input, size_t length) { + return generic_validate_utf8_with_errors(reinterpret_cast(input),length); +} + +template +bool generic_validate_ascii(const uint8_t * input, size_t length) { + buf_block_reader<64> reader(input, length); + uint8_t blocks[64]{}; + simd::simd8x64 running_or(blocks); + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + running_or |= in; + reader.advance(); + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + running_or |= in; + return running_or.is_ascii(); +} + +bool generic_validate_ascii(const char * input, size_t length) { + return generic_validate_ascii(reinterpret_cast(input),length); +} + +template +result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) { + buf_block_reader<64> reader(input, length); + size_t count{0}; + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + if (!in.is_ascii()) { + result res = scalar::ascii::validate_with_errors(reinterpret_cast(input + count), length - count); + return result(res.error, count + res.count); + } + reader.advance(); + + count += 64; + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + if (!in.is_ascii()) { + result res = scalar::ascii::validate_with_errors(reinterpret_cast(input + count), length - count); + return result(res.error, count + res.count); + } else { + return result(error_code::SUCCESS, length); + } +} + +result generic_validate_ascii_with_errors(const char * input, size_t length) { + return generic_validate_ascii_with_errors(reinterpret_cast(input),length); +} + +} // namespace utf8_validation +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf +/* end file src/generic/utf8_validation/utf8_validator.h */ +// transcoding from UTF-8 to UTF-16 +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h +/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ + + +namespace simdutf { +namespace ppc64 { +namespace { +namespace utf8_to_utf16 { + +using namespace simd; + +template +simdutf_warn_unused size_t convert_valid(const char* input, size_t size, + char16_t* utf16_output) noexcept { + // The implementation is not specific to haswell and should be moved to the generic directory. + size_t pos = 0; + char16_t* start{utf16_output}; + const size_t safety_margin = 16; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + // this loop could be unrolled further. For example, we could process the mask + // far more than 64 bytes. + simd8x64 in(reinterpret_cast(input + pos)); + if(in.is_ascii()) { + in.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // Slow path. We hope that the compiler will recognize that this is a slow path. + // Anything that is not a continuation mask is a 'leading byte', that is, the + // start of a new code point. + uint64_t utf8_continuation_mask = in.lt(-65 + 1); + // -65 is 0b10111111 in two-complement's, so largest possible continuation byte + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + // The *start* of code points is not so useful, rather, we want the *end* of code points. + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times when using solely + // the slow/regular path, and at least four times if there are fast paths. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + // + // Thus we may allow convert_masked_utf8_to_utf16 to process + // more bytes at a time under a fast-path mode where 16 bytes + // are consumed at once (e.g., when encountering ASCII). + size_t consumed = convert_masked_utf8_to_utf16(input + pos, + utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block.These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + utf16_output += scalar::utf8_to_utf16::convert_valid(input + pos, size - pos, utf16_output); + return utf16_output - start; +} + +} // namespace utf8_to_utf16 +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf +/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h +/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */ + + +namespace simdutf { +namespace ppc64 { +namespace { +namespace utf8_to_utf16 { +using namespace simd; + + + simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { +// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) +// Bit 1 = Too Long (ASCII followed by continuation) +// Bit 2 = Overlong 3-byte +// Bit 4 = Surrogate +// Bit 5 = Overlong 2-byte +// Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1<<6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 + ); + constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, + CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000 + ); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT + ); + return (byte_1_high & byte_1_low & byte_2_high); + } + simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, + const simd8 prev_input, const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; + } + + + struct validating_transcoder { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + + validating_transcoder() : error(uint8_t(0)) {} + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes + // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + + template + simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) { + size_t pos = 0; + char16_t* start{utf16_output}; + const size_t safety_margin = 16; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if(input.is_ascii()) { + input.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf16(in + pos, + utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block.These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if(errors()) { return 0; } + if(pos < size) { + size_t howmany = scalar::utf8_to_utf16::convert(in + pos, size - pos, utf16_output); + if(howmany == 0) { return 0; } + utf16_output += howmany; + } + return utf16_output - start; + } + + template + simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) { + size_t pos = 0; + char16_t* start{utf16_output}; + const size_t safety_margin = 16; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if(input.is_ascii()) { + input.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + if (errors()) { + result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(in + pos, size - pos, utf16_output); + res.count += pos; + return res; + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf16(in + pos, + utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block.These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if(errors()) { + result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(in + pos, size - pos, utf16_output); + res.count += pos; + return res; + } + if(pos < size) { + result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(in + pos, size - pos, utf16_output); + if (res.error) { // In case of error, we want the error position + res.count += pos; + return res; + } else { // In case of success, we want the number of word written + utf16_output += res.count; + } + } + return result(error_code::SUCCESS, utf16_output - start); + } + + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } + + }; // struct utf8_checker +} // utf8_to_utf16 namespace +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf +/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */ +// transcoding from UTF-8 to UTF-32 +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/valid_utf8_to_utf32.h +/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ + +namespace simdutf { +namespace ppc64 { +namespace { +namespace utf8_to_utf32 { + +using namespace simd; + + +simdutf_warn_unused size_t convert_valid(const char* input, size_t size, + char32_t* utf32_output) noexcept { + size_t pos = 0; + char32_t* start{utf32_output}; + const size_t safety_margin = 16; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 in(reinterpret_cast(input + pos)); + if(in.is_ascii()) { + in.store_ascii_as_utf32(utf32_output); + utf32_output += 64; + pos += 64; + } else { + // -65 is 0b10111111 in two-complement's, so largest possible continuation byte + uint64_t utf8_continuation_mask = in.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + size_t max_starting_point = (pos + 64) - 12; + while(pos < max_starting_point) { + size_t consumed = convert_masked_utf8_to_utf32(input + pos, + utf8_end_of_code_point_mask, utf32_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + } + } + utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output); + return utf32_output - start; +} + + +} // namespace utf8_to_utf32 +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf +/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/utf8_to_utf32.h +/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */ + + +namespace simdutf { +namespace ppc64 { +namespace { +namespace utf8_to_utf32 { +using namespace simd; + + + simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { +// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) +// Bit 1 = Too Long (ASCII followed by continuation) +// Bit 2 = Overlong 3-byte +// Bit 4 = Surrogate +// Bit 5 = Overlong 2-byte +// Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1<<6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 + ); + constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, + CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000 + ); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT + ); + return (byte_1_high & byte_1_low & byte_2_high); + } + simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, + const simd8 prev_input, const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; + } + + + struct validating_transcoder { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + + validating_transcoder() : error(uint8_t(0)) {} + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes + // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + + + simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) { + size_t pos = 0; + char32_t* start{utf32_output}; + const size_t safety_margin = 16; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if(input.is_ascii()) { + input.store_ascii_as_utf32(utf32_output); + utf32_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf32(in + pos, + utf8_end_of_code_point_mask, utf32_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block.These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if(errors()) { return 0; } + if(pos < size) { + size_t howmany = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output); + if(howmany == 0) { return 0; } + utf32_output += howmany; + } + return utf32_output - start; + } + + simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) { + size_t pos = 0; + char32_t* start{utf32_output}; + const size_t safety_margin = 16; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if(input.is_ascii()) { + input.store_ascii_as_utf32(utf32_output); + utf32_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + if (errors()) { + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(in + pos, size - pos, utf32_output); + res.count += pos; + return res; + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf32(in + pos, + utf8_end_of_code_point_mask, utf32_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block.These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if(errors()) { + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(in + pos, size - pos, utf32_output); + res.count += pos; + return res; + } + if(pos < size) { + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(in + pos, size - pos, utf32_output); + if (res.error) { // In case of error, we want the error position + res.count += pos; + return res; + } else { // In case of success, we want the number of word written + utf32_output += res.count; + } + } + return result(error_code::SUCCESS, utf32_output - start); + } + + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } + + }; // struct utf8_checker +} // utf8_to_utf32 namespace +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf +/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */ +// other functions +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8.h +/* begin file src/generic/utf8.h */ + +namespace simdutf { +namespace ppc64 { +namespace { +namespace utf8 { + +using namespace simd; + +simdutf_really_inline size_t count_code_points(const char* in, size_t size) { + size_t pos = 0; + size_t count = 0; + for(;pos + 64 <= size; pos += 64) { + simd8x64 input(reinterpret_cast(in + pos)); + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + count += 64 - count_ones(utf8_continuation_mask); + } + return count + scalar::utf8::count_code_points(in + pos, size - pos); +} + + +simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) { + size_t pos = 0; + size_t count = 0; + // This algorithm could no doubt be improved! + for(;pos + 64 <= size; pos += 64) { + simd8x64 input(reinterpret_cast(in + pos)); + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + // We count one word for anything that is not a continuation (so + // leading bytes). + count += 64 - count_ones(utf8_continuation_mask); + int64_t utf8_4byte = input.gteq_unsigned(240); + count += count_ones(utf8_4byte); + } + return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos); +} + + +simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size) { + return count_code_points(in, size); +} +} // utf8 namespace +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf +/* end file src/generic/utf8.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf16.h +/* begin file src/generic/utf16.h */ +namespace simdutf { +namespace ppc64 { +namespace { +namespace utf16 { + +template +simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) { + size_t pos = 0; + size_t count = 0; + for(;pos + 32 <= size; pos += 32) { + simd16x32 input(reinterpret_cast(in + pos)); + if (big_endian) input.swap_bytes(); + uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF); + count += count_ones(not_pair) / 2; + } + return count + scalar::utf16::count_code_points(in + pos, size - pos); +} + +template +simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) { + size_t pos = 0; + size_t count = 0; + // This algorithm could no doubt be improved! + for(;pos + 32 <= size; pos += 32) { + simd16x32 input(reinterpret_cast(in + pos)); + if (big_endian) input.swap_bytes(); + uint64_t ascii_mask = input.lteq(0x7F); + uint64_t twobyte_mask = input.lteq(0x7FF); + uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF); + + size_t ascii_count = count_ones(ascii_mask) / 2; + size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2; + size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2; + size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2; + count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count; + } + return count + scalar::utf16::utf8_length_from_utf16(in + pos, size - pos); +} + +template +simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) { + return count_code_points(in, size); +} + +simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) { + size_t pos = 0; + + while (pos + 32 <= size) { + simd16x32 input(reinterpret_cast(in + pos)); + input.swap_bytes(); + input.store(reinterpret_cast(output)); + pos += 32; + output += 32; + } + + scalar::utf16::change_endianness_utf16(in + pos, size - pos, output); +} + +} // utf16 +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf +/* end file src/generic/utf16.h */ + +// +// Implementation-specific overrides +// +namespace simdutf { +namespace ppc64 { + +simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept { + // If there is a BOM, then we trust it. + auto bom_encoding = simdutf::BOM::check_bom(input, length); + if(bom_encoding != encoding_type::unspecified) { return bom_encoding; } + int out = 0; + if(validate_utf8(input, length)) { out |= encoding_type::UTF8; } + if((length % 2) == 0) { + if(validate_utf16(reinterpret_cast(input), length/2)) { out |= encoding_type::UTF16_LE; } + } + if((length % 4) == 0) { + if(validate_utf32(reinterpret_cast(input), length/4)) { out |= encoding_type::UTF32_LE; } + } + + return out; +} + +simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { + return ppc64::utf8_validation::generic_validate_utf8(buf,len); +} + +simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept { + return ppc64::utf8_validation::generic_validate_utf8_with_errors(buf,len); +} + +simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept { + return ppc64::utf8_validation::generic_validate_ascii(buf,len); +} + +simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept { + return ppc64::utf8_validation::generic_validate_ascii_with_errors(buf,len); +} + +simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept { + return scalar::utf16::validate(buf, len); +} + +simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept { + return scalar::utf16::validate(buf, len); +} + +simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept { + return scalar::utf16::validate_with_errors(buf, len); +} + +simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept { + return scalar::utf16::validate_with_errors(buf, len); +} + +simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept { + return scalar::utf32::validate_with_errors(buf, len); +} + +simdutf_warn_unused bool implementation::validate_utf32(const char16_t *buf, size_t len) const noexcept { + return scalar::utf32::validate(buf, len); +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept { + return 0; // stub +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept { + return 0; // stub +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept { + return result(error_code::OTHER, 0); // stub +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept { + return result(error_code::OTHER, 0); // stub +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept { + return 0; // stub +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept { + return 0; // stub +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* /*buf*/, size_t /*len*/, char32_t* /*utf16_output*/) const noexcept { + return 0; // stub +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* /*buf*/, size_t /*len*/, char32_t* /*utf16_output*/) const noexcept { + return result(error_code::OTHER, 0); // stub +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* /*buf*/, size_t /*len*/, char32_t* /*utf16_output*/) const noexcept { + return 0; // stub +} + +simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + return scalar::utf16_to_utf8::convert(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + return scalar::utf16_to_utf8::convert(buf, len, utf8_output); +} + +simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + return scalar::utf16_to_utf8::convert_with_errors(buf, len, utf8_output); +} + +simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + return scalar::utf16_to_utf8::convert_with_errors(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + return scalar::utf16_to_utf8::convert_valid(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + return scalar::utf16_to_utf8::convert_valid(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept { + return scalar::utf32_to_utf8::convert(buf, len, utf8_output); +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept { + return scalar::utf32_to_utf8::convert_with_errors(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept { + return scalar::utf32_to_utf8::convert_valid(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + return scalar::utf32_to_utf16::convert(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + return scalar::utf32_to_utf16::convert(buf, len, utf16_output); +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + return scalar::utf32_to_utf16::convert_with_errors(buf, len, utf16_output); +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + return scalar::utf32_to_utf16::convert_with_errors(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + return scalar::utf32_to_utf16::convert_valid(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + return scalar::utf32_to_utf16::convert_valid(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + return scalar::utf16_to_utf32::convert(buf, len, utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + return scalar::utf16_to_utf32::convert(buf, len, utf32_output); +} + +simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + return scalar::utf16_to_utf32::convert_with_errors(buf, len, utf32_output); +} + +simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + return scalar::utf16_to_utf32::convert_with_errors(buf, len, utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + return scalar::utf16_to_utf32::convert_valid(buf, len, utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + return scalar::utf16_to_utf32::convert_valid(buf, len, utf32_output); +} + +void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept { + scalar::utf16::change_endianness_utf16(input, length, output); +} + +simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept { + return scalar::utf16::count_code_points(input, length); +} + +simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept { + return scalar::utf16::count_code_points(input, length); +} + +simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept { + return utf8::count_code_points(input, length); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept { + return scalar::utf16::utf8_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept { + return scalar::utf16::utf8_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept { + return scalar::utf16::utf32_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept { + return scalar::utf16::utf32_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept { + return scalar::utf8::utf16_length_from_utf8(input, length); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept { + return scalar::utf32::utf8_length_from_utf32(input, length); +} + +simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept { + return scalar::utf32::utf16_length_from_utf32(input, length); +} + +simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept { + return scalar::utf8::utf32_length_from_utf8(input, length); +} + +} // namespace ppc64 +} // namespace simdutf + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/end.h +/* begin file src/simdutf/ppc64/end.h */ +/* end file src/simdutf/ppc64/end.h */ +/* end file src/ppc64/implementation.cpp */ +#endif +#if SIMDUTF_IMPLEMENTATION_WESTMERE +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/implementation.cpp +/* begin file src/westmere/implementation.cpp */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/begin.h +/* begin file src/simdutf/westmere/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "westmere" +// #define SIMDUTF_IMPLEMENTATION westmere +SIMDUTF_TARGET_WESTMERE +/* end file src/simdutf/westmere/begin.h */ +namespace simdutf { +namespace westmere { +namespace { +#ifndef SIMDUTF_WESTMERE_H +#error "westmere.h must be included" +#endif +using namespace simd; + +simdutf_really_inline bool is_ascii(const simd8x64& input) { + return input.reduce_or().is_ascii(); +} + +simdutf_unused simdutf_really_inline simd8 must_be_continuation(const simd8 prev1, const simd8 prev2, const simd8 prev3) { + simd8 is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0 + simd8 is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0 + simd8 is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0 + // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine. + return simd8(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0); +} + +simdutf_really_inline simd8 must_be_2_3_continuation(const simd8 prev2, const simd8 prev3) { + simd8 is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0 + simd8 is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0 + // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine. + return simd8(is_third_byte | is_fourth_byte) > int8_t(0); +} + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_detect_encodings.cpp +/* begin file src/westmere/sse_detect_encodings.cpp */ +template +// len is known to be a multiple of 2 when this is called +int sse_detect_encodings(const char * buf, size_t len) { + const char* start = buf; + const char* end = buf + len; + + bool is_utf8 = true; + bool is_utf16 = true; + bool is_utf32 = true; + + int out = 0; + + const auto v_d8 = simd8::splat(0xd8); + const auto v_f8 = simd8::splat(0xf8); + + __m128i currentmax = _mm_setzero_si128(); + + checker check{}; + + while(buf + 64 <= end) { + __m128i in = _mm_loadu_si128((__m128i*)buf); + __m128i secondin = _mm_loadu_si128((__m128i*)buf+1); + __m128i thirdin = _mm_loadu_si128((__m128i*)buf+2); + __m128i fourthin = _mm_loadu_si128((__m128i*)buf+3); + + const auto u0 = simd16(in); + const auto u1 = simd16(secondin); + const auto u2 = simd16(thirdin); + const auto u3 = simd16(fourthin); + + const auto v0 = u0.shr<8>(); + const auto v1 = u1.shr<8>(); + const auto v2 = u2.shr<8>(); + const auto v3 = u3.shr<8>(); + + const auto in16 = simd16::pack(v0, v1); + const auto nextin16 = simd16::pack(v2, v3); + + const auto surrogates_wordmask0 = (in16 & v_f8) == v_d8; + const auto surrogates_wordmask1 = (nextin16 & v_f8) == v_d8; + uint16_t surrogates_bitmask0 = static_cast(surrogates_wordmask0.to_bitmask()); + uint16_t surrogates_bitmask1 = static_cast(surrogates_wordmask1.to_bitmask()); + + // Check for surrogates + if (surrogates_bitmask0 != 0x0 || surrogates_bitmask1 != 0x0) { + // Cannot be UTF8 + is_utf8 = false; + // Can still be either UTF-16LE or UTF-32LE depending on the positions of the surrogates + // To be valid UTF-32LE, a surrogate cannot be in the two most significant bytes of any 32-bit word. + // On the other hand, to be valid UTF-16LE, at least one surrogate must be in the two most significant + // bytes of a 32-bit word since they always come in pairs in UTF-16LE. + // Note that we always proceed in multiple of 4 before this point so there is no offset in 32-bit words. + + if (((surrogates_bitmask0 | surrogates_bitmask1) & 0xaaaa) != 0) { + is_utf32 = false; + // Code from sse_validate_utf16le.cpp + // Not efficient, we do not process surrogates_bitmask1 + const char16_t * input = reinterpret_cast(buf); + const char16_t* end16 = reinterpret_cast(start) + len/2; + + const auto v_fc = simd8::splat(0xfc); + const auto v_dc = simd8::splat(0xdc); + + const uint16_t V0 = static_cast(~surrogates_bitmask0); + + const auto vH0 = (in16 & v_fc) == v_dc; + const uint16_t H0 = static_cast(vH0.to_bitmask()); + + const uint16_t L0 = static_cast(~H0 & surrogates_bitmask0); + + const uint16_t a0 = static_cast(L0 & (H0 >> 1)); + + const uint16_t b0 = static_cast(a0 << 1); + + const uint16_t c0 = static_cast(V0 | a0 | b0); + + if (c0 == 0xffff) { + input += 16; + } else if (c0 == 0x7fff) { + input += 15; + } else { + is_utf16 = false; + break; + } + + while (input + simd16::SIZE * 2 < end16) { + const auto in0 = simd16(input); + const auto in1 = simd16(input + simd16::SIZE / sizeof(char16_t)); + + const auto t0 = in0.shr<8>(); + const auto t1 = in1.shr<8>(); + + const auto in_16 = simd16::pack(t0, t1); + + const auto surrogates_wordmask = (in_16 & v_f8) == v_d8; + const uint16_t surrogates_bitmask = static_cast(surrogates_wordmask.to_bitmask()); + if (surrogates_bitmask == 0x0) { + input += 16; + } else { + const uint16_t V = static_cast(~surrogates_bitmask); + + const auto vH = (in_16 & v_fc) == v_dc; + const uint16_t H = static_cast(vH.to_bitmask()); + + const uint16_t L = static_cast(~H & surrogates_bitmask); + + const uint16_t a = static_cast(L & (H >> 1)); + + const uint16_t b = static_cast(a << 1); + + const uint16_t c = static_cast(V | a | b); + + if (c == 0xffff) { + input += 16; + } else if (c == 0x7fff) { + input += 15; + } else { + is_utf16 = false; + break; + } + } + } + } else { + is_utf16 = false; + // Check for UTF-32LE + if (len % 4 == 0) { + const char32_t * input = reinterpret_cast(buf); + const char32_t* end32 = reinterpret_cast(start) + len/4; + + // Must start checking for surrogates + __m128i currentoffsetmax = _mm_setzero_si128(); + const __m128i offset = _mm_set1_epi32(0xffff2000); + const __m128i standardoffsetmax = _mm_set1_epi32(0xfffff7ff); + + currentmax = _mm_max_epu32(in, currentmax); + currentmax = _mm_max_epu32(secondin, currentmax); + currentmax = _mm_max_epu32(thirdin, currentmax); + currentmax = _mm_max_epu32(fourthin, currentmax); + + currentoffsetmax = _mm_max_epu32(_mm_add_epi32(in, offset), currentoffsetmax); + currentoffsetmax = _mm_max_epu32(_mm_add_epi32(secondin, offset), currentoffsetmax); + currentoffsetmax = _mm_max_epu32(_mm_add_epi32(thirdin, offset), currentoffsetmax); + currentoffsetmax = _mm_max_epu32(_mm_add_epi32(fourthin, offset), currentoffsetmax); + + while (input + 4 < end32) { + const __m128i in32 = _mm_loadu_si128((__m128i *)input); + currentmax = _mm_max_epu32(in32,currentmax); + currentoffsetmax = _mm_max_epu32(_mm_add_epi32(in32, offset), currentoffsetmax); + input += 4; + } + + __m128i forbidden_words = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax); + if(_mm_testz_si128(forbidden_words, forbidden_words) == 0) { + is_utf32 = false; + } + } else { + is_utf32 = false; + } + } + break; + } + // If no surrogate, validate under other encodings as well + + // UTF-32LE validation + currentmax = _mm_max_epu32(in, currentmax); + currentmax = _mm_max_epu32(secondin, currentmax); + currentmax = _mm_max_epu32(thirdin, currentmax); + currentmax = _mm_max_epu32(fourthin, currentmax); + + // UTF-8 validation + // Relies on ../generic/utf8_validation/utf8_lookup4_algorithm.h + simd::simd8x64 in8(in, secondin, thirdin, fourthin); + check.check_next_input(in8); + + buf += 64; + } + + // Check which encodings are possible + + if (is_utf8) { + if (static_cast(buf - start) != len) { + uint8_t block[64]{}; + std::memset(block, 0x20, 64); + std::memcpy(block, buf, len - (buf - start)); + simd::simd8x64 in(block); + check.check_next_input(in); + } + if (!check.errors()) { + out |= simdutf::encoding_type::UTF8; + } + } + + if (is_utf16 && scalar::utf16::validate(reinterpret_cast(buf), (len - (buf - start))/2)) { + out |= simdutf::encoding_type::UTF16_LE; + } + + if (is_utf32 && (len % 4 == 0)) { + const __m128i standardmax = _mm_set1_epi32(0x10ffff); + __m128i is_zero = _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax); + if (_mm_testz_si128(is_zero, is_zero) == 1 && scalar::utf32::validate(reinterpret_cast(buf), (len - (buf - start))/4)) { + out |= simdutf::encoding_type::UTF32_LE; + } + } + + return out; +} +/* end file src/westmere/sse_detect_encodings.cpp */ + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_validate_utf16.cpp +/* begin file src/westmere/sse_validate_utf16.cpp */ +/* + In UTF-16 words in range 0xD800 to 0xDFFF have special meaning. + + In a vectorized algorithm we want to examine the most significant + nibble in order to select a fast path. If none of highest nibbles + are 0xD (13), than we are sure that UTF-16 chunk in a vector + register is valid. + + Let us analyze what we need to check if the nibble is 0xD. The + value of the preceding nibble determines what we have: + + 0xd000 .. 0xd7ff - a valid word + 0xd800 .. 0xdbff - low surrogate + 0xdc00 .. 0xdfff - high surrogate + + Other constraints we have to consider: + - there must not be two consecutive low surrogates (0xd800 .. 0xdbff) + - there must not be two consecutive high surrogates (0xdc00 .. 0xdfff) + - there must not be sole low surrogate nor high surrogate + + We're going to build three bitmasks based on the 3rd nibble: + - V = valid word, + - L = low surrogate (0xd800 .. 0xdbff) + - H = high surrogate (0xdc00 .. 0xdfff) + + 0 1 2 3 4 5 6 7 <--- word index + [ V | L | H | L | H | V | V | L ] + 1 0 0 0 0 1 1 0 - V = valid masks + 0 1 0 1 0 0 0 1 - L = low surrogate + 0 0 1 0 1 0 0 0 - H high surrogate + + + 1 0 0 0 0 1 1 0 V = valid masks + 0 1 0 1 0 0 0 0 a = L & (H >> 1) + 0 0 1 0 1 0 0 0 b = a << 1 + 1 1 1 1 1 1 1 0 c = V | a | b + ^ + the last bit can be zero, we just consume 7 words + and recheck this word in the next iteration +*/ + +/* Returns: + - pointer to the last unprocessed character (a scalar fallback should check the rest); + - nullptr if an error was detected. +*/ +template +const char16_t* sse_validate_utf16(const char16_t* input, size_t size) { + const char16_t* end = input + size; + + const auto v_d8 = simd8::splat(0xd8); + const auto v_f8 = simd8::splat(0xf8); + const auto v_fc = simd8::splat(0xfc); + const auto v_dc = simd8::splat(0xdc); + + while (input + simd16::SIZE * 2 < end) { + // 0. Load data: since the validation takes into account only higher + // byte of each word, we compress the two vectors into one which + // consists only the higher bytes. + auto in0 = simd16(input); + auto in1 = simd16(input + simd16::SIZE / sizeof(char16_t)); + if (big_endian) { + in0 = in0.swap_bytes(); + in1 = in1.swap_bytes(); + } + + const auto t0 = in0.shr<8>(); + const auto t1 = in1.shr<8>(); + + const auto in = simd16::pack(t0, t1); + + // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy). + const auto surrogates_wordmask = (in & v_f8) == v_d8; + const uint16_t surrogates_bitmask = static_cast(surrogates_wordmask.to_bitmask()); + if (surrogates_bitmask == 0x0000) { + input += 16; + } else { + // 2. We have some surrogates that have to be distinguished: + // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF) + // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF) + // + // Fact: high surrogate has 11th bit set (3rd bit in the higher word) + + // V - non-surrogate words + // V = not surrogates_wordmask + const uint16_t V = static_cast(~surrogates_bitmask); + + // H - word-mask for high surrogates: the six highest bits are 0b1101'11 + const auto vH = (in & v_fc) == v_dc; + const uint16_t H = static_cast(vH.to_bitmask()); + + // L - word mask for low surrogates + // L = not H and surrogates_wordmask + const uint16_t L = static_cast(~H & surrogates_bitmask); + + const uint16_t a = static_cast(L & (H >> 1)); // A low surrogate must be followed by high one. + // (A low surrogate placed in the 7th register's word + // is an exception we handle.) + const uint16_t b = static_cast(a << 1); // Just mark that the opinput - startite fact is hold, + // thanks to that we have only two masks for valid case. + const uint16_t c = static_cast(V | a | b); // Combine all the masks into the final one. + + if (c == 0xffff) { + // The whole input register contains valid UTF-16, i.e., + // either single words or proper surrogate pairs. + input += 16; + } else if (c == 0x7fff) { + // The 15 lower words of the input register contains valid UTF-16. + // The 15th word may be either a low or high surrogate. It the next + // iteration we 1) check if the low surrogate is followed by a high + // one, 2) reject sole high surrogate. + input += 15; + } else { + return nullptr; + } + } + } + + return input; +} + + +template +const result sse_validate_utf16_with_errors(const char16_t* input, size_t size) { + const char16_t* start = input; + const char16_t* end = input + size; + + const auto v_d8 = simd8::splat(0xd8); + const auto v_f8 = simd8::splat(0xf8); + const auto v_fc = simd8::splat(0xfc); + const auto v_dc = simd8::splat(0xdc); + + while (input + simd16::SIZE * 2 < end) { + // 0. Load data: since the validation takes into account only higher + // byte of each word, we compress the two vectors into one which + // consists only the higher bytes. + auto in0 = simd16(input); + auto in1 = simd16(input + simd16::SIZE / sizeof(char16_t)); + + if (big_endian) { + in0 = in0.swap_bytes(); + in1 = in1.swap_bytes(); + } + + const auto t0 = in0.shr<8>(); + const auto t1 = in1.shr<8>(); + + const auto in = simd16::pack(t0, t1); + + // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy). + const auto surrogates_wordmask = (in & v_f8) == v_d8; + const uint16_t surrogates_bitmask = static_cast(surrogates_wordmask.to_bitmask()); + if (surrogates_bitmask == 0x0000) { + input += 16; + } else { + // 2. We have some surrogates that have to be distinguished: + // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF) + // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF) + // + // Fact: high surrogate has 11th bit set (3rd bit in the higher word) + + // V - non-surrogate words + // V = not surrogates_wordmask + const uint16_t V = static_cast(~surrogates_bitmask); + + // H - word-mask for high surrogates: the six highest bits are 0b1101'11 + const auto vH = (in & v_fc) == v_dc; + const uint16_t H = static_cast(vH.to_bitmask()); + + // L - word mask for low surrogates + // L = not H and surrogates_wordmask + const uint16_t L = static_cast(~H & surrogates_bitmask); + + const uint16_t a = static_cast(L & (H >> 1)); // A low surrogate must be followed by high one. + // (A low surrogate placed in the 7th register's word + // is an exception we handle.) + const uint16_t b = static_cast(a << 1); // Just mark that the opinput - startite fact is hold, + // thanks to that we have only two masks for valid case. + const uint16_t c = static_cast(V | a | b); // Combine all the masks into the final one. + + if (c == 0xffff) { + // The whole input register contains valid UTF-16, i.e., + // either single words or proper surrogate pairs. + input += 16; + } else if (c == 0x7fff) { + // The 15 lower words of the input register contains valid UTF-16. + // The 15th word may be either a low or high surrogate. It the next + // iteration we 1) check if the low surrogate is followed by a high + // one, 2) reject sole high surrogate. + input += 15; + } else { + return result(error_code::SURROGATE, input - start); + } + } + } + + return result(error_code::SUCCESS, input - start); +} +/* end file src/westmere/sse_validate_utf16.cpp */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_validate_utf32le.cpp +/* begin file src/westmere/sse_validate_utf32le.cpp */ +/* Returns: + - pointer to the last unprocessed character (a scalar fallback should check the rest); + - nullptr if an error was detected. +*/ +const char32_t* sse_validate_utf32le(const char32_t* input, size_t size) { + const char32_t* end = input + size; + + const __m128i standardmax = _mm_set1_epi32(0x10ffff); + const __m128i offset = _mm_set1_epi32(0xffff2000); + const __m128i standardoffsetmax = _mm_set1_epi32(0xfffff7ff); + __m128i currentmax = _mm_setzero_si128(); + __m128i currentoffsetmax = _mm_setzero_si128(); + + while (input + 4 < end) { + const __m128i in = _mm_loadu_si128((__m128i *)input); + currentmax = _mm_max_epu32(in,currentmax); + currentoffsetmax = _mm_max_epu32(_mm_add_epi32(in, offset), currentoffsetmax); + input += 4; + } + __m128i is_zero = _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax); + if(_mm_test_all_zeros(is_zero, is_zero) == 0) { + return nullptr; + } + + is_zero = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax); + if(_mm_test_all_zeros(is_zero, is_zero) == 0) { + return nullptr; + } + + return input; +} + + +const result sse_validate_utf32le_with_errors(const char32_t* input, size_t size) { + const char32_t* start = input; + const char32_t* end = input + size; + + const __m128i standardmax = _mm_set1_epi32(0x10ffff); + const __m128i offset = _mm_set1_epi32(0xffff2000); + const __m128i standardoffsetmax = _mm_set1_epi32(0xfffff7ff); + __m128i currentmax = _mm_setzero_si128(); + __m128i currentoffsetmax = _mm_setzero_si128(); + + while (input + 4 < end) { + const __m128i in = _mm_loadu_si128((__m128i *)input); + currentmax = _mm_max_epu32(in,currentmax); + currentoffsetmax = _mm_max_epu32(_mm_add_epi32(in, offset), currentoffsetmax); + + __m128i is_zero = _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax); + if(_mm_test_all_zeros(is_zero, is_zero) == 0) { + return result(error_code::TOO_LARGE, input - start); + } + + is_zero = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax); + if(_mm_test_all_zeros(is_zero, is_zero) == 0) { + return result(error_code::SURROGATE, input - start); + } + input += 4; + } + + return result(error_code::SUCCESS, input - start); +} +/* end file src/westmere/sse_validate_utf32le.cpp */ + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf8_to_utf16.cpp +/* begin file src/westmere/sse_convert_utf8_to_utf16.cpp */ +// depends on "tables/utf8_to_utf16_tables.h" + + +// Convert up to 12 bytes from utf8 to utf16 using a mask indicating the +// end of the code points. Only the least significant 12 bits of the mask +// are accessed. +// It returns how many bytes were consumed (up to 12). +template +size_t convert_masked_utf8_to_utf16(const char *input, + uint64_t utf8_end_of_code_point_mask, + char16_t *&utf16_output) { + // we use an approach where we try to process up to 12 input bytes. + // Why 12 input bytes and not 16? Because we are concerned with the size of + // the lookup tables. Also 12 is nicely divisible by two and three. + // + // + // Optimization note: our main path below is load-latency dependent. Thus it is maybe + // beneficial to have fast paths that depend on branch prediction but have less latency. + // This results in more instructions but, potentially, also higher speeds. + // + // We first try a few fast paths. + const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + const __m128i in = _mm_loadu_si128((__m128i *)input); + const uint16_t input_utf8_end_of_code_point_mask = + utf8_end_of_code_point_mask & 0xfff; + if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) { + // We process the data in chunks of 16 bytes. + __m128i ascii_first = _mm_cvtepu8_epi16(in); + __m128i ascii_second = _mm_cvtepu8_epi16(_mm_srli_si128(in,8)); + if (big_endian) { + ascii_first = _mm_shuffle_epi8(ascii_first, swap); + ascii_second = _mm_shuffle_epi8(ascii_second, swap); + } + _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output), ascii_first); + _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output + 8), ascii_second); + utf16_output += 16; // We wrote 16 16-bit characters. + return 16; // We consumed 16 bytes. + } + if(((utf8_end_of_code_point_mask & 0xFFFF) == 0xaaaa)) { + // We want to take 8 2-byte UTF-8 words and turn them into 8 2-byte UTF-16 words. + // There is probably a more efficient sequence, but the following might do. + const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); + __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); + if (big_endian) composed = _mm_shuffle_epi8(composed, swap); + _mm_storeu_si128((__m128i *)utf16_output, composed); + utf16_output += 8; // We wrote 16 bytes, 8 code points. + return 16; + } + if(input_utf8_end_of_code_point_mask == 0x924) { + // We want to take 4 3-byte UTF-8 words and turn them into 4 2-byte UTF-16 words. + // There is probably a more efficient sequence, but the following might do. + const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = + _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits + const __m128i middlebyte = + _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits + const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); + const __m128i highbyte = + _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits + const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4); + const __m128i composed = + _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted); + __m128i composed_repacked = _mm_packus_epi32(composed, composed); + if (big_endian) composed_repacked = _mm_shuffle_epi8(composed_repacked, swap); + _mm_storeu_si128((__m128i *)utf16_output, composed_repacked); + utf16_output += 4; + return 12; + } + /// We do not have a fast path available, so we fallback. + + const uint8_t idx = + tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0]; + const uint8_t consumed = + tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; + if (idx < 64) { + // SIX (6) input code-words + // this is a relatively easy scenario + // we process SIX (6) input code-words. The max length in bytes of six code + // words spanning between 1 and 2 bytes each is 12 bytes. On processors + // where pdep/pext is fast, we might be able to use a small lookup table. + const __m128i sh = + _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); + __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); + if (big_endian) composed = _mm_shuffle_epi8(composed, swap); + _mm_storeu_si128((__m128i *)utf16_output, composed); + utf16_output += 6; // We wrote 12 bytes, 6 code points. + } else if (idx < 145) { + // FOUR (4) input code-words + const __m128i sh = + _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = + _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits + const __m128i middlebyte = + _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits + const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); + const __m128i highbyte = + _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits + const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4); + const __m128i composed = + _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted); + __m128i composed_repacked = _mm_packus_epi32(composed, composed); + if (big_endian) composed_repacked = _mm_shuffle_epi8(composed_repacked, swap); + _mm_storeu_si128((__m128i *)utf16_output, composed_repacked); + utf16_output += 4; + } else if (idx < 209) { + // TWO (2) input code-words + const __m128i sh = + _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); + const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); + const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); + __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000)); + // correct for spurious high bit + const __m128i correct = + _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1); + middlehighbyte = _mm_xor_si128(correct, middlehighbyte); + const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000)); + const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6); + const __m128i composed = + _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), + _mm_or_si128(highbyte_shifted, middlehighbyte_shifted)); + const __m128i composedminus = + _mm_sub_epi32(composed, _mm_set1_epi32(0x10000)); + const __m128i lowtenbits = + _mm_and_si128(composedminus, _mm_set1_epi32(0x3ff)); + const __m128i hightenbits = _mm_srli_epi32(composedminus, 10); + const __m128i lowtenbitsadd = + _mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00)); + const __m128i hightenbitsadd = + _mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800)); + const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16); + __m128i surrogates = + _mm_or_si128(hightenbitsadd, lowtenbitsaddshifted); + uint32_t basic_buffer[4]; + uint32_t basic_buffer_swap[4]; + if (big_endian) { + _mm_storeu_si128((__m128i *)basic_buffer_swap, _mm_shuffle_epi8(composed, swap)); + surrogates = _mm_shuffle_epi8(surrogates, swap); + } + _mm_storeu_si128((__m128i *)basic_buffer, composed); + uint32_t surrogate_buffer[4]; + _mm_storeu_si128((__m128i *)surrogate_buffer, surrogates); + for (size_t i = 0; i < 3; i++) { + if (basic_buffer[i] < 65536) { + utf16_output[0] = big_endian ? uint16_t(basic_buffer_swap[i]) : uint16_t(basic_buffer[i]); + utf16_output++; + } else { + utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff); + utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16); + utf16_output += 2; + } + } + } else { + // here we know that there is an error but we do not handle errors + } + return consumed; +} +/* end file src/westmere/sse_convert_utf8_to_utf16.cpp */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf8_to_utf32.cpp +/* begin file src/westmere/sse_convert_utf8_to_utf32.cpp */ +// depends on "tables/utf8_to_utf16_tables.h" + + +// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the +// end of the code points. Only the least significant 12 bits of the mask +// are accessed. +// It returns how many bytes were consumed (up to 12). +size_t convert_masked_utf8_to_utf32(const char *input, + uint64_t utf8_end_of_code_point_mask, + char32_t *&utf32_output) { + // we use an approach where we try to process up to 12 input bytes. + // Why 12 input bytes and not 16? Because we are concerned with the size of + // the lookup tables. Also 12 is nicely divisible by two and three. + // + // + // Optimization note: our main path below is load-latency dependent. Thus it is maybe + // beneficial to have fast paths that depend on branch prediction but have less latency. + // This results in more instructions but, potentially, also higher speeds. + // + // We first try a few fast paths. + const __m128i in = _mm_loadu_si128((__m128i *)input); + const uint16_t input_utf8_end_of_code_point_mask = + utf8_end_of_code_point_mask & 0xfff; + if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) { + // We process the data in chunks of 16 bytes. + _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu8_epi32(in)); + _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu8_epi32(_mm_srli_si128(in,4))); + _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+8), _mm_cvtepu8_epi32(_mm_srli_si128(in,8))); + _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+12), _mm_cvtepu8_epi32(_mm_srli_si128(in,12))); + utf32_output += 16; // We wrote 16 32-bit characters. + return 16; // We consumed 16 bytes. + } + if(((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) { + // We want to take 8 2-byte UTF-8 words and turn them into 8 4-byte UTF-32 words. + // There is probably a more efficient sequence, but the following might do. + const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); + const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); + _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(composed)); + _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(composed,8))); + utf32_output += 8; // We wrote 32 bytes, 8 code points. + return 16; + } + if(input_utf8_end_of_code_point_mask == 0x924) { + // We want to take 4 3-byte UTF-8 words and turn them into 4 4-byte UTF-32 words. + // There is probably a more efficient sequence, but the following might do. + const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = + _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits + const __m128i middlebyte = + _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits + const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); + const __m128i highbyte = + _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits + const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4); + const __m128i composed = + _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted); + _mm_storeu_si128((__m128i *)utf32_output, composed); + utf32_output += 4; + return 12; + } + /// We do not have a fast path available, so we fallback. + + const uint8_t idx = + tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0]; + const uint8_t consumed = + tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; + if (idx < 64) { + // SIX (6) input code-words + // this is a relatively easy scenario + // we process SIX (6) input code-words. The max length in bytes of six code + // words spanning between 1 and 2 bytes each is 12 bytes. On processors + // where pdep/pext is fast, we might be able to use a small lookup table. + const __m128i sh = + _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); + const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); + _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(composed)); + _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(composed,8))); + utf32_output += 6; // We wrote 12 bytes, 6 code points. + } else if (idx < 145) { + // FOUR (4) input code-words + const __m128i sh = + _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = + _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits + const __m128i middlebyte = + _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits + const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); + const __m128i highbyte = + _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits + const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4); + const __m128i composed = + _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted); + _mm_storeu_si128((__m128i *)utf32_output, composed); + utf32_output += 4; + } else if (idx < 209) { + // TWO (2) input code-words + const __m128i sh = + _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); + const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); + const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); + __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000)); + // correct for spurious high bit + const __m128i correct = + _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1); + middlehighbyte = _mm_xor_si128(correct, middlehighbyte); + const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000)); + const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6); + const __m128i composed = + _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), + _mm_or_si128(highbyte_shifted, middlehighbyte_shifted)); + _mm_storeu_si128((__m128i *)utf32_output, composed); + utf32_output += 3; + } else { + // here we know that there is an error but we do not handle errors + } + return consumed; +} +/* end file src/westmere/sse_convert_utf8_to_utf32.cpp */ + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf16_to_utf8.cpp +/* begin file src/westmere/sse_convert_utf16_to_utf8.cpp */ +/* + The vectorized algorithm works on single SSE register i.e., it + loads eight 16-bit words. + + We consider three cases: + 1. an input register contains no surrogates and each value + is in range 0x0000 .. 0x07ff. + 2. an input register contains no surrogates and values are + is in range 0x0000 .. 0xffff. + 3. an input register contains surrogates --- i.e. codepoints + can have 16 or 32 bits. + + Ad 1. + + When values are less than 0x0800, it means that a 16-bit words + can be converted into: 1) single UTF8 byte (when it's an ASCII + char) or 2) two UTF8 bytes. + + For this case we do only some shuffle to obtain these 2-byte + codes and finally compress the whole SSE register with a single + shuffle. + + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. + + Ad 2. + + When values fit in 16-bit words, but are above 0x07ff, then + a single word may produce one, two or three UTF8 bytes. + + We prepare data for all these three cases in two registers. + The first register contains lower two UTF8 bytes (used in all + cases), while the second one contains just the third byte for + the three-UTF8-bytes case. + + Finally these two registers are interleaved forming eight-element + array of 32-bit values. The array spans two SSE registers. + The bytes from the registers are compressed using two shuffles. + + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. + + + To summarize: + - We need two 256-entry tables that have 8704 bytes in total. +*/ + +/* + Returns a pair: the first unprocessed byte from buf and utf8_output + A scalar routing should carry on the conversion of the tail. +*/ +template +std::pair sse_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) { + + const char16_t* end = buf + len; + + const __m128i v_0000 = _mm_setzero_si128(); + const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800); + const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800); + const __m128i v_c080 = _mm_set1_epi16((int16_t)0xc080); + const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 + + while (buf + 16 + safety_margin <= end) { + __m128i in = _mm_loadu_si128((__m128i*)buf); + if (big_endian) { + const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + in = _mm_shuffle_epi8(in, swap); + } + // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes + const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80); + if(_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!! + __m128i nextin = _mm_loadu_si128((__m128i*)buf+1); + if (big_endian) { + const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + nextin = _mm_shuffle_epi8(nextin, swap); + } + if(!_mm_testz_si128(nextin, v_ff80)) { + // 1. pack the bytes + // obviously suboptimal. + const __m128i utf8_packed = _mm_packus_epi16(in,in); + // 2. store (16 bytes) + _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); + // 3. adjust pointers + buf += 8; + utf8_output += 8; + in = nextin; + } else { + // 1. pack the bytes + // obviously suboptimal. + const __m128i utf8_packed = _mm_packus_epi16(in,nextin); + // 2. store (16 bytes) + _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! + } + } + + // no bits set above 7th bit + const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000); + const uint16_t one_byte_bitmask = static_cast(_mm_movemask_epi8(one_byte_bytemask)); + + // no bits set above 11th bit + const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000); + const uint16_t one_or_two_bytes_bitmask = static_cast(_mm_movemask_epi8(one_or_two_bytes_bytemask)); + + if (one_or_two_bytes_bitmask == 0xffff) { + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00); + const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f); + + // t0 = [000a|aaaa|bbbb|bb00] + const __m128i t0 = _mm_slli_epi16(in, 2); + // t1 = [000a|aaaa|0000|0000] + const __m128i t1 = _mm_and_si128(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const __m128i t2 = _mm_and_si128(in, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const __m128i t3 = _mm_or_si128(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const __m128i t4 = _mm_or_si128(t3, v_c080); + + // 2. merge ASCII and 2-byte codewords + const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in, one_byte_bytemask); + + // 3. prepare bitmask for 8-bit lookup + // one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB) + const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a + const uint16_t m1 = static_cast(m0 >> 7); // m1 = 00000000h0g0f0e0 + const uint8_t m2 = static_cast((m0 | m1) & 0xff); // m2 = hdgcfbea + // 4. pack the bytes + const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; + const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1)); + const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle); + + // 5. store bytes + _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); + + // 6. adjust pointers + buf += 8; + utf8_output += row[0]; + continue; + + } + + // 1. Check if there are any surrogate word in the input chunk. + // We have also deal with situation when there is a surrogate word + // at the end of a chunk. + const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800); + + // bitmask = 0x0000 if there are no surrogates + // = 0xc000 if the last word is a surrogate + const uint16_t surrogates_bitmask = static_cast(_mm_movemask_epi8(surrogates_bytemask)); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, + // it is likely an uncommon occurrence. + if (surrogates_bitmask == 0x0000) { + // case: words from register produce either 1, 2 or 3 UTF-8 bytes + const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); + + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes + + We expand the input word (16-bit) into two words (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. + + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. + + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. + + Finally from these two words we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ +#define vec(x) _mm_set1_epi16(static_cast(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const __m128i t0 = _mm_shuffle_epi8(in, dup_even); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const __m128i t1 = _mm_and_si128(t0, vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const __m128i t2 = _mm_or_si128 (t1, vec(0b1000000000000000)); + + // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] + const __m128i s0 = _mm_srli_epi16(in, 4); + // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] + const __m128i s1 = _mm_and_si128(s0, vec(0b0000111111111100)); + // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] + const __m128i s2 = _mm_maddubs_epi16(s1, vec(0x0140)); + // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const __m128i s3 = _mm_or_si128(s2, vec(0b1100000011100000)); + const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, vec(0b0100000000000000)); + const __m128i s4 = _mm_xor_si128(s3, m0); +#undef vec + + // 4. expand words 16-bit => 32-bit + const __m128i out0 = _mm_unpacklo_epi16(t2, s4); + const __m128i out1 = _mm_unpackhi_epi16(t2, s4); + + // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle + const uint16_t mask = (one_byte_bitmask & 0x5555) | + (one_or_two_bytes_bitmask & 0xaaaa); + if(mask == 0) { + // We only have three-byte words. Use fast path. + const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); + const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle); + const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle); + _mm_storeu_si128((__m128i*)utf8_output, utf8_0); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, utf8_1); + utf8_output += 12; + buf += 8; + continue; + } + const uint8_t mask0 = uint8_t(mask); + + const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1)); + const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0); + + const uint8_t mask1 = static_cast(mask >> 8); + + const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1)); + const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1); + + _mm_storeu_si128((__m128i*)utf8_output, utf8_0); + utf8_output += row0[0]; + _mm_storeu_si128((__m128i*)utf8_output, utf8_1); + utf8_output += row1[0]; + + buf += 8; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; + if((word & 0xFF80)==0) { + *utf8_output++ = char(word); + } else if((word & 0xF800)==0) { + *utf8_output++ = char((word>>6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if((word &0xF800 ) != 0xD800) { + *utf8_output++ = char((word>>12) | 0b11100000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1]; + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if((diff | diff2) > 0x3FF) { return std::make_pair(nullptr, utf8_output); } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf8_output++ = char((value>>18) | 0b11110000); + *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((value & 0b111111) | 0b10000000); + } + } + buf += k; + } + } // while + + return std::make_pair(buf, utf8_output); +} + + +/* + Returns a pair: a result struct and utf8_output. + If there is an error, the count field of the result is the position of the error. + Otherwise, it is the position of the first unprocessed byte in buf (even if finished). + A scalar routing should carry on the conversion of the tail if needed. +*/ +template +std::pair sse_convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) { + const char16_t* start = buf; + const char16_t* end = buf + len; + + const __m128i v_0000 = _mm_setzero_si128(); + const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800); + const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800); + const __m128i v_c080 = _mm_set1_epi16((int16_t)0xc080); + const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 + + while (buf + 16 + safety_margin <= end) { + __m128i in = _mm_loadu_si128((__m128i*)buf); + if (big_endian) { + const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + in = _mm_shuffle_epi8(in, swap); + } + // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes + const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80); + if(_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!! + __m128i nextin = _mm_loadu_si128((__m128i*)buf+1); + if (big_endian) { + const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + nextin = _mm_shuffle_epi8(nextin, swap); + } + if(!_mm_testz_si128(nextin, v_ff80)) { + // 1. pack the bytes + // obviously suboptimal. + const __m128i utf8_packed = _mm_packus_epi16(in,in); + // 2. store (16 bytes) + _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); + // 3. adjust pointers + buf += 8; + utf8_output += 8; + in = nextin; + } else { + // 1. pack the bytes + // obviously suboptimal. + const __m128i utf8_packed = _mm_packus_epi16(in,nextin); + // 2. store (16 bytes) + _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! + } + } + + // no bits set above 7th bit + const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000); + const uint16_t one_byte_bitmask = static_cast(_mm_movemask_epi8(one_byte_bytemask)); + + // no bits set above 11th bit + const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000); + const uint16_t one_or_two_bytes_bitmask = static_cast(_mm_movemask_epi8(one_or_two_bytes_bytemask)); + + if (one_or_two_bytes_bitmask == 0xffff) { + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00); + const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f); + + // t0 = [000a|aaaa|bbbb|bb00] + const __m128i t0 = _mm_slli_epi16(in, 2); + // t1 = [000a|aaaa|0000|0000] + const __m128i t1 = _mm_and_si128(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const __m128i t2 = _mm_and_si128(in, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const __m128i t3 = _mm_or_si128(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const __m128i t4 = _mm_or_si128(t3, v_c080); + + // 2. merge ASCII and 2-byte codewords + const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in, one_byte_bytemask); + + // 3. prepare bitmask for 8-bit lookup + // one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB) + const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a + const uint16_t m1 = static_cast(m0 >> 7); // m1 = 00000000h0g0f0e0 + const uint8_t m2 = static_cast((m0 | m1) & 0xff); // m2 = hdgcfbea + // 4. pack the bytes + const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; + const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1)); + const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle); + + // 5. store bytes + _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); + + // 6. adjust pointers + buf += 8; + utf8_output += row[0]; + continue; + + } + + // 1. Check if there are any surrogate word in the input chunk. + // We have also deal with situation when there is a surrogate word + // at the end of a chunk. + const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800); + + // bitmask = 0x0000 if there are no surrogates + // = 0xc000 if the last word is a surrogate + const uint16_t surrogates_bitmask = static_cast(_mm_movemask_epi8(surrogates_bytemask)); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, + // it is likely an uncommon occurrence. + if (surrogates_bitmask == 0x0000) { + // case: words from register produce either 1, 2 or 3 UTF-8 bytes + const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); + + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes + + We expand the input word (16-bit) into two words (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. + + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. + + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. + + Finally from these two words we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ +#define vec(x) _mm_set1_epi16(static_cast(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const __m128i t0 = _mm_shuffle_epi8(in, dup_even); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const __m128i t1 = _mm_and_si128(t0, vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const __m128i t2 = _mm_or_si128 (t1, vec(0b1000000000000000)); + + // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] + const __m128i s0 = _mm_srli_epi16(in, 4); + // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] + const __m128i s1 = _mm_and_si128(s0, vec(0b0000111111111100)); + // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] + const __m128i s2 = _mm_maddubs_epi16(s1, vec(0x0140)); + // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const __m128i s3 = _mm_or_si128(s2, vec(0b1100000011100000)); + const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, vec(0b0100000000000000)); + const __m128i s4 = _mm_xor_si128(s3, m0); +#undef vec + + // 4. expand words 16-bit => 32-bit + const __m128i out0 = _mm_unpacklo_epi16(t2, s4); + const __m128i out1 = _mm_unpackhi_epi16(t2, s4); + + // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle + const uint16_t mask = (one_byte_bitmask & 0x5555) | + (one_or_two_bytes_bitmask & 0xaaaa); + if(mask == 0) { + // We only have three-byte words. Use fast path. + const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); + const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle); + const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle); + _mm_storeu_si128((__m128i*)utf8_output, utf8_0); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, utf8_1); + utf8_output += 12; + buf += 8; + continue; + } + const uint8_t mask0 = uint8_t(mask); + + const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1)); + const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0); + + const uint8_t mask1 = static_cast(mask >> 8); + + const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1)); + const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1); + + _mm_storeu_si128((__m128i*)utf8_output, utf8_0); + utf8_output += row0[0]; + _mm_storeu_si128((__m128i*)utf8_output, utf8_1); + utf8_output += row1[0]; + + buf += 8; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; + if((word & 0xFF80)==0) { + *utf8_output++ = char(word); + } else if((word & 0xF800)==0) { + *utf8_output++ = char((word>>6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if((word &0xF800 ) != 0xD800) { + *utf8_output++ = char((word>>12) | 0b11100000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1]; + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if((diff | diff2) > 0x3FF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf8_output); } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf8_output++ = char((value>>18) | 0b11110000); + *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((value & 0b111111) | 0b10000000); + } + } + buf += k; + } + } // while + + return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output); +} +/* end file src/westmere/sse_convert_utf16_to_utf8.cpp */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf16_to_utf32.cpp +/* begin file src/westmere/sse_convert_utf16_to_utf32.cpp */ +/* + The vectorized algorithm works on single SSE register i.e., it + loads eight 16-bit words. + + We consider three cases: + 1. an input register contains no surrogates and each value + is in range 0x0000 .. 0x07ff. + 2. an input register contains no surrogates and values are + is in range 0x0000 .. 0xffff. + 3. an input register contains surrogates --- i.e. codepoints + can have 16 or 32 bits. + + Ad 1. + + When values are less than 0x0800, it means that a 16-bit words + can be converted into: 1) single UTF8 byte (when it's an ASCII + char) or 2) two UTF8 bytes. + + For this case we do only some shuffle to obtain these 2-byte + codes and finally compress the whole SSE register with a single + shuffle. + + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. + + Ad 2. + + When values fit in 16-bit words, but are above 0x07ff, then + a single word may produce one, two or three UTF8 bytes. + + We prepare data for all these three cases in two registers. + The first register contains lower two UTF8 bytes (used in all + cases), while the second one contains just the third byte for + the three-UTF8-bytes case. + + Finally these two registers are interleaved forming eight-element + array of 32-bit values. The array spans two SSE registers. + The bytes from the registers are compressed using two shuffles. + + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. + + + To summarize: + - We need two 256-entry tables that have 8704 bytes in total. +*/ + +/* + Returns a pair: the first unprocessed byte from buf and utf8_output + A scalar routing should carry on the conversion of the tail. +*/ +template +std::pair sse_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) { + const char16_t* end = buf + len; + + const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800); + const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800); + + while (buf + 16 <= end) { + __m128i in = _mm_loadu_si128((__m128i*)buf); + + if (big_endian) { + const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + in = _mm_shuffle_epi8(in, swap); + } + + // 1. Check if there are any surrogate word in the input chunk. + // We have also deal with situation when there is a surrogate word + // at the end of a chunk. + const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800); + + // bitmask = 0x0000 if there are no surrogates + // = 0xc000 if the last word is a surrogate + const uint16_t surrogates_bitmask = static_cast(_mm_movemask_epi8(surrogates_bytemask)); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, + // it is likely an uncommon occurrence. + if (surrogates_bitmask == 0x0000) { + // case: no surrogate pair, extend 16-bit words to 32-bit words + _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(in)); + _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(in,8))); + utf32_output += 8; + buf += 8; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; + if((word &0xF800 ) != 0xD800) { + *utf32_output++ = char32_t(word); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1]; + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if((diff | diff2) > 0x3FF) { return std::make_pair(nullptr, utf32_output); } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf32_output++ = char32_t(value); + } + } + buf += k; + } + } // while + return std::make_pair(buf, utf32_output); +} + + +/* + Returns a pair: a result struct and utf8_output. + If there is an error, the count field of the result is the position of the error. + Otherwise, it is the position of the first unprocessed byte in buf (even if finished). + A scalar routing should carry on the conversion of the tail if needed. +*/ +template +std::pair sse_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) { + const char16_t* start = buf; + const char16_t* end = buf + len; + + const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800); + const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800); + + while (buf + 16 <= end) { + __m128i in = _mm_loadu_si128((__m128i*)buf); + + if (big_endian) { + const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + in = _mm_shuffle_epi8(in, swap); + } + + // 1. Check if there are any surrogate word in the input chunk. + // We have also deal with situation when there is a surrogate word + // at the end of a chunk. + const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800); + + // bitmask = 0x0000 if there are no surrogates + // = 0xc000 if the last word is a surrogate + const uint16_t surrogates_bitmask = static_cast(_mm_movemask_epi8(surrogates_bytemask)); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, + // it is likely an uncommon occurrence. + if (surrogates_bitmask == 0x0000) { + // case: no surrogate pair, extend 16-bit words to 32-bit words + _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(in)); + _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(in,8))); + utf32_output += 8; + buf += 8; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k]; + if((word &0xF800 ) != 0xD800) { + *utf32_output++ = char32_t(word); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1]; + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if((diff | diff2) > 0x3FF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf32_output); } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf32_output++ = char32_t(value); + } + } + buf += k; + } + } // while + return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output); +} +/* end file src/westmere/sse_convert_utf16_to_utf32.cpp */ + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf32_to_utf8.cpp +/* begin file src/westmere/sse_convert_utf32_to_utf8.cpp */ +std::pair sse_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) { + + const char32_t* end = buf + len; + + const __m128i v_0000 = _mm_setzero_si128(); + const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800); + const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080); + const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80); + const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000); + const __m128i v_7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff); + __m128i running_max = _mm_setzero_si128(); + __m128i forbidden_bytemask = _mm_setzero_si128(); + const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 + + while (buf + 16 + safety_margin <= end) { + __m128i in = _mm_loadu_si128((__m128i*)buf); + __m128i nextin = _mm_loadu_si128((__m128i*)buf+1); + running_max = _mm_max_epu32(_mm_max_epu32(in, running_max), nextin); + + // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation + __m128i in_16 = _mm_packus_epi32(_mm_and_si128(in, v_7fffffff), _mm_and_si128(nextin, v_7fffffff)); + + // Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp + + // Check for ASCII fast path + if(_mm_testz_si128(in_16, v_ff80)) { // ASCII fast path!!!! + __m128i thirdin = _mm_loadu_si128((__m128i*)buf+2); + __m128i fourthin = _mm_loadu_si128((__m128i*)buf+3); + running_max = _mm_max_epu32(_mm_max_epu32(thirdin, running_max), fourthin); + __m128i nextin_16 = _mm_packus_epi32(_mm_and_si128(thirdin, v_7fffffff), _mm_and_si128(fourthin, v_7fffffff)); + if(!_mm_testz_si128(nextin_16, v_ff80)) { + // 1. pack the bytes + // obviously suboptimal. + const __m128i utf8_packed = _mm_packus_epi16(in_16,in_16); + // 2. store (16 bytes) + _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); + // 3. adjust pointers + buf += 8; + utf8_output += 8; + // Proceed with next input + in_16 = nextin_16; + } else { + // 1. pack the bytes + const __m128i utf8_packed = _mm_packus_epi16(in_16, nextin_16); + // 2. store (16 bytes) + _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! + } + } + + // no bits set above 7th bit + const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_ff80), v_0000); + const uint16_t one_byte_bitmask = static_cast(_mm_movemask_epi8(one_byte_bytemask)); + + // no bits set above 11th bit + const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000); + const uint16_t one_or_two_bytes_bitmask = static_cast(_mm_movemask_epi8(one_or_two_bytes_bytemask)); + + if (one_or_two_bytes_bitmask == 0xffff) { + // case: all words either produce 1 or 2 UTF-8 bytes (at least one produces 2 bytes) + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00); + const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f); + + // t0 = [000a|aaaa|bbbb|bb00] + const __m128i t0 = _mm_slli_epi16(in_16, 2); + // t1 = [000a|aaaa|0000|0000] + const __m128i t1 = _mm_and_si128(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const __m128i t2 = _mm_and_si128(in_16, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const __m128i t3 = _mm_or_si128(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const __m128i t4 = _mm_or_si128(t3, v_c080); + + // 2. merge ASCII and 2-byte codewords + const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in_16, one_byte_bytemask); + + // 3. prepare bitmask for 8-bit lookup + // one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB) + const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a + const uint16_t m1 = static_cast(m0 >> 7); // m1 = 00000000h0g0f0e0 + const uint8_t m2 = static_cast((m0 | m1) & 0xff); // m2 = hdgcfbea + // 4. pack the bytes + const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; + const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1)); + const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle); + + // 5. store bytes + _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); + + // 6. adjust pointers + buf += 8; + utf8_output += row[0]; + continue; + } + + + // Check for overflow in packing + const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000); + const uint32_t saturation_bitmask = static_cast(_mm_movemask_epi8(saturation_bytemask)); + + if (saturation_bitmask == 0xffff) { + // case: words from register produce either 1, 2 or 3 UTF-8 bytes + const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800); + forbidden_bytemask = _mm_or_si128(forbidden_bytemask, _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800)); + + const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); + + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes + + We expand the input word (16-bit) into two words (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. + + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. + + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. + + Finally from these two words we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ +#define vec(x) _mm_set1_epi16(static_cast(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const __m128i t1 = _mm_and_si128(t0, vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const __m128i t2 = _mm_or_si128 (t1, vec(0b1000000000000000)); + + // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] + const __m128i s0 = _mm_srli_epi16(in_16, 4); + // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] + const __m128i s1 = _mm_and_si128(s0, vec(0b0000111111111100)); + // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] + const __m128i s2 = _mm_maddubs_epi16(s1, vec(0x0140)); + // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const __m128i s3 = _mm_or_si128(s2, vec(0b1100000011100000)); + const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, vec(0b0100000000000000)); + const __m128i s4 = _mm_xor_si128(s3, m0); +#undef vec + + // 4. expand words 16-bit => 32-bit + const __m128i out0 = _mm_unpacklo_epi16(t2, s4); + const __m128i out1 = _mm_unpackhi_epi16(t2, s4); + + // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle + const uint16_t mask = (one_byte_bitmask & 0x5555) | + (one_or_two_bytes_bitmask & 0xaaaa); + if(mask == 0) { + // We only have three-byte words. Use fast path. + const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); + const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle); + const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle); + _mm_storeu_si128((__m128i*)utf8_output, utf8_0); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, utf8_1); + utf8_output += 12; + buf += 8; + continue; + } + const uint8_t mask0 = uint8_t(mask); + + const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1)); + const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0); + + const uint8_t mask1 = static_cast(mask >> 8); + + const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1)); + const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1); + + _mm_storeu_si128((__m128i*)utf8_output, utf8_0); + utf8_output += row0[0]; + _mm_storeu_si128((__m128i*)utf8_output, utf8_1); + utf8_output += row1[0]; + + buf += 8; + } else { + // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint32_t word = buf[k]; + if((word & 0xFFFFFF80)==0) { + *utf8_output++ = char(word); + } else if((word & 0xFFFFF800)==0) { + *utf8_output++ = char((word>>6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if((word &0xFFFF0000 )==0) { + if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf8_output); } + *utf8_output++ = char((word>>12) | 0b11100000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { + if (word > 0x10FFFF) { return std::make_pair(nullptr, utf8_output); } + *utf8_output++ = char((word>>18) | 0b11110000); + *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } + } + buf += k; + } + } // while + + // check for invalid input + const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff); + if(static_cast(_mm_movemask_epi8(_mm_cmpeq_epi32(_mm_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffff) { + return std::make_pair(nullptr, utf8_output); + } + + if (static_cast(_mm_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf8_output); } + + return std::make_pair(buf, utf8_output); +} + + +std::pair sse_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) { + + const char32_t* end = buf + len; + const char32_t* start = buf; + + const __m128i v_0000 = _mm_setzero_si128(); + const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800); + const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080); + const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80); + const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000); + const __m128i v_7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff); + const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff); + + const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92 + + while (buf + 16 + safety_margin <= end) { + __m128i in = _mm_loadu_si128((__m128i*)buf); + __m128i nextin = _mm_loadu_si128((__m128i*)buf+1); + + // Check for too large input + __m128i max_input = _mm_max_epu32(_mm_max_epu32(in, nextin), v_10ffff); + if(static_cast(_mm_movemask_epi8(_mm_cmpeq_epi32(max_input, v_10ffff))) != 0xffff) { + return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output); + } + + // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation + __m128i in_16 = _mm_packus_epi32(_mm_and_si128(in, v_7fffffff), _mm_and_si128(nextin, v_7fffffff)); + + // Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp + + // Check for ASCII fast path + if(_mm_testz_si128(in_16, v_ff80)) { // ASCII fast path!!!! + __m128i thirdin = _mm_loadu_si128((__m128i*)buf+2); + __m128i fourthin = _mm_loadu_si128((__m128i*)buf+3); + __m128i nextin_16 = _mm_packus_epi32(_mm_and_si128(thirdin, v_7fffffff), _mm_and_si128(fourthin, v_7fffffff)); + if(!_mm_testz_si128(nextin_16, v_ff80)) { + // 1. pack the bytes + // obviously suboptimal. + const __m128i utf8_packed = _mm_packus_epi16(in_16,in_16); + // 2. store (16 bytes) + _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); + // 3. adjust pointers + buf += 8; + utf8_output += 8; + // Proceed with next input + in_16 = nextin_16; + __m128i next_max_input = _mm_max_epu32(_mm_max_epu32(thirdin, fourthin), v_10ffff); + if(static_cast(_mm_movemask_epi8(_mm_cmpeq_epi32(next_max_input, v_10ffff))) != 0xffff) { + return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output); + } + } else { + // 1. pack the bytes + const __m128i utf8_packed = _mm_packus_epi16(in_16, nextin_16); + // 2. store (16 bytes) + _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! + } + } + + // no bits set above 7th bit + const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_ff80), v_0000); + const uint16_t one_byte_bitmask = static_cast(_mm_movemask_epi8(one_byte_bytemask)); + + // no bits set above 11th bit + const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000); + const uint16_t one_or_two_bytes_bitmask = static_cast(_mm_movemask_epi8(one_or_two_bytes_bytemask)); + + if (one_or_two_bytes_bitmask == 0xffff) { + // case: all words either produce 1 or 2 UTF-8 bytes (at least one produces 2 bytes) + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00); + const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f); + + // t0 = [000a|aaaa|bbbb|bb00] + const __m128i t0 = _mm_slli_epi16(in_16, 2); + // t1 = [000a|aaaa|0000|0000] + const __m128i t1 = _mm_and_si128(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const __m128i t2 = _mm_and_si128(in_16, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const __m128i t3 = _mm_or_si128(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const __m128i t4 = _mm_or_si128(t3, v_c080); + + // 2. merge ASCII and 2-byte codewords + const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in_16, one_byte_bytemask); + + // 3. prepare bitmask for 8-bit lookup + // one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB) + const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a + const uint16_t m1 = static_cast(m0 >> 7); // m1 = 00000000h0g0f0e0 + const uint8_t m2 = static_cast((m0 | m1) & 0xff); // m2 = hdgcfbea + // 4. pack the bytes + const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; + const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1)); + const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle); + + // 5. store bytes + _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); + + // 6. adjust pointers + buf += 8; + utf8_output += row[0]; + continue; + } + + + // Check for overflow in packing + const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000); + const uint32_t saturation_bitmask = static_cast(_mm_movemask_epi8(saturation_bytemask)); + + if (saturation_bitmask == 0xffff) { + // case: words from register produce either 1, 2 or 3 UTF-8 bytes + + // Check for illegal surrogate words + const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800); + const __m128i forbidden_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800); + if (static_cast(_mm_movemask_epi8(forbidden_bytemask)) != 0) { + return std::make_pair(result(error_code::SURROGATE, buf - start), utf8_output); + } + + const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); + + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes + + We expand the input word (16-bit) into two words (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. + + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. + + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. + + Finally from these two words we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ +#define vec(x) _mm_set1_epi16(static_cast(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const __m128i t1 = _mm_and_si128(t0, vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const __m128i t2 = _mm_or_si128 (t1, vec(0b1000000000000000)); + + // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] + const __m128i s0 = _mm_srli_epi16(in_16, 4); + // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] + const __m128i s1 = _mm_and_si128(s0, vec(0b0000111111111100)); + // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] + const __m128i s2 = _mm_maddubs_epi16(s1, vec(0x0140)); + // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const __m128i s3 = _mm_or_si128(s2, vec(0b1100000011100000)); + const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, vec(0b0100000000000000)); + const __m128i s4 = _mm_xor_si128(s3, m0); +#undef vec + + // 4. expand words 16-bit => 32-bit + const __m128i out0 = _mm_unpacklo_epi16(t2, s4); + const __m128i out1 = _mm_unpackhi_epi16(t2, s4); + + // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle + const uint16_t mask = (one_byte_bitmask & 0x5555) | + (one_or_two_bytes_bitmask & 0xaaaa); + if(mask == 0) { + // We only have three-byte words. Use fast path. + const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); + const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle); + const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle); + _mm_storeu_si128((__m128i*)utf8_output, utf8_0); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, utf8_1); + utf8_output += 12; + buf += 8; + continue; + } + const uint8_t mask0 = uint8_t(mask); + + const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1)); + const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0); + + const uint8_t mask1 = static_cast(mask >> 8); + + const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1)); + const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1); + + _mm_storeu_si128((__m128i*)utf8_output, utf8_0); + utf8_output += row0[0]; + _mm_storeu_si128((__m128i*)utf8_output, utf8_1); + utf8_output += row1[0]; + + buf += 8; + } else { + // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint32_t word = buf[k]; + if((word & 0xFFFFFF80)==0) { + *utf8_output++ = char(word); + } else if((word & 0xFFFFF800)==0) { + *utf8_output++ = char((word>>6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if((word &0xFFFF0000 )==0) { + if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output); } + *utf8_output++ = char((word>>12) | 0b11100000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { + if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf- start + k), utf8_output); } + *utf8_output++ = char((word>>18) | 0b11110000); + *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } + } + buf += k; + } + } // while + + return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output); +} +/* end file src/westmere/sse_convert_utf32_to_utf8.cpp */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf32_to_utf16.cpp +/* begin file src/westmere/sse_convert_utf32_to_utf16.cpp */ +template +std::pair sse_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_output) { + + const char32_t* end = buf + len; + + const __m128i v_0000 = _mm_setzero_si128(); + const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000); + __m128i forbidden_bytemask = _mm_setzero_si128(); + + while (buf + 8 <= end) { + __m128i in = _mm_loadu_si128((__m128i*)buf); + __m128i nextin = _mm_loadu_si128((__m128i*)buf+1); + const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000); + const uint32_t saturation_bitmask = static_cast(_mm_movemask_epi8(saturation_bytemask)); + + // Check if no bits set above 16th + if (saturation_bitmask == 0xffff) { + // Pack UTF-32 to UTF-16 + __m128i utf16_packed = _mm_packus_epi32(in, nextin); + + const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800); + const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800); + forbidden_bytemask = _mm_or_si128(forbidden_bytemask, _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800)); + + if (big_endian) { + const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + utf16_packed = _mm_shuffle_epi8(utf16_packed, swap); + } + + _mm_storeu_si128((__m128i*)utf16_output, utf16_packed); + utf16_output += 8; + buf += 8; + } else { + size_t forward = 7; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint32_t word = buf[k]; + if((word & 0xFFFF0000)==0) { + // will not generate a surrogate pair + if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf16_output); } + *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word); + } else { + // will generate a surrogate pair + if (word > 0x10FFFF) { return std::make_pair(nullptr, utf16_output); } + word -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); + if (big_endian) { + high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8)); + low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8)); + } + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); + } + } + buf += k; + } + } + + // check for invalid input + if (static_cast(_mm_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf16_output); } + + return std::make_pair(buf, utf16_output); +} + + +template +std::pair sse_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) { + const char32_t* start = buf; + const char32_t* end = buf + len; + + const __m128i v_0000 = _mm_setzero_si128(); + const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000); + + while (buf + 8 <= end) { + __m128i in = _mm_loadu_si128((__m128i*)buf); + __m128i nextin = _mm_loadu_si128((__m128i*)buf+1); + const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000); + const uint32_t saturation_bitmask = static_cast(_mm_movemask_epi8(saturation_bytemask)); + + // Check if no bits set above 16th + if (saturation_bitmask == 0xffff) { + // Pack UTF-32 to UTF-16 + __m128i utf16_packed = _mm_packus_epi32(in, nextin); + + const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800); + const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800); + const __m128i forbidden_bytemask = _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800); + if (static_cast(_mm_movemask_epi8(forbidden_bytemask)) != 0) { + return std::make_pair(result(error_code::SURROGATE, buf - start), utf16_output); + } + + if (big_endian) { + const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + utf16_packed = _mm_shuffle_epi8(utf16_packed, swap); + } + + _mm_storeu_si128((__m128i*)utf16_output, utf16_packed); + utf16_output += 8; + buf += 8; + } else { + size_t forward = 7; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint32_t word = buf[k]; + if((word & 0xFFFF0000)==0) { + // will not generate a surrogate pair + if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output); } + *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word); + } else { + // will generate a surrogate pair + if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output); } + word -= 0x10000; + uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); + uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); + if (big_endian) { + high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8)); + low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8)); + } + *utf16_output++ = char16_t(high_surrogate); + *utf16_output++ = char16_t(low_surrogate); + } + } + buf += k; + } + } + + return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output); +} +/* end file src/westmere/sse_convert_utf32_to_utf16.cpp */ + +} // unnamed namespace +} // namespace westmere +} // namespace simdutf + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/buf_block_reader.h +/* begin file src/generic/buf_block_reader.h */ +namespace simdutf { +namespace westmere { +namespace { + +// Walks through a buffer in block-sized increments, loading the last part with spaces +template +struct buf_block_reader { +public: + simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len); + simdutf_really_inline size_t block_index(); + simdutf_really_inline bool has_full_block() const; + simdutf_really_inline const uint8_t *full_block() const; + /** + * Get the last block, padded with spaces. + * + * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this + * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there + * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding. + * + * @return the number of effective characters in the last block. + */ + simdutf_really_inline size_t get_remainder(uint8_t *dst) const; + simdutf_really_inline void advance(); +private: + const uint8_t *buf; + const size_t len; + const size_t lenminusstep; + size_t idx; +}; + +// Routines to print masks and text for debugging bitmask operations +simdutf_unused static char * format_input_text_64(const uint8_t *text) { + static char *buf = reinterpret_cast(malloc(sizeof(simd8x64) + 1)); + for (size_t i=0; i); i++) { + buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]); + } + buf[sizeof(simd8x64)] = '\0'; + return buf; +} + +// Routines to print masks and text for debugging bitmask operations +simdutf_unused static char * format_input_text(const simd8x64& in) { + static char *buf = reinterpret_cast(malloc(sizeof(simd8x64) + 1)); + in.store(reinterpret_cast(buf)); + for (size_t i=0; i); i++) { + if (buf[i] < ' ') { buf[i] = '_'; } + } + buf[sizeof(simd8x64)] = '\0'; + return buf; +} + +simdutf_unused static char * format_mask(uint64_t mask) { + static char *buf = reinterpret_cast(malloc(64 + 1)); + for (size_t i=0; i<64; i++) { + buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' '; + } + buf[64] = '\0'; + return buf; +} + +template +simdutf_really_inline buf_block_reader::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {} + +template +simdutf_really_inline size_t buf_block_reader::block_index() { return idx; } + +template +simdutf_really_inline bool buf_block_reader::has_full_block() const { + return idx < lenminusstep; +} + +template +simdutf_really_inline const uint8_t *buf_block_reader::full_block() const { + return &buf[idx]; +} + +template +simdutf_really_inline size_t buf_block_reader::get_remainder(uint8_t *dst) const { + if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers + std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once. + std::memcpy(dst, buf + idx, len - idx); + return len - idx; +} + +template +simdutf_really_inline void buf_block_reader::advance() { + idx += STEP_SIZE; +} + +} // unnamed namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/buf_block_reader.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h +/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ +namespace simdutf { +namespace westmere { +namespace { +namespace utf8_validation { + +using namespace simd; + + simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { +// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) +// Bit 1 = Too Long (ASCII followed by continuation) +// Bit 2 = Overlong 3-byte +// Bit 4 = Surrogate +// Bit 5 = Overlong 2-byte +// Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1<<6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 + ); + constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, + CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000 + ); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT + ); + return (byte_1_high & byte_1_low & byte_2_high); + } + simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, + const simd8 prev_input, const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; + } + + // + // Return nonzero if there are incomplete multibyte characters at the end of the block: + // e.g. if there is a 4-byte character, but it's 3 bytes from the end. + // + simdutf_really_inline simd8 is_incomplete(const simd8 input) { + // If the previous input's last 3 bytes match this, they're too short (they ended at EOF): + // ... 1111____ 111_____ 11______ + static const uint8_t max_array[32] = { + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1 + }; + const simd8 max_value(&max_array[sizeof(max_array)-sizeof(simd8)]); + return input.gt_bits(max_value); + } + + struct utf8_checker { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + // The last input we received + simd8 prev_input_block; + // Whether the last input we received was incomplete (used for ASCII fast path) + simd8 prev_incomplete; + + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes + // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + // The only problem that can happen at EOF is that a multibyte character is too short + // or a byte value too large in the last bytes: check_special_cases only checks for bytes + // too large in the first of two bytes. + simdutf_really_inline void check_eof() { + // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't + // possibly finish them. + this->error |= this->prev_incomplete; + } + + simdutf_really_inline void check_next_input(const simd8x64& input) { + if(simdutf_likely(is_ascii(input))) { + this->error |= this->prev_incomplete; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], this->prev_input_block); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], this->prev_input_block); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + this->prev_incomplete = is_incomplete(input.chunks[simd8x64::NUM_CHUNKS-1]); + this->prev_input_block = input.chunks[simd8x64::NUM_CHUNKS-1]; + + } + } + + // do not forget to call check_eof! + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } + + }; // struct utf8_checker +} // namespace utf8_validation + +using utf8_validation::utf8_checker; + +} // unnamed namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_validator.h +/* begin file src/generic/utf8_validation/utf8_validator.h */ +namespace simdutf { +namespace westmere { +namespace { +namespace utf8_validation { + +/** + * Validates that the string is actual UTF-8. + */ +template +bool generic_validate_utf8(const uint8_t * input, size_t length) { + checker c{}; + buf_block_reader<64> reader(input, length); + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + c.check_next_input(in); + reader.advance(); + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + c.check_next_input(in); + reader.advance(); + c.check_eof(); + return !c.errors(); +} + +bool generic_validate_utf8(const char * input, size_t length) { + return generic_validate_utf8(reinterpret_cast(input),length); +} + +/** + * Validates that the string is actual UTF-8 and stops on errors. + */ +template +result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) { + checker c{}; + buf_block_reader<64> reader(input, length); + size_t count{0}; + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + c.check_next_input(in); + if(c.errors()) { + if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk + result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast(input + count), length - count); + res.count += count; + return res; + } + reader.advance(); + count += 64; + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + c.check_next_input(in); + reader.advance(); + c.check_eof(); + if (c.errors()) { + result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast(input) + count, length - count); + res.count += count; + return res; + } else { + return result(error_code::SUCCESS, length); + } +} + +result generic_validate_utf8_with_errors(const char * input, size_t length) { + return generic_validate_utf8_with_errors(reinterpret_cast(input),length); +} + +template +bool generic_validate_ascii(const uint8_t * input, size_t length) { + buf_block_reader<64> reader(input, length); + uint8_t blocks[64]{}; + simd::simd8x64 running_or(blocks); + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + running_or |= in; + reader.advance(); + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + running_or |= in; + return running_or.is_ascii(); +} + +bool generic_validate_ascii(const char * input, size_t length) { + return generic_validate_ascii(reinterpret_cast(input),length); +} + +template +result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) { + buf_block_reader<64> reader(input, length); + size_t count{0}; + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + if (!in.is_ascii()) { + result res = scalar::ascii::validate_with_errors(reinterpret_cast(input + count), length - count); + return result(res.error, count + res.count); + } + reader.advance(); + + count += 64; + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + if (!in.is_ascii()) { + result res = scalar::ascii::validate_with_errors(reinterpret_cast(input + count), length - count); + return result(res.error, count + res.count); + } else { + return result(error_code::SUCCESS, length); + } +} + +result generic_validate_ascii_with_errors(const char * input, size_t length) { + return generic_validate_ascii_with_errors(reinterpret_cast(input),length); +} + +} // namespace utf8_validation +} // unnamed namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/utf8_validation/utf8_validator.h */ +// transcoding from UTF-8 to UTF-16 +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h +/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ + + +namespace simdutf { +namespace westmere { +namespace { +namespace utf8_to_utf16 { + +using namespace simd; + +template +simdutf_warn_unused size_t convert_valid(const char* input, size_t size, + char16_t* utf16_output) noexcept { + // The implementation is not specific to haswell and should be moved to the generic directory. + size_t pos = 0; + char16_t* start{utf16_output}; + const size_t safety_margin = 16; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + // this loop could be unrolled further. For example, we could process the mask + // far more than 64 bytes. + simd8x64 in(reinterpret_cast(input + pos)); + if(in.is_ascii()) { + in.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // Slow path. We hope that the compiler will recognize that this is a slow path. + // Anything that is not a continuation mask is a 'leading byte', that is, the + // start of a new code point. + uint64_t utf8_continuation_mask = in.lt(-65 + 1); + // -65 is 0b10111111 in two-complement's, so largest possible continuation byte + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + // The *start* of code points is not so useful, rather, we want the *end* of code points. + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times when using solely + // the slow/regular path, and at least four times if there are fast paths. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + // + // Thus we may allow convert_masked_utf8_to_utf16 to process + // more bytes at a time under a fast-path mode where 16 bytes + // are consumed at once (e.g., when encountering ASCII). + size_t consumed = convert_masked_utf8_to_utf16(input + pos, + utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block.These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + utf16_output += scalar::utf8_to_utf16::convert_valid(input + pos, size - pos, utf16_output); + return utf16_output - start; +} + +} // namespace utf8_to_utf16 +} // unnamed namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h +/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */ + + +namespace simdutf { +namespace westmere { +namespace { +namespace utf8_to_utf16 { +using namespace simd; + + + simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { +// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) +// Bit 1 = Too Long (ASCII followed by continuation) +// Bit 2 = Overlong 3-byte +// Bit 4 = Surrogate +// Bit 5 = Overlong 2-byte +// Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1<<6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 + ); + constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, + CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000 + ); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT + ); + return (byte_1_high & byte_1_low & byte_2_high); + } + simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, + const simd8 prev_input, const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; + } + + + struct validating_transcoder { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + + validating_transcoder() : error(uint8_t(0)) {} + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes + // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + + template + simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) { + size_t pos = 0; + char16_t* start{utf16_output}; + const size_t safety_margin = 16; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if(input.is_ascii()) { + input.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf16(in + pos, + utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block.These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if(errors()) { return 0; } + if(pos < size) { + size_t howmany = scalar::utf8_to_utf16::convert(in + pos, size - pos, utf16_output); + if(howmany == 0) { return 0; } + utf16_output += howmany; + } + return utf16_output - start; + } + + template + simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) { + size_t pos = 0; + char16_t* start{utf16_output}; + const size_t safety_margin = 16; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if(input.is_ascii()) { + input.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + if (errors()) { + result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(in + pos, size - pos, utf16_output); + res.count += pos; + return res; + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf16(in + pos, + utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block.These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if(errors()) { + result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(in + pos, size - pos, utf16_output); + res.count += pos; + return res; + } + if(pos < size) { + result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(in + pos, size - pos, utf16_output); + if (res.error) { // In case of error, we want the error position + res.count += pos; + return res; + } else { // In case of success, we want the number of word written + utf16_output += res.count; + } + } + return result(error_code::SUCCESS, utf16_output - start); + } + + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } + + }; // struct utf8_checker +} // utf8_to_utf16 namespace +} // unnamed namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */ +// transcoding from UTF-8 to UTF-32 +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/valid_utf8_to_utf32.h +/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ + +namespace simdutf { +namespace westmere { +namespace { +namespace utf8_to_utf32 { + +using namespace simd; + + +simdutf_warn_unused size_t convert_valid(const char* input, size_t size, + char32_t* utf32_output) noexcept { + size_t pos = 0; + char32_t* start{utf32_output}; + const size_t safety_margin = 16; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 in(reinterpret_cast(input + pos)); + if(in.is_ascii()) { + in.store_ascii_as_utf32(utf32_output); + utf32_output += 64; + pos += 64; + } else { + // -65 is 0b10111111 in two-complement's, so largest possible continuation byte + uint64_t utf8_continuation_mask = in.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + size_t max_starting_point = (pos + 64) - 12; + while(pos < max_starting_point) { + size_t consumed = convert_masked_utf8_to_utf32(input + pos, + utf8_end_of_code_point_mask, utf32_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + } + } + utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output); + return utf32_output - start; +} + + +} // namespace utf8_to_utf32 +} // unnamed namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/utf8_to_utf32.h +/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */ + + +namespace simdutf { +namespace westmere { +namespace { +namespace utf8_to_utf32 { +using namespace simd; + + + simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { +// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) +// Bit 1 = Too Long (ASCII followed by continuation) +// Bit 2 = Overlong 3-byte +// Bit 4 = Surrogate +// Bit 5 = Overlong 2-byte +// Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1<<6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 + ); + constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, + CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000 + ); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT + ); + return (byte_1_high & byte_1_low & byte_2_high); + } + simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, + const simd8 prev_input, const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; + } + + + struct validating_transcoder { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + + validating_transcoder() : error(uint8_t(0)) {} + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes + // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + + + simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) { + size_t pos = 0; + char32_t* start{utf32_output}; + const size_t safety_margin = 16; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if(input.is_ascii()) { + input.store_ascii_as_utf32(utf32_output); + utf32_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf32(in + pos, + utf8_end_of_code_point_mask, utf32_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block.These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if(errors()) { return 0; } + if(pos < size) { + size_t howmany = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output); + if(howmany == 0) { return 0; } + utf32_output += howmany; + } + return utf32_output - start; + } + + simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) { + size_t pos = 0; + char32_t* start{utf32_output}; + const size_t safety_margin = 16; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if(input.is_ascii()) { + input.store_ascii_as_utf32(utf32_output); + utf32_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + if (errors()) { + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(in + pos, size - pos, utf32_output); + res.count += pos; + return res; + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf32(in + pos, + utf8_end_of_code_point_mask, utf32_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block.These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if(errors()) { + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(in + pos, size - pos, utf32_output); + res.count += pos; + return res; + } + if(pos < size) { + result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(in + pos, size - pos, utf32_output); + if (res.error) { // In case of error, we want the error position + res.count += pos; + return res; + } else { // In case of success, we want the number of word written + utf32_output += res.count; + } + } + return result(error_code::SUCCESS, utf32_output - start); + } + + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } + + }; // struct utf8_checker +} // utf8_to_utf32 namespace +} // unnamed namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */ +// other functions +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8.h +/* begin file src/generic/utf8.h */ + +namespace simdutf { +namespace westmere { +namespace { +namespace utf8 { + +using namespace simd; + +simdutf_really_inline size_t count_code_points(const char* in, size_t size) { + size_t pos = 0; + size_t count = 0; + for(;pos + 64 <= size; pos += 64) { + simd8x64 input(reinterpret_cast(in + pos)); + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + count += 64 - count_ones(utf8_continuation_mask); + } + return count + scalar::utf8::count_code_points(in + pos, size - pos); +} + + +simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) { + size_t pos = 0; + size_t count = 0; + // This algorithm could no doubt be improved! + for(;pos + 64 <= size; pos += 64) { + simd8x64 input(reinterpret_cast(in + pos)); + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + // We count one word for anything that is not a continuation (so + // leading bytes). + count += 64 - count_ones(utf8_continuation_mask); + int64_t utf8_4byte = input.gteq_unsigned(240); + count += count_ones(utf8_4byte); + } + return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos); +} + + +simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size) { + return count_code_points(in, size); +} +} // utf8 namespace +} // unnamed namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/utf8.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf16.h +/* begin file src/generic/utf16.h */ +namespace simdutf { +namespace westmere { +namespace { +namespace utf16 { + +template +simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) { + size_t pos = 0; + size_t count = 0; + for(;pos + 32 <= size; pos += 32) { + simd16x32 input(reinterpret_cast(in + pos)); + if (big_endian) input.swap_bytes(); + uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF); + count += count_ones(not_pair) / 2; + } + return count + scalar::utf16::count_code_points(in + pos, size - pos); +} + +template +simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) { + size_t pos = 0; + size_t count = 0; + // This algorithm could no doubt be improved! + for(;pos + 32 <= size; pos += 32) { + simd16x32 input(reinterpret_cast(in + pos)); + if (big_endian) input.swap_bytes(); + uint64_t ascii_mask = input.lteq(0x7F); + uint64_t twobyte_mask = input.lteq(0x7FF); + uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF); + + size_t ascii_count = count_ones(ascii_mask) / 2; + size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2; + size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2; + size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2; + count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count; + } + return count + scalar::utf16::utf8_length_from_utf16(in + pos, size - pos); +} + +template +simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) { + return count_code_points(in, size); +} + +simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) { + size_t pos = 0; + + while (pos + 32 <= size) { + simd16x32 input(reinterpret_cast(in + pos)); + input.swap_bytes(); + input.store(reinterpret_cast(output)); + pos += 32; + output += 32; + } + + scalar::utf16::change_endianness_utf16(in + pos, size - pos, output); +} + +} // utf16 +} // unnamed namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/utf16.h */ +// +// Implementation-specific overrides +// + +namespace simdutf { +namespace westmere { + +simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept { + // If there is a BOM, then we trust it. + auto bom_encoding = simdutf::BOM::check_bom(input, length); + if(bom_encoding != encoding_type::unspecified) { return bom_encoding; } + if (length % 2 == 0) { + return sse_detect_encodings(input, length); + } else { + if (implementation::validate_utf8(input, length)) { + return simdutf::encoding_type::UTF8; + } else { + return simdutf::encoding_type::unspecified; + } + } +} + +simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { + return westmere::utf8_validation::generic_validate_utf8(buf, len); +} + +simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept { + return westmere::utf8_validation::generic_validate_utf8_with_errors(buf, len); +} + +simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept { + return westmere::utf8_validation::generic_validate_ascii(buf, len); +} + +simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept { + return westmere::utf8_validation::generic_validate_ascii_with_errors(buf,len); +} + +simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept { + const char16_t* tail = sse_validate_utf16(buf, len); + if (tail) { + return scalar::utf16::validate(tail, len - (tail - buf)); + } else { + return false; + } +} + +simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept { + const char16_t* tail = sse_validate_utf16(buf, len); + if (tail) { + return scalar::utf16::validate(tail, len - (tail - buf)); + } else { + return false; + } +} + +simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept { + result res = sse_validate_utf16_with_errors(buf, len); + if (res.count != len) { + result scalar_res = scalar::utf16::validate_with_errors(buf + res.count, len - res.count); + return result(scalar_res.error, res.count + scalar_res.count); + } else { + return res; + } +} + +simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept { + result res = sse_validate_utf16_with_errors(buf, len); + if (res.count != len) { + result scalar_res = scalar::utf16::validate_with_errors(buf + res.count, len - res.count); + return result(scalar_res.error, res.count + scalar_res.count); + } else { + return res; + } +} + +simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept { + const char32_t* tail = sse_validate_utf32le(buf, len); + if (tail) { + return scalar::utf32::validate(tail, len - (tail - buf)); + } else { + return false; + } +} + +simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept { + result res = sse_validate_utf32le_with_errors(buf, len); + if (res.count != len) { + result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count); + return result(scalar_res.error, res.count + scalar_res.count); + } else { + return res; + } +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert(buf, len, utf16_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert_with_errors(buf, len, utf16_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert_with_errors(buf, len, utf16_output); +} + + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* input, size_t size, + char16_t* utf16_output) const noexcept { + return utf8_to_utf16::convert_valid(input, size, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* input, size_t size, + char16_t* utf16_output) const noexcept { + return utf8_to_utf16::convert_valid(input, size, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept { + utf8_to_utf32::validating_transcoder converter; + return converter.convert(buf, len, utf32_output); +} + +simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept { + utf8_to_utf32::validating_transcoder converter; + return converter.convert_with_errors(buf, len, utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size, + char32_t* utf32_output) const noexcept { + return utf8_to_utf32::convert_valid(input, size, utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + std::pair ret = sse_convert_utf16_to_utf8(buf, len, utf8_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - utf8_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + std::pair ret = sse_convert_utf16_to_utf8(buf, len, utf8_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - utf8_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of words written even if finished + std::pair ret = westmere::sse_convert_utf16_to_utf8_with_errors(buf, len, utf8_output); + if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = scalar::utf16_to_utf8::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written + return ret.first; +} + +simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of words written even if finished + std::pair ret = westmere::sse_convert_utf16_to_utf8_with_errors(buf, len, utf8_output); + if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = scalar::utf16_to_utf8::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written + return ret.first; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + return convert_utf16le_to_utf8(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + return convert_utf16be_to_utf8(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept { + std::pair ret = sse_convert_utf32_to_utf8(buf, len, utf8_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - utf8_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of words written even if finished + std::pair ret = westmere::sse_convert_utf32_to_utf8_with_errors(buf, len, utf8_output); + if (ret.first.count != len) { + result scalar_res = scalar::utf32_to_utf8::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written + return ret.first; +} + +simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + std::pair ret = sse_convert_utf16_to_utf32(buf, len, utf32_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - utf32_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + std::pair ret = sse_convert_utf16_to_utf32(buf, len, utf32_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - utf32_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of words written even if finished + std::pair ret = westmere::sse_convert_utf16_to_utf32_with_errors(buf, len, utf32_output); + if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = scalar::utf16_to_utf32::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit words written + return ret.first; +} + +simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of words written even if finished + std::pair ret = westmere::sse_convert_utf16_to_utf32_with_errors(buf, len, utf32_output); + if (ret.first.error) { return ret.first; } // Can return directly since scalar fallback already found correct ret.first.count + if (ret.first.count != len) { // All good so far, but not finished + result scalar_res = scalar::utf16_to_utf32::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit words written + return ret.first; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept { + return convert_utf32_to_utf8(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + std::pair ret = sse_convert_utf32_to_utf16(buf, len, utf16_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + std::pair ret = sse_convert_utf32_to_utf16(buf, len, utf16_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - utf16_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of words written even if finished + std::pair ret = westmere::sse_convert_utf32_to_utf16_with_errors(buf, len, utf16_output); + if (ret.first.count != len) { + result scalar_res = scalar::utf32_to_utf16::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written + return ret.first; +} + +simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + // ret.first.count is always the position in the buffer, not the number of words written even if finished + std::pair ret = westmere::sse_convert_utf32_to_utf16_with_errors(buf, len, utf16_output); + if (ret.first.count != len) { + result scalar_res = scalar::utf32_to_utf16::convert_with_errors( + buf + ret.first.count, len - ret.first.count, ret.second); + if (scalar_res.error) { + scalar_res.count += ret.first.count; + return scalar_res; + } else { + ret.second += scalar_res.count; + } + } + ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written + return ret.first; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + return convert_utf32_to_utf16le(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept { + return convert_utf32_to_utf16be(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + return convert_utf16le_to_utf32(buf, len, utf32_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept { + return convert_utf16be_to_utf32(buf, len, utf32_output); +} + +void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept { + utf16::change_endianness_utf16(input, length, output); +} + +simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept { + return utf16::count_code_points(input, length); +} + +simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept { + return utf16::count_code_points(input, length); +} + +simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept { + return utf8::count_code_points(input, length); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept { + return utf16::utf8_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept { + return utf16::utf8_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept { + return utf16::utf32_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept { + return utf16::utf32_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept { + return utf8::utf16_length_from_utf8(input, length); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept { + const __m128i v_00000000 = _mm_setzero_si128(); + const __m128i v_ffffff80 = _mm_set1_epi32((uint32_t)0xffffff80); + const __m128i v_fffff800 = _mm_set1_epi32((uint32_t)0xfffff800); + const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000); + size_t pos = 0; + size_t count = 0; + for(;pos + 4 <= length; pos += 4) { + __m128i in = _mm_loadu_si128((__m128i*)(input + pos)); + const __m128i ascii_bytes_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_ffffff80), v_00000000); + const __m128i one_two_bytes_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_fffff800), v_00000000); + const __m128i two_bytes_bytemask = _mm_xor_si128(one_two_bytes_bytemask, ascii_bytes_bytemask); + const __m128i one_two_three_bytes_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_ffff0000), v_00000000); + const __m128i three_bytes_bytemask = _mm_xor_si128(one_two_three_bytes_bytemask, one_two_bytes_bytemask); + const uint16_t ascii_bytes_bitmask = static_cast(_mm_movemask_epi8(ascii_bytes_bytemask)); + const uint16_t two_bytes_bitmask = static_cast(_mm_movemask_epi8(two_bytes_bytemask)); + const uint16_t three_bytes_bitmask = static_cast(_mm_movemask_epi8(three_bytes_bytemask)); + + size_t ascii_count = count_ones(ascii_bytes_bitmask) / 4; + size_t two_bytes_count = count_ones(two_bytes_bitmask) / 4; + size_t three_bytes_count = count_ones(three_bytes_bitmask) / 4; + count += 16 - 3*ascii_count - 2*two_bytes_count - three_bytes_count; + } + return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos); +} + +simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept { + const __m128i v_00000000 = _mm_setzero_si128(); + const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000); + size_t pos = 0; + size_t count = 0; + for(;pos + 4 <= length; pos += 4) { + __m128i in = _mm_loadu_si128((__m128i*)(input + pos)); + const __m128i surrogate_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_ffff0000), v_00000000); + const uint16_t surrogate_bitmask = static_cast(_mm_movemask_epi8(surrogate_bytemask)); + size_t surrogate_count = (16-count_ones(surrogate_bitmask))/4; + count += 4 + surrogate_count; + } + return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos); +} + +simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept { + return utf8::utf32_length_from_utf8(input, length); +} + +} // namespace westmere +} // namespace simdutf + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/end.h +/* begin file src/simdutf/westmere/end.h */ +SIMDUTF_UNTARGET_REGION +/* end file src/simdutf/westmere/end.h */ +/* end file src/westmere/implementation.cpp */ +#endif + +SIMDUTF_POP_DISABLE_WARNINGS +/* end file src/simdutf.cpp */ diff --git a/deps/simdutf/simdutf.gyp b/deps/simdutf/simdutf.gyp new file mode 100644 index 000000000000000..fe57c680a17d57f --- /dev/null +++ b/deps/simdutf/simdutf.gyp @@ -0,0 +1,123 @@ +{ + 'variables': { + 'arm_fpu%': '', + 'target_arch%': '', + }, + + 'targets': [ + { + 'target_name': 'simdutf', + 'type': 'static_library', + 'include_dirs': ['.'], + 'sources': ['simdutf.cpp'], + 'conditions': [ + [ 'arm_fpu=="neon" and target_arch=="arm"', { + 'dependencies': [ 'simdutf_neon32' ], + }], + + # arm64 requires NEON, so it's safe to always use it + [ 'target_arch=="arm64"', { + 'dependencies': [ 'simdutf_neon64' ], + }], + + # Runtime detection will happen for x86 CPUs + [ 'target_arch in "ia32 x64 x32"', { + 'dependencies': [ + 'simdutf_sse42', + 'simdutf_avx', + 'simdutf_avx2', + ], + }], + ], + }, + + { + 'target_name': 'simdutf_neon32', + 'type': 'static_library', + 'include_dirs': ['.'], + 'sources': ['simdutf.cpp'], + 'cflags': [ '-Wno-unused-function' ], + 'conditions': [ + [ 'OS!="win"', { + 'cflags': [ '-mfpu=neon' ], + 'xcode_settings': { + 'OTHER_CFLAGS': [ '-mfpu=neon' ] + }, + }], + ], + }, + + { + 'target_name': 'simdutf_neon64', + 'type': 'static_library', + 'include_dirs': ['.'], + 'sources': ['simdutf.cpp'], + 'cflags': [ '-Wno-unused-function' ], + # NEON is required in arm64, so no -mfpu flag is needed + }, + + { + 'target_name': 'simdutf_avx', + 'type': 'static_library', + 'include_dirs': ['.'], + 'sources': ['simdutf.cpp'], + 'cflags': [ '-Wno-unused-function' ], + 'conditions': [ + [ 'OS!="win"', { + 'cflags': [ '-mavx' ], + 'xcode_settings': { + 'OTHER_CFLAGS': [ '-mavx' ] + }, + }, { + 'msvs_settings': { + 'VCCLCompilerTool': { + 'AdditionalOptions': [ + '/arch:AVX' + ], + }, + }, + }], + ], + }, + + { + 'target_name': 'simdutf_avx2', + 'type': 'static_library', + 'include_dirs': ['.'], + 'sources': ['simdutf.cpp'], + 'cflags': [ '-Wno-unused-function' ], + 'conditions': [ + [ 'OS!="win"', { + 'cflags': [ '-mavx2' ], + 'xcode_settings': { + 'OTHER_CFLAGS': [ '-mavx2' ] + }, + }, { + 'msvs_settings': { + 'VCCLCompilerTool': { + 'AdditionalOptions': [ + '/arch:AVX2' + ], + }, + }, + }], + ], + }, + + { + 'target_name': 'simdutf_sse42', + 'type': 'static_library', + 'include_dirs': ['.'], + 'sources': ['simdutf.cpp'], + 'cflags': [ '-Wno-unused-function' ], + 'conditions': [ + [ 'OS!="win"', { + 'cflags': [ '-msse4.2' ], + 'xcode_settings': { + 'OTHER_CFLAGS': [ '-msse4.2' ] + }, + }], + ], + }, + ] +} diff --git a/deps/simdutf/simdutf.h b/deps/simdutf/simdutf.h new file mode 100644 index 000000000000000..a9527fe4cb7784e --- /dev/null +++ b/deps/simdutf/simdutf.h @@ -0,0 +1,2435 @@ +/* auto-generated on 2022-12-08 17:36:59 +0000. Do not edit! */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf.h +/* begin file include/simdutf.h */ +#ifndef SIMDUTF_H +#define SIMDUTF_H +#include + +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf/compiler_check.h +/* begin file include/simdutf/compiler_check.h */ +#ifndef SIMDUTF_COMPILER_CHECK_H +#define SIMDUTF_COMPILER_CHECK_H + +#ifndef __cplusplus +#error simdutf requires a C++ compiler +#endif + +#ifndef SIMDUTF_CPLUSPLUS +#if defined(_MSVC_LANG) && !defined(__clang__) +#define SIMDUTF_CPLUSPLUS (_MSC_VER == 1900 ? 201103L : _MSVC_LANG) +#else +#define SIMDUTF_CPLUSPLUS __cplusplus +#endif +#endif + +// C++ 17 +#if !defined(SIMDUTF_CPLUSPLUS17) && (SIMDUTF_CPLUSPLUS >= 201703L) +#define SIMDUTF_CPLUSPLUS17 1 +#endif + +// C++ 14 +#if !defined(SIMDUTF_CPLUSPLUS14) && (SIMDUTF_CPLUSPLUS >= 201402L) +#define SIMDUTF_CPLUSPLUS14 1 +#endif + +// C++ 11 +#if !defined(SIMDUTF_CPLUSPLUS11) && (SIMDUTF_CPLUSPLUS >= 201103L) +#define SIMDUTF_CPLUSPLUS11 1 +#endif + +#ifndef SIMDUTF_CPLUSPLUS11 +#error simdutf requires a compiler compliant with the C++11 standard +#endif + +#endif // SIMDUTF_COMPILER_CHECK_H +/* end file include/simdutf/compiler_check.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf/common_defs.h +/* begin file include/simdutf/common_defs.h */ +#ifndef SIMDUTF_COMMON_DEFS_H +#define SIMDUTF_COMMON_DEFS_H + +#include +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf/portability.h +/* begin file include/simdutf/portability.h */ +#ifndef SIMDUTF_PORTABILITY_H +#define SIMDUTF_PORTABILITY_H + +#include +#include +#include +#include +#include +#ifndef _WIN32 +// strcasecmp, strncasecmp +#include +#endif + +#ifdef _MSC_VER +#define SIMDUTF_VISUAL_STUDIO 1 +/** + * We want to differentiate carefully between + * clang under visual studio and regular visual + * studio. + * + * Under clang for Windows, we enable: + * * target pragmas so that part and only part of the + * code gets compiled for advanced instructions. + * + */ +#ifdef __clang__ +// clang under visual studio +#define SIMDUTF_CLANG_VISUAL_STUDIO 1 +#else +// just regular visual studio (best guess) +#define SIMDUTF_REGULAR_VISUAL_STUDIO 1 +#endif // __clang__ +#endif // _MSC_VER + +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO +// https://en.wikipedia.org/wiki/C_alternative_tokens +// This header should have no effect, except maybe +// under Visual Studio. +#include +#endif + +#if defined(__x86_64__) || defined(_M_AMD64) +#define SIMDUTF_IS_X86_64 1 +#elif defined(__aarch64__) || defined(_M_ARM64) +#define SIMDUTF_IS_ARM64 1 +#elif defined(__PPC64__) || defined(_M_PPC64) +//#define SIMDUTF_IS_PPC64 1 +#pragma message("The simdutf library does yet support SIMD acceleration under\ +POWER processors. Please see https://github.com/lemire/simdutf/issues/51") +#else +// The simdutf library is designed +// for 64-bit processors and it seems that you are not +// compiling for a known 64-bit platform. Please +// use a 64-bit target such as x64 or 64-bit ARM for best performance. +#define SIMDUTF_IS_32BITS 1 + +// We do not support 32-bit platforms, but it can be +// handy to identify them. +#if defined(_M_IX86) || defined(__i386__) +#define SIMDUTF_IS_X86_32BITS 1 +#elif defined(__arm__) || defined(_M_ARM) +#define SIMDUTF_IS_ARM_32BITS 1 +#elif defined(__PPC__) || defined(_M_PPC) +#define SIMDUTF_IS_PPC_32BITS 1 +#endif + +#endif // defined(__x86_64__) || defined(_M_AMD64) + +#ifdef SIMDUTF_IS_32BITS +#ifndef SIMDUTF_NO_PORTABILITY_WARNING +#pragma message("The simdutf library is designed \ +for 64-bit processors and it seems that you are not \ +compiling for a known 64-bit platform. All fast kernels \ +will be disabled and performance may be poor. Please \ +use a 64-bit target such as x64, 64-bit ARM or 64-bit PPC.") +#endif // SIMDUTF_NO_PORTABILITY_WARNING +#endif // SIMDUTF_IS_32BITS + +// this is almost standard? +#define SIMDUTF_STRINGIFY_IMPLEMENTATION_(a) #a +#define SIMDUTF_STRINGIFY(a) SIMDUTF_STRINGIFY_IMPLEMENTATION_(a) + +// Our fast kernels require 64-bit systems. +// +// On 32-bit x86, we lack 64-bit popcnt, lzcnt, blsr instructions. +// Furthermore, the number of SIMD registers is reduced. +// +// On 32-bit ARM, we would have smaller registers. +// +// The simdutf users should still have the fallback kernel. It is +// slower, but it should run everywhere. + +// +// Enable valid runtime implementations, and select SIMDUTF_BUILTIN_IMPLEMENTATION +// + +// We are going to use runtime dispatch. +#ifdef SIMDUTF_IS_X86_64 +#ifdef __clang__ +// clang does not have GCC push pop +// warning: clang attribute push can't be used within a namespace in clang up +// til 8.0 so SIMDUTF_TARGET_REGION and SIMDUTF_UNTARGET_REGION must be *outside* of a +// namespace. +#define SIMDUTF_TARGET_REGION(T) \ + _Pragma(SIMDUTF_STRINGIFY( \ + clang attribute push(__attribute__((target(T))), apply_to = function))) +#define SIMDUTF_UNTARGET_REGION _Pragma("clang attribute pop") +#elif defined(__GNUC__) +// GCC is easier +#define SIMDUTF_TARGET_REGION(T) \ + _Pragma("GCC push_options") _Pragma(SIMDUTF_STRINGIFY(GCC target(T))) +#define SIMDUTF_UNTARGET_REGION _Pragma("GCC pop_options") +#endif // clang then gcc + +#endif // x86 + +// Default target region macros don't do anything. +#ifndef SIMDUTF_TARGET_REGION +#define SIMDUTF_TARGET_REGION(T) +#define SIMDUTF_UNTARGET_REGION +#endif + +// Is threading enabled? +#if defined(_REENTRANT) || defined(_MT) +#ifndef SIMDUTF_THREADS_ENABLED +#define SIMDUTF_THREADS_ENABLED +#endif +#endif + +// workaround for large stack sizes under -O0. +// https://github.com/simdutf/simdutf/issues/691 +#ifdef __APPLE__ +#ifndef __OPTIMIZE__ +// Apple systems have small stack sizes in secondary threads. +// Lack of compiler optimization may generate high stack usage. +// Users may want to disable threads for safety, but only when +// in debug mode which we detect by the fact that the __OPTIMIZE__ +// macro is not defined. +#undef SIMDUTF_THREADS_ENABLED +#endif +#endif + +#ifdef SIMDUTF_VISUAL_STUDIO +// This is one case where we do not distinguish between +// regular visual studio and clang under visual studio. +// clang under Windows has _stricmp (like visual studio) but not strcasecmp (as clang normally has) +#define simdutf_strcasecmp _stricmp +#define simdutf_strncasecmp _strnicmp +#else +// The strcasecmp, strncasecmp, and strcasestr functions do not work with multibyte strings (e.g. UTF-8). +// So they are only useful for ASCII in our context. +// https://www.gnu.org/software/libunistring/manual/libunistring.html#char-_002a-strings +#define simdutf_strcasecmp strcasecmp +#define simdutf_strncasecmp strncasecmp +#endif + +#ifdef NDEBUG + +#ifdef SIMDUTF_VISUAL_STUDIO +#define SIMDUTF_UNREACHABLE() __assume(0) +#define SIMDUTF_ASSUME(COND) __assume(COND) +#else +#define SIMDUTF_UNREACHABLE() __builtin_unreachable(); +#define SIMDUTF_ASSUME(COND) do { if (!(COND)) __builtin_unreachable(); } while (0) +#endif + +#else // NDEBUG + +#define SIMDUTF_UNREACHABLE() assert(0); +#define SIMDUTF_ASSUME(COND) assert(COND) + +#endif + +#endif // SIMDUTF_PORTABILITY_H +/* end file include/simdutf/portability.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf/avx512.h +/* begin file include/simdutf/avx512.h */ +#ifndef SIMDUTF_AVX512_H_ +#define SIMDUTF_AVX512_H_ + +/* + It's possible to override AVX512 settings with cmake DCMAKE_CXX_FLAGS. + + All preprocessor directives has form `SIMDUTF_HAS_AVX512{feature}`, + where a feature is a code name for extensions. + + Please see the listing below to find which are supported. +*/ + +#ifndef SIMDUTF_HAS_AVX512F +# if defined(__AVX512F__) && __AVX512F__ == 1 +# define SIMDUTF_HAS_AVX512F 1 +# endif +#endif + +#ifndef SIMDUTF_HAS_AVX512DQ +# if defined(__AVX512DQ__) && __AVX512DQ__ == 1 +# define SIMDUTF_HAS_AVX512DQ 1 +# endif +#endif + +#ifndef SIMDUTF_HAS_AVX512IFMA +# if defined(__AVX512IFMA__) && __AVX512IFMA__ == 1 +# define SIMDUTF_HAS_AVX512IFMA 1 +# endif +#endif + +#ifndef SIMDUTF_HAS_AVX512CD +# if defined(__AVX512CD__) && __AVX512CD__ == 1 +# define SIMDUTF_HAS_AVX512CD 1 +# endif +#endif + +#ifndef SIMDUTF_HAS_AVX512BW +# if defined(__AVX512BW__) && __AVX512BW__ == 1 +# define SIMDUTF_HAS_AVX512BW 1 +# endif +#endif + +#ifndef SIMDUTF_HAS_AVX512VL +# if defined(__AVX512VL__) && __AVX512VL__ == 1 +# define SIMDUTF_HAS_AVX512VL 1 +# endif +#endif + +#ifndef SIMDUTF_HAS_AVX512VBMI +# if defined(__AVX512VBMI__) && __AVX512VBMI__ == 1 +# define SIMDUTF_HAS_AVX512VBMI 1 +# endif +#endif + +#ifndef SIMDUTF_HAS_AVX512VBMI2 +# if defined(__AVX512VBMI2__) && __AVX512VBMI2__ == 1 +# define SIMDUTF_HAS_AVX512VBMI2 1 +# endif +#endif + +#ifndef SIMDUTF_HAS_AVX512VNNI +# if defined(__AVX512VNNI__) && __AVX512VNNI__ == 1 +# define SIMDUTF_HAS_AVX512VNNI 1 +# endif +#endif + +#ifndef SIMDUTF_HAS_AVX512BITALG +# if defined(__AVX512BITALG__) && __AVX512BITALG__ == 1 +# define SIMDUTF_HAS_AVX512BITALG 1 +# endif +#endif + +#ifndef SIMDUTF_HAS_AVX512VPOPCNTDQ +# if defined(__AVX512VPOPCNTDQ__) && __AVX512VPOPCNTDQ__ == 1 +# define SIMDUTF_HAS_AVX512VPOPCNTDQ 1 +# endif +#endif + +#endif // SIMDUTF_AVX512_H_ +/* end file include/simdutf/avx512.h */ + + +#if defined(__GNUC__) + // Marks a block with a name so that MCA analysis can see it. + #define SIMDUTF_BEGIN_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-BEGIN " #name); + #define SIMDUTF_END_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-END " #name); + #define SIMDUTF_DEBUG_BLOCK(name, block) BEGIN_DEBUG_BLOCK(name); block; END_DEBUG_BLOCK(name); +#else + #define SIMDUTF_BEGIN_DEBUG_BLOCK(name) + #define SIMDUTF_END_DEBUG_BLOCK(name) + #define SIMDUTF_DEBUG_BLOCK(name, block) +#endif + +// Align to N-byte boundary +#define SIMDUTF_ROUNDUP_N(a, n) (((a) + ((n)-1)) & ~((n)-1)) +#define SIMDUTF_ROUNDDOWN_N(a, n) ((a) & ~((n)-1)) + +#define SIMDUTF_ISALIGNED_N(ptr, n) (((uintptr_t)(ptr) & ((n)-1)) == 0) + +#if defined(SIMDUTF_REGULAR_VISUAL_STUDIO) + + #define simdutf_really_inline __forceinline + #define simdutf_never_inline __declspec(noinline) + + #define simdutf_unused + #define simdutf_warn_unused + + #ifndef simdutf_likely + #define simdutf_likely(x) x + #endif + #ifndef simdutf_unlikely + #define simdutf_unlikely(x) x + #endif + + #define SIMDUTF_PUSH_DISABLE_WARNINGS __pragma(warning( push )) + #define SIMDUTF_PUSH_DISABLE_ALL_WARNINGS __pragma(warning( push, 0 )) + #define SIMDUTF_DISABLE_VS_WARNING(WARNING_NUMBER) __pragma(warning( disable : WARNING_NUMBER )) + // Get rid of Intellisense-only warnings (Code Analysis) + // Though __has_include is C++17, it is supported in Visual Studio 2017 or better (_MSC_VER>=1910). + #ifdef __has_include + #if __has_include() + #include + #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS SIMDUTF_DISABLE_VS_WARNING(ALL_CPPCORECHECK_WARNINGS) + #endif + #endif + + #ifndef SIMDUTF_DISABLE_UNDESIRED_WARNINGS + #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS + #endif + + #define SIMDUTF_DISABLE_DEPRECATED_WARNING SIMDUTF_DISABLE_VS_WARNING(4996) + #define SIMDUTF_DISABLE_STRICT_OVERFLOW_WARNING + #define SIMDUTF_POP_DISABLE_WARNINGS __pragma(warning( pop )) + +#else // SIMDUTF_REGULAR_VISUAL_STUDIO + + #define simdutf_really_inline inline __attribute__((always_inline)) + #define simdutf_never_inline inline __attribute__((noinline)) + + #define simdutf_unused __attribute__((unused)) + #define simdutf_warn_unused __attribute__((warn_unused_result)) + + #ifndef simdutf_likely + #define simdutf_likely(x) __builtin_expect(!!(x), 1) + #endif + #ifndef simdutf_unlikely + #define simdutf_unlikely(x) __builtin_expect(!!(x), 0) + #endif + + #define SIMDUTF_PUSH_DISABLE_WARNINGS _Pragma("GCC diagnostic push") + // gcc doesn't seem to disable all warnings with all and extra, add warnings here as necessary + #define SIMDUTF_PUSH_DISABLE_ALL_WARNINGS SIMDUTF_PUSH_DISABLE_WARNINGS \ + SIMDUTF_DISABLE_GCC_WARNING(-Weffc++) \ + SIMDUTF_DISABLE_GCC_WARNING(-Wall) \ + SIMDUTF_DISABLE_GCC_WARNING(-Wconversion) \ + SIMDUTF_DISABLE_GCC_WARNING(-Wextra) \ + SIMDUTF_DISABLE_GCC_WARNING(-Wattributes) \ + SIMDUTF_DISABLE_GCC_WARNING(-Wimplicit-fallthrough) \ + SIMDUTF_DISABLE_GCC_WARNING(-Wnon-virtual-dtor) \ + SIMDUTF_DISABLE_GCC_WARNING(-Wreturn-type) \ + SIMDUTF_DISABLE_GCC_WARNING(-Wshadow) \ + SIMDUTF_DISABLE_GCC_WARNING(-Wunused-parameter) \ + SIMDUTF_DISABLE_GCC_WARNING(-Wunused-variable) + #define SIMDUTF_PRAGMA(P) _Pragma(#P) + #define SIMDUTF_DISABLE_GCC_WARNING(WARNING) SIMDUTF_PRAGMA(GCC diagnostic ignored #WARNING) + #if defined(SIMDUTF_CLANG_VISUAL_STUDIO) + #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS SIMDUTF_DISABLE_GCC_WARNING(-Wmicrosoft-include) + #else + #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS + #endif + #define SIMDUTF_DISABLE_DEPRECATED_WARNING SIMDUTF_DISABLE_GCC_WARNING(-Wdeprecated-declarations) + #define SIMDUTF_DISABLE_STRICT_OVERFLOW_WARNING SIMDUTF_DISABLE_GCC_WARNING(-Wstrict-overflow) + #define SIMDUTF_POP_DISABLE_WARNINGS _Pragma("GCC diagnostic pop") + + + +#endif // MSC_VER + +#if defined(SIMDUTF_VISUAL_STUDIO) + /** + * It does not matter here whether you are using + * the regular visual studio or clang under visual + * studio. + */ + #if SIMDUTF_USING_LIBRARY + #define SIMDUTF_DLLIMPORTEXPORT __declspec(dllimport) + #else + #define SIMDUTF_DLLIMPORTEXPORT __declspec(dllexport) + #endif +#else + #define SIMDUTF_DLLIMPORTEXPORT +#endif + +/// If EXPR is an error, returns it. +#define SIMDUTF_TRY(EXPR) { auto _err = (EXPR); if (_err) { return _err; } } + + +#endif // SIMDUTF_COMMON_DEFS_H +/* end file include/simdutf/common_defs.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf/encoding_types.h +/* begin file include/simdutf/encoding_types.h */ +#include + +namespace simdutf { + +enum encoding_type { + UTF8 = 1, // BOM 0xef 0xbb 0xbf + UTF16_LE = 2, // BOM 0xff 0xfe + UTF16_BE = 4, // BOM 0xfe 0xff + UTF32_LE = 8, // BOM 0xff 0xfe 0x00 0x00 + UTF32_BE = 16, // BOM 0x00 0x00 0xfe 0xff + + unspecified = 0 +}; + +enum endianness { + LITTLE, + BIG +}; + +std::string to_string(encoding_type bom); + +// Note that BOM for UTF8 is discouraged. +namespace BOM { + +/** + * Checks for a BOM. If not, returns unspecified + * @param input the string to process + * @param length the length of the string in words + * @return the corresponding encoding + */ + +encoding_type check_bom(const uint8_t* byte, size_t length); +encoding_type check_bom(const char* byte, size_t length); +/** + * Returns the size, in bytes, of the BOM for a given encoding type. + * Note that UTF8 BOM are discouraged. + * @param bom the encoding type + * @return the size in bytes of the corresponding BOM + */ +size_t bom_byte_size(encoding_type bom); + +} // BOM namespace +} // simdutf namespace +/* end file include/simdutf/encoding_types.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf/error.h +/* begin file include/simdutf/error.h */ +#ifndef ERROR_H +#define ERROR_H +namespace simdutf { + +enum error_code { + SUCCESS = 0, + HEADER_BITS, // Any byte must have fewer than 5 header bits. + TOO_SHORT, // The leading byte must be followed by N-1 continuation bytes, where N is the UTF-8 character length + // This is also the error when the input is truncated. + TOO_LONG, // The leading byte must not be a continuation byte. + OVERLONG, // The decoded character must be above U+7F for two-byte characters, U+7FF for three-byte characters, + // and U+FFFF for four-byte characters. + TOO_LARGE, // The decoded character must be less than or equal to U+10FFFF OR less than or equal than U+7F for ASCII. + SURROGATE, // The decoded character must be not be in U+D800...DFFF (UTF-8 or UTF-32) OR + // a high surrogate must be followed by a low surrogate and a low surrogate must be preceded by a high surrogate (UTF-16) + OTHER // Not related to validation/transcoding. +}; + +struct result { + error_code error; + size_t count; // In case of error, indicates the position of the error. In case of success, indicates the number of words validated/written. + + simdutf_really_inline result(); + + simdutf_really_inline result(error_code, size_t); +}; + +} +#endif +/* end file include/simdutf/error.h */ + +SIMDUTF_PUSH_DISABLE_WARNINGS +SIMDUTF_DISABLE_UNDESIRED_WARNINGS + +// Public API +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf/simdutf_version.h +/* begin file include/simdutf/simdutf_version.h */ +// /include/simdutf/simdutf_version.h automatically generated by release.py, +// do not change by hand +#ifndef SIMDUTF_SIMDUTF_VERSION_H +#define SIMDUTF_SIMDUTF_VERSION_H + +/** The version of simdutf being used (major.minor.revision) */ +#define SIMDUTF_VERSION 1.0.1 + +namespace simdutf { +enum { + /** + * The major version (MAJOR.minor.revision) of simdutf being used. + */ + SIMDUTF_VERSION_MAJOR = 1, + /** + * The minor version (major.MINOR.revision) of simdutf being used. + */ + SIMDUTF_VERSION_MINOR = 0, + /** + * The revision (major.minor.REVISION) of simdutf being used. + */ + SIMDUTF_VERSION_REVISION = 1 +}; +} // namespace simdutf + +#endif // SIMDUTF_SIMDUTF_VERSION_H +/* end file include/simdutf/simdutf_version.h */ +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf/implementation.h +/* begin file include/simdutf/implementation.h */ +#ifndef SIMDUTF_IMPLEMENTATION_H +#define SIMDUTF_IMPLEMENTATION_H +#include +#if !defined(SIMDUTF_NO_THREADS) +#include +#endif +#include +#include +// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf/internal/isadetection.h +/* begin file include/simdutf/internal/isadetection.h */ +/* From +https://github.com/endorno/pytorch/blob/master/torch/lib/TH/generic/simd/simd.h +Highly modified. + +Copyright (c) 2016- Facebook, Inc (Adam Paszke) +Copyright (c) 2014- Facebook, Inc (Soumith Chintala) +Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) +Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) +Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) +Copyright (c) 2011-2013 NYU (Clement Farabet) +Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, +Iain Melvin, Jason Weston) Copyright (c) 2006 Idiap Research Institute +(Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, +Samy Bengio, Johnny Mariethoz) + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories +America and IDIAP Research Institute nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef SIMDutf_INTERNAL_ISADETECTION_H +#define SIMDutf_INTERNAL_ISADETECTION_H + +#include +#include +#if defined(_MSC_VER) +#include +#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID) +#include +#endif + +namespace simdutf { +namespace internal { + +enum instruction_set { + DEFAULT = 0x0, + NEON = 0x1, + AVX2 = 0x4, + SSE42 = 0x8, + PCLMULQDQ = 0x10, + BMI1 = 0x20, + BMI2 = 0x40, + ALTIVEC = 0x80, + AVX512F = 0x100, + AVX512DQ = 0x200, + AVX512IFMA = 0x400, + AVX512PF = 0x800, + AVX512ER = 0x1000, + AVX512CD = 0x2000, + AVX512BW = 0x4000, + AVX512VL = 0x8000, + AVX512VBMI2 = 0x10000 +}; + +#if defined(__PPC64__) + +static inline uint32_t detect_supported_architectures() { + return instruction_set::ALTIVEC; +} + +#elif defined(__arm__) || defined(__aarch64__) // incl. armel, armhf, arm64 + +#if defined(__ARM_NEON) + +static inline uint32_t detect_supported_architectures() { + return instruction_set::NEON; +} + +#else // ARM without NEON + +static inline uint32_t detect_supported_architectures() { + return instruction_set::DEFAULT; +} + +#endif + +#elif defined(__x86_64__) || defined(_M_AMD64) // x64 + + +namespace { +namespace cpuid_bit { + // Can be found on Intel ISA Reference for CPUID + + // EAX = 0x01 + constexpr uint32_t pclmulqdq = uint32_t(1) << 1; ///< @private bit 1 of ECX for EAX=0x1 + constexpr uint32_t sse42 = uint32_t(1) << 20; ///< @private bit 20 of ECX for EAX=0x1 + + // EAX = 0x7f (Structured Extended Feature Flags), ECX = 0x00 (Sub-leaf) + // See: "Table 3-8. Information Returned by CPUID Instruction" + namespace ebx { + constexpr uint32_t bmi1 = uint32_t(1) << 3; + constexpr uint32_t avx2 = uint32_t(1) << 5; + constexpr uint32_t bmi2 = uint32_t(1) << 8; + constexpr uint32_t avx512f = uint32_t(1) << 16; + constexpr uint32_t avx512dq = uint32_t(1) << 17; + constexpr uint32_t avx512ifma = uint32_t(1) << 21; + constexpr uint32_t avx512cd = uint32_t(1) << 28; + constexpr uint32_t avx512bw = uint32_t(1) << 30; + constexpr uint32_t avx512vl = uint32_t(1) << 31; + } + + namespace ecx { + constexpr uint32_t avx512vbmi = uint32_t(1) << 1; + constexpr uint32_t avx512vbmi2 = uint32_t(1) << 6; + constexpr uint32_t avx512vnni = uint32_t(1) << 11; + constexpr uint32_t avx512bitalg = uint32_t(1) << 12; + constexpr uint32_t avx512vpopcnt = uint32_t(1) << 14; + } + namespace edx { + constexpr uint32_t avx512vp2intersect = uint32_t(1) << 8; + } + } +} + + + +static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, + uint32_t *edx) { +#if defined(_MSC_VER) + int cpu_info[4]; + __cpuid(cpu_info, *eax); + *eax = cpu_info[0]; + *ebx = cpu_info[1]; + *ecx = cpu_info[2]; + *edx = cpu_info[3]; +#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID) + uint32_t level = *eax; + __get_cpuid(level, eax, ebx, ecx, edx); +#else + uint32_t a = *eax, b, c = *ecx, d; + asm volatile("cpuid\n\t" : "+a"(a), "=b"(b), "+c"(c), "=d"(d)); + *eax = a; + *ebx = b; + *ecx = c; + *edx = d; +#endif +} + +static inline uint32_t detect_supported_architectures() { + uint32_t eax; + uint32_t ebx = 0; + uint32_t ecx = 0; + uint32_t edx = 0; + uint32_t host_isa = 0x0; + + // EBX for EAX=0x1 + eax = 0x1; + cpuid(&eax, &ebx, &ecx, &edx); + + if (ecx & cpuid_bit::sse42) { + host_isa |= instruction_set::SSE42; + } + + if (ecx & cpuid_bit::pclmulqdq) { + host_isa |= instruction_set::PCLMULQDQ; + } + + // ECX for EAX=0x7 + eax = 0x7; + ecx = 0x0; // Sub-leaf = 0 + cpuid(&eax, &ebx, &ecx, &edx); + if (ebx & cpuid_bit::ebx::avx2) { + host_isa |= instruction_set::AVX2; + } + if (ebx & cpuid_bit::ebx::bmi1) { + host_isa |= instruction_set::BMI1; + } + if (ebx & cpuid_bit::ebx::bmi2) { + host_isa |= instruction_set::BMI2; + } + if (ebx & cpuid_bit::ebx::avx512f) { + host_isa |= instruction_set::AVX512F; + } + if (ebx & cpuid_bit::ebx::avx512bw) { + host_isa |= instruction_set::AVX512BW; + } + if (ebx & cpuid_bit::ebx::avx512cd) { + host_isa |= instruction_set::AVX512CD; + } + if (ebx & cpuid_bit::ebx::avx512dq) { + host_isa |= instruction_set::AVX512DQ; + } + if (ebx & cpuid_bit::ebx::avx512vl) { + host_isa |= instruction_set::AVX512VL; + } + if (ecx & cpuid_bit::ecx::avx512vbmi2) { + host_isa |= instruction_set::AVX512VBMI2; + } + return host_isa; +} +#else // fallback + + +static inline uint32_t detect_supported_architectures() { + return instruction_set::DEFAULT; +} + + +#endif // end SIMD extension detection code + +} // namespace internal +} // namespace simdutf + +#endif // SIMDutf_INTERNAL_ISADETECTION_H +/* end file include/simdutf/internal/isadetection.h */ + + +namespace simdutf { + +/** + * Autodetect the encoding of the input, a single encoding is recommended. + * E.g., the function might return simdutf::encoding_type::UTF8, + * simdutf::encoding_type::UTF16_LE, simdutf::encoding_type::UTF16_BE, or + * simdutf::encoding_type::UTF32_LE. + * + * @param input the string to analyze. + * @param length the length of the string in bytes. + * @return the detected encoding type + */ +simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const char * input, size_t length) noexcept; +simdutf_really_inline simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const uint8_t * input, size_t length) noexcept { + return autodetect_encoding(reinterpret_cast(input), length); +} + +/** + * Autodetect the possible encodings of the input in one pass. + * E.g., if the input might be UTF-16LE or UTF-8, this function returns + * the value (simdutf::encoding_type::UTF8 | simdutf::encoding_type::UTF16_LE). + * + * Overriden by each implementation. + * + * @param input the string to analyze. + * @param length the length of the string in bytes. + * @return the detected encoding type + */ +simdutf_warn_unused int detect_encodings(const char * input, size_t length) noexcept; +simdutf_really_inline simdutf_warn_unused int detect_encodings(const uint8_t * input, size_t length) noexcept { + return detect_encodings(reinterpret_cast(input), length); +} + + +/** + * Validate the UTF-8 string. + * + * Overridden by each implementation. + * + * @param buf the UTF-8 string to validate. + * @param len the length of the string in bytes. + * @return true if and only if the string is valid UTF-8. + */ +simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept; + +/** + * Validate the UTF-8 string and stop on error. + * + * Overridden by each implementation. + * + * @param buf the UTF-8 string to validate. + * @param len the length of the string in bytes. + * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful. + */ +simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) noexcept; + +/** + * Validate the ASCII string. + * + * Overridden by each implementation. + * + * @param buf the ASCII string to validate. + * @param len the length of the string in bytes. + * @return true if and only if the string is valid ASCII. + */ +simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept; + +/** + * Validate the ASCII string and stop on error. + * + * Overridden by each implementation. + * + * @param buf the ASCII string to validate. + * @param len the length of the string in bytes. + * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful. + */ +simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) noexcept; + +/** + * Validate the UTF-16LE string. + * + * Overridden by each implementation. + * + * This function is not BOM-aware. + * + * @param buf the UTF-16LE string to validate. + * @param len the length of the string in number of 2-byte words (char16_t). + * @return true if and only if the string is valid UTF-16LE. + */ +simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) noexcept; + +/** + * Validate the UTF-16BE string. + * + * Overridden by each implementation. + * + * This function is not BOM-aware. + * + * @param buf the UTF-16BE string to validate. + * @param len the length of the string in number of 2-byte words (char16_t). + * @return true if and only if the string is valid UTF-16BE. + */ +simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) noexcept; + +/** + * Validate the UTF-16LE string and stop on error. + * + * Overridden by each implementation. + * + * This function is not BOM-aware. + * + * @param buf the UTF-16LE string to validate. + * @param len the length of the string in number of 2-byte words (char16_t). + * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful. + */ +simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) noexcept; + +/** + * Validate the UTF-16BE string and stop on error. + * + * Overridden by each implementation. + * + * This function is not BOM-aware. + * + * @param buf the UTF-16BE string to validate. + * @param len the length of the string in number of 2-byte words (char16_t). + * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful. + */ +simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) noexcept; + +/** + * Validate the UTF-32LE string. + * + * Overridden by each implementation. + * + * This function is not BOM-aware. + * + * @param buf the UTF-32LE string to validate. + * @param len the length of the string in number of 4-byte words (char32_t). + * @return true if and only if the string is valid UTF-32LE. + */ +simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) noexcept; + +/** + * Validate the UTF-32LE string and stop on error. + * + * Overridden by each implementation. + * + * This function is not BOM-aware. + * + * @param buf the UTF-32LE string to validate. + * @param len the length of the string in number of 4-byte words (char32_t). + * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful. + */ +simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) noexcept; + +/** + * Convert possibly broken UTF-8 string into UTF-16LE string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return the number of written char16_t; 0 if the input was not valid UTF-8 string + */ +simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_output) noexcept; + +/** + * Convert possibly broken UTF-8 string into UTF-16BE string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return the number of written char16_t; 0 if the input was not valid UTF-8 string + */ +simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_output) noexcept; + +/** + * Convert possibly broken UTF-8 string into UTF-16LE string and stop on error. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful. + */ +simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept; + +/** + * Convert possibly broken UTF-8 string into UTF-16BE string and stop on error. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful. + */ +simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept; + +/** + * Convert possibly broken UTF-8 string into UTF-32LE string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param utf32_buffer the pointer to buffer that can hold conversion result + * @return the number of written char32_t; 0 if the input was not valid UTF-8 string + */ +simdutf_warn_unused size_t convert_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_output) noexcept; + +/** + * Convert possibly broken UTF-8 string into UTF-32LE string and stop on error. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param utf32_buffer the pointer to buffer that can hold conversion result + * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful. + */ +simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * input, size_t length, char32_t* utf32_output) noexcept; + +/** + * Convert valid UTF-8 string into UTF-16LE string. + * + * This function assumes that the input string is valid UTF-8. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return the number of written char16_t + */ +simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_buffer) noexcept; + +/** + * Convert valid UTF-8 string into UTF-16BE string. + * + * This function assumes that the input string is valid UTF-8. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return the number of written char16_t + */ +simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_buffer) noexcept; + +/** + * Convert valid UTF-8 string into UTF-32LE string. + * + * This function assumes that the input string is valid UTF-8. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param utf32_buffer the pointer to buffer that can hold conversion result + * @return the number of written char32_t + */ +simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) noexcept; + +/** + * Compute the number of 2-byte words that this UTF-8 string would require in UTF-16LE format. + * + * This function does not validate the input. + * + * This function is not BOM-aware. + * + * @param input the UTF-8 string to process + * @param length the length of the string in bytes + * @return the number of char16_t words required to encode the UTF-8 string as UTF-16LE + */ +simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) noexcept; + +/** + * Compute the number of 4-byte words that this UTF-8 string would require in UTF-32LE format. + * + * This function is equivalent to count_utf8 + * + * This function does not validate the input. + * + * This function is not BOM-aware. + * + * @param input the UTF-8 string to process + * @param length the length of the string in bytes + * @return the number of char32_t words required to encode the UTF-8 string as UTF-32LE + */ +simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) noexcept; + +/** + * Convert possibly broken UTF-16LE string into UTF-8 string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-16LE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf8_buffer the pointer to buffer that can hold conversion result + * @return number of written words; 0 if input is not a valid UTF-16LE string + */ +simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept; + +/** + * Convert possibly broken UTF-16BE string into UTF-8 string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-16BE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf8_buffer the pointer to buffer that can hold conversion result + * @return number of written words; 0 if input is not a valid UTF-16LE string + */ +simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept; + +/** + * Convert possibly broken UTF-16LE string into UTF-8 string and stop on error. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-16LE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf8_buffer the pointer to buffer that can hold conversion result + * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful. + */ +simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) noexcept; + +/** + * Convert possibly broken UTF-16BE string into UTF-8 string and stop on error. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-16BE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf8_buffer the pointer to buffer that can hold conversion result + * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful. + */ +simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) noexcept; + +/** + * Convert valid UTF-16LE string into UTF-8 string. + * + * This function assumes that the input string is valid UTF-16LE. + * + * This function is not BOM-aware. + * + * @param input the UTF-16LE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf8_buffer the pointer to buffer that can hold the conversion result + * @return number of written words; 0 if conversion is not possible + */ +simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept; + +/** + * Convert valid UTF-16BE string into UTF-8 string. + * + * This function assumes that the input string is valid UTF-16BE. + * + * This function is not BOM-aware. + * + * @param input the UTF-16BE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf8_buffer the pointer to buffer that can hold the conversion result + * @return number of written words; 0 if conversion is not possible + */ +simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept; + +/** + * Convert possibly broken UTF-16LE string into UTF-32LE string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-16LE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf32_buffer the pointer to buffer that can hold conversion result + * @return number of written words; 0 if input is not a valid UTF-16LE string + */ +simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; + +/** + * Convert possibly broken UTF-16BE string into UTF-32LE string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-16BE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf32_buffer the pointer to buffer that can hold conversion result + * @return number of written words; 0 if input is not a valid UTF-16LE string + */ +simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; + +/** + * Convert possibly broken UTF-16LE string into UTF-32LE string and stop on error. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-16LE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf32_buffer the pointer to buffer that can hold conversion result + * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful. + */ +simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; + +/** + * Convert possibly broken UTF-16BE string into UTF-32LE string and stop on error. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-16BE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf32_buffer the pointer to buffer that can hold conversion result + * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful. + */ +simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; + +/** + * Convert valid UTF-16LE string into UTF-32LE string. + * + * This function assumes that the input string is valid UTF-16LE. + * + * This function is not BOM-aware. + * + * @param input the UTF-16LE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf32_buffer the pointer to buffer that can hold the conversion result + * @return number of written words; 0 if conversion is not possible + */ +simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; + +/** + * Convert valid UTF-16BE string into UTF-32LE string. + * + * This function assumes that the input string is valid UTF-16LE. + * + * This function is not BOM-aware. + * + * @param input the UTF-16BE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf32_buffer the pointer to buffer that can hold the conversion result + * @return number of written words; 0 if conversion is not possible + */ +simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; + +/** + * Compute the number of bytes that this UTF-16LE string would require in UTF-8 format. + * + * This function does not validate the input. + * + * @param input the UTF-16LE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @return the number of bytes required to encode the UTF-16LE string as UTF-8 + */ +simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) noexcept; + +/** + * Compute the number of bytes that this UTF-16BE string would require in UTF-8 format. + * + * This function does not validate the input. + * + * @param input the UTF-16BE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @return the number of bytes required to encode the UTF-16BE string as UTF-8 + */ +simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) noexcept; + +/** + * Convert possibly broken UTF-32LE string into UTF-8 string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-32LE string to convert + * @param length the length of the string in 4-byte words (char32_t) + * @param utf8_buffer the pointer to buffer that can hold conversion result + * @return number of written words; 0 if input is not a valid UTF-32LE string + */ +simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) noexcept; + +/** + * Convert possibly broken UTF-32LE string into UTF-8 string and stop on error. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-32LE string to convert + * @param length the length of the string in 4-byte words (char32_t) + * @param utf8_buffer the pointer to buffer that can hold conversion result + * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful. + */ +simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * input, size_t length, char* utf8_buffer) noexcept; + +/** + * Convert valid UTF-32LE string into UTF-8 string. + * + * This function assumes that the input string is valid UTF-32LE. + * + * This function is not BOM-aware. + * + * @param input the UTF-32LE string to convert + * @param length the length of the string in 4-byte words (char32_t) + * @param utf8_buffer the pointer to buffer that can hold the conversion result + * @return number of written words; 0 if conversion is not possible + */ +simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) noexcept; + +/** + * Convert possibly broken UTF-32LE string into UTF-16LE string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-32LE string to convert + * @param length the length of the string in 4-byte words (char32_t) + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return number of written words; 0 if input is not a valid UTF-32LE string + */ +simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; + +/** + * Convert possibly broken UTF-32LE string into UTF-16BE string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-32LE string to convert + * @param length the length of the string in 4-byte words (char32_t) + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return number of written words; 0 if input is not a valid UTF-32LE string + */ +simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; + +/** + * Convert possibly broken UTF-32LE string into UTF-16LE string and stop on error. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-32LE string to convert + * @param length the length of the string in 4-byte words (char32_t) + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful. + */ +simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; + +/** + * Convert possibly broken UTF-32LE string into UTF-16BE string and stop on error. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-32LE string to convert + * @param length the length of the string in 4-byte words (char32_t) + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful. + */ +simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; + +/** + * Convert valid UTF-32LE string into UTF-16LE string. + * + * This function assumes that the input string is valid UTF-32LE. + * + * This function is not BOM-aware. + * + * @param input the UTF-32LE string to convert + * @param length the length of the string in 4-byte words (char32_t) + * @param utf16_buffer the pointer to buffer that can hold the conversion result + * @return number of written words; 0 if conversion is not possible + */ +simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; + +/** + * Convert valid UTF-32LE string into UTF-16BE string. + * + * This function assumes that the input string is valid UTF-32LE. + * + * This function is not BOM-aware. + * + * @param input the UTF-32LE string to convert + * @param length the length of the string in 4-byte words (char32_t) + * @param utf16_buffer the pointer to buffer that can hold the conversion result + * @return number of written words; 0 if conversion is not possible + */ +simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; + +/** + * Change the endianness of the input. Can be used to go from UTF-16LE to UTF-16BE or + * from UTF-16BE to UTF-16LE. + * + * This function does not validate the input. + * + * This function is not BOM-aware. + * + * @param input the UTF-16 string to process + * @param length the length of the string in 2-byte words (char16_t) + * @param output the pointer to buffer that can hold the conversion result + */ +void change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) noexcept; + +/** + * Compute the number of bytes that this UTF-32LE string would require in UTF-8 format. + * + * This function does not validate the input. + * + * @param input the UTF-32LE string to convert + * @param length the length of the string in 4-byte words (char32_t) + * @return the number of bytes required to encode the UTF-32LE string as UTF-8 + */ +simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) noexcept; + +/** + * Compute the number of two-byte words that this UTF-32LE string would require in UTF-16 format. + * + * This function does not validate the input. + * + * @param input the UTF-32LE string to convert + * @param length the length of the string in 4-byte words (char32_t) + * @return the number of bytes required to encode the UTF-32LE string as UTF-16 + */ +simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) noexcept; + +/** + * Compute the number of bytes that this UTF-16LE string would require in UTF-32LE format. + * + * This function is equivalent to count_utf16le. + * + * This function does not validate the input. + * + * This function is not BOM-aware. + * + * @param input the UTF-16LE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @return the number of bytes required to encode the UTF-16LE string as UTF-32LE + */ +simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) noexcept; + +/** + * Compute the number of bytes that this UTF-16BE string would require in UTF-32LE format. + * + * This function is equivalent to count_utf16be. + * + * This function does not validate the input. + * + * This function is not BOM-aware. + * + * @param input the UTF-16BE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @return the number of bytes required to encode the UTF-16BE string as UTF-32LE + */ +simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) noexcept; + +/** + * Count the number of code points (characters) in the string assuming that + * it is valid. + * + * This function assumes that the input string is valid UTF-16LE. + * + * This function is not BOM-aware. + * + * @param input the UTF-16LE string to process + * @param length the length of the string in 2-byte words (char16_t) + * @return number of code points + */ +simdutf_warn_unused size_t count_utf16le(const char16_t * input, size_t length) noexcept; + +/** + * Count the number of code points (characters) in the string assuming that + * it is valid. + * + * This function assumes that the input string is valid UTF-16BE. + * + * This function is not BOM-aware. + * + * @param input the UTF-16BE string to process + * @param length the length of the string in 2-byte words (char16_t) + * @return number of code points + */ +simdutf_warn_unused size_t count_utf16be(const char16_t * input, size_t length) noexcept; + +/** + * Count the number of code points (characters) in the string assuming that + * it is valid. + * + * This function assumes that the input string is valid UTF-8. + * + * @param input the UTF-8 string to process + * @param length the length of the string in bytes + * @return number of code points + */ +simdutf_warn_unused size_t count_utf8(const char * input, size_t length) noexcept; + +/** + * An implementation of simdutf for a particular CPU architecture. + * + * Also used to maintain the currently active implementation. The active implementation is + * automatically initialized on first use to the most advanced implementation supported by the host. + */ +class implementation { +public: + + /** + * The name of this implementation. + * + * const implementation *impl = simdutf::active_implementation; + * cout << "simdutf is optimized for " << impl->name() << "(" << impl->description() << ")" << endl; + * + * @return the name of the implementation, e.g. "haswell", "westmere", "arm64" + */ + virtual const std::string &name() const { return _name; } + + /** + * The description of this implementation. + * + * const implementation *impl = simdutf::active_implementation; + * cout << "simdutf is optimized for " << impl->name() << "(" << impl->description() << ")" << endl; + * + * @return the name of the implementation, e.g. "haswell", "westmere", "arm64" + */ + virtual const std::string &description() const { return _description; } + + /** + * The instruction sets this implementation is compiled against + * and the current CPU match. This function may poll the current CPU/system + * and should therefore not be called too often if performance is a concern. + * + * + * @return true if the implementation can be safely used on the current system (determined at runtime) + */ + bool supported_by_runtime_system() const; + + /** + * This function will try to detect the encoding + * @param input the string to identify + * @param length the length of the string in bytes. + * @return the encoding type detected + */ + virtual encoding_type autodetect_encoding(const char * input, size_t length) const noexcept; + + /** + * This function will try to detect the possible encodings in one pass + * @param input the string to identify + * @param length the length of the string in bytes. + * @return the encoding type detected + */ + virtual int detect_encodings(const char * input, size_t length) const noexcept = 0; + + /** + * @private For internal implementation use + * + * The instruction sets this implementation is compiled against. + * + * @return a mask of all required `internal::instruction_set::` values + */ + virtual uint32_t required_instruction_sets() const { return _required_instruction_sets; }; + + + /** + * Validate the UTF-8 string. + * + * Overridden by each implementation. + * + * @param buf the UTF-8 string to validate. + * @param len the length of the string in bytes. + * @return true if and only if the string is valid UTF-8. + */ + simdutf_warn_unused virtual bool validate_utf8(const char *buf, size_t len) const noexcept = 0; + + /** + * Validate the UTF-8 string and stop on errors. + * + * Overridden by each implementation. + * + * @param buf the UTF-8 string to validate. + * @param len the length of the string in bytes. + * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful. + */ + simdutf_warn_unused virtual result validate_utf8_with_errors(const char *buf, size_t len) const noexcept = 0; + + /** + * Validate the ASCII string. + * + * Overridden by each implementation. + * + * @param buf the ASCII string to validate. + * @param len the length of the string in bytes. + * @return true if and only if the string is valid ASCII. + */ + simdutf_warn_unused virtual bool validate_ascii(const char *buf, size_t len) const noexcept = 0; + + /** + * Validate the ASCII string and stop on error. + * + * Overridden by each implementation. + * + * @param buf the ASCII string to validate. + * @param len the length of the string in bytes. + * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful. + */ + simdutf_warn_unused virtual result validate_ascii_with_errors(const char *buf, size_t len) const noexcept = 0; + + /** + * Validate the UTF-16LE string. + * + * Overridden by each implementation. + * + * This function is not BOM-aware. + * + * @param buf the UTF-16LE string to validate. + * @param len the length of the string in number of 2-byte words (char16_t). + * @return true if and only if the string is valid UTF-16LE. + */ + simdutf_warn_unused virtual bool validate_utf16le(const char16_t *buf, size_t len) const noexcept = 0; + + /** + * Validate the UTF-16BE string. + * + * Overridden by each implementation. + * + * This function is not BOM-aware. + * + * @param buf the UTF-16BE string to validate. + * @param len the length of the string in number of 2-byte words (char16_t). + * @return true if and only if the string is valid UTF-16BE. + */ + simdutf_warn_unused virtual bool validate_utf16be(const char16_t *buf, size_t len) const noexcept = 0; + + /** + * Validate the UTF-16LE string and stop on error. + * + * Overridden by each implementation. + * + * This function is not BOM-aware. + * + * @param buf the UTF-16LE string to validate. + * @param len the length of the string in number of 2-byte words (char16_t). + * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful. + */ + simdutf_warn_unused virtual result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept = 0; + + /** + * Validate the UTF-16BE string and stop on error. + * + * Overridden by each implementation. + * + * This function is not BOM-aware. + * + * @param buf the UTF-16BE string to validate. + * @param len the length of the string in number of 2-byte words (char16_t). + * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful. + */ + simdutf_warn_unused virtual result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept = 0; + + /** + * Validate the UTF-32LE string. + * + * Overridden by each implementation. + * + * This function is not BOM-aware. + * + * @param buf the UTF-32LE string to validate. + * @param len the length of the string in number of 4-byte words (char32_t). + * @return true if and only if the string is valid UTF-32LE. + */ + simdutf_warn_unused virtual bool validate_utf32(const char32_t *buf, size_t len) const noexcept = 0; + + /** + * Validate the UTF-32LE string and stop on error. + * + * Overridden by each implementation. + * + * This function is not BOM-aware. + * + * @param buf the UTF-32LE string to validate. + * @param len the length of the string in number of 4-byte words (char32_t). + * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful. + */ + simdutf_warn_unused virtual result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept = 0; + + /** + * Convert possibly broken UTF-8 string into UTF-16LE string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return the number of written char16_t; 0 if the input was not valid UTF-8 string + */ + simdutf_warn_unused virtual size_t convert_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0; + + /** + * Convert possibly broken UTF-8 string into UTF-16BE string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return the number of written char16_t; 0 if the input was not valid UTF-8 string + */ + simdutf_warn_unused virtual size_t convert_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0; + + /** + * Convert possibly broken UTF-8 string into UTF-16LE string and stop on error. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful. + */ + simdutf_warn_unused virtual result convert_utf8_to_utf16le_with_errors(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0; + + /** + * Convert possibly broken UTF-8 string into UTF-16BE string and stop on error. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful. + */ + simdutf_warn_unused virtual result convert_utf8_to_utf16be_with_errors(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0; + + /** + * Convert possibly broken UTF-8 string into UTF-32LE string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param utf32_buffer the pointer to buffer that can hold conversion result + * @return the number of written char16_t; 0 if the input was not valid UTF-8 string + */ + simdutf_warn_unused virtual size_t convert_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_output) const noexcept = 0; + + /** + * Convert possibly broken UTF-8 string into UTF-32LE string and stop on error. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param utf32_buffer the pointer to buffer that can hold conversion result + * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful. + */ + simdutf_warn_unused virtual result convert_utf8_to_utf32_with_errors(const char * input, size_t length, char32_t* utf32_output) const noexcept = 0; + + /** + * Convert valid UTF-8 string into UTF-16LE string. + * + * This function assumes that the input string is valid UTF-8. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return the number of written char16_t + */ + simdutf_warn_unused virtual size_t convert_valid_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; + +/** + * Convert valid UTF-8 string into UTF-16BE string. + * + * This function assumes that the input string is valid UTF-8. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return the number of written char16_t + */ + simdutf_warn_unused virtual size_t convert_valid_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; + + /** + * Convert valid UTF-8 string into UTF-32LE string. + * + * This function assumes that the input string is valid UTF-8. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return the number of written char32_t + */ + simdutf_warn_unused virtual size_t convert_valid_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; + + /** + * Compute the number of 2-byte words that this UTF-8 string would require in UTF-16LE format. + * + * This function does not validate the input. + * + * @param input the UTF-8 string to process + * @param length the length of the string in bytes + * @return the number of char16_t words required to encode the UTF-8 string as UTF-16LE + */ + simdutf_warn_unused virtual size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept = 0; + + /** + * Compute the number of 4-byte words that this UTF-8 string would require in UTF-32LE format. + * + * This function is equivalent to count_utf8. + * + * This function does not validate the input. + * + * @param input the UTF-8 string to process + * @param length the length of the string in bytes + * @return the number of char32_t words required to encode the UTF-8 string as UTF-32LE + */ + simdutf_warn_unused virtual size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept = 0; + + /** + * Convert possibly broken UTF-16LE string into UTF-8 string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-16LE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf8_buffer the pointer to buffer that can hold conversion result + * @return number of written words; 0 if input is not a valid UTF-16LE string + */ + simdutf_warn_unused virtual size_t convert_utf16le_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0; + + /** + * Convert possibly broken UTF-16BE string into UTF-8 string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-16BE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf8_buffer the pointer to buffer that can hold conversion result + * @return number of written words; 0 if input is not a valid UTF-16BE string + */ + simdutf_warn_unused virtual size_t convert_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0; + + /** + * Convert possibly broken UTF-16LE string into UTF-8 string and stop on error. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-16LE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf8_buffer the pointer to buffer that can hold conversion result + * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful. + */ + simdutf_warn_unused virtual result convert_utf16le_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0; + + /** + * Convert possibly broken UTF-16BE string into UTF-8 string and stop on error. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-16BE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf8_buffer the pointer to buffer that can hold conversion result + * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful. + */ + simdutf_warn_unused virtual result convert_utf16be_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0; + + /** + * Convert valid UTF-16LE string into UTF-8 string. + * + * This function assumes that the input string is valid UTF-16LE. + * + * This function is not BOM-aware. + * + * @param input the UTF-16LE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf8_buffer the pointer to buffer that can hold the conversion result + * @return number of written words; 0 if conversion is not possible + */ + simdutf_warn_unused virtual size_t convert_valid_utf16le_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0; + + /** + * Convert valid UTF-16BE string into UTF-8 string. + * + * This function assumes that the input string is valid UTF-16BE. + * + * This function is not BOM-aware. + * + * @param input the UTF-16BE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf8_buffer the pointer to buffer that can hold the conversion result + * @return number of written words; 0 if conversion is not possible + */ + simdutf_warn_unused virtual size_t convert_valid_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0; + + /** + * Convert possibly broken UTF-16LE string into UTF-32LE string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-16LE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf32_buffer the pointer to buffer that can hold conversion result + * @return number of written words; 0 if input is not a valid UTF-16LE string + */ + simdutf_warn_unused virtual size_t convert_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; + + /** + * Convert possibly broken UTF-16BE string into UTF-32LE string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-16BE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf32_buffer the pointer to buffer that can hold conversion result + * @return number of written words; 0 if input is not a valid UTF-16BE string + */ + simdutf_warn_unused virtual size_t convert_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; + + /** + * Convert possibly broken UTF-16LE string into UTF-32LE string and stop on error. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-16LE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf32_buffer the pointer to buffer that can hold conversion result + * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful. + */ + simdutf_warn_unused virtual result convert_utf16le_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; + + /** + * Convert possibly broken UTF-16BE string into UTF-32LE string and stop on error. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-16BE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf32_buffer the pointer to buffer that can hold conversion result + * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful. + */ + simdutf_warn_unused virtual result convert_utf16be_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; + + /** + * Convert valid UTF-16LE string into UTF-32LE string. + * + * This function assumes that the input string is valid UTF-16LE. + * + * This function is not BOM-aware. + * + * @param input the UTF-16LE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf32_buffer the pointer to buffer that can hold the conversion result + * @return number of written words; 0 if conversion is not possible + */ + simdutf_warn_unused virtual size_t convert_valid_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; + + /** + * Convert valid UTF-16LE string into UTF-32BE string. + * + * This function assumes that the input string is valid UTF-16BE. + * + * This function is not BOM-aware. + * + * @param input the UTF-16BE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf32_buffer the pointer to buffer that can hold the conversion result + * @return number of written words; 0 if conversion is not possible + */ + simdutf_warn_unused virtual size_t convert_valid_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; + + /** + * Compute the number of bytes that this UTF-16LE string would require in UTF-8 format. + * + * This function does not validate the input. + * + * This function is not BOM-aware. + * + * @param input the UTF-16LE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @return the number of bytes required to encode the UTF-16LE string as UTF-8 + */ + simdutf_warn_unused virtual size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept = 0; + + /** + * Compute the number of bytes that this UTF-16BE string would require in UTF-8 format. + * + * This function does not validate the input. + * + * This function is not BOM-aware. + * + * @param input the UTF-16BE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @return the number of bytes required to encode the UTF-16BE string as UTF-8 + */ + simdutf_warn_unused virtual size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept = 0; + + /** + * Convert possibly broken UTF-32LE string into UTF-8 string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-32LE string to convert + * @param length the length of the string in 4-byte words (char32_t) + * @param utf8_buffer the pointer to buffer that can hold conversion result + * @return number of written words; 0 if input is not a valid UTF-32LE string + */ + simdutf_warn_unused virtual size_t convert_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) const noexcept = 0; + + /** + * Convert possibly broken UTF-32LE string into UTF-8 string and stop on error. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-32LE string to convert + * @param length the length of the string in 4-byte words (char32_t) + * @param utf8_buffer the pointer to buffer that can hold conversion result + * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful. + */ + simdutf_warn_unused virtual result convert_utf32_to_utf8_with_errors(const char32_t * input, size_t length, char* utf8_buffer) const noexcept = 0; + + /** + * Convert valid UTF-32LE string into UTF-8 string. + * + * This function assumes that the input string is valid UTF-32LE. + * + * This function is not BOM-aware. + * + * @param input the UTF-32LE string to convert + * @param length the length of the string in 4-byte words (char32_t) + * @param utf8_buffer the pointer to buffer that can hold the conversion result + * @return number of written words; 0 if conversion is not possible + */ + simdutf_warn_unused virtual size_t convert_valid_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) const noexcept = 0; + + /** + * Convert possibly broken UTF-32LE string into UTF-16LE string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-32LE string to convert + * @param length the length of the string in 4-byte words (char32_t) + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return number of written words; 0 if input is not a valid UTF-32LE string + */ + simdutf_warn_unused virtual size_t convert_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; + + /** + * Convert possibly broken UTF-32LE string into UTF-16BE string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-32LE string to convert + * @param length the length of the string in 4-byte words (char32_t) + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return number of written words; 0 if input is not a valid UTF-32LE string + */ + simdutf_warn_unused virtual size_t convert_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; + + /** + * Convert possibly broken UTF-32LE string into UTF-16LE string and stop on error. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-32LE string to convert + * @param length the length of the string in 4-byte words (char32_t) + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful. + */ + simdutf_warn_unused virtual result convert_utf32_to_utf16le_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; + + /** + * Convert possibly broken UTF-32LE string into UTF-16BE string and stop on error. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-32LE string to convert + * @param length the length of the string in 4-byte words (char32_t) + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful. + */ + simdutf_warn_unused virtual result convert_utf32_to_utf16be_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; + + /** + * Convert valid UTF-32LE string into UTF-16LE string. + * + * This function assumes that the input string is valid UTF-32LE. + * + * This function is not BOM-aware. + * + * @param input the UTF-32LE string to convert + * @param length the length of the string in 4-byte words (char32_t) + * @param utf16_buffer the pointer to buffer that can hold the conversion result + * @return number of written words; 0 if conversion is not possible + */ + simdutf_warn_unused virtual size_t convert_valid_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; + + /** + * Convert valid UTF-32LE string into UTF-16BE string. + * + * This function assumes that the input string is valid UTF-32LE. + * + * This function is not BOM-aware. + * + * @param input the UTF-32LE string to convert + * @param length the length of the string in 4-byte words (char32_t) + * @param utf16_buffer the pointer to buffer that can hold the conversion result + * @return number of written words; 0 if conversion is not possible + */ + simdutf_warn_unused virtual size_t convert_valid_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; + + /** + * Change the endianness of the input. Can be used to go from UTF-16LE to UTF-16BE or + * from UTF-16BE to UTF-16LE. + * + * This function does not validate the input. + * + * This function is not BOM-aware. + * + * @param input the UTF-16 string to process + * @param length the length of the string in 2-byte words (char16_t) + * @param output the pointer to buffer that can hold the conversion result + */ + virtual void change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept = 0; + + /** + * Compute the number of bytes that this UTF-32LE string would require in UTF-8 format. + * + * This function does not validate the input. + * + * @param input the UTF-32LE string to convert + * @param length the length of the string in 4-byte words (char32_t) + * @return the number of bytes required to encode the UTF-32LE string as UTF-8 + */ + simdutf_warn_unused virtual size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept = 0; + + /** + * Compute the number of two-byte words that this UTF-32LE string would require in UTF-16 format. + * + * This function does not validate the input. + * + * @param input the UTF-32LE string to convert + * @param length the length of the string in 4-byte words (char32_t) + * @return the number of bytes required to encode the UTF-32LE string as UTF-16 + */ + simdutf_warn_unused virtual size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept = 0; + + /* + * Compute the number of bytes that this UTF-16LE string would require in UTF-32LE format. + * + * This function is equivalent to count_utf16le. + * + * This function does not validate the input. + * + * This function is not BOM-aware. + * + * @param input the UTF-16LE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @return the number of bytes required to encode the UTF-16LE string as UTF-32LE + */ + simdutf_warn_unused virtual size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept = 0; + + /* + * Compute the number of bytes that this UTF-16BE string would require in UTF-32LE format. + * + * This function is equivalent to count_utf16be. + * + * This function does not validate the input. + * + * This function is not BOM-aware. + * + * @param input the UTF-16BE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @return the number of bytes required to encode the UTF-16BE string as UTF-32LE + */ + simdutf_warn_unused virtual size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept = 0; + + /** + * Count the number of code points (characters) in the string assuming that + * it is valid. + * + * This function assumes that the input string is valid UTF-16LE. + * + * This function is not BOM-aware. + * + * @param input the UTF-16LE string to process + * @param length the length of the string in 2-byte words (char16_t) + * @return number of code points + */ + simdutf_warn_unused virtual size_t count_utf16le(const char16_t * input, size_t length) const noexcept = 0; + + /** + * Count the number of code points (characters) in the string assuming that + * it is valid. + * + * This function assumes that the input string is valid UTF-16BE. + * + * This function is not BOM-aware. + * + * @param input the UTF-16BE string to process + * @param length the length of the string in 2-byte words (char16_t) + * @return number of code points + */ + simdutf_warn_unused virtual size_t count_utf16be(const char16_t * input, size_t length) const noexcept = 0; + + + /** + * Count the number of code points (characters) in the string assuming that + * it is valid. + * + * This function assumes that the input string is valid UTF-8. + * + * @param input the UTF-8 string to process + * @param length the length of the string in bytes + * @return number of code points + */ + simdutf_warn_unused virtual size_t count_utf8(const char * input, size_t length) const noexcept = 0; + + + +protected: + /** @private Construct an implementation with the given name and description. For subclasses. */ + simdutf_really_inline implementation( + std::string name, + std::string description, + uint32_t required_instruction_sets + ) : + _name(name), + _description(description), + _required_instruction_sets(required_instruction_sets) + { + } + virtual ~implementation()=default; + +private: + /** + * The name of this implementation. + */ + const std::string _name; + + /** + * The description of this implementation. + */ + const std::string _description; + + /** + * Instruction sets required for this implementation. + */ + const uint32_t _required_instruction_sets; +}; + +/** @private */ +namespace internal { + +/** + * The list of available implementations compiled into simdutf. + */ +class available_implementation_list { +public: + /** Get the list of available implementations compiled into simdutf */ + simdutf_really_inline available_implementation_list() {} + /** Number of implementations */ + size_t size() const noexcept; + /** STL const begin() iterator */ + const implementation * const *begin() const noexcept; + /** STL const end() iterator */ + const implementation * const *end() const noexcept; + + /** + * Get the implementation with the given name. + * + * Case sensitive. + * + * const implementation *impl = simdutf::available_implementations["westmere"]; + * if (!impl) { exit(1); } + * if (!imp->supported_by_runtime_system()) { exit(1); } + * simdutf::active_implementation = impl; + * + * @param name the implementation to find, e.g. "westmere", "haswell", "arm64" + * @return the implementation, or nullptr if the parse failed. + */ + const implementation * operator[](const std::string &name) const noexcept { + for (const implementation * impl : *this) { + if (impl->name() == name) { return impl; } + } + return nullptr; + } + + /** + * Detect the most advanced implementation supported by the current host. + * + * This is used to initialize the implementation on startup. + * + * const implementation *impl = simdutf::available_implementation::detect_best_supported(); + * simdutf::active_implementation = impl; + * + * @return the most advanced supported implementation for the current host, or an + * implementation that returns UNSUPPORTED_ARCHITECTURE if there is no supported + * implementation. Will never return nullptr. + */ + const implementation *detect_best_supported() const noexcept; +}; + +template +class atomic_ptr { +public: + atomic_ptr(T *_ptr) : ptr{_ptr} {} + +#if defined(SIMDUTF_NO_THREADS) + operator const T*() const { return ptr; } + const T& operator*() const { return *ptr; } + const T* operator->() const { return ptr; } + + operator T*() { return ptr; } + T& operator*() { return *ptr; } + T* operator->() { return ptr; } + atomic_ptr& operator=(T *_ptr) { ptr = _ptr; return *this; } + +#else + operator const T*() const { return ptr.load(); } + const T& operator*() const { return *ptr; } + const T* operator->() const { return ptr.load(); } + + operator T*() { return ptr.load(); } + T& operator*() { return *ptr; } + T* operator->() { return ptr.load(); } + atomic_ptr& operator=(T *_ptr) { ptr = _ptr; return *this; } + +#endif + +private: +#if defined(SIMDUTF_NO_THREADS) + T* ptr; +#else + std::atomic ptr; +#endif +}; + +} // namespace internal + +/** + * The list of available implementations compiled into simdutf. + */ +extern SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list available_implementations; + +/** + * The active implementation. + * + * Automatically initialized on first use to the most advanced implementation supported by this hardware. + */ +extern SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr active_implementation; + +} // namespace simdutf + +#endif // SIMDUTF_IMPLEMENTATION_H +/* end file include/simdutf/implementation.h */ + + +// Implementation-internal files (must be included before the implementations themselves, to keep +// amalgamation working--otherwise, the first time a file is included, it might be put inside the +// #ifdef SIMDUTF_IMPLEMENTATION_ARM64/FALLBACK/etc., which means the other implementations can't +// compile unless that implementation is turned on). + + +SIMDUTF_POP_DISABLE_WARNINGS + +#endif // SIMDUTF_H +/* end file include/simdutf.h */ diff --git a/node.gyp b/node.gyp index 448cb8a8c7cd49f..21182c56b91ac99 100644 --- a/node.gyp +++ b/node.gyp @@ -448,13 +448,15 @@ 'include_dirs': [ 'src', - '<(SHARED_INTERMEDIATE_DIR)' # for node_natives.h + '<(SHARED_INTERMEDIATE_DIR)', # for node_natives.h + 'deps/simdutf' ], 'dependencies': [ 'deps/base64/base64.gyp:base64', 'deps/googletest/googletest.gyp:gtest_prod', 'deps/histogram/histogram.gyp:histogram', 'deps/uvwasi/uvwasi.gyp:uvwasi', + 'deps/simdutf/simdutf.gyp:simdutf', ], 'sources': [ @@ -649,6 +651,7 @@ 'src/pipe_wrap.h', 'src/req_wrap.h', 'src/req_wrap-inl.h', + 'src/simdutf.h', 'src/spawn_sync.h', 'src/stream_base.h', 'src/stream_base-inl.h', diff --git a/tools/dep_updaters/README.md b/tools/dep_updaters/README.md index afae95302766e5f..21ce2900459a202 100644 --- a/tools/dep_updaters/README.md +++ b/tools/dep_updaters/README.md @@ -35,3 +35,28 @@ been created with the changes), do the following: example). [`c61870c`]: https://github.com/nodejs/node/commit/c61870c376e2f5b0dbaa939972c46745e21cdbdd + +## simdutf + +The `update-simdutf.sh` script takes the target version to update as its only +argument, downloads it from the [GitHub repo](https://github.com/simdutf/simdutf) +and uses it to replace the contents of `deps/uv/`. The contents are replaced +entirely except for the `*.gyp` and `*.gypi` build files, which are part of the +Node.js build definitions and are not present in the upstream repo. + +For example, in order to update to version `1.44.2`, the following command can +be run: + +```bash +./tools/dep_updaters/update-simdutf.sh 2.0.6 +``` + +Once the script has run (either manually, or by CI in which case a PR will have +been created with the changes), do the following: + +1. Check the [changelog](https://github.com/simdutf/simdutf/releases/tag/v2.0.3) for + things that might require changes in Node.js. +2. If necessary, update `common.gypi` and `simdutf.gyp` with build-related changes. +3. Check that Node.js compiles without errors and the tests pass. +4. Create a commit for the update and in the commit message include the + important/relevant items from the changelog. diff --git a/tools/dep_updaters/update-simdutf.sh b/tools/dep_updaters/update-simdutf.sh new file mode 100755 index 000000000000000..f6e28c73ad79b4d --- /dev/null +++ b/tools/dep_updaters/update-simdutf.sh @@ -0,0 +1,49 @@ +#!/bin/sh +set -e +# Shell script to update simdutf in the source tree to a specific version + +BASE_DIR=$(cd "$(dirname "$0")/../.." && pwd) +DEPS_DIR="$BASE_DIR/deps" +SIMDUTF_VERSION=$1 + +if [ "$#" -le 0 ]; then + echo "Error: please provide an simdutf version to update to" + echo " e.g. $0 2.0.3" + exit 1 +fi + +echo "Making temporary workspace..." + +WORKSPACE=$(mktemp -d 2> /dev/null || mktemp -d -t 'tmp') + +cleanup () { + EXIT_CODE=$? + [ -d "$WORKSPACE" ] && rm -rf "$WORKSPACE" + exit $EXIT_CODE +} + +trap cleanup INT TERM EXIT + +SIMDUTF_REF="v$SIMDUTF_VERSION" +SIMDUTF_ZIP="simdutf-$SIMDUTF_VERSION.zip" + +cd "$WORKSPACE" + +echo "Fetching simdutf source archive..." +curl -sL -o "$SIMDUTF_ZIP" "https://github.com/simdutf/simdutf/releases/download/$SIMDUTF_REF/singleheader.zip" +tar -xf "$SIMDUTF_ZIP" +rm "$SIMDUTF_ZIP" +rm ./*_demo.cpp + +echo "Replacing existing simdutf (except GYP build files)" +mv "$DEPS_DIR/simdutf/"*.gyp "$DEPS_DIR/simdutf/README.md" "$WORKSPACE/" +rm -rf "$DEPS_DIR/simdutf" +mv "$WORKSPACE" "$DEPS_DIR/simdutf" + +echo "All done!" +echo "" +echo "Please git add simdutf, commit the new version:" +echo "" +echo "$ git add -A deps/simdutf" +echo "$ git commit -m \"deps: update simdutf to $SIMDUTF_VERSION\"" +echo "" diff --git a/tools/license-builder.sh b/tools/license-builder.sh index 96b25d9781645ab..606ddcccd7fe056 100755 --- a/tools/license-builder.sh +++ b/tools/license-builder.sh @@ -77,6 +77,8 @@ licenseText="$(sed -e '/You should have received a copy of the CC0/,$d' -e 's/^\ addlicense "SipHash" "deps/v8/src/third_party/siphash" "$licenseText" licenseText="$(sed -e '/The data format used by the zlib library/,$d' -e 's/^\/\* *//' -e 's/^ *//' "${rootdir}/deps/zlib/zlib.h")" addlicense "zlib" "deps/zlib" "$licenseText" +addlicense "simdutf" "deps/simdutf" "$licenseText" +licenseText="$(curl -sL https://raw.githubusercontent.com/simdutf/simdutf/HEAD/LICENSE-MIT)" # npm licenseText="$(cat "${rootdir}/deps/npm/LICENSE")"