Skip to content

Commit

Permalink
Improve utf8_to_utf16 speed for common path (#892)
Browse files Browse the repository at this point in the history
* Improve utf8_to_utf16 speed for common path

Conversion from UTF 8 to UTF 16 will consist mostly of single byte code points (e.g. parsing json bodies). This allows running single byte conversion in a tight loop that is only interrupted if multi byte handling becomes necessary.

Measurements for a very long string showed ~30% speed improvement

* Use UtilCharInternal_t as character type to avoid issues with platform dependent definition of char
  • Loading branch information
chris0x44 authored and BillyONeal committed Oct 26, 2018
1 parent f4c08f0 commit b3a7141
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 19 deletions.
59 changes: 40 additions & 19 deletions Release/src/utilities/asyncrt_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -347,19 +347,33 @@ const std::error_category & __cdecl linux_category()
#define H_SURROGATE_END 0xDBFF
#define SURROGATE_PAIR_START 0x10000

// Create a dedicated type for characters to avoid the issue
// of different platforms defaulting char to be either signed
// or unsigned.
using UtilCharInternal_t = signed char;


inline size_t count_utf8_to_utf16(const std::string& s)
{
const size_t sSize = s.size();
const char* const sData = s.data();
auto sData = reinterpret_cast<const UtilCharInternal_t* const>(s.data());
size_t result{ sSize };

for (size_t index = 0; index < sSize;)
{
const char c{ sData[index++] };
if ((c & BIT8) == 0)
if( sData[index] > 0 )
{
continue;
// use fast inner loop to skip single byte code points (which are
// expected to be the most frequent)
while ((++index < sSize) && (sData[index] > 0))
;

if (index >= sSize) break;
}

// start special handling for multi-byte code points
const UtilCharInternal_t c{ sData[index++] };

if ((c & BIT7) == 0)
{
throw std::range_error("UTF-8 string character can never start with 10xxxxxx");
Expand All @@ -371,7 +385,7 @@ inline size_t count_utf8_to_utf16(const std::string& s)
throw std::range_error("UTF-8 string is missing bytes in character");
}

const char c2{ sData[index++] };
const UtilCharInternal_t c2{ sData[index++] };
if ((c2 & 0xC0) != BIT8)
{
throw std::range_error("UTF-8 continuation byte is missing leading bit mask");
Expand All @@ -387,8 +401,8 @@ inline size_t count_utf8_to_utf16(const std::string& s)
throw std::range_error("UTF-8 string is missing bytes in character");
}

const char c2{ sData[index++] };
const char c3{ sData[index++] };
const UtilCharInternal_t c2{ sData[index++] };
const UtilCharInternal_t c3{ sData[index++] };
if (((c2 | c3) & 0xC0) != BIT8)
{
throw std::range_error("UTF-8 continuation byte is missing leading bit mask");
Expand All @@ -403,9 +417,9 @@ inline size_t count_utf8_to_utf16(const std::string& s)
throw std::range_error("UTF-8 string is missing bytes in character");
}

const char c2{ sData[index++] };
const char c3{ sData[index++] };
const char c4{ sData[index++] };
const UtilCharInternal_t c2{ sData[index++] };
const UtilCharInternal_t c3{ sData[index++] };
const UtilCharInternal_t c4{ sData[index++] };
if (((c2 | c3 | c4) & 0xC0) != BIT8)
{
throw std::range_error("UTF-8 continuation byte is missing leading bit mask");
Expand All @@ -427,21 +441,21 @@ utf16string __cdecl conversions::utf8_to_utf16(const std::string &s)
{
// Save repeated heap allocations, use the length of resulting sequence.
const size_t srcSize = s.size();
const std::string::value_type* const srcData = &s[0];
auto srcData = reinterpret_cast<const UtilCharInternal_t* const>(s.data());
utf16string dest(count_utf8_to_utf16(s), L'\0');
utf16string::value_type* const destData = &dest[0];
size_t destIndex = 0;

for (size_t index = 0; index < srcSize; ++index)
{
std::string::value_type src = srcData[index];
UtilCharInternal_t src = srcData[index];
switch (src & 0xF0)
{
case 0xF0: // 4 byte character, 0x10000 to 0x10FFFF
{
const char c2{ srcData[++index] };
const char c3{ srcData[++index] };
const char c4{ srcData[++index] };
const UtilCharInternal_t c2{ srcData[++index] };
const UtilCharInternal_t c3{ srcData[++index] };
const UtilCharInternal_t c4{ srcData[++index] };
uint32_t codePoint = ((src & LOW_3BITS) << 18) | ((c2 & LOW_6BITS) << 12) | ((c3 & LOW_6BITS) << 6) | (c4 & LOW_6BITS);
if (codePoint >= SURROGATE_PAIR_START)
{
Expand All @@ -464,20 +478,27 @@ utf16string __cdecl conversions::utf8_to_utf16(const std::string &s)
break;
case 0xE0: // 3 byte character, 0x800 to 0xFFFF
{
const char c2{ srcData[++index] };
const char c3{ srcData[++index] };
const UtilCharInternal_t c2{ srcData[++index] };
const UtilCharInternal_t c3{ srcData[++index] };
destData[destIndex++] = static_cast<utf16string::value_type>(((src & LOW_4BITS) << 12) | ((c2 & LOW_6BITS) << 6) | (c3 & LOW_6BITS));
}
break;
case 0xD0: // 2 byte character, 0x80 to 0x7FF
case 0xC0:
{
const char c2{ srcData[++index] };
const UtilCharInternal_t c2{ srcData[++index] };
destData[destIndex++] = static_cast<utf16string::value_type>(((src & LOW_5BITS) << 6) | (c2 & LOW_6BITS));
}
break;
default: // single byte character, 0x0 to 0x7F
destData[destIndex++] = static_cast<utf16string::value_type>(src);
// try to use a fast inner loop for following single byte characters,
// since they are quite probable
do
{
destData[destIndex++] = static_cast<utf16string::value_type>(srcData[index++]);
} while (index < srcSize && srcData[index] > 0);
// adjust index since it will be incremented by the for loop
--index;
}
}
return dest;
Expand Down
25 changes: 25 additions & 0 deletions Release/tests/functional/utils/strings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,31 @@ TEST(utf8_to_utf16)
#else
VERIFY_ARE_EQUAL(conversion.from_bytes(input), result);
#endif


// 1 byte character followed by 4 byte character
input.clear();
input.push_back( 51u); // 00110011
// U+10000
input.push_back(244u); // 11110100
input.push_back(128u); // 10000000
input.push_back(128u); // 10000000
input.push_back(128u); // 10000000
// U+10FFFF
input.push_back(244u); // 11110100
input.push_back(143u); // 10001111
input.push_back(191u); // 10111111
input.push_back(191u); // 10111111
result = utility::conversions::utf8_to_utf16(input);
#if defined(__GLIBCXX__)
VERIFY_ARE_EQUAL(51, result[0]);
VERIFY_ARE_EQUAL(56256, result[1]);
VERIFY_ARE_EQUAL(56320, result[2]);
VERIFY_ARE_EQUAL(56319, result[3]);
VERIFY_ARE_EQUAL(57343, result[4]);
#else
VERIFY_ARE_EQUAL(conversion.from_bytes(input), result);
#endif
}

TEST(utf16_to_utf8_errors)
Expand Down

0 comments on commit b3a7141

Please sign in to comment.