Skip to content

Commit

Permalink
.
Browse files Browse the repository at this point in the history
  • Loading branch information
renatoGarcia committed Jun 18, 2024
1 parent e7ef9b2 commit b72bf11
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 138 deletions.
27 changes: 27 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ IceCream-Cpp is a little (single header) library to help with the print debuggin
* [line_wrap_width](#line_wrap_width)
* [include_context](#include_context)
* [context_delimiter](#context_delimiter)
* [Character Encoding](#character-encoding)
* [Printing logic](#printing-logic)
* [C strings](#c-strings)
* [Pointer like types](#pointer-like-types)
Expand Down Expand Up @@ -539,6 +540,32 @@ The string separating the context text from the variables values. Default value
auto context_delimiter(std::string const& value) -> IcecreamAPI&;
```

### Character Encoding

Character encoding in C++ is complicated. The `std::string` and a `char*` strings don't
have any requirements or information about what encoding they are using, while a string
literal, like on:

```C++
auto* const str = "foo";
```

will have a well defined, but implementation-defined encoding. Any system and compiler can
potentially use a distinct encoding.

The `std::wstring` and `wchar_t*` are even a little more complicated. Besides all the
above consideration, the bit size of their [code
unit](https://en.cppreference.com/w/cpp/language/charset#Code_unit_and_literal_encoding)
is implementation-defined. A `wchar_t` has 32 bits on Linux but 16 bits on Windows, for
example.

The new `char8_t`, `char16_t` and `char32_t` are better specified, they must be Unicode
encoded with their respective code unit width.

On the other end, to print `std::cout`

On IceCream-Cpp

### Printing logic

When printing a type `T`, the precedence is use an overloaded function
Expand Down
212 changes: 74 additions & 138 deletions icecream.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@
#include <string>
#include <tuple>
#include <type_traits>
#include <uchar.h>
#include <utility>
#include <valarray>
#include <vector>
Expand Down Expand Up @@ -553,176 +552,113 @@ namespace icecream{ namespace detail

// -------------------------------------------------- Char encoding

#if defined(__cpp_char8_t)
// A public domain branchless UTF-8 decoder by Christopher Wellons:
// https://github.com/skeeto/branchless-utf8
/* Decode the next character, c, from s, reporting errors in e.
*
* Since this is a branchless decoder, four bytes will be read from the
* buffer regardless of the actual length of the next character. This
* means the buffer _must_ have at least three bytes of zero padding
* following the end of the data stream.
*
* Errors are reported in e, which will be non-zero if the parsed
* character was somehow invalid: invalid byte sequence, non-canonical
* encoding, or a surrogate half.
*
* The function returns a pointer to the next character. When an error
* occurs, this pointer will be a guess that depends on the particular
* error, but it will always advance at least one byte.
*/
inline auto utf8_decode(char8_t const* buf, char32_t* c, int* e) -> char8_t const*
{
static const char lengths[] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0
};

static const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
static const int shiftc[] = {0, 18, 12, 6, 0};
static const int shifte[] = {0, 6, 4, 2, 0};

char8_t const* s = buf;
int len = lengths[s[0] >> 3];

/* Compute the pointer to the next character early so that the next
* iteration can start working on the next character. Neither Clang
* nor GCC figure out this reordering on their own.
*/
char8_t const* next = s + len + !len;

/* Assume a four-byte character and load four bytes. Unused bits are
* shifted out.
*/
*c = (uint32_t)(s[0] & masks[len]) << 18;
*c |= (uint32_t)(s[1] & 0x3f) << 12;
*c |= (uint32_t)(s[2] & 0x3f) << 6;
*c |= (uint32_t)(s[3] & 0x3f) << 0;
*c >>= shiftc[len];

/* Accumulate the various error conditions. */
*e = (*c < mins[len]) << 6; // non-canonical encoding
*e |= ((*c >> 11) == 0x1b) << 7; // surrogate half?
*e |= (*c > 0x10FFFF) << 8; // out of range?
*e |= (s[1] & 0xc0) >> 2;
*e |= (s[2] & 0xc0) >> 4;
*e |= (s[3] ) >> 6;
*e ^= 0x2a; // top two bits of each tail byte correct?
*e >>= shifte[len];

return next;
}


inline auto to_utf32(std::u8string const& s) -> std::u32string
inline auto to_utf32(std::u16string const& input) -> std::u32string
{
auto result = std::u32string{};
auto const block_size = size_t{4};
auto const* current = s.data();

if (s.size() >= block_size)
auto it = input.begin();
while (it != input.end())
{
auto const* const end = current + s.size() - block_size + 1;
while (current < end)
auto const it_next = it + 1;
if ((*it - 0xD800u) >= 2048u) // is not surrogate
{
auto c = char32_t{};
auto error = int{0};
current = utf8_decode(current, &c, &error);
if (error)
return U"<IceCreamCpp error decoding unicode string>";
result.push_back(c);
result.push_back(*it);
++it;
}
}

if (auto num_chars_left = s.data() + s.size() - current)
{
char8_t buf[2 * block_size] = {};

for (int i = 0; i < num_chars_left; ++i)
{
buf[i] = current[i];
else if (
(*it & 0xFFFFFC00u) == 0xD800u // is high surrogate
&& it_next != input.end()
&& (*it_next & 0xFFFFFC00u) == 0xDC00u // is low surrogate
){
auto const high = uint32_t{*it};
auto const low = uint32_t{*it_next};
auto const codepoint = char32_t{(high << 10) + low - 0x35FDC00u};
result.push_back(codepoint);
it += 2;
}

current = buf;
auto const* const end = buf + num_chars_left;
while (current < end)
else
{
auto c = char32_t{};
auto error = int{0};
current = utf8_decode(current, &c, &error);
if (error)
return U"IceCreamCpp error decoding unicode string";
result.push_back(c);
// Encoding error, print the REPLACEMENT CHARACTER
result.push_back(0xFFFD);
++it;
}
}

return result;
}
#endif

inline auto cxrtomb(char* s, char16_t c, std::mbstate_t* ps) -> std::size_t
{
return c16rtomb(s, c, ps);
}

inline auto cxrtomb(char* s, char32_t c, std::mbstate_t* ps) -> std::size_t
{
return c32rtomb(s, c, ps);
}

inline auto cxrtomb(char* s, wchar_t c, std::mbstate_t* ps) -> std::size_t
inline auto to_utf8_string(std::u32string const& input) -> std::string
{
return wcrtomb(s, c, ps);
}

inline auto to_narrow_multibyte(std::string const& s) -> std::string
{
return s;
}

template <typename T>
auto to_narrow_multibyte(std::basic_string<T> const& str) -> std::string
{
auto const prev_loc = std::string{std::setlocale(LC_CTYPE, nullptr)};
auto result = std::string{};

for (auto loc : {"", "C.UTF-8"})
for (auto const code : input)
{
if (std::string{std::setlocale(LC_CTYPE, nullptr)} != "C")
if (code < 0x80)
{
break;
result.push_back(code); // 0xxxxxxx
}
else
else if (code < 0x800) // 00000yyy yyxxxxxx
{
result.push_back(0xC0 | (code >> 6)); // 110yyyyy
result.push_back(0x80 | (code & 0x3F)); // 10xxxxxx
}
else if (code < 0x10000) // zzzzyyyy yyxxxxxx
{
std::setlocale(LC_CTYPE, loc);
result.push_back(0xE0 | (code >> 12)); // 1110zzzz
result.push_back(0x80 | ((code >> 6) & 0x3F)); // 10yyyyyy
result.push_back(0x80 | (code & 0x3F)); // 10xxxxxx
}
else if (code < 0x200000) // 000uuuuu zzzzyyyy yyxxxxxx
{
result.push_back(0xF0 | (code >> 18)); // 11110uuu
result.push_back(0x80 | ((code >> 12) & 0x3F)); // 10uuzzzz
result.push_back(0x80 | ((code >> 6) & 0x3F)); // 10yyyyyy
result.push_back(0x80 | (code & 0x3F)); // 10xxxxxx
}
else // Encoding error, print the REPLACEMENT CHARACTER
{
result.push_back(0xEF);
result.push_back(0xBF);
result.push_back(0xBF);
}
}
return result;
}

inline auto to_narrow_multibyte(std::string const& str) -> std::string
{
return str;
}

inline auto to_narrow_multibyte(std::wstring const& str) -> std::string
{
auto result = std::string{};
auto state = std::mbstate_t();
for (auto const c : str)

auto state = std::mbstate_t{};
for (auto const wc : str)
{
char out[MB_LEN_MAX]{};
auto const rc = cxrtomb(out, c, &state);
if (rc == static_cast<std::size_t>(-1))
{
auto sstr = std::ostringstream{};
sstr << "IceCreamCpp error decoding string errno " << std::strerror(errno);
std::setlocale(LC_CTYPE, prev_loc.c_str());
return sstr.str();
}
result.append(out, rc);
auto mb = std::string(MB_CUR_MAX, '\0');
std::wcrtomb(&mb[0], wc, &state);
result.append(mb);
}

std::setlocale(LC_CTYPE, prev_loc.c_str());
return result;
}

inline auto to_narrow_multibyte(std::u32string const& str) -> std::string
{
return to_utf8_string(str);
}

inline auto to_narrow_multibyte(std::u16string const& str) -> std::string
{
return to_narrow_multibyte(to_utf32(str));
}

#if defined(__cpp_char8_t)
inline auto to_narrow_multibyte(std::u8string const& str) -> std::string
{
// c8rtomb is missing from many implementations
return to_narrow_multibyte(to_utf32(str));
return std::string(reinterpret_cast<char const*>(str.data()));
}
#endif

Expand Down
9 changes: 9 additions & 0 deletions tests/test_c++11.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -797,6 +797,15 @@ TEST_CASE("std_string")
REQUIRE(str == "ic| v0: \"u16str \xce\xb1\"\n");
}

{
auto str = std::string{};
icecream::ic.output(str);

auto v0 = std::u16string {u"u16str \U0001D11E"};
IC(v0);
REQUIRE(str == "ic| v0: \"u16str \xf0\x9d\x84\x9e\"\n");
}

{
auto str = std::string{};
icecream::ic.output(str);
Expand Down

0 comments on commit b72bf11

Please sign in to comment.