Skip to content

Commit b72bf11

Browse files
committed
.
1 parent e7ef9b2 commit b72bf11

File tree

3 files changed

+110
-138
lines changed

3 files changed

+110
-138
lines changed

README.md

+27
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ IceCream-Cpp is a little (single header) library to help with the print debuggin
2323
* [line_wrap_width](#line_wrap_width)
2424
* [include_context](#include_context)
2525
* [context_delimiter](#context_delimiter)
26+
* [Character Encoding](#character-encoding)
2627
* [Printing logic](#printing-logic)
2728
* [C strings](#c-strings)
2829
* [Pointer like types](#pointer-like-types)
@@ -539,6 +540,32 @@ The string separating the context text from the variables values. Default value
539540
auto context_delimiter(std::string const& value) -> IcecreamAPI&;
540541
```
541542

543+
### Character Encoding
544+
545+
Character encoding in C++ is complicated. The `std::string` and a `char*` strings don't
546+
have any requirements or information about what encoding they are using, while a string
547+
literal, like on:
548+
549+
```C++
550+
auto* const str = "foo";
551+
```
552+
553+
will have a well defined, but implementation-defined encoding. Any system and compiler can
554+
potentially use a distinct encoding.
555+
556+
The `std::wstring` and `wchar_t*` are even a little more complicated. Besides all the
557+
above consideration, the bit size of their [code
558+
unit](https://en.cppreference.com/w/cpp/language/charset#Code_unit_and_literal_encoding)
559+
is implementation-defined. A `wchar_t` has 32 bits on Linux but 16 bits on Windows, for
560+
example.
561+
562+
The new `char8_t`, `char16_t` and `char32_t` are better specified, they must be Unicode
563+
encoded with their respective code unit width.
564+
565+
On the other end, to print `std::cout`
566+
567+
On IceCream-Cpp
568+
542569
### Printing logic
543570

544571
When printing a type `T`, the precedence is use an overloaded function

icecream.hpp

+74-138
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,6 @@
4848
#include <string>
4949
#include <tuple>
5050
#include <type_traits>
51-
#include <uchar.h>
5251
#include <utility>
5352
#include <valarray>
5453
#include <vector>
@@ -553,176 +552,113 @@ namespace icecream{ namespace detail
553552

554553
// -------------------------------------------------- Char encoding
555554

556-
#if defined(__cpp_char8_t)
557-
// A public domain branchless UTF-8 decoder by Christopher Wellons:
558-
// https://github.com/skeeto/branchless-utf8
559-
/* Decode the next character, c, from s, reporting errors in e.
560-
*
561-
* Since this is a branchless decoder, four bytes will be read from the
562-
* buffer regardless of the actual length of the next character. This
563-
* means the buffer _must_ have at least three bytes of zero padding
564-
* following the end of the data stream.
565-
*
566-
* Errors are reported in e, which will be non-zero if the parsed
567-
* character was somehow invalid: invalid byte sequence, non-canonical
568-
* encoding, or a surrogate half.
569-
*
570-
* The function returns a pointer to the next character. When an error
571-
* occurs, this pointer will be a guess that depends on the particular
572-
* error, but it will always advance at least one byte.
573-
*/
574-
inline auto utf8_decode(char8_t const* buf, char32_t* c, int* e) -> char8_t const*
575-
{
576-
static const char lengths[] = {
577-
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
578-
0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0
579-
};
580-
581-
static const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
582-
static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
583-
static const int shiftc[] = {0, 18, 12, 6, 0};
584-
static const int shifte[] = {0, 6, 4, 2, 0};
585-
586-
char8_t const* s = buf;
587-
int len = lengths[s[0] >> 3];
588-
589-
/* Compute the pointer to the next character early so that the next
590-
* iteration can start working on the next character. Neither Clang
591-
* nor GCC figure out this reordering on their own.
592-
*/
593-
char8_t const* next = s + len + !len;
594-
595-
/* Assume a four-byte character and load four bytes. Unused bits are
596-
* shifted out.
597-
*/
598-
*c = (uint32_t)(s[0] & masks[len]) << 18;
599-
*c |= (uint32_t)(s[1] & 0x3f) << 12;
600-
*c |= (uint32_t)(s[2] & 0x3f) << 6;
601-
*c |= (uint32_t)(s[3] & 0x3f) << 0;
602-
*c >>= shiftc[len];
603-
604-
/* Accumulate the various error conditions. */
605-
*e = (*c < mins[len]) << 6; // non-canonical encoding
606-
*e |= ((*c >> 11) == 0x1b) << 7; // surrogate half?
607-
*e |= (*c > 0x10FFFF) << 8; // out of range?
608-
*e |= (s[1] & 0xc0) >> 2;
609-
*e |= (s[2] & 0xc0) >> 4;
610-
*e |= (s[3] ) >> 6;
611-
*e ^= 0x2a; // top two bits of each tail byte correct?
612-
*e >>= shifte[len];
613-
614-
return next;
615-
}
616-
617-
618-
inline auto to_utf32(std::u8string const& s) -> std::u32string
555+
inline auto to_utf32(std::u16string const& input) -> std::u32string
619556
{
620557
auto result = std::u32string{};
621-
auto const block_size = size_t{4};
622-
auto const* current = s.data();
623558

624-
if (s.size() >= block_size)
559+
auto it = input.begin();
560+
while (it != input.end())
625561
{
626-
auto const* const end = current + s.size() - block_size + 1;
627-
while (current < end)
562+
auto const it_next = it + 1;
563+
if ((*it - 0xD800u) >= 2048u) // is not surrogate
628564
{
629-
auto c = char32_t{};
630-
auto error = int{0};
631-
current = utf8_decode(current, &c, &error);
632-
if (error)
633-
return U"<IceCreamCpp error decoding unicode string>";
634-
result.push_back(c);
565+
result.push_back(*it);
566+
++it;
635567
}
636-
}
637-
638-
if (auto num_chars_left = s.data() + s.size() - current)
639-
{
640-
char8_t buf[2 * block_size] = {};
641-
642-
for (int i = 0; i < num_chars_left; ++i)
643-
{
644-
buf[i] = current[i];
568+
else if (
569+
(*it & 0xFFFFFC00u) == 0xD800u // is high surrogate
570+
&& it_next != input.end()
571+
&& (*it_next & 0xFFFFFC00u) == 0xDC00u // is low surrogate
572+
){
573+
auto const high = uint32_t{*it};
574+
auto const low = uint32_t{*it_next};
575+
auto const codepoint = char32_t{(high << 10) + low - 0x35FDC00u};
576+
result.push_back(codepoint);
577+
it += 2;
645578
}
646-
647-
current = buf;
648-
auto const* const end = buf + num_chars_left;
649-
while (current < end)
579+
else
650580
{
651-
auto c = char32_t{};
652-
auto error = int{0};
653-
current = utf8_decode(current, &c, &error);
654-
if (error)
655-
return U"IceCreamCpp error decoding unicode string";
656-
result.push_back(c);
581+
// Encoding error, print the REPLACEMENT CHARACTER
582+
result.push_back(0xFFFD);
583+
++it;
657584
}
658585
}
659586

660587
return result;
661588
}
662-
#endif
663-
664-
inline auto cxrtomb(char* s, char16_t c, std::mbstate_t* ps) -> std::size_t
665-
{
666-
return c16rtomb(s, c, ps);
667-
}
668-
669-
inline auto cxrtomb(char* s, char32_t c, std::mbstate_t* ps) -> std::size_t
670-
{
671-
return c32rtomb(s, c, ps);
672-
}
673589

674-
inline auto cxrtomb(char* s, wchar_t c, std::mbstate_t* ps) -> std::size_t
590+
inline auto to_utf8_string(std::u32string const& input) -> std::string
675591
{
676-
return wcrtomb(s, c, ps);
677-
}
678-
679-
inline auto to_narrow_multibyte(std::string const& s) -> std::string
680-
{
681-
return s;
682-
}
683-
684-
template <typename T>
685-
auto to_narrow_multibyte(std::basic_string<T> const& str) -> std::string
686-
{
687-
auto const prev_loc = std::string{std::setlocale(LC_CTYPE, nullptr)};
592+
auto result = std::string{};
688593

689-
for (auto loc : {"", "C.UTF-8"})
594+
for (auto const code : input)
690595
{
691-
if (std::string{std::setlocale(LC_CTYPE, nullptr)} != "C")
596+
if (code < 0x80)
692597
{
693-
break;
598+
result.push_back(code); // 0xxxxxxx
694599
}
695-
else
600+
else if (code < 0x800) // 00000yyy yyxxxxxx
601+
{
602+
result.push_back(0xC0 | (code >> 6)); // 110yyyyy
603+
result.push_back(0x80 | (code & 0x3F)); // 10xxxxxx
604+
}
605+
else if (code < 0x10000) // zzzzyyyy yyxxxxxx
696606
{
697-
std::setlocale(LC_CTYPE, loc);
607+
result.push_back(0xE0 | (code >> 12)); // 1110zzzz
608+
result.push_back(0x80 | ((code >> 6) & 0x3F)); // 10yyyyyy
609+
result.push_back(0x80 | (code & 0x3F)); // 10xxxxxx
610+
}
611+
else if (code < 0x200000) // 000uuuuu zzzzyyyy yyxxxxxx
612+
{
613+
result.push_back(0xF0 | (code >> 18)); // 11110uuu
614+
result.push_back(0x80 | ((code >> 12) & 0x3F)); // 10uuzzzz
615+
result.push_back(0x80 | ((code >> 6) & 0x3F)); // 10yyyyyy
616+
result.push_back(0x80 | (code & 0x3F)); // 10xxxxxx
617+
}
618+
else // Encoding error, print the REPLACEMENT CHARACTER
619+
{
620+
result.push_back(0xEF);
621+
result.push_back(0xBF);
622+
result.push_back(0xBF);
698623
}
699624
}
625+
return result;
626+
}
627+
628+
inline auto to_narrow_multibyte(std::string const& str) -> std::string
629+
{
630+
return str;
631+
}
700632

633+
inline auto to_narrow_multibyte(std::wstring const& str) -> std::string
634+
{
701635
auto result = std::string{};
702-
auto state = std::mbstate_t();
703-
for (auto const c : str)
636+
637+
auto state = std::mbstate_t{};
638+
for (auto const wc : str)
704639
{
705-
char out[MB_LEN_MAX]{};
706-
auto const rc = cxrtomb(out, c, &state);
707-
if (rc == static_cast<std::size_t>(-1))
708-
{
709-
auto sstr = std::ostringstream{};
710-
sstr << "IceCreamCpp error decoding string errno " << std::strerror(errno);
711-
std::setlocale(LC_CTYPE, prev_loc.c_str());
712-
return sstr.str();
713-
}
714-
result.append(out, rc);
640+
auto mb = std::string(MB_CUR_MAX, '\0');
641+
std::wcrtomb(&mb[0], wc, &state);
642+
result.append(mb);
715643
}
716644

717-
std::setlocale(LC_CTYPE, prev_loc.c_str());
718645
return result;
719646
}
720647

648+
inline auto to_narrow_multibyte(std::u32string const& str) -> std::string
649+
{
650+
return to_utf8_string(str);
651+
}
652+
653+
inline auto to_narrow_multibyte(std::u16string const& str) -> std::string
654+
{
655+
return to_narrow_multibyte(to_utf32(str));
656+
}
657+
721658
#if defined(__cpp_char8_t)
722659
inline auto to_narrow_multibyte(std::u8string const& str) -> std::string
723660
{
724-
// c8rtomb is missing from many implementations
725-
return to_narrow_multibyte(to_utf32(str));
661+
return std::string(reinterpret_cast<char const*>(str.data()));
726662
}
727663
#endif
728664

tests/test_c++11.cpp

+9
Original file line numberDiff line numberDiff line change
@@ -797,6 +797,15 @@ TEST_CASE("std_string")
797797
REQUIRE(str == "ic| v0: \"u16str \xce\xb1\"\n");
798798
}
799799

800+
{
801+
auto str = std::string{};
802+
icecream::ic.output(str);
803+
804+
auto v0 = std::u16string {u"u16str \U0001D11E"};
805+
IC(v0);
806+
REQUIRE(str == "ic| v0: \"u16str \xf0\x9d\x84\x9e\"\n");
807+
}
808+
800809
{
801810
auto str = std::string{};
802811
icecream::ic.output(str);

0 commit comments

Comments
 (0)