.

renatoGarcia · renatoGarcia · commit b72bf1163007 · 2024-06-18T03:39:52.000-03:00
diff --git a/README.md b/README.md
@@ -23,6 +23,7 @@ IceCream-Cpp is a little (single header) library to help with the print debuggin
      * [line_wrap_width](#line_wrap_width)
      * [include_context](#include_context)
      * [context_delimiter](#context_delimiter)
+  * [Character Encoding](#character-encoding)
   * [Printing logic](#printing-logic)
      * [C strings](#c-strings)
      * [Pointer like types](#pointer-like-types)
@@ -539,6 +540,32 @@ The string separating the context text from the variables values. Default value
     auto context_delimiter(std::string const& value) -> IcecreamAPI&;
     ```
 
+### Character Encoding
+
+Character encoding in C++ is complicated. The `std::string` and a `char*` strings don't
+have any requirements or information about what encoding they are using, while a string
+literal, like on:
+
+```C++
+auto* const str = "foo";
+```
+
+will have a well defined, but implementation-defined encoding. Any system and compiler can
+potentially use a distinct encoding.
+
+The `std::wstring` and `wchar_t*` are even a little more complicated. Besides all the
+above consideration, the bit size of their [code
+unit](https://en.cppreference.com/w/cpp/language/charset#Code_unit_and_literal_encoding)
+is implementation-defined. A `wchar_t` has 32 bits on Linux but 16 bits on Windows, for
+example.
+
+The new `char8_t`, `char16_t` and `char32_t` are better specified, they must be Unicode
+encoded with their respective code unit width.
+
+On the other end, to print `std::cout` 
+
+On IceCream-Cpp
+
 ### Printing logic
 
 When printing a type `T`, the precedence is use an overloaded function
diff --git a/icecream.hpp b/icecream.hpp
@@ -48,7 +48,6 @@
 #include <string>
 #include <tuple>
 #include <type_traits>
-#include <uchar.h>
 #include <utility>
 #include <valarray>
 #include <vector>
@@ -553,176 +552,113 @@ namespace icecream{ namespace detail
 
     // -------------------------------------------------- Char encoding
 
-#if defined(__cpp_char8_t)
-    // A public domain branchless UTF-8 decoder by Christopher Wellons:
-    // https://github.com/skeeto/branchless-utf8
-    /* Decode the next character, c, from s, reporting errors in e.
-     *
-     * Since this is a branchless decoder, four bytes will be read from the
-     * buffer regardless of the actual length of the next character. This
-     * means the buffer _must_ have at least three bytes of zero padding
-     * following the end of the data stream.
-     *
-     * Errors are reported in e, which will be non-zero if the parsed
-     * character was somehow invalid: invalid byte sequence, non-canonical
-     * encoding, or a surrogate half.
-     *
-     * The function returns a pointer to the next character. When an error
-     * occurs, this pointer will be a guess that depends on the particular
-     * error, but it will always advance at least one byte.
-     */
-    inline auto utf8_decode(char8_t const* buf, char32_t* c, int* e) -> char8_t const*
-    {
-        static const char lengths[] = {
-            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-            0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0
-        };
-
-        static const int masks[]  = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
-        static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
-        static const int shiftc[] = {0, 18, 12, 6, 0};
-        static const int shifte[] = {0, 6, 4, 2, 0};
-
-        char8_t const* s = buf;
-        int len = lengths[s[0] >> 3];
-
-        /* Compute the pointer to the next character early so that the next
-         * iteration can start working on the next character. Neither Clang
-         * nor GCC figure out this reordering on their own.
-         */
-        char8_t const* next = s + len + !len;
-
-        /* Assume a four-byte character and load four bytes. Unused bits are
-         * shifted out.
-         */
-        *c  = (uint32_t)(s[0] & masks[len]) << 18;
-        *c |= (uint32_t)(s[1] & 0x3f) << 12;
-        *c |= (uint32_t)(s[2] & 0x3f) <<  6;
-        *c |= (uint32_t)(s[3] & 0x3f) <<  0;
-        *c >>= shiftc[len];
-
-        /* Accumulate the various error conditions. */
-        *e  = (*c < mins[len]) << 6; // non-canonical encoding
-        *e |= ((*c >> 11) == 0x1b) << 7;  // surrogate half?
-        *e |= (*c > 0x10FFFF) << 8;  // out of range?
-        *e |= (s[1] & 0xc0) >> 2;
-        *e |= (s[2] & 0xc0) >> 4;
-        *e |= (s[3]       ) >> 6;
-        *e ^= 0x2a; // top two bits of each tail byte correct?
-        *e >>= shifte[len];
-
-        return next;
-    }
-
-
-    inline auto to_utf32(std::u8string const& s) -> std::u32string
+    inline auto to_utf32(std::u16string const& input) -> std::u32string
     {
         auto result = std::u32string{};
-        auto const block_size = size_t{4};
-        auto const* current = s.data();
 
-        if (s.size() >= block_size)
+        auto it = input.begin();
+        while (it != input.end())
         {
-            auto const* const end = current + s.size() - block_size + 1;
-            while (current < end)
+            auto const it_next = it + 1;
+            if ((*it - 0xD800u) >= 2048u)  // is not surrogate
             {
-                auto c = char32_t{};
-                auto error = int{0};
-                current = utf8_decode(current, &c, &error);
-                if (error)
-                    return U"<IceCreamCpp error decoding unicode string>";
-                result.push_back(c);
+                result.push_back(*it);
+                ++it;
             }
-        }
-
-        if (auto num_chars_left = s.data() + s.size() - current)
-        {
-            char8_t buf[2 * block_size] = {};
-
-            for (int i = 0; i < num_chars_left; ++i)
-            {
-                buf[i] = current[i];
+            else if (
+                (*it & 0xFFFFFC00u) == 0xD800u  // is high surrogate
+                && it_next != input.end()
+                && (*it_next & 0xFFFFFC00u) == 0xDC00u  // is low surrogate
+            ){
+                auto const high = uint32_t{*it};
+                auto const low = uint32_t{*it_next};
+                auto const codepoint = char32_t{(high << 10) + low - 0x35FDC00u};
+                result.push_back(codepoint);
+                it += 2;
             }
-
-            current = buf;
-            auto const* const end = buf + num_chars_left;
-            while (current < end)
+            else
             {
-                auto c = char32_t{};
-                auto error = int{0};
-                current = utf8_decode(current, &c, &error);
-                if (error)
-                    return U"IceCreamCpp error decoding unicode string";
-                result.push_back(c);
+                // Encoding error, print the REPLACEMENT CHARACTER
+                result.push_back(0xFFFD);
+                ++it;
             }
         }
 
         return result;
     }
-#endif
-
-    inline auto cxrtomb(char* s, char16_t c, std::mbstate_t* ps) -> std::size_t
-    {
-        return c16rtomb(s, c, ps);
-    }
-
-    inline auto cxrtomb(char* s, char32_t c, std::mbstate_t* ps) -> std::size_t
-    {
-        return c32rtomb(s, c, ps);
-    }
 
-    inline auto cxrtomb(char* s, wchar_t c, std::mbstate_t* ps) -> std::size_t
+    inline auto to_utf8_string(std::u32string const& input) -> std::string
     {
-        return wcrtomb(s, c, ps);
-    }
-
-    inline auto to_narrow_multibyte(std::string const& s) -> std::string
-    {
-        return s;
-    }
-
-    template <typename T>
-    auto to_narrow_multibyte(std::basic_string<T> const& str) -> std::string
-    {
-        auto const prev_loc = std::string{std::setlocale(LC_CTYPE, nullptr)};
+        auto result = std::string{};
 
-        for (auto loc : {"", "C.UTF-8"})
+        for (auto const code : input)
         {
-            if (std::string{std::setlocale(LC_CTYPE, nullptr)} != "C")
+            if (code < 0x80)
             {
-                break;
+                result.push_back(code);  // 0xxxxxxx
             }
-            else
+            else if (code < 0x800)  // 00000yyy yyxxxxxx
+            {
+                result.push_back(0xC0 | (code >> 6));    // 110yyyyy
+                result.push_back(0x80 | (code & 0x3F));  // 10xxxxxx
+            }
+            else if (code < 0x10000)   // zzzzyyyy yyxxxxxx
             {
-                std::setlocale(LC_CTYPE, loc);
+                result.push_back(0xE0 | (code >> 12));          // 1110zzzz
+                result.push_back(0x80 | ((code >> 6) & 0x3F));  // 10yyyyyy
+                result.push_back(0x80 | (code & 0x3F));         // 10xxxxxx
+            }
+            else if (code < 0x200000)  // 000uuuuu zzzzyyyy yyxxxxxx
+            {
+                result.push_back(0xF0 | (code >> 18));           // 11110uuu
+                result.push_back(0x80 | ((code >> 12) & 0x3F));  // 10uuzzzz
+                result.push_back(0x80 | ((code >> 6)  & 0x3F));  // 10yyyyyy
+                result.push_back(0x80 | (code & 0x3F));          // 10xxxxxx
+            }
+            else  // Encoding error, print the REPLACEMENT CHARACTER
+            {
+                result.push_back(0xEF);
+                result.push_back(0xBF);
+                result.push_back(0xBF);
             }
         }
+        return result;
+    }
+
+    inline auto to_narrow_multibyte(std::string const& str) -> std::string
+    {
+        return str;
+    }
 
+    inline auto to_narrow_multibyte(std::wstring const& str) -> std::string
+    {
         auto result = std::string{};
-        auto state = std::mbstate_t();
-        for (auto const c : str)
+
+        auto state = std::mbstate_t{};
+        for (auto const wc : str)
         {
-            char out[MB_LEN_MAX]{};
-            auto const rc = cxrtomb(out, c, &state);
-            if (rc == static_cast<std::size_t>(-1))
-            {
-                auto sstr = std::ostringstream{};
-                sstr << "IceCreamCpp error decoding string errno " << std::strerror(errno);
-                std::setlocale(LC_CTYPE, prev_loc.c_str());
-                return sstr.str();
-            }
-            result.append(out, rc);
+            auto mb = std::string(MB_CUR_MAX, '\0');
+            std::wcrtomb(&mb[0], wc, &state);
+            result.append(mb);
         }
 
-        std::setlocale(LC_CTYPE, prev_loc.c_str());
         return result;
     }
 
+    inline auto to_narrow_multibyte(std::u32string const& str) -> std::string
+    {
+        return to_utf8_string(str);
+    }
+
+    inline auto to_narrow_multibyte(std::u16string const& str) -> std::string
+    {
+        return to_narrow_multibyte(to_utf32(str));
+    }
+
 #if defined(__cpp_char8_t)
     inline auto to_narrow_multibyte(std::u8string const& str) -> std::string
     {
-        // c8rtomb is missing from many implementations
-        return to_narrow_multibyte(to_utf32(str));
+        return std::string(reinterpret_cast<char const*>(str.data()));
     }
 #endif
 
diff --git a/tests/test_c++11.cpp b/tests/test_c++11.cpp
@@ -797,6 +797,15 @@ TEST_CASE("std_string")
         REQUIRE(str == "ic| v0: \"u16str \xce\xb1\"\n");
     }
 
+    {
+        auto str = std::string{};
+        icecream::ic.output(str);
+
+        auto v0 = std::u16string {u"u16str \U0001D11E"};
+        IC(v0);
+        REQUIRE(str == "ic| v0: \"u16str \xf0\x9d\x84\x9e\"\n");
+    }
+
     {
         auto str = std::string{};
         icecream::ic.output(str);