|
48 | 48 | #include <string>
|
49 | 49 | #include <tuple>
|
50 | 50 | #include <type_traits>
|
51 |
| -#include <uchar.h> |
52 | 51 | #include <utility>
|
53 | 52 | #include <valarray>
|
54 | 53 | #include <vector>
|
@@ -553,176 +552,113 @@ namespace icecream{ namespace detail
|
553 | 552 |
|
554 | 553 | // -------------------------------------------------- Char encoding
|
555 | 554 |
|
556 |
| -#if defined(__cpp_char8_t) |
557 |
| - // A public domain branchless UTF-8 decoder by Christopher Wellons: |
558 |
| - // https://github.com/skeeto/branchless-utf8 |
559 |
| - /* Decode the next character, c, from s, reporting errors in e. |
560 |
| - * |
561 |
| - * Since this is a branchless decoder, four bytes will be read from the |
562 |
| - * buffer regardless of the actual length of the next character. This |
563 |
| - * means the buffer _must_ have at least three bytes of zero padding |
564 |
| - * following the end of the data stream. |
565 |
| - * |
566 |
| - * Errors are reported in e, which will be non-zero if the parsed |
567 |
| - * character was somehow invalid: invalid byte sequence, non-canonical |
568 |
| - * encoding, or a surrogate half. |
569 |
| - * |
570 |
| - * The function returns a pointer to the next character. When an error |
571 |
| - * occurs, this pointer will be a guess that depends on the particular |
572 |
| - * error, but it will always advance at least one byte. |
573 |
| - */ |
574 |
| - inline auto utf8_decode(char8_t const* buf, char32_t* c, int* e) -> char8_t const* |
575 |
| - { |
576 |
| - static const char lengths[] = { |
577 |
| - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
578 |
| - 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0 |
579 |
| - }; |
580 |
| - |
581 |
| - static const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07}; |
582 |
| - static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536}; |
583 |
| - static const int shiftc[] = {0, 18, 12, 6, 0}; |
584 |
| - static const int shifte[] = {0, 6, 4, 2, 0}; |
585 |
| - |
586 |
| - char8_t const* s = buf; |
587 |
| - int len = lengths[s[0] >> 3]; |
588 |
| - |
589 |
| - /* Compute the pointer to the next character early so that the next |
590 |
| - * iteration can start working on the next character. Neither Clang |
591 |
| - * nor GCC figure out this reordering on their own. |
592 |
| - */ |
593 |
| - char8_t const* next = s + len + !len; |
594 |
| - |
595 |
| - /* Assume a four-byte character and load four bytes. Unused bits are |
596 |
| - * shifted out. |
597 |
| - */ |
598 |
| - *c = (uint32_t)(s[0] & masks[len]) << 18; |
599 |
| - *c |= (uint32_t)(s[1] & 0x3f) << 12; |
600 |
| - *c |= (uint32_t)(s[2] & 0x3f) << 6; |
601 |
| - *c |= (uint32_t)(s[3] & 0x3f) << 0; |
602 |
| - *c >>= shiftc[len]; |
603 |
| - |
604 |
| - /* Accumulate the various error conditions. */ |
605 |
| - *e = (*c < mins[len]) << 6; // non-canonical encoding |
606 |
| - *e |= ((*c >> 11) == 0x1b) << 7; // surrogate half? |
607 |
| - *e |= (*c > 0x10FFFF) << 8; // out of range? |
608 |
| - *e |= (s[1] & 0xc0) >> 2; |
609 |
| - *e |= (s[2] & 0xc0) >> 4; |
610 |
| - *e |= (s[3] ) >> 6; |
611 |
| - *e ^= 0x2a; // top two bits of each tail byte correct? |
612 |
| - *e >>= shifte[len]; |
613 |
| - |
614 |
| - return next; |
615 |
| - } |
616 |
| - |
617 |
| - |
618 |
| - inline auto to_utf32(std::u8string const& s) -> std::u32string |
| 555 | + inline auto to_utf32(std::u16string const& input) -> std::u32string |
619 | 556 | {
|
620 | 557 | auto result = std::u32string{};
|
621 |
| - auto const block_size = size_t{4}; |
622 |
| - auto const* current = s.data(); |
623 | 558 |
|
624 |
| - if (s.size() >= block_size) |
| 559 | + auto it = input.begin(); |
| 560 | + while (it != input.end()) |
625 | 561 | {
|
626 |
| - auto const* const end = current + s.size() - block_size + 1; |
627 |
| - while (current < end) |
| 562 | + auto const it_next = it + 1; |
| 563 | + if ((*it - 0xD800u) >= 2048u) // is not surrogate |
628 | 564 | {
|
629 |
| - auto c = char32_t{}; |
630 |
| - auto error = int{0}; |
631 |
| - current = utf8_decode(current, &c, &error); |
632 |
| - if (error) |
633 |
| - return U"<IceCreamCpp error decoding unicode string>"; |
634 |
| - result.push_back(c); |
| 565 | + result.push_back(*it); |
| 566 | + ++it; |
635 | 567 | }
|
636 |
| - } |
637 |
| - |
638 |
| - if (auto num_chars_left = s.data() + s.size() - current) |
639 |
| - { |
640 |
| - char8_t buf[2 * block_size] = {}; |
641 |
| - |
642 |
| - for (int i = 0; i < num_chars_left; ++i) |
643 |
| - { |
644 |
| - buf[i] = current[i]; |
| 568 | + else if ( |
| 569 | + (*it & 0xFFFFFC00u) == 0xD800u // is high surrogate |
| 570 | + && it_next != input.end() |
| 571 | + && (*it_next & 0xFFFFFC00u) == 0xDC00u // is low surrogate |
| 572 | + ){ |
| 573 | + auto const high = uint32_t{*it}; |
| 574 | + auto const low = uint32_t{*it_next}; |
| 575 | + auto const codepoint = char32_t{(high << 10) + low - 0x35FDC00u}; |
| 576 | + result.push_back(codepoint); |
| 577 | + it += 2; |
645 | 578 | }
|
646 |
| - |
647 |
| - current = buf; |
648 |
| - auto const* const end = buf + num_chars_left; |
649 |
| - while (current < end) |
| 579 | + else |
650 | 580 | {
|
651 |
| - auto c = char32_t{}; |
652 |
| - auto error = int{0}; |
653 |
| - current = utf8_decode(current, &c, &error); |
654 |
| - if (error) |
655 |
| - return U"IceCreamCpp error decoding unicode string"; |
656 |
| - result.push_back(c); |
| 581 | + // Encoding error, print the REPLACEMENT CHARACTER |
| 582 | + result.push_back(0xFFFD); |
| 583 | + ++it; |
657 | 584 | }
|
658 | 585 | }
|
659 | 586 |
|
660 | 587 | return result;
|
661 | 588 | }
|
662 |
| -#endif |
663 |
| - |
664 |
| - inline auto cxrtomb(char* s, char16_t c, std::mbstate_t* ps) -> std::size_t |
665 |
| - { |
666 |
| - return c16rtomb(s, c, ps); |
667 |
| - } |
668 |
| - |
669 |
| - inline auto cxrtomb(char* s, char32_t c, std::mbstate_t* ps) -> std::size_t |
670 |
| - { |
671 |
| - return c32rtomb(s, c, ps); |
672 |
| - } |
673 | 589 |
|
674 |
| - inline auto cxrtomb(char* s, wchar_t c, std::mbstate_t* ps) -> std::size_t |
| 590 | + inline auto to_utf8_string(std::u32string const& input) -> std::string |
675 | 591 | {
|
676 |
| - return wcrtomb(s, c, ps); |
677 |
| - } |
678 |
| - |
679 |
| - inline auto to_narrow_multibyte(std::string const& s) -> std::string |
680 |
| - { |
681 |
| - return s; |
682 |
| - } |
683 |
| - |
684 |
| - template <typename T> |
685 |
| - auto to_narrow_multibyte(std::basic_string<T> const& str) -> std::string |
686 |
| - { |
687 |
| - auto const prev_loc = std::string{std::setlocale(LC_CTYPE, nullptr)}; |
| 592 | + auto result = std::string{}; |
688 | 593 |
|
689 |
| - for (auto loc : {"", "C.UTF-8"}) |
| 594 | + for (auto const code : input) |
690 | 595 | {
|
691 |
| - if (std::string{std::setlocale(LC_CTYPE, nullptr)} != "C") |
| 596 | + if (code < 0x80) |
692 | 597 | {
|
693 |
| - break; |
| 598 | + result.push_back(code); // 0xxxxxxx |
694 | 599 | }
|
695 |
| - else |
| 600 | + else if (code < 0x800) // 00000yyy yyxxxxxx |
| 601 | + { |
| 602 | + result.push_back(0xC0 | (code >> 6)); // 110yyyyy |
| 603 | + result.push_back(0x80 | (code & 0x3F)); // 10xxxxxx |
| 604 | + } |
| 605 | + else if (code < 0x10000) // zzzzyyyy yyxxxxxx |
696 | 606 | {
|
697 |
| - std::setlocale(LC_CTYPE, loc); |
| 607 | + result.push_back(0xE0 | (code >> 12)); // 1110zzzz |
| 608 | + result.push_back(0x80 | ((code >> 6) & 0x3F)); // 10yyyyyy |
| 609 | + result.push_back(0x80 | (code & 0x3F)); // 10xxxxxx |
| 610 | + } |
| 611 | + else if (code < 0x200000) // 000uuuuu zzzzyyyy yyxxxxxx |
| 612 | + { |
| 613 | + result.push_back(0xF0 | (code >> 18)); // 11110uuu |
| 614 | + result.push_back(0x80 | ((code >> 12) & 0x3F)); // 10uuzzzz |
| 615 | + result.push_back(0x80 | ((code >> 6) & 0x3F)); // 10yyyyyy |
| 616 | + result.push_back(0x80 | (code & 0x3F)); // 10xxxxxx |
| 617 | + } |
| 618 | + else // Encoding error, print the REPLACEMENT CHARACTER |
| 619 | + { |
| 620 | + result.push_back(0xEF); |
| 621 | + result.push_back(0xBF); |
| 622 | + result.push_back(0xBF); |
698 | 623 | }
|
699 | 624 | }
|
| 625 | + return result; |
| 626 | + } |
| 627 | + |
| 628 | + inline auto to_narrow_multibyte(std::string const& str) -> std::string |
| 629 | + { |
| 630 | + return str; |
| 631 | + } |
700 | 632 |
|
| 633 | + inline auto to_narrow_multibyte(std::wstring const& str) -> std::string |
| 634 | + { |
701 | 635 | auto result = std::string{};
|
702 |
| - auto state = std::mbstate_t(); |
703 |
| - for (auto const c : str) |
| 636 | + |
| 637 | + auto state = std::mbstate_t{}; |
| 638 | + for (auto const wc : str) |
704 | 639 | {
|
705 |
| - char out[MB_LEN_MAX]{}; |
706 |
| - auto const rc = cxrtomb(out, c, &state); |
707 |
| - if (rc == static_cast<std::size_t>(-1)) |
708 |
| - { |
709 |
| - auto sstr = std::ostringstream{}; |
710 |
| - sstr << "IceCreamCpp error decoding string errno " << std::strerror(errno); |
711 |
| - std::setlocale(LC_CTYPE, prev_loc.c_str()); |
712 |
| - return sstr.str(); |
713 |
| - } |
714 |
| - result.append(out, rc); |
| 640 | + auto mb = std::string(MB_CUR_MAX, '\0'); |
| 641 | + std::wcrtomb(&mb[0], wc, &state); |
| 642 | + result.append(mb); |
715 | 643 | }
|
716 | 644 |
|
717 |
| - std::setlocale(LC_CTYPE, prev_loc.c_str()); |
718 | 645 | return result;
|
719 | 646 | }
|
720 | 647 |
|
| 648 | + inline auto to_narrow_multibyte(std::u32string const& str) -> std::string |
| 649 | + { |
| 650 | + return to_utf8_string(str); |
| 651 | + } |
| 652 | + |
| 653 | + inline auto to_narrow_multibyte(std::u16string const& str) -> std::string |
| 654 | + { |
| 655 | + return to_narrow_multibyte(to_utf32(str)); |
| 656 | + } |
| 657 | + |
721 | 658 | #if defined(__cpp_char8_t)
|
722 | 659 | inline auto to_narrow_multibyte(std::u8string const& str) -> std::string
|
723 | 660 | {
|
724 |
| - // c8rtomb is missing from many implementations |
725 |
| - return to_narrow_multibyte(to_utf32(str)); |
| 661 | + return std::string(reinterpret_cast<char const*>(str.data())); |
726 | 662 | }
|
727 | 663 | #endif
|
728 | 664 |
|
|
0 commit comments