diff --git a/stl/CMakeLists.txt b/stl/CMakeLists.txt index 751c6ad78a..12fa2b9255 100644 --- a/stl/CMakeLists.txt +++ b/stl/CMakeLists.txt @@ -249,6 +249,7 @@ endforeach() # Objs that exist in both libcpmt[d][01].lib and msvcprt[d].lib. set(IMPLIB_SOURCES ${CMAKE_CURRENT_LIST_DIR}/src/filesystem.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/format.cpp ${CMAKE_CURRENT_LIST_DIR}/src/locale0_implib.cpp ${CMAKE_CURRENT_LIST_DIR}/src/nothrow.cpp ${CMAKE_CURRENT_LIST_DIR}/src/sharedmutex.cpp diff --git a/stl/inc/format b/stl/inc/format index 939581aff9..d09850c03f 100644 --- a/stl/inc/format +++ b/stl/inc/format @@ -53,6 +53,7 @@ #include #include #include +#include #include #pragma pack(push, _CRT_PACKING) @@ -62,6 +63,8 @@ _STL_DISABLE_CLANG_WARNINGS #pragma push_macro("new") #undef new +extern "C" _NODISCARD __std_win_error __stdcall __std_get_cvt(__std_code_page _Codepage, _Cvtvec* _Pcvt) noexcept; + _STD_BEGIN class format_error : public runtime_error { @@ -444,95 +447,232 @@ _NODISCARD constexpr bool _Is_execution_charset_utf8() { #pragma warning(pop) } -inline constexpr bool _Is_execution_charset_utf8_v = _Is_execution_charset_utf8(); +inline constexpr char16_t _Width_estimate_low_intervals[] = { // Per N4885 [format.string.std]/11 + 0x1100u, 0x1160u, 0x2329u, 0x232Bu, 0x2E80u, 0x303Fu, 0x3040u, 0xA4D0u, 0xAC00u, 0xD7A4u, 0xF900u, 0xFB00u, 0xFE10u, + 0xFE1Au, 0xFE30u, 0xFE70u, 0xFF00u, 0xFF61u, 0xFFE0u, 0xFFE7u}; -_NODISCARD constexpr int _Utf8_code_units_in_next_character( - const char* const _First, const char* const _Last) noexcept { - // Returns a count of the number of UTF-8 code units that compose the first encoded character in [_First, _Last), - // or -1 if [_First, _Last) doesn't contain an entire encoded character or *_First is not a valid lead byte. - const auto _Ch = static_cast(*_First); - if (_Ch < 0b1000'0000u) { - return 1; +inline constexpr char32_t _Width_estimate_high_intervals[] = { // Per N4885 [format.string.std]/11 + 0x1F300u, 0x1F650u, 0x1F900u, 0x1FA00u, 0x20000u, 0x2FFFEu, 0x30000u, 0x3FFFEu}; + +template +_NODISCARD constexpr int _Unicode_width_estimate(const char32_t _Ch) noexcept { + // Computes the width estimation for Unicode characters from N4885 [format.string.std]/11 + int _Result = 1; + for (const auto& _Bound : _Bounds) { + if (_Ch < _Bound) { + return _Result; + } + _Result ^= 0b11u; // Flip between 1 and 2 on each iteration } - const auto _Len = static_cast(_Last - _First); + return 1; +} - if (_Ch < 0b1110'0000u) { - // check for non-lead byte or partial 2-byte encoded character - return (_Ch >= 0b1100'0000u && _Len >= 2) ? 2 : -1; +template +class _Fmt_codec; + +template +class _Fmt_codec_base {}; + +template <> +class _Fmt_codec_base { +protected: + _Cvtvec _Cvt; + + _NODISCARD int _Double_byte_encoding_code_units_in_next_character( + const char* const _First, const char* const _Last) const { + // Returns a count of the number of code units that compose the first encoded character in [_First, _Last), + // or -1 if [_First, _Last) doesn't contain an entire encoded character or *_First is not a valid lead byte. + wchar_t _Wide; + mbstate_t _St{}; + const auto _Len = static_cast(_Last - _First); + const int _Result = _Mbrtowc(&_Wide, _First, _Len, &_St, &_Cvt); + if (_Result > 0) { + return _Result; + } else if (_Result < 0) { // invalid or incomplete encoded character + return -1; + } else { // next code unit is '\0' + return 1; + } } - if (_Ch < 0b1111'0000u) { - // check for partial 3-byte encoded character - return (_Len >= 3) ? 3 : -1; + _Fmt_codec_base() { +#ifndef _FORMAT_CODEPAGE +#define _FORMAT_CODEPAGE __std_code_page::_Acp +#endif // _FORMAT_CODEPAGE + [[maybe_unused]] const __std_win_error _Result = __std_get_cvt(_FORMAT_CODEPAGE, &_Cvt); + _STL_INTERNAL_CHECK(_Result == __std_win_error::_Success); +#undef _FORMAT_CODEPAGE } +}; - // check for partial 4-byte encoded character - return (_Len >= 4) ? 4 : -1; -} +template +class _Fmt_codec : private _Fmt_codec_base<_Statically_Utf8> { +private: + _NODISCARD static constexpr int _Utf8_code_units_in_next_character( + const char* const _First, const char* const _Last) noexcept { + // Returns a count of the number of UTF-8 code units that compose the first encoded character in [_First, + // _Last), or -1 if [_First, _Last) doesn't contain an entire encoded character or *_First is not a valid lead + // byte. + const auto _Ch = static_cast(*_First); + if (_Ch < 0b1000'0000u) { + return 1; + } -_NODISCARD inline int _Double_byte_encoding_code_units_in_next_character( - const char* const _First, const char* const _Last, const _Cvtvec& _Cvt) { - // Returns a count of the number of code units that compose the first encoded character in [_First, _Last), - // or -1 if [_First, _Last) doesn't contain an entire encoded character or *_First is not a valid lead byte. - wchar_t _Wide; - mbstate_t _St{}; - const auto _Len = static_cast(_Last - _First); - const int _Result = _Mbrtowc(&_Wide, _First, _Len, &_St, &_Cvt); - if (_Result > 0) { - return _Result; - } else if (_Result < 0) { // invalid or incomplete encoded character - return -1; - } else { // next code unit is '\0' - return 1; - } -} + const auto _Len = static_cast(_Last - _First); -_NODISCARD inline int _Code_units_in_next_character(const char* _First, const char* _Last, const _Cvtvec& _Cvt) { - // Returns a count of the number of code units that compose the first encoded character in - // [_First, _Last), or -1 if [_First, _Last) doesn't contain an entire encoded character or - // *_First is not a valid lead byte. - _STL_INTERNAL_CHECK(_First < _Last); + if (_Ch < 0b1110'0000u) { + // check for non-lead byte or partial 2-byte encoded character + return (_Ch >= 0b1100'0000u && _Len >= 2) ? 2 : -1; + } - if constexpr (_Is_execution_charset_utf8_v) { - return _Utf8_code_units_in_next_character(_First, _Last); - } else { - switch (_Cvt._Mbcurmax) { + if (_Ch < 0b1111'0000u) { + // check for partial 3-byte encoded character + return (_Len >= 3) ? 3 : -1; + } + + // check for partial 4-byte encoded character + return (_Len >= 4) ? 4 : -1; + } + + _NODISCARD static int _Estimate_utf8_character_width(const char* const _Ptr, const int _Units) noexcept { + // Return an estimate for the width of the character composed of _Units code units, + // whose first code unit is denoted by _Ptr. + auto _Ch = static_cast(*_Ptr); + switch (_Units) { default: - _STL_INTERNAL_CHECK(!"Bad number of encoding units for this code page"); - [[fallthrough]]; case 1: - return 1; // all characters have only one code unit - case 2: - return _Double_byte_encoding_code_units_in_next_character(_First, _Last, _Cvt); + return 1; + case 3: + _Ch &= 0b1111u; + break; + case 4: + _Ch &= 0b111u; + break; + } + + for (int _Idx = 1; _Idx < _Units; ++_Idx) { + _Ch = _Ch << 6 | (_Ptr[_Idx] & 0b11'1111u); + } - case 4: // Assume UTF-8 (as does _Mbrtowc) + if (_Units == 3) { + return _Unicode_width_estimate<_Width_estimate_low_intervals>(_Ch); + } + + return _Unicode_width_estimate<_Width_estimate_high_intervals>(_Ch); + } + +public: + _NODISCARD int _Units_in_next_character(const char* const _First, const char* const _Last) const noexcept { + // Returns a count of the number of code units that compose the first encoded character in + // [_First, _Last), or -1 if [_First, _Last) doesn't contain an entire encoded character or + // *_First is not a valid lead byte. + _STL_INTERNAL_CHECK(_First < _Last); + + if constexpr (_Statically_Utf8) { return _Utf8_code_units_in_next_character(_First, _Last); + } else { + switch (this->_Cvt._Mbcurmax) { + default: + _STL_INTERNAL_CHECK(!"Bad number of encoding units for this code page"); + [[fallthrough]]; + case 1: + return 1; // all characters have only one code unit + + case 2: + return this->_Double_byte_encoding_code_units_in_next_character(_First, _Last); + + case 4: // Assume UTF-8 (as does _Mbrtowc) + return _Utf8_code_units_in_next_character(_First, _Last); + } } } -} -_NODISCARD inline int _Code_units_in_next_character(const wchar_t* _First, const wchar_t* _Last, const _Cvtvec&) { - // Returns a count of the number of code units that compose the first encoded character in - // [_First, _Last), or -1 if [_First, _Last) doesn't contain an entire encoded character or - // *_First is an unpaired surrogate. - _STL_INTERNAL_CHECK(_First < _Last); + _NODISCARD const char* _Find_encoded(const char* _First, const char* const _Last, const char _Val) const { + // Returns the first occurrence of _Val as an encoded character (and not, for example, as a + // continuation byte) in [_First, _Last). + if constexpr (_Statically_Utf8) { + return _Find_unchecked(_First, _Last, _Val); + } else { + if (this->_Cvt._Mbcurmax == 1 || this->_Cvt._Mbcurmax == 4) { + // As above and in _Mbrtowc, assume 4-byte encodings are UTF-8 + return _Find_unchecked(_First, _Last, _Val); + } + + while (_First != _Last && *_First != _Val) { + const int _Units = _Units_in_next_character(_First, _Last); + if (_Units < 0) { + _THROW(format_error("Invalid encoded character in format string.")); + } + _First += _Units; + } - if (*_First < 0xD800u || *_First >= 0xE000u) { - return 1; + return _First; + } } - if (*_First >= 0xDC00u) { // unpaired low surrogate - return -1; + _NODISCARD int _Estimate_width(const char* const _Ptr, const int _Units) const { + // Return an estimate for the width of the character composed of _Units code units, + // whose first code unit is denoted by _Ptr. + if constexpr (_Statically_Utf8) { + return _Estimate_utf8_character_width(_Ptr, _Units); + } else { + if (this->_Cvt._Mbcurmax != 4) { + // not a Unicode encoding; estimate width == number of code units + return _Units; + } + + // assume UTF-8 + return _Estimate_utf8_character_width(_Ptr, _Units); + } } +}; + +template +class _Fmt_codec { +public: + _NODISCARD int _Units_in_next_character(const wchar_t* _First, const wchar_t* const _Last) const noexcept { + // Returns a count of the number of code units that compose the first encoded character in + // [_First, _Last), or -1 if [_First, _Last) doesn't contain an entire encoded character or + // *_First is an unpaired surrogate. + _STL_INTERNAL_CHECK(_First < _Last); + + if (*_First < 0xD800u || *_First >= 0xE000u) { + return 1; + } - if (++_First == _Last || *_First < 0xDC00u || *_First >= 0xE000u) { // unpaired high surrogate - return -1; + if (*_First >= 0xDC00u) { // unpaired low surrogate + return -1; + } + + if (++_First == _Last || *_First < 0xDC00u || *_First >= 0xE000u) { // unpaired high surrogate + return -1; + } + + return 2; // surrogate pair } - return 2; // surrogate pair -} + _NODISCARD const wchar_t* _Find_encoded( + const wchar_t* const _First, const wchar_t* const _Last, const wchar_t _Val) const { + return _Find_unchecked(_First, _Last, _Val); + } + + _NODISCARD int _Estimate_width(const wchar_t* const _Ptr, const int _Units) const { + // Return an estimate for the width of the character composed of _Units code units, + // whose first code unit is denoted by _Ptr. + auto _Ch = static_cast(*_Ptr); + if (_Units == 1) { + return _Unicode_width_estimate<_Width_estimate_low_intervals>(_Ch); + } + + // surrogate pair + _Ch = (_Ch - 0xD800u) << 10; + _Ch += static_cast(_Ptr[1]) - 0xDC00u; + _Ch += 0x10000u; + return _Unicode_width_estimate<_Width_estimate_high_intervals>(_Ch); + } +}; template _Callbacks_type> _NODISCARD const _CharT* _Parse_align(const _CharT* _Begin, const _CharT* _End, _Callbacks_type&& _Callbacks) { @@ -540,7 +680,7 @@ _NODISCARD const _CharT* _Parse_align(const _CharT* _Begin, const _CharT* _End, _STL_INTERNAL_CHECK(_Begin != _End && *_Begin != '}'); auto _Parsed_align = _Fmt_align::_None; - const int _Units = _Code_units_in_next_character(_Begin, _End, _Getcvt()); + const int _Units = _Fmt_codec<_CharT>{}._Units_in_next_character(_Begin, _End); if (_Units < 0) { // invalid fill character encoding _THROW(format_error("Invalid format string.")); } @@ -800,44 +940,19 @@ _NODISCARD constexpr const _CharT* _Parse_replacement_field( return _Begin + 1; } -template -_NODISCARD const _CharT* _Find_encoded( - const _CharT* _First, const _CharT* _Last, const _CharT _Val, const _Cvtvec& _Cvt) { - // Returns the first occurrence of _Val as an encoded character (and not, for example, as a - // continuation byte) in [_First, _Last). - if constexpr (_Is_execution_charset_utf8_v) { - return _Find_unchecked(_First, _Last, _Val); - } else { - if (_Cvt._Mbcurmax == 1 || _Cvt._Mbcurmax == 4) { - // As above and in _Mbrtowc, assume 4-byte encodings are UTF-8 - return _Find_unchecked(_First, _Last, _Val); - } - - while (_First != _Last && *_First != _Val) { - const int _Units = _Code_units_in_next_character(_First, _Last, _Cvt); - if (_Units < 0) { - _THROW(format_error("Invalid encoded character in format string.")); - } - _First += _Units; - } - - return _First; - } -} - template _HandlerT> void _Parse_format_string(basic_string_view<_CharT> _Format_str, _HandlerT&& _Handler) { - auto _Begin = _Format_str.data(); - auto _End = _Begin + _Format_str.size(); - const _Cvtvec& _Cvt = _Getcvt(); + auto _Begin = _Format_str.data(); + auto _End = _Begin + _Format_str.size(); + const _Fmt_codec<_CharT> _Codec; while (_Begin != _End) { const _CharT* _OpeningCurl = _Begin; if (*_Begin != '{') { - _OpeningCurl = _Find_encoded(_Begin, _End, _CharT{'{'}, _Cvt); + _OpeningCurl = _Codec._Find_encoded(_Begin, _End, _CharT{'{'}); for (;;) { - const _CharT* _ClosingCurl = _Find_encoded(_Begin, _OpeningCurl, _CharT{'}'}, _Cvt); + const _CharT* _ClosingCurl = _Codec._Find_encoded(_Begin, _OpeningCurl, _CharT{'}'}); // In this case there are neither closing nor opening curls in [_Begin, _OpeningCurl) // Write the whole thing out. @@ -2214,95 +2329,15 @@ _NODISCARD _OutputIt _Fmt_write( return _Fmt_write(_STD move(_Out), basic_string_view<_CharT>{_Value}, _Specs, _Locale); } -inline constexpr char16_t _Width_estimate_low_intervals[] = { // Per N4885 [format.string.std]/11 - 0x1100u, 0x1160u, 0x2329u, 0x232Bu, 0x2E80u, 0x303Fu, 0x3040u, 0xA4D0u, 0xAC00u, 0xD7A4u, 0xF900u, 0xFB00u, 0xFE10u, - 0xFE1Au, 0xFE30u, 0xFE70u, 0xFF00u, 0xFF61u, 0xFFE0u, 0xFFE7u}; - -inline constexpr char32_t _Width_estimate_high_intervals[] = { // Per N4885 [format.string.std]/11 - 0x1F300u, 0x1F650u, 0x1F900u, 0x1FA00u, 0x20000u, 0x2FFFEu, 0x30000u, 0x3FFFEu}; - -template -_NODISCARD constexpr int _Unicode_width_estimate(const char32_t _Ch) noexcept { - // Computes the width estimation for Unicode characters from N4885 [format.string.std]/11 - int _Result = 1; - for (const auto& _Bound : _Bounds) { - if (_Ch < _Bound) { - return _Result; - } - _Result ^= 1; - } - - return 1; -} - -_NODISCARD inline int _Estimate_utf8_character_width(const char* const _Ptr, const int _Units) noexcept { - // Return an estimate for the width of the character composed of _Units code units, - // whose first code unit is denoted by _Ptr. - auto _Ch = static_cast(*_Ptr); - switch (_Units) { - default: - case 1: - case 2: - return 1; - case 3: - _Ch &= 0b1111u; - break; - case 4: - _Ch &= 0b111u; - break; - } - - for (int _Idx = 1; _Idx < _Units; ++_Idx) { - _Ch = _Ch << 6 | (_Ptr[_Idx] & 0b11'1111u); - } - - if (_Units == 3) { - return _Unicode_width_estimate<_Width_estimate_low_intervals>(_Ch); - } - - return _Unicode_width_estimate<_Width_estimate_high_intervals>(_Ch); -} - -_NODISCARD inline int _Estimate_character_width(const char* _Ptr, const int _Units, const _Cvtvec& _Cvt) { - // Return an estimate for the width of the character composed of _Units code units, - // whose first code unit is denoted by _Ptr. - if constexpr (_Is_execution_charset_utf8_v) { - return _Estimate_utf8_character_width(_Ptr, _Units); - } else { - if (_Cvt._Mbcurmax != 4) { - // not a Unicode encoding; estimate width == number of code units - return _Units; - } - - // assume UTF-8 - return _Estimate_utf8_character_width(_Ptr, _Units); - } -} - -_NODISCARD inline int _Estimate_character_width(const wchar_t* _Ptr, const int _Units, const _Cvtvec&) { - // Return an estimate for the width of the character composed of _Units code units, - // whose first code unit is denoted by _Ptr. - auto _Ch = static_cast(*_Ptr); - if (_Units == 1) { - return _Unicode_width_estimate<_Width_estimate_low_intervals>(_Ch); - } - - // surrogate pair - _Ch = (_Ch - 0xD8000u) << 10; - _Ch += static_cast(_Ptr[1]) - 0xDC00u; - _Ch += 0x10000u; - return _Unicode_width_estimate<_Width_estimate_high_intervals>(_Ch); -} - template _NODISCARD const _CharT* _Measure_string_prefix(const basic_string_view<_CharT> _Value, int& _Width) { // Returns a pointer past-the-end of the largest prefix of _Value that fits in _Width, or all // of _Value if _Width is negative. Updates _Width to the estimated width of that prefix. - const int _Max_width = _Width; - auto _Pos = _Value.data(); - const auto _Last = _Pos + _Value.size(); - int _Estimated_width = 0; // the estimated width of [_Value.data(), _Pos) - const _Cvtvec& _Cvt = _Getcvt(); + const int _Max_width = _Width; + auto _Pos = _Value.data(); + const auto _Last = _Pos + _Value.size(); + int _Estimated_width = 0; // the estimated width of [_Value.data(), _Pos) + const _Fmt_codec<_CharT> _Codec; constexpr auto _Max_int = (numeric_limits::max)(); while (_Pos != _Last) { @@ -2312,8 +2347,8 @@ _NODISCARD const _CharT* _Measure_string_prefix(const basic_string_view<_CharT> } // TRANSITION, extended grapheme clustering - const int _Units = _Code_units_in_next_character(_Pos, _Last, _Cvt); - const int _Character_width = _Estimate_character_width(_Pos, _Units, _Cvt); + const int _Units = _Codec._Units_in_next_character(_Pos, _Last); + const int _Character_width = _Codec._Estimate_width(_Pos, _Units); if (_Max_int - _Character_width < _Estimated_width) { // avoid overflow // Either _Max_width isn't set, or adding this character will exceed it. diff --git a/stl/inc/xfilesystem_abi.h b/stl/inc/xfilesystem_abi.h index 052af6cc93..f0b456bd69 100644 --- a/stl/inc/xfilesystem_abi.h +++ b/stl/inc/xfilesystem_abi.h @@ -205,7 +205,7 @@ _BITMASK_OPS(__std_fs_file_flags) enum class __std_fs_file_handle : intptr_t { _Invalid = -1 }; -enum class __std_code_page : unsigned int { _Utf8 = 65001 }; +enum class __std_code_page : unsigned int { _Acp = 0, _Utf8 = 65001 }; struct __std_fs_convert_result { int _Len; diff --git a/stl/msbuild/stl_base/stl.files.settings.targets b/stl/msbuild/stl_base/stl.files.settings.targets index 05ff1f5ddb..15aa0fbc46 100644 --- a/stl/msbuild/stl_base/stl.files.settings.targets +++ b/stl/msbuild/stl_base/stl.files.settings.targets @@ -161,6 +161,7 @@ SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception (controlled by IncludeInLink and IncludeInImportLib). --> (_Codepage), _Flags, &_Info) == 0) { + // NB: the only documented failure mode for GetCPInfoExW is ERROR_INVALID_PARAMETER, + // so in practice it should never fail for CP_ACP. + return __std_win_error{GetLastError()}; + } + + _Pcvt->_Page = _Info.CodePage; + _Pcvt->_Mbcurmax = _Info.MaxCharSize; + + for (int _Idx = 0; _Idx < MAX_LEADBYTES; _Idx += 2) { + if (_Info.LeadByte[_Idx] == 0 && _Info.LeadByte[_Idx + 1] == 0) { + break; + } + + for (unsigned char _First = _Info.LeadByte[_Idx], _Last = _Info.LeadByte[_Idx + 1]; _First != _Last; ++_First) { + _Pcvt->_Isleadbyte[_First >> 3] |= 1u << (_First & 0b111u); + } + } + + return __std_win_error::_Success; +} diff --git a/tests/std/include/test_format_support.hpp b/tests/std/include/test_format_support.hpp index 89dd541e85..77cc36eb85 100644 --- a/tests/std/include/test_format_support.hpp +++ b/tests/std/include/test_format_support.hpp @@ -117,9 +117,10 @@ void test_parse_helper(const CharT* (*func)(const CharT*, const CharT*, callback callback_type&& callbacks = {}) { try { auto end = func(view.data(), view.data() + view.size(), std::move(callbacks)); - if (expected_end_position != std::basic_string_view::npos) { - assert(end == view.data() + expected_end_position); + if (expected_end_position == std::basic_string_view::npos) { + expected_end_position = view.size(); } + assert(end == view.data() + expected_end_position); assert(!err_expected); } catch (const std::format_error&) { assert(err_expected); diff --git a/tests/std/tests/P0645R10_text_formatting_formatting/test.cpp b/tests/std/tests/P0645R10_text_formatting_formatting/test.cpp index ab37ef41f5..abee200168 100644 --- a/tests/std/tests/P0645R10_text_formatting_formatting/test.cpp +++ b/tests/std/tests/P0645R10_text_formatting_formatting/test.cpp @@ -975,31 +975,6 @@ void test_size() { test_size_helper(8, STR("{:8}"), STR("scully")); } -void test_multibyte_format_strings() { -#ifndef MSVC_INTERNAL_TESTING // TRANSITION, the Windows version on Contest VMs doesn't always understand ".UTF-8" - { - assert(setlocale(LC_ALL, ".UTF-8") != nullptr); - // Filling with footballs ("\xf0\x9f\x8f\x88" is U+1F3C8 AMERICAN FOOTBALL) - assert(format("{:\xf0\x9f\x8f\x88>4}"sv, 42) == "\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\x34\x32"); - - assert(format("{:\xf0\x9f\x8f\x88<4.2}", "1") == "\x31\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88"sv); - assert(format("{:\xf0\x9f\x8f\x88^4.2}", "1") == "\xf0\x9f\x8f\x88\x31\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88"sv); - assert(format("{:\xf0\x9f\x8f\x88>4.2}", "1") == "\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\x31"sv); - } - - { - assert(setlocale(LC_ALL, ".UTF-8") != nullptr); - try { - (void) format("{:\x9f\x8f\x88<10}"sv, 42); // Bad fill character encoding: missing lead byte before \x9f - assert(false); - } catch (const format_error&) { - } - } -#endif // MSVC_INTERNAL_TESTING - - assert(setlocale(LC_ALL, "C") != nullptr); -} - // The libfmt_ tests are derived from tests in // libfmt, Copyright (c) 2012 - present, Victor Zverovich // See NOTICE.txt for more information. @@ -1318,8 +1293,6 @@ void test() { test_size(); test_size(); - test_multibyte_format_strings(); - libfmt_formatter_test_escape(); libfmt_formatter_test_escape(); diff --git a/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/test.cpp b/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/test.cpp index beb4a499c6..081eecd7ce 100644 --- a/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/test.cpp +++ b/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/test.cpp @@ -1,6 +1,8 @@ // Copyright (c) Microsoft Corporation. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#define _FORMAT_CODEPAGE (__std_code_page{932}) + #include #include #include @@ -11,55 +13,76 @@ using namespace std; void test_multibyte_format_strings() { - { - assert(setlocale(LC_ALL, ".932") != nullptr); - const auto s = - "\x93\xfa\x96{\x92\x6e\x90}"sv; // Note the use of `{` and `}` as continuation bytes (from GH-1576) - assert(format(s) == s); - - assert(format("{:.2}", s) == "\x93\xfa"sv); - assert(format("{:4.2}", s) == "\x93\xfa "sv); + const auto s = "\x93\xfa\x96{\x92\x6e\x90}"sv; // Note the use of `{` and `}` as continuation bytes (from GH-1576) + assert(format(s) == s); - assert(format("{:<4.2}", s) == "\x93\xfa "sv); - assert(format("{:^4.2}", s) == " \x93\xfa "sv); - assert(format("{:>4.2}", s) == " \x93\xfa"sv); + assert(format("{:.2}", s) == "\x93\xfa"sv); + assert(format("{:4.2}", s) == "\x93\xfa "sv); - assert(format("{:\x90}<4.2}", s) == "\x93\xfa\x90}\x90}"sv); - assert(format("{:\x90}^4.2}", s) == "\x90}\x93\xfa\x90}"sv); - assert(format("{:\x90}>4.2}", s) == "\x90}\x90}\x93\xfa"sv); + assert(format("{:<4.2}", s) == "\x93\xfa "sv); + assert(format("{:^4.2}", s) == " \x93\xfa "sv); + assert(format("{:>4.2}", s) == " \x93\xfa"sv); - assert(format("{:.3}", s) == "\x93\xfa"sv); - assert(format("{:4.3}", s) == "\x93\xfa "sv); + assert(format("{:\x90}<4.2}", s) == "\x93\xfa\x90}\x90}"sv); + assert(format("{:\x90}^4.2}", s) == "\x90}\x93\xfa\x90}"sv); + assert(format("{:\x90}>4.2}", s) == "\x90}\x90}\x93\xfa"sv); - assert(format("{:<4.3}", s) == "\x93\xfa "sv); - assert(format("{:^4.3}", s) == " \x93\xfa "sv); - assert(format("{:>4.3}", s) == " \x93\xfa"sv); + assert(format("{:.3}", s) == "\x93\xfa"sv); + assert(format("{:4.3}", s) == "\x93\xfa "sv); - assert(format("{:\x90}<4.3}", s) == "\x93\xfa\x90}\x90}"sv); - assert(format("{:\x90}^4.3}", s) == "\x90}\x93\xfa\x90}"sv); - assert(format("{:\x90}>4.3}", s) == "\x90}\x90}\x93\xfa"sv); - } + assert(format("{:<4.3}", s) == "\x93\xfa "sv); + assert(format("{:^4.3}", s) == " \x93\xfa "sv); + assert(format("{:>4.3}", s) == " \x93\xfa"sv); - assert(setlocale(LC_ALL, "C") != nullptr); + assert(format("{:\x90}<4.3}", s) == "\x93\xfa\x90}\x90}"sv); + assert(format("{:\x90}^4.3}", s) == "\x90}\x93\xfa\x90}"sv); + assert(format("{:\x90}>4.3}", s) == "\x90}\x90}\x93\xfa"sv); } void test_parse_align() { - auto parse_align_fn = _Parse_align>; - - { - assert(setlocale(LC_ALL, ".932") != nullptr); - test_parse_helper(parse_align_fn, "\x93\xfaX"sv, false, 3, - {.expected_alignment = _Fmt_align::_Right, .expected_fill = "\x96\x7b"sv}); - test_parse_helper(parse_align_fn, "\x92\x6e^X"sv, false, 3, - {.expected_alignment = _Fmt_align::_Center, .expected_fill = "\x92\x6e"sv}); - } + const auto parse_align_fn = _Parse_align>; - assert(setlocale(LC_ALL, "C") != nullptr); + test_parse_helper(parse_align_fn, "\x93\xfaX"sv, false, 3, + {.expected_alignment = _Fmt_align::_Right, .expected_fill = "\x96\x7b"sv}); + test_parse_helper(parse_align_fn, "\x92\x6e^X"sv, false, 3, + {.expected_alignment = _Fmt_align::_Center, .expected_fill = "\x92\x6e"sv}); +} + +void test_width_estimation() { + // Format strings of known width with a trailing delimiter using a precision large enough to + // include all but the delimiter to validate the width estimation code. + struct test_case { + const char* str; + int width; + }; + constexpr test_case test_cases[] = { + {"\x58", 1}, + {"x\x58", 2}, + + // Pick "short" and "long" codepoints (\x20 and \x96\x7b), then form all permutations of + // 3-codepoint prefixes with the same fixed delimiter as above. This gives us coverage of + // all adjacent pairings (short/short, short/long, long/short, long/long). + {"\x20\x20\x20\x58", 4}, + {"\x20\x20\x96\x7b\x58", 5}, + {"\x20\x96\x7b\x20\x58", 5}, + {"\x96\x7b\x20\x20\x58", 5}, + {"\x20\x96\x7b\x96\x7b\x58", 6}, + {"\x96\x7b\x20\x96\x7b\x58", 6}, + {"\x96\x7b\x96\x7b\x20\x58", 6}, + {"\x96\x7b\x96\x7b\x96\x7b\x58", 7}, + }; + + for (const auto& test : test_cases) { + string_view sv{test.str}; + sv = sv.substr(0, sv.size() - 1); + assert(format("{:.{}}", test.str, test.width - 1) == sv); + } } int main() { test_multibyte_format_strings(); test_parse_align(); + test_width_estimation(); } diff --git a/tests/std/tests/P0645R10_text_formatting_parsing/test.cpp b/tests/std/tests/P0645R10_text_formatting_parsing/test.cpp index ddf204c0db..39fe7bd217 100644 --- a/tests/std/tests/P0645R10_text_formatting_parsing/test.cpp +++ b/tests/std/tests/P0645R10_text_formatting_parsing/test.cpp @@ -36,7 +36,7 @@ bool test_parse_align() { // \x343E (which is from CJK unified ideographs extension A) and similar characters to parse as // an alignment specifier. auto s4 = L"*\x343E"sv; - test_parse_helper(parse_align_fn, s4, false, view_typ::npos, {.expected_fill = L"*"sv}); + test_parse_helper(parse_align_fn, s4, false, 0, {.expected_fill = L"*"sv}); // test multi-code-unit fill characters { @@ -47,22 +47,6 @@ bool test_parse_align() { test_parse_helper(parse_align_fn, L"\U0001F3C8^X"sv, false, 3, {.expected_alignment = _Fmt_align::_Center, .expected_fill = L"\U0001F3C8"sv}); } - } else { - // test multibyte fill characters -#ifndef MSVC_INTERNAL_TESTING // TRANSITION, the Windows version on Contest VMs doesn't always understand ".UTF-8" - { - assert(setlocale(LC_ALL, ".UTF-8") != nullptr); - // "\xf0\x9f\x8f\x88" is U+1F3C8 AMERICAN FOOTBALL - test_parse_helper(parse_align_fn, "\xf0\x9f\x8f\x88X"sv, false, 5, - {.expected_alignment = _Fmt_align::_Right, .expected_fill = "\xf0\x9f\x8f\x88"sv}); - test_parse_helper(parse_align_fn, "\xf0\x9f\x8f\x88^X"sv, false, 5, - {.expected_alignment = _Fmt_align::_Center, .expected_fill = "\xf0\x9f\x8f\x88"sv}); - } -#endif // MSVC_INTERNAL_TESTING - - assert(setlocale(LC_ALL, "C") != nullptr); } return true; diff --git a/tests/std/tests/P0645R10_text_formatting_utf8/test.cpp b/tests/std/tests/P0645R10_text_formatting_utf8/test.cpp index 5b98879423..08a70afaac 100644 --- a/tests/std/tests/P0645R10_text_formatting_utf8/test.cpp +++ b/tests/std/tests/P0645R10_text_formatting_utf8/test.cpp @@ -12,12 +12,12 @@ using namespace std; void test_multibyte_format_strings() { { - // Filling with footballs ("\xf0\x9f\x8f\x88" is U+1F3C8 AMERICAN FOOTBALL) - assert(format("{:\xf0\x9f\x8f\x88>4}"sv, 42) == "\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\x34\x32"); + // Filling with footballs ("\U0001f3c8" is U+1F3C8 AMERICAN FOOTBALL) + assert(format("{:\U0001f3c8>4}"sv, 42) == "\U0001f3c8\U0001f3c8\x34\x32"); - assert(format("{:\xf0\x9f\x8f\x88<4.2}", "1") == "\x31\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88"sv); - assert(format("{:\xf0\x9f\x8f\x88^4.2}", "1") == "\xf0\x9f\x8f\x88\x31\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88"sv); - assert(format("{:\xf0\x9f\x8f\x88>4.2}", "1") == "\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\x31"sv); + assert(format("{:\U0001f3c8<4.2}", "1") == "\x31\U0001f3c8\U0001f3c8\U0001f3c8"sv); + assert(format("{:\U0001f3c8^4.2}", "1") == "\U0001f3c8\x31\U0001f3c8\U0001f3c8"sv); + assert(format("{:\U0001f3c8>4.2}", "1") == "\U0001f3c8\U0001f3c8\U0001f3c8\x31"sv); } { @@ -33,19 +33,115 @@ void test_parse_align() { auto parse_align_fn = _Parse_align>; { - // "\xf0\x9f\x8f\x88" is U+1F3C8 AMERICAN FOOTBALL - test_parse_helper(parse_align_fn, "\xf0\x9f\x8f\x88X"sv, false, 5, - {.expected_alignment = _Fmt_align::_Right, .expected_fill = "\xf0\x9f\x8f\x88"sv}); - test_parse_helper(parse_align_fn, "\xf0\x9f\x8f\x88^X"sv, false, 5, - {.expected_alignment = _Fmt_align::_Center, .expected_fill = "\xf0\x9f\x8f\x88"sv}); + test_parse_helper(parse_align_fn, "\U0001f3c8X"sv, false, 5, + {.expected_alignment = _Fmt_align::_Right, .expected_fill = "\U0001f3c8"sv}); + test_parse_helper(parse_align_fn, "\U0001f3c8^X"sv, false, 5, + {.expected_alignment = _Fmt_align::_Center, .expected_fill = "\U0001f3c8"sv}); + } +} + +template +void test_width_estimation() { + // Format strings of known width with a trailing delimiter using a precision large enough to + // include all but the delimiter to validate the width estimation code. + struct test_case { + const CharT* str; + int width; + }; + constexpr test_case test_cases[] = { + {TYPED_LITERAL(CharT, "\x58"), 1}, + {TYPED_LITERAL(CharT, "x\x58"), 2}, + + // test the boundaries of the intervals defined in n4885 [format.string.std]/11 + {TYPED_LITERAL(CharT, "\u10ff\x58"), 2}, + {TYPED_LITERAL(CharT, "\u1100\x58"), 3}, + {TYPED_LITERAL(CharT, "\u115f\x58"), 3}, + {TYPED_LITERAL(CharT, "\u1160\x58"), 2}, + {TYPED_LITERAL(CharT, "\u2328\x58"), 2}, + {TYPED_LITERAL(CharT, "\u2329\x58"), 3}, + {TYPED_LITERAL(CharT, "\u232a\x58"), 3}, + {TYPED_LITERAL(CharT, "\u232b\x58"), 2}, + {TYPED_LITERAL(CharT, "\u2e7f\x58"), 2}, + {TYPED_LITERAL(CharT, "\u2e80\x58"), 3}, + {TYPED_LITERAL(CharT, "\u303e\x58"), 3}, + {TYPED_LITERAL(CharT, "\u303f\x58"), 2}, + {TYPED_LITERAL(CharT, "\u3040\x58"), 3}, + {TYPED_LITERAL(CharT, "\ua4cf\x58"), 3}, + {TYPED_LITERAL(CharT, "\ua4d0\x58"), 2}, + {TYPED_LITERAL(CharT, "\uabff\x58"), 2}, + {TYPED_LITERAL(CharT, "\uac00\x58"), 3}, + {TYPED_LITERAL(CharT, "\ud7a3\x58"), 3}, + {TYPED_LITERAL(CharT, "\ud7a4\x58"), 2}, + {TYPED_LITERAL(CharT, "\ud7ff\x58"), 2}, + + // skip over the surrogate pair range (\ud800-\udfff) + + {TYPED_LITERAL(CharT, "\ue000\x58"), 2}, + {TYPED_LITERAL(CharT, "\uf8ff\x58"), 2}, + {TYPED_LITERAL(CharT, "\uf900\x58"), 3}, + {TYPED_LITERAL(CharT, "\ufaff\x58"), 3}, + {TYPED_LITERAL(CharT, "\ufb00\x58"), 2}, + {TYPED_LITERAL(CharT, "\ufe0f\x58"), 2}, + {TYPED_LITERAL(CharT, "\ufe10\x58"), 3}, + {TYPED_LITERAL(CharT, "\ufe19\x58"), 3}, + {TYPED_LITERAL(CharT, "\ufe1a\x58"), 2}, + {TYPED_LITERAL(CharT, "\ufe2f\x58"), 2}, + {TYPED_LITERAL(CharT, "\ufe30\x58"), 3}, + {TYPED_LITERAL(CharT, "\ufe6f\x58"), 3}, + {TYPED_LITERAL(CharT, "\ufe70\x58"), 2}, + {TYPED_LITERAL(CharT, "\ufeff\x58"), 2}, + {TYPED_LITERAL(CharT, "\uff00\x58"), 3}, + {TYPED_LITERAL(CharT, "\uff60\x58"), 3}, + {TYPED_LITERAL(CharT, "\uff61\x58"), 2}, + {TYPED_LITERAL(CharT, "\uffdf\x58"), 2}, + {TYPED_LITERAL(CharT, "\uffe0\x58"), 3}, + {TYPED_LITERAL(CharT, "\uffe6\x58"), 3}, + {TYPED_LITERAL(CharT, "\uffe7\x58"), 2}, + {TYPED_LITERAL(CharT, "\U0001f2ff\x58"), 2}, + {TYPED_LITERAL(CharT, "\U0001f300\x58"), 3}, + {TYPED_LITERAL(CharT, "\U0001f64f\x58"), 3}, + {TYPED_LITERAL(CharT, "\U0001f650\x58"), 2}, + {TYPED_LITERAL(CharT, "\U0001f8ff\x58"), 2}, + {TYPED_LITERAL(CharT, "\U0001f900\x58"), 3}, + {TYPED_LITERAL(CharT, "\U0001f9ff\x58"), 3}, + {TYPED_LITERAL(CharT, "\U0001fa00\x58"), 2}, + {TYPED_LITERAL(CharT, "\U0001ffff\x58"), 2}, + {TYPED_LITERAL(CharT, "\U00020000\x58"), 3}, + {TYPED_LITERAL(CharT, "\U0002fffd\x58"), 3}, + {TYPED_LITERAL(CharT, "\U0002fffe\x58"), 2}, + {TYPED_LITERAL(CharT, "\U0002ffff\x58"), 2}, + {TYPED_LITERAL(CharT, "\U00030000\x58"), 3}, + {TYPED_LITERAL(CharT, "\U0003fffd\x58"), 3}, + {TYPED_LITERAL(CharT, "\U0003fffe\x58"), 2}, + {TYPED_LITERAL(CharT, "\U0010ffff\x58"), 2}, + + // Pick "short" and "long" codepoints (\u2000 and \ufe40), then form all permutations of + // 3-codepoint prefixes with the same fixed delimiter as above. This gives us coverage of + // all adjacent pairings (short/short, short/long, long/short, long/long). + {TYPED_LITERAL(CharT, "\u2000\u2000\u2000\x58"), 4}, + {TYPED_LITERAL(CharT, "\u2000\u2000\ufe40\x58"), 5}, + {TYPED_LITERAL(CharT, "\u2000\ufe40\u2000\x58"), 5}, + {TYPED_LITERAL(CharT, "\ufe40\u2000\u2000\x58"), 5}, + {TYPED_LITERAL(CharT, "\u2000\ufe40\ufe40\x58"), 6}, + {TYPED_LITERAL(CharT, "\ufe40\u2000\ufe40\x58"), 6}, + {TYPED_LITERAL(CharT, "\ufe40\ufe40\u2000\x58"), 6}, + {TYPED_LITERAL(CharT, "\ufe40\ufe40\ufe40\x58"), 7}, + }; + + for (const auto& test : test_cases) { + basic_string_view sv{test.str}; + sv = sv.substr(0, sv.size() - 1); + assert(format(TYPED_LITERAL(CharT, "{:.{}}"), test.str, test.width - 1) == sv); } } void run_tests() { test_multibyte_format_strings(); test_parse_align(); + test_width_estimation(); + test_width_estimation(); } int main() {