From 0fec1662f14ed4954966b6296a0317a69d51a5f3 Mon Sep 17 00:00:00 2001 From: statementreply Date: Sun, 11 Apr 2021 16:56:17 +0800 Subject: [PATCH 1/8] Assume format strings are always UTF-8 when encoding charset is UTF-8 --- stl/inc/format | 152 ++++++++++++++++++++++++++++++------------------- 1 file changed, 95 insertions(+), 57 deletions(-) diff --git a/stl/inc/format b/stl/inc/format index fe47c696b0..bf10804b62 100644 --- a/stl/inc/format +++ b/stl/inc/format @@ -417,55 +417,79 @@ _NODISCARD constexpr const _CharT* _Parse_arg_id( throw format_error("Invalid format string."); } +inline constexpr bool _Execution_charset_is_utf8 = [] { +#pragma warning(push) +#pragma warning(disable : 4309) // 'initializing' : truncation of constant value +#pragma warning(disable : 4566) // character represented by universal-character-name '\u4E00' cannot be represented in + // the current code page +#pragma warning(disable : 6201) // Index '2' is out of valid index range '0' to '1' for possibly stack allocated buffer + // '_Test_char' +#pragma warning(disable : 6239) // ( && ) always evaluates to the result of . + // Did you intend to use the bitwise-and operator? + constexpr char _Test_char[] = "\u4e00"; + return sizeof(_Test_char) == 4 && _Test_char[0] == '\xe4' && _Test_char[1] == '\xb8' && _Test_char[2] == '\x80'; +#pragma warning(pop) +}(); + +_NODISCARD inline int _Utf8_code_units_in_next_character(const char* const _First, const char* const _Last) noexcept { + // Returns a count of the number of UTF-8 code units that compose the first encoded character in [_First, _Last), + // or -1 if [_First, _Last) doesn't contain an entire encoded character or *_First is not a valid lead byte. + const auto _Ch = static_cast(*_First); + if (_Ch < 0b1000'0000u) { + return 1; + } + + const auto _Len = static_cast(_Last - _First); + + if (_Ch < 0b1110'0000u) { + // check for non-lead byte or partial 2-byte encoded character + return (_Ch >= 0b1100'0000u && _Len >= 2) ? 2 : -1; + } + + if (_Ch < 0b1111'0000u) { + // check for partial 3-byte encoded character + return (_Len >= 3) ? 3 : -1; + } + + // check for partial 4-byte encoded character + return (_Len >= 4) ? 4 : -1; +} + _NODISCARD inline int _Code_units_in_next_character(const char* _First, const char* _Last, const _Cvtvec& _Cvt) { // Returns a count of the number of code units that compose the first encoded character in // [_First, _Last), or -1 if [_First, _Last) doesn't contain an entire encoded character or // *_First is not a valid lead byte. _STL_INTERNAL_CHECK(_First < _Last); - switch (_Cvt._Mbcurmax) { - default: - _STL_INTERNAL_CHECK(!"Bad number of encoding units for this code page"); - [[fallthrough]]; - case 1: - return 1; // all characters have only one code unit - - case 2: - { - wchar_t _Wide; - mbstate_t _St{}; - const auto _Len = static_cast(_Last - _First); - const int _Result = _Mbrtowc(&_Wide, _First, _Len, &_St, &_Cvt); - if (_Result > 0) { - return _Result; - } else if (_Result < 0) { // invalid or incomplete encoded character - return -1; - } else { // next code unit is '\0' - return 1; - } - } - - case 4: // Assume UTF-8 (as does _Mbrtowc) - { - const auto _Ch = static_cast(*_First); - if (_Ch < 0b1000'0000u) { - return 1; - } - - const auto _Len = static_cast(_Last - _First); - - if (_Ch < 0b1110'0000u) { - // check for non-lead byte or partial 2-byte encoded character - return (_Ch >= 0b1100'0000u && _Len >= 2) ? 2 : -1; + if constexpr (_Execution_charset_is_utf8) { + return _Utf8_code_units_in_next_character(_First, _Last); + } else { + switch (_Cvt._Mbcurmax) { + default: + _STL_INTERNAL_CHECK(!"Bad number of encoding units for this code page"); + [[fallthrough]]; + case 1: + return 1; // all characters have only one code unit + + case 2: + { + wchar_t _Wide; + mbstate_t _St{}; + const auto _Len = static_cast(_Last - _First); + const int _Result = _Mbrtowc(&_Wide, _First, _Len, &_St, &_Cvt); + if (_Result > 0) { + return _Result; + } else if (_Result < 0) { // invalid or incomplete encoded character + return -1; + } else { // next code unit is '\0' + return 1; + } } - if (_Ch < 0b1111'0000u) { - // check for partial 3-byte encoded character - return (_Len >= 3) ? 3 : -1; + case 4: // Assume UTF-8 (as does _Mbrtowc) + { + return _Utf8_code_units_in_next_character(_First, _Last); } - - // check for partial 4-byte encoded character - return (_Len >= 4) ? 4 : -1; } } } @@ -751,20 +775,24 @@ template const _CharT* _Find_encoded(const _CharT* _First, const _CharT* _Last, const _CharT _Val, const _Cvtvec& _Cvt) { // Returns the first occurrence of _Val as an encoded character (and not, for example, as a // continuation byte) in [_First, _Last). - if (_Cvt._Mbcurmax == 1 || _Cvt._Mbcurmax == 4) { - // As above and in _Mbrtowc, assume 4-byte encodings are UTF-8 + if constexpr (_Execution_charset_is_utf8) { return _Find_unchecked(_First, _Last, _Val); - } + } else { + if (_Cvt._Mbcurmax == 1 || _Cvt._Mbcurmax == 4) { + // As above and in _Mbrtowc, assume 4-byte encodings are UTF-8 + return _Find_unchecked(_First, _Last, _Val); + } - while (_First != _Last && *_First != _Val) { - const int _Units = _Code_units_in_next_character(_First, _Last, _Cvt); - if (_Units < 0) { - throw format_error("Invalid encoded character in format string."); + while (_First != _Last && *_First != _Val) { + const int _Units = _Code_units_in_next_character(_First, _Last, _Cvt); + if (_Units < 0) { + throw format_error("Invalid encoded character in format string."); + } + _First += _Units; } - _First += _Units; - } - return _First; + return _First; + } } template _HandlerT> @@ -2163,15 +2191,9 @@ _NODISCARD constexpr int _Unicode_width_estimate(const char32_t _Ch) noexcept { return 1; } -_NODISCARD inline int _Estimate_character_width(const char* _Ptr, const int _Units, const _Cvtvec& _Cvt) { +_NODISCARD inline int _Estimate_utf8_character_width(const char* const _Ptr, const int _Units) noexcept { // Return an estimate for the width of the character composed of _Units code units, // whose first code unit is denoted by _Ptr. - if (_Cvt._Mbcurmax != 4) { - // not a Unicode encoding; estimate width == number of code units - return _Units; - } - - // assume UTF-8 auto _Ch = static_cast(*_Ptr); switch (_Units) { default: @@ -2197,6 +2219,22 @@ _NODISCARD inline int _Estimate_character_width(const char* _Ptr, const int _Uni return _Unicode_width_estimate<_Width_estimate_high_intervals>(_Ch); } +_NODISCARD inline int _Estimate_character_width(const char* _Ptr, const int _Units, const _Cvtvec& _Cvt) { + // Return an estimate for the width of the character composed of _Units code units, + // whose first code unit is denoted by _Ptr. + if constexpr (_Execution_charset_is_utf8) { + return _Estimate_utf8_character_width(_Ptr, _Units); + } else { + if (_Cvt._Mbcurmax != 4) { + // not a Unicode encoding; estimate width == number of code units + return _Units; + } + + // assume UTF-8 + return _Estimate_utf8_character_width(_Ptr, _Units); + } +} + _NODISCARD inline int _Estimate_character_width(const wchar_t* _Ptr, const int _Units, const _Cvtvec&) { // Return an estimate for the width of the character composed of _Units code units, // whose first code unit is denoted by _Ptr. From 97a1ee08515af6bec5cf549ccb89bfc631872f8e Mon Sep 17 00:00:00 2001 From: statementreply Date: Sun, 11 Apr 2021 17:03:36 +0800 Subject: [PATCH 2/8] Add `/utf-8` tests --- tests/std/include/test_format_support.hpp | 126 ++++++++++++++++++ .../P0645R10_text_formatting_utf8/env.lst | 6 + .../P0645R10_text_formatting_utf8/test.cpp | 64 +++++++++ 3 files changed, 196 insertions(+) create mode 100644 tests/std/include/test_format_support.hpp create mode 100644 tests/std/tests/P0645R10_text_formatting_utf8/env.lst create mode 100644 tests/std/tests/P0645R10_text_formatting_utf8/test.cpp diff --git a/tests/std/include/test_format_support.hpp b/tests/std/include/test_format_support.hpp new file mode 100644 index 0000000000..ca569f23d3 --- /dev/null +++ b/tests/std/include/test_format_support.hpp @@ -0,0 +1,126 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#pragma once + +#include +#include +#include +#include + +// copied from the string_view tests +template +struct choose_literal; // not defined + +template <> +struct choose_literal { + static constexpr const char* choose(const char* s, const wchar_t*) { + return s; + } +}; + +template <> +struct choose_literal { + static constexpr const wchar_t* choose(const char*, const wchar_t* s) { + return s; + } +}; + +#define TYPED_LITERAL(CharT, Literal) (choose_literal::choose(Literal, L##Literal)) + +template +struct noop_testing_callbacks { + constexpr void _On_align(std::_Align) {} + constexpr void _On_fill(std::basic_string_view) {} + constexpr void _On_width(unsigned int) {} + constexpr void _On_dynamic_width(std::size_t) {} + constexpr void _On_dynamic_width(std::_Auto_id_tag) {} + constexpr void _On_precision(unsigned int) {} + constexpr void _On_dynamic_precision(std::size_t) {} + constexpr void _On_dynamic_precision(std::_Auto_id_tag) {} + constexpr void _On_sign(std::_Sign) {} + constexpr void _On_hash() {} + constexpr void _On_zero() {} + constexpr void _On_localized() {} + constexpr void _On_type(CharT) {} +}; + +template +struct testing_callbacks { + std::_Align expected_alignment = std::_Align::_None; + std::_Sign expected_sign = std::_Sign::_None; + std::basic_string_view expected_fill; + int expected_width = -1; + std::size_t expected_dynamic_width = static_cast(-1); + bool expected_auto_dynamic_width = false; + int expected_precision = -1; + std::size_t expected_dynamic_precision = static_cast(-1); + bool expected_auto_dynamic_precision = false; + bool expected_hash = false; + bool expected_zero = false; + bool expected_localized = false; + CharT expected_type = '\0'; + + constexpr void _On_align(std::_Align aln) { + assert(aln == expected_alignment); + } + constexpr void _On_fill(std::basic_string_view str_view) { + assert(str_view == expected_fill); + } + constexpr void _On_width(int width) { + assert(width == expected_width); + } + constexpr void _On_dynamic_width(std::size_t id) { + assert(id == expected_dynamic_width); + } + constexpr void _On_dynamic_width(std::_Auto_id_tag) { + assert(expected_auto_dynamic_width); + } + constexpr void _On_precision(int pre) { + assert(pre == expected_precision); + } + constexpr void _On_dynamic_precision(std::size_t id) { + assert(id == expected_dynamic_precision); + } + constexpr void _On_dynamic_precision(std::_Auto_id_tag) { + assert(expected_auto_dynamic_precision); + } + constexpr void _On_sign(std::_Sign sgn) { + assert(sgn == expected_sign); + } + constexpr void _On_hash() { + assert(expected_hash); + } + constexpr void _On_zero() { + assert(expected_zero); + } + constexpr void _On_localized() { + assert(expected_localized); + } + constexpr void _On_type(CharT type) { + assert(type == expected_type); + } +}; +template +testing_callbacks(std::_Align, std::basic_string_view) -> testing_callbacks; + +struct testing_arg_id_callbacks { + constexpr void _On_auto_id() {} + constexpr void _On_manual_id(std::size_t) {} +}; + +template +void test_parse_helper(const CharT* (*func)(const CharT*, const CharT*, callback_type&&), + std::basic_string_view view, bool err_expected = false, + typename std::basic_string_view::size_type expected_end_position = std::basic_string_view::npos, + callback_type&& callbacks = {}) { + try { + auto end = func(view.data(), view.data() + view.size(), std::move(callbacks)); + if (expected_end_position != std::basic_string_view::npos) { + assert(end == view.data() + expected_end_position); + } + assert(!err_expected); + } catch (const std::format_error&) { + assert(err_expected); + } +} diff --git a/tests/std/tests/P0645R10_text_formatting_utf8/env.lst b/tests/std/tests/P0645R10_text_formatting_utf8/env.lst new file mode 100644 index 0000000000..42da0946d2 --- /dev/null +++ b/tests/std/tests/P0645R10_text_formatting_utf8/env.lst @@ -0,0 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +RUNALL_INCLUDE ..\concepts_matrix.lst +RUNALL_CROSSLIST +PM_CL="/utf-8" diff --git a/tests/std/tests/P0645R10_text_formatting_utf8/test.cpp b/tests/std/tests/P0645R10_text_formatting_utf8/test.cpp new file mode 100644 index 0000000000..ef45b409eb --- /dev/null +++ b/tests/std/tests/P0645R10_text_formatting_utf8/test.cpp @@ -0,0 +1,64 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include + +#include "test_format_support.hpp" + +using namespace std; + +void test_multibyte_format_strings() { + { + // Filling with footballs ("\xf0\x9f\x8f\x88" is U+1F3C8 AMERICAN FOOTBALL) + assert(format("{:\xf0\x9f\x8f\x88>4}"sv, 42) == "\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\x34\x32"); + + assert(format("{:\xf0\x9f\x8f\x88<4.2}", "1") == "\x31\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88"sv); + assert(format("{:\xf0\x9f\x8f\x88^4.2}", "1") == "\xf0\x9f\x8f\x88\x31\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88"sv); + assert(format("{:\xf0\x9f\x8f\x88>4.2}", "1") == "\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\x31"sv); + } + + { + try { + (void) format("{:\x9f\x8f\x88<10}"sv, 42); // Bad fill character encoding: missing lead byte before \x9f + assert(false); + } catch (const format_error&) { + } + } +} + +void test_parse_align() { + auto parse_align_fn = _Parse_align>; + + { + // "\xf0\x9f\x8f\x88" is U+1F3C8 AMERICAN FOOTBALL + test_parse_helper(parse_align_fn, "\xf0\x9f\x8f\x88X"sv, false, 5, + {.expected_alignment = _Align::_Right, .expected_fill = "\xf0\x9f\x8f\x88"sv}); + test_parse_helper(parse_align_fn, "\xf0\x9f\x8f\x88^X"sv, false, 5, + {.expected_alignment = _Align::_Center, .expected_fill = "\xf0\x9f\x8f\x88"sv}); + } +} + +void run_tests() { + test_multibyte_format_strings(); + test_parse_align(); +} + +int main() { + run_tests(); + + setlocale(LC_ALL, ".1252"); + run_tests(); + + setlocale(LC_ALL, ".932"); + run_tests(); + +#ifndef MSVC_INTERNAL_TESTING // TRANSITION, Windows on Contest VMs understand ".UTF-8" codepage + setlocale(LC_ALL, ".UTF-8"); + run_tests(); +#endif +} From 761932afb529e7ae938bb84e03547d0bb9eb9a12 Mon Sep 17 00:00:00 2001 From: statementreply Date: Sun, 11 Apr 2021 17:07:19 +0800 Subject: [PATCH 3/8] Run Shift-JIS tests with `/execution-charset:.932` --- .../test.cpp | 29 ---- .../env.lst | 27 ++++ .../test.cpp | 65 +++++++++ .../P0645R10_text_formatting_parsing/test.cpp | 129 +----------------- tests/std/tests/concepts_matrix.lst | 2 + 5 files changed, 96 insertions(+), 156 deletions(-) create mode 100644 tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/env.lst create mode 100644 tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/test.cpp diff --git a/tests/std/tests/P0645R10_text_formatting_formatting/test.cpp b/tests/std/tests/P0645R10_text_formatting_formatting/test.cpp index dc86a49658..21fda060b4 100644 --- a/tests/std/tests/P0645R10_text_formatting_formatting/test.cpp +++ b/tests/std/tests/P0645R10_text_formatting_formatting/test.cpp @@ -973,35 +973,6 @@ void test_size() { } void test_multibyte_format_strings() { - { - setlocale(LC_ALL, ".932"); - const auto s = - "\x93\xfa\x96{\x92\x6e\x90}"sv; // Note the use of `{` and `}` as continuation bytes (from GH-1576) - assert(format(s) == s); - - assert(format("{:.2}", s) == "\x93\xfa"sv); - assert(format("{:4.2}", s) == "\x93\xfa "sv); - - assert(format("{:<4.2}", s) == "\x93\xfa "sv); - assert(format("{:^4.2}", s) == " \x93\xfa "sv); - assert(format("{:>4.2}", s) == " \x93\xfa"sv); - - assert(format("{:\x90}<4.2}", s) == "\x93\xfa\x90}\x90}"sv); - assert(format("{:\x90}^4.2}", s) == "\x90}\x93\xfa\x90}"sv); - assert(format("{:\x90}>4.2}", s) == "\x90}\x90}\x93\xfa"sv); - - assert(format("{:.3}", s) == "\x93\xfa"sv); - assert(format("{:4.3}", s) == "\x93\xfa "sv); - - assert(format("{:<4.3}", s) == "\x93\xfa "sv); - assert(format("{:^4.3}", s) == " \x93\xfa "sv); - assert(format("{:>4.3}", s) == " \x93\xfa"sv); - - assert(format("{:\x90}<4.3}", s) == "\x93\xfa\x90}\x90}"sv); - assert(format("{:\x90}^4.3}", s) == "\x90}\x93\xfa\x90}"sv); - assert(format("{:\x90}>4.3}", s) == "\x90}\x90}\x93\xfa"sv); - } - #ifndef MSVC_INTERNAL_TESTING // TRANSITION, Windows on Contest VMs understand ".UTF-8" codepage { setlocale(LC_ALL, ".UTF-8"); diff --git a/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/env.lst b/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/env.lst new file mode 100644 index 0000000000..9aa5f2a5cd --- /dev/null +++ b/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/env.lst @@ -0,0 +1,27 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# This is `concepts_matrix.lst` with `/execution-charset:.932` added. +# clang is excluded since it doesn't support non-UTF-8 execution charsets. + +RUNALL_INCLUDE ..\prefix.lst +RUNALL_CROSSLIST +PM_CL="/w14640 /Zc:threadSafeInit- /EHsc /std:c++latest /execution-charset:.932" +RUNALL_CROSSLIST +PM_CL="/MD /D_ITERATOR_DEBUG_LEVEL=0 /permissive- /Zc:noexceptTypes-" +PM_CL="/MD /D_ITERATOR_DEBUG_LEVEL=1 /permissive-" +PM_CL="/MD /D_ITERATOR_DEBUG_LEVEL=0 /permissive- /Zc:char8_t- /Zc:preprocessor" +PM_CL="/MDd /D_ITERATOR_DEBUG_LEVEL=0 /permissive- /Zc:wchar_t-" +PM_CL="/MDd /D_ITERATOR_DEBUG_LEVEL=1 /permissive-" +PM_CL="/MDd /D_ITERATOR_DEBUG_LEVEL=2 /permissive- /fp:except /Zc:preprocessor" +PM_CL="/MT /D_ITERATOR_DEBUG_LEVEL=0 /permissive-" +PM_CL="/MT /D_ITERATOR_DEBUG_LEVEL=0 /permissive- /analyze:only /analyze:autolog-" +PM_CL="/MT /D_ITERATOR_DEBUG_LEVEL=1 /permissive-" +PM_CL="/MTd /D_ITERATOR_DEBUG_LEVEL=0 /permissive- /fp:strict" +PM_CL="/MTd /D_ITERATOR_DEBUG_LEVEL=1 /permissive-" +PM_CL="/MTd /D_ITERATOR_DEBUG_LEVEL=2 /permissive" +PM_CL="/MTd /D_ITERATOR_DEBUG_LEVEL=2 /permissive- /analyze:only /analyze:autolog-" +PM_CL="/permissive- /Za /MD" +PM_CL="/permissive- /Za /MDd" +# PM_CL="/permissive- /BE /c /MD" +# PM_CL="/permissive- /BE /c /MTd" diff --git a/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/test.cpp b/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/test.cpp new file mode 100644 index 0000000000..a24c1a6e02 --- /dev/null +++ b/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/test.cpp @@ -0,0 +1,65 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include + +#include "test_format_support.hpp" + +using namespace std; + +void test_multibyte_format_strings() { + { + setlocale(LC_ALL, ".932"); + const auto s = + "\x93\xfa\x96{\x92\x6e\x90}"sv; // Note the use of `{` and `}` as continuation bytes (from GH-1576) + assert(format(s) == s); + + assert(format("{:.2}", s) == "\x93\xfa"sv); + assert(format("{:4.2}", s) == "\x93\xfa "sv); + + assert(format("{:<4.2}", s) == "\x93\xfa "sv); + assert(format("{:^4.2}", s) == " \x93\xfa "sv); + assert(format("{:>4.2}", s) == " \x93\xfa"sv); + + assert(format("{:\x90}<4.2}", s) == "\x93\xfa\x90}\x90}"sv); + assert(format("{:\x90}^4.2}", s) == "\x90}\x93\xfa\x90}"sv); + assert(format("{:\x90}>4.2}", s) == "\x90}\x90}\x93\xfa"sv); + + assert(format("{:.3}", s) == "\x93\xfa"sv); + assert(format("{:4.3}", s) == "\x93\xfa "sv); + + assert(format("{:<4.3}", s) == "\x93\xfa "sv); + assert(format("{:^4.3}", s) == " \x93\xfa "sv); + assert(format("{:>4.3}", s) == " \x93\xfa"sv); + + assert(format("{:\x90}<4.3}", s) == "\x93\xfa\x90}\x90}"sv); + assert(format("{:\x90}^4.3}", s) == "\x90}\x93\xfa\x90}"sv); + assert(format("{:\x90}>4.3}", s) == "\x90}\x90}\x93\xfa"sv); + } + + setlocale(LC_ALL, nullptr); +} + +void test_parse_align() { + auto parse_align_fn = _Parse_align>; + + { + setlocale(LC_ALL, ".932"); + test_parse_helper(parse_align_fn, "\x93\xfaX"sv, false, 3, + {.expected_alignment = _Align::_Right, .expected_fill = "\x96\x7b"sv}); + test_parse_helper(parse_align_fn, "\x92\x6e^X"sv, false, 3, + {.expected_alignment = _Align::_Center, .expected_fill = "\x92\x6e"sv}); + } + + setlocale(LC_ALL, nullptr); +} + +int main() { + test_multibyte_format_strings(); + test_parse_align(); +} diff --git a/tests/std/tests/P0645R10_text_formatting_parsing/test.cpp b/tests/std/tests/P0645R10_text_formatting_parsing/test.cpp index 9a7f62083d..966f04a1a8 100644 --- a/tests/std/tests/P0645R10_text_formatting_parsing/test.cpp +++ b/tests/std/tests/P0645R10_text_formatting_parsing/test.cpp @@ -8,124 +8,9 @@ #include #include -using namespace std; - -// copied from the string_view tests -template -struct choose_literal; // not defined - -template <> -struct choose_literal { - static constexpr const char* choose(const char* s, const wchar_t*) { - return s; - } -}; - -template <> -struct choose_literal { - static constexpr const wchar_t* choose(const char*, const wchar_t* s) { - return s; - } -}; - -#define TYPED_LITERAL(CharT, Literal) (choose_literal::choose(Literal, L##Literal)) +#include "test_format_support.hpp" -template -struct noop_testing_callbacks { - constexpr void _On_align(_Align) {} - constexpr void _On_fill(basic_string_view) {} - constexpr void _On_width(unsigned int) {} - constexpr void _On_dynamic_width(size_t) {} - constexpr void _On_dynamic_width(_Auto_id_tag) {} - constexpr void _On_precision(unsigned int) {} - constexpr void _On_dynamic_precision(size_t) {} - constexpr void _On_dynamic_precision(_Auto_id_tag) {} - constexpr void _On_sign(_Sign) {} - constexpr void _On_hash() {} - constexpr void _On_zero() {} - constexpr void _On_localized() {} - constexpr void _On_type(CharT) {} -}; - -template -struct testing_callbacks { - _Align expected_alignment = _Align::_None; - _Sign expected_sign = _Sign::_None; - basic_string_view expected_fill; - int expected_width = -1; - size_t expected_dynamic_width = static_cast(-1); - bool expected_auto_dynamic_width = false; - int expected_precision = -1; - size_t expected_dynamic_precision = static_cast(-1); - bool expected_auto_dynamic_precision = false; - bool expected_hash = false; - bool expected_zero = false; - bool expected_localized = false; - CharT expected_type = '\0'; - - constexpr void _On_align(_Align aln) { - assert(aln == expected_alignment); - } - constexpr void _On_fill(basic_string_view str_view) { - assert(str_view == expected_fill); - } - constexpr void _On_width(int width) { - assert(width == expected_width); - } - constexpr void _On_dynamic_width(size_t id) { - assert(id == expected_dynamic_width); - } - constexpr void _On_dynamic_width(_Auto_id_tag) { - assert(expected_auto_dynamic_width); - } - constexpr void _On_precision(int pre) { - assert(pre == expected_precision); - } - constexpr void _On_dynamic_precision(size_t id) { - assert(id == expected_dynamic_precision); - } - constexpr void _On_dynamic_precision(_Auto_id_tag) { - assert(expected_auto_dynamic_precision); - } - constexpr void _On_sign(_Sign sgn) { - assert(sgn == expected_sign); - } - constexpr void _On_hash() { - assert(expected_hash); - } - constexpr void _On_zero() { - assert(expected_zero); - } - constexpr void _On_localized() { - assert(expected_localized); - } - constexpr void _On_type(CharT type) { - assert(type == expected_type); - } -}; -template -testing_callbacks(_Align, basic_string_view) -> testing_callbacks; - -struct testing_arg_id_callbacks { - constexpr void _On_auto_id() {} - constexpr void _On_manual_id(size_t) {} -}; - -template -void test_parse_helper(const CharT* (*func)(const CharT*, const CharT*, callback_type&&), basic_string_view view, - bool err_expected = false, - typename basic_string_view::size_type expected_end_position = basic_string_view::npos, - callback_type&& callbacks = {}) { - try { - auto end = func(view.data(), view.data() + view.size(), move(callbacks)); - if (expected_end_position != basic_string_view::npos) { - assert(end == view.data() + expected_end_position); - } - assert(!err_expected); - } catch (const format_error&) { - assert(err_expected); - } -} +using namespace std; template bool test_parse_align() { @@ -162,16 +47,6 @@ bool test_parse_align() { } } else { // test multibyte fill characters - { - setlocale(LC_ALL, ".932"); - test_parse_helper(parse_align_fn, "\x93\xfaX"sv, false, 3, - {.expected_alignment = _Align::_Right, .expected_fill = "\x96\x7b"sv}); - test_parse_helper(parse_align_fn, "\x92\x6e^X"sv, false, 3, - {.expected_alignment = _Align::_Center, .expected_fill = "\x92\x6e"sv}); - } - #ifndef MSVC_INTERNAL_TESTING // TRANSITION, Windows on Contest VMs understand ".UTF-8" codepage { setlocale(LC_ALL, ".UTF-8"); diff --git a/tests/std/tests/concepts_matrix.lst b/tests/std/tests/concepts_matrix.lst index 5810e10f46..d35c3015ad 100644 --- a/tests/std/tests/concepts_matrix.lst +++ b/tests/std/tests/concepts_matrix.lst @@ -1,6 +1,8 @@ # Copyright (c) Microsoft Corporation. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# When updating this file, also update tests\P0645R10_text_formatting_legacy_text_encoding\env.lst to match + RUNALL_INCLUDE .\prefix.lst RUNALL_CROSSLIST PM_CL="/w14640 /Zc:threadSafeInit- /EHsc /std:c++latest" From 903ba79196cee08b8a69f7ec690a2b11e5bc01b1 Mon Sep 17 00:00:00 2001 From: statementreply Date: Tue, 13 Apr 2021 00:58:52 +0800 Subject: [PATCH 4/8] Apply code review feedback --- stl/inc/format | 50 ++++++++++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/stl/inc/format b/stl/inc/format index bf10804b62..f9dffd211d 100644 --- a/stl/inc/format +++ b/stl/inc/format @@ -417,7 +417,7 @@ _NODISCARD constexpr const _CharT* _Parse_arg_id( throw format_error("Invalid format string."); } -inline constexpr bool _Execution_charset_is_utf8 = [] { +_NODISCARD constexpr bool _Is_execution_charset_utf8() { #pragma warning(push) #pragma warning(disable : 4309) // 'initializing' : truncation of constant value #pragma warning(disable : 4566) // character represented by universal-character-name '\u4E00' cannot be represented in @@ -429,9 +429,12 @@ inline constexpr bool _Execution_charset_is_utf8 = [] { constexpr char _Test_char[] = "\u4e00"; return sizeof(_Test_char) == 4 && _Test_char[0] == '\xe4' && _Test_char[1] == '\xb8' && _Test_char[2] == '\x80'; #pragma warning(pop) -}(); +} + +inline constexpr bool _Is_execution_charset_utf8_v = _Is_execution_charset_utf8(); -_NODISCARD inline int _Utf8_code_units_in_next_character(const char* const _First, const char* const _Last) noexcept { +_NODISCARD inline constexpr int _Utf8_code_units_in_next_character( + const char* const _First, const char* const _Last) noexcept { // Returns a count of the number of UTF-8 code units that compose the first encoded character in [_First, _Last), // or -1 if [_First, _Last) doesn't contain an entire encoded character or *_First is not a valid lead byte. const auto _Ch = static_cast(*_First); @@ -455,13 +458,30 @@ _NODISCARD inline int _Utf8_code_units_in_next_character(const char* const _Firs return (_Len >= 4) ? 4 : -1; } +_NODISCARD inline int _Double_byte_encoding_code_units_in_next_character( + const char* const _First, const char* const _Last, const _Cvtvec& _Cvt) { + // Returns a count of the number of code units that compose the first encoded character in [_First, _Last), + // or -1 if [_First, _Last) doesn't contain an entire encoded character or *_First is not a valid lead byte. + wchar_t _Wide; + mbstate_t _St{}; + const auto _Len = static_cast(_Last - _First); + const int _Result = _Mbrtowc(&_Wide, _First, _Len, &_St, &_Cvt); + if (_Result > 0) { + return _Result; + } else if (_Result < 0) { // invalid or incomplete encoded character + return -1; + } else { // next code unit is '\0' + return 1; + } +} + _NODISCARD inline int _Code_units_in_next_character(const char* _First, const char* _Last, const _Cvtvec& _Cvt) { // Returns a count of the number of code units that compose the first encoded character in // [_First, _Last), or -1 if [_First, _Last) doesn't contain an entire encoded character or // *_First is not a valid lead byte. _STL_INTERNAL_CHECK(_First < _Last); - if constexpr (_Execution_charset_is_utf8) { + if constexpr (_Is_execution_charset_utf8_v) { return _Utf8_code_units_in_next_character(_First, _Last); } else { switch (_Cvt._Mbcurmax) { @@ -472,24 +492,10 @@ _NODISCARD inline int _Code_units_in_next_character(const char* _First, const ch return 1; // all characters have only one code unit case 2: - { - wchar_t _Wide; - mbstate_t _St{}; - const auto _Len = static_cast(_Last - _First); - const int _Result = _Mbrtowc(&_Wide, _First, _Len, &_St, &_Cvt); - if (_Result > 0) { - return _Result; - } else if (_Result < 0) { // invalid or incomplete encoded character - return -1; - } else { // next code unit is '\0' - return 1; - } - } + return _Double_byte_encoding_code_units_in_next_character(_First, _Last, _Cvt); case 4: // Assume UTF-8 (as does _Mbrtowc) - { - return _Utf8_code_units_in_next_character(_First, _Last); - } + return _Utf8_code_units_in_next_character(_First, _Last); } } } @@ -775,7 +781,7 @@ template const _CharT* _Find_encoded(const _CharT* _First, const _CharT* _Last, const _CharT _Val, const _Cvtvec& _Cvt) { // Returns the first occurrence of _Val as an encoded character (and not, for example, as a // continuation byte) in [_First, _Last). - if constexpr (_Execution_charset_is_utf8) { + if constexpr (_Is_execution_charset_utf8_v) { return _Find_unchecked(_First, _Last, _Val); } else { if (_Cvt._Mbcurmax == 1 || _Cvt._Mbcurmax == 4) { @@ -2222,7 +2228,7 @@ _NODISCARD inline int _Estimate_utf8_character_width(const char* const _Ptr, con _NODISCARD inline int _Estimate_character_width(const char* _Ptr, const int _Units, const _Cvtvec& _Cvt) { // Return an estimate for the width of the character composed of _Units code units, // whose first code unit is denoted by _Ptr. - if constexpr (_Execution_charset_is_utf8) { + if constexpr (_Is_execution_charset_utf8_v) { return _Estimate_utf8_character_width(_Ptr, _Units); } else { if (_Cvt._Mbcurmax != 4) { From 8f20290f327ce4c6f5453595ad7f50d15a76c042 Mon Sep 17 00:00:00 2001 From: statementreply Date: Tue, 13 Apr 2021 01:06:36 +0800 Subject: [PATCH 5/8] constexpr function implies inline --- stl/inc/format | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stl/inc/format b/stl/inc/format index f9dffd211d..8f576a9792 100644 --- a/stl/inc/format +++ b/stl/inc/format @@ -433,7 +433,7 @@ _NODISCARD constexpr bool _Is_execution_charset_utf8() { inline constexpr bool _Is_execution_charset_utf8_v = _Is_execution_charset_utf8(); -_NODISCARD inline constexpr int _Utf8_code_units_in_next_character( +_NODISCARD constexpr int _Utf8_code_units_in_next_character( const char* const _First, const char* const _Last) noexcept { // Returns a count of the number of UTF-8 code units that compose the first encoded character in [_First, _Last), // or -1 if [_First, _Last) doesn't contain an entire encoded character or *_First is not a valid lead byte. From b5ddb4d81380b39448c741ecc95709f745b3531e Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 12 Apr 2021 18:22:40 -0700 Subject: [PATCH 6/8] Code review feedback. --- tests/std/include/test_format_support.hpp | 1 + tests/std/test.lst | 2 ++ .../P0645R10_text_formatting_legacy_text_encoding/test.cpp | 2 +- tests/std/tests/P0645R10_text_formatting_utf8/test.cpp | 2 +- 4 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/std/include/test_format_support.hpp b/tests/std/include/test_format_support.hpp index ca569f23d3..d3bf9b92ac 100644 --- a/tests/std/include/test_format_support.hpp +++ b/tests/std/include/test_format_support.hpp @@ -7,6 +7,7 @@ #include #include #include +#include // copied from the string_view tests template diff --git a/tests/std/test.lst b/tests/std/test.lst index 6db71e0154..cba3615d38 100644 --- a/tests/std/test.lst +++ b/tests/std/test.lst @@ -263,8 +263,10 @@ tests\P0645R10_text_formatting_args tests\P0645R10_text_formatting_custom_formatting tests\P0645R10_text_formatting_death tests\P0645R10_text_formatting_formatting +tests\P0645R10_text_formatting_legacy_text_encoding tests\P0645R10_text_formatting_parse_contexts tests\P0645R10_text_formatting_parsing +tests\P0645R10_text_formatting_utf8 tests\P0660R10_jthread_and_cv_any tests\P0660R10_stop_token tests\P0660R10_stop_token_death diff --git a/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/test.cpp b/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/test.cpp index a24c1a6e02..36a1c51130 100644 --- a/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/test.cpp +++ b/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/test.cpp @@ -2,8 +2,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include +#include #include -#include #include #include "test_format_support.hpp" diff --git a/tests/std/tests/P0645R10_text_formatting_utf8/test.cpp b/tests/std/tests/P0645R10_text_formatting_utf8/test.cpp index ef45b409eb..c4cc5c6ebe 100644 --- a/tests/std/tests/P0645R10_text_formatting_utf8/test.cpp +++ b/tests/std/tests/P0645R10_text_formatting_utf8/test.cpp @@ -2,8 +2,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include +#include #include -#include #include #include "test_format_support.hpp" From 33a901fcb25ad6b38b5f7013bfef938ddb1dcd5f Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 12 Apr 2021 18:24:21 -0700 Subject: [PATCH 7/8] Mitigate merge conflicts: Use _THROW. --- stl/inc/format | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stl/inc/format b/stl/inc/format index 8f576a9792..8b4eda68de 100644 --- a/stl/inc/format +++ b/stl/inc/format @@ -792,7 +792,7 @@ const _CharT* _Find_encoded(const _CharT* _First, const _CharT* _Last, const _Ch while (_First != _Last && *_First != _Val) { const int _Units = _Code_units_in_next_character(_First, _Last, _Cvt); if (_Units < 0) { - throw format_error("Invalid encoded character in format string."); + _THROW(format_error("Invalid encoded character in format string.")); } _First += _Units; } From 926444b88606fcb9ab25a5cdc095bf683a7f755a Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 12 Apr 2021 18:26:46 -0700 Subject: [PATCH 8/8] Mitigate merge conflicts: Check setlocale(), reset to "C". --- .../test.cpp | 8 ++++---- tests/std/tests/P0645R10_text_formatting_utf8/test.cpp | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/test.cpp b/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/test.cpp index 36a1c51130..af57f610fc 100644 --- a/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/test.cpp +++ b/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/test.cpp @@ -12,7 +12,7 @@ using namespace std; void test_multibyte_format_strings() { { - setlocale(LC_ALL, ".932"); + assert(setlocale(LC_ALL, ".932") != nullptr); const auto s = "\x93\xfa\x96{\x92\x6e\x90}"sv; // Note the use of `{` and `}` as continuation bytes (from GH-1576) assert(format(s) == s); @@ -40,14 +40,14 @@ void test_multibyte_format_strings() { assert(format("{:\x90}>4.3}", s) == "\x90}\x90}\x93\xfa"sv); } - setlocale(LC_ALL, nullptr); + assert(setlocale(LC_ALL, "C") != nullptr); } void test_parse_align() { auto parse_align_fn = _Parse_align>; { - setlocale(LC_ALL, ".932"); + assert(setlocale(LC_ALL, ".932") != nullptr); test_parse_helper(parse_align_fn, "\x93\xfaX"sv, false, 3, @@ -56,7 +56,7 @@ void test_parse_align() { {.expected_alignment = _Align::_Center, .expected_fill = "\x92\x6e"sv}); } - setlocale(LC_ALL, nullptr); + assert(setlocale(LC_ALL, "C") != nullptr); } int main() { diff --git a/tests/std/tests/P0645R10_text_formatting_utf8/test.cpp b/tests/std/tests/P0645R10_text_formatting_utf8/test.cpp index c4cc5c6ebe..0cf1ca3d9c 100644 --- a/tests/std/tests/P0645R10_text_formatting_utf8/test.cpp +++ b/tests/std/tests/P0645R10_text_formatting_utf8/test.cpp @@ -51,14 +51,14 @@ void run_tests() { int main() { run_tests(); - setlocale(LC_ALL, ".1252"); + assert(setlocale(LC_ALL, ".1252") != nullptr); run_tests(); - setlocale(LC_ALL, ".932"); + assert(setlocale(LC_ALL, ".932") != nullptr); run_tests(); #ifndef MSVC_INTERNAL_TESTING // TRANSITION, Windows on Contest VMs understand ".UTF-8" codepage - setlocale(LC_ALL, ".UTF-8"); + assert(setlocale(LC_ALL, ".UTF-8") != nullptr); run_tests(); #endif }