<format>: Assume UTF-8 format strings when execution charset is UTF-8 (…

…#1824) * Assume format strings are always UTF-8 when encoding charset is UTF-8 * Add `/utf-8` tests * Run Shift-JIS tests with `/execution-charset:.932` * Apply code review feedback * constexpr function implies inline * Code review feedback. * Mitigate merge conflicts: Use _THROW. * Mitigate merge conflicts: Check setlocale(), reset to "C". Co-authored-by: Stephan T. Lavavej <stl@nuwen.net>
microsoft · Apr 13, 2021 · b81d9eb · b81d9eb
1 parent de11c9a
commit b81d9eb
Show file tree

Hide file tree

Showing 10 changed files with 397 additions and 214 deletions.
diff --git a/stl/inc/format b/stl/inc/format
@@ -417,55 +417,85 @@ _NODISCARD constexpr const _CharT* _Parse_arg_id(
     throw format_error("Invalid format string.");
 }
 
+_NODISCARD constexpr bool _Is_execution_charset_utf8() {
+#pragma warning(push)
+#pragma warning(disable : 4309) // 'initializing' : truncation of constant value
+#pragma warning(disable : 4566) // character represented by universal-character-name '\u4E00' cannot be represented in
+                                // the current code page
+#pragma warning(disable : 6201) // Index '2' is out of valid index range '0' to '1' for possibly stack allocated buffer
+                                // '_Test_char'
+#pragma warning(disable : 6239) // (<non-zero constant> && <expression>) always evaluates to the result of <expression>.
+                                // Did you intend to use the bitwise-and operator?
+    constexpr char _Test_char[] = "\u4e00";
+    return sizeof(_Test_char) == 4 && _Test_char[0] == '\xe4' && _Test_char[1] == '\xb8' && _Test_char[2] == '\x80';
+#pragma warning(pop)
+}
+
+inline constexpr bool _Is_execution_charset_utf8_v = _Is_execution_charset_utf8();
+
+_NODISCARD constexpr int _Utf8_code_units_in_next_character(
+    const char* const _First, const char* const _Last) noexcept {
+    // Returns a count of the number of UTF-8 code units that compose the first encoded character in [_First, _Last),
+    // or -1 if [_First, _Last) doesn't contain an entire encoded character or *_First is not a valid lead byte.
+    const auto _Ch = static_cast<unsigned char>(*_First);
+    if (_Ch < 0b1000'0000u) {
+        return 1;
+    }
+
+    const auto _Len = static_cast<size_t>(_Last - _First);
+
+    if (_Ch < 0b1110'0000u) {
+        // check for non-lead byte or partial 2-byte encoded character
+        return (_Ch >= 0b1100'0000u && _Len >= 2) ? 2 : -1;
+    }
+
+    if (_Ch < 0b1111'0000u) {
+        // check for partial 3-byte encoded character
+        return (_Len >= 3) ? 3 : -1;
+    }
+
+    // check for partial 4-byte encoded character
+    return (_Len >= 4) ? 4 : -1;
+}
+
+_NODISCARD inline int _Double_byte_encoding_code_units_in_next_character(
+    const char* const _First, const char* const _Last, const _Cvtvec& _Cvt) {
+    // Returns a count of the number of code units that compose the first encoded character in [_First, _Last),
+    // or -1 if [_First, _Last) doesn't contain an entire encoded character or *_First is not a valid lead byte.
+    wchar_t _Wide;
+    mbstate_t _St{};
+    const auto _Len   = static_cast<size_t>(_Last - _First);
+    const int _Result = _Mbrtowc(&_Wide, _First, _Len, &_St, &_Cvt);
+    if (_Result > 0) {
+        return _Result;
+    } else if (_Result < 0) { // invalid or incomplete encoded character
+        return -1;
+    } else { // next code unit is '\0'
+        return 1;
+    }
+}
+
 _NODISCARD inline int _Code_units_in_next_character(const char* _First, const char* _Last, const _Cvtvec& _Cvt) {
     // Returns a count of the number of code units that compose the first encoded character in
     // [_First, _Last), or -1 if [_First, _Last) doesn't contain an entire encoded character or
     // *_First is not a valid lead byte.
     _STL_INTERNAL_CHECK(_First < _Last);
 
-    switch (_Cvt._Mbcurmax) {
-    default:
-        _STL_INTERNAL_CHECK(!"Bad number of encoding units for this code page");
-        [[fallthrough]];
-    case 1:
-        return 1; // all characters have only one code unit
-
-    case 2:
-        {
-            wchar_t _Wide;
-            mbstate_t _St{};
-            const auto _Len   = static_cast<size_t>(_Last - _First);
-            const int _Result = _Mbrtowc(&_Wide, _First, _Len, &_St, &_Cvt);
-            if (_Result > 0) {
-                return _Result;
-            } else if (_Result < 0) { // invalid or incomplete encoded character
-                return -1;
-            } else { // next code unit is '\0'
-                return 1;
-            }
-        }
-
-    case 4: // Assume UTF-8 (as does _Mbrtowc)
-        {
-            const auto _Ch = static_cast<unsigned char>(*_First);
-            if (_Ch < 0b1000'0000u) {
-                return 1;
-            }
-
-            const auto _Len = static_cast<size_t>(_Last - _First);
+    if constexpr (_Is_execution_charset_utf8_v) {
+        return _Utf8_code_units_in_next_character(_First, _Last);
+    } else {
+        switch (_Cvt._Mbcurmax) {
+        default:
+            _STL_INTERNAL_CHECK(!"Bad number of encoding units for this code page");
+            [[fallthrough]];
+        case 1:
+            return 1; // all characters have only one code unit
 
-            if (_Ch < 0b1110'0000u) {
-                // check for non-lead byte or partial 2-byte encoded character
-                return (_Ch >= 0b1100'0000u && _Len >= 2) ? 2 : -1;
-            }
+        case 2:
+            return _Double_byte_encoding_code_units_in_next_character(_First, _Last, _Cvt);
 
-            if (_Ch < 0b1111'0000u) {
-                // check for partial 3-byte encoded character
-                return (_Len >= 3) ? 3 : -1;
-            }
-
-            // check for partial 4-byte encoded character
-            return (_Len >= 4) ? 4 : -1;
+        case 4: // Assume UTF-8 (as does _Mbrtowc)
+            return _Utf8_code_units_in_next_character(_First, _Last);
         }
     }
 }
@@ -751,20 +781,24 @@ template <class _CharT>
 const _CharT* _Find_encoded(const _CharT* _First, const _CharT* _Last, const _CharT _Val, const _Cvtvec& _Cvt) {
     // Returns the first occurrence of _Val as an encoded character (and not, for example, as a
     // continuation byte) in [_First, _Last).
-    if (_Cvt._Mbcurmax == 1 || _Cvt._Mbcurmax == 4) {
-        // As above and in _Mbrtowc, assume 4-byte encodings are UTF-8
+    if constexpr (_Is_execution_charset_utf8_v) {
         return _Find_unchecked(_First, _Last, _Val);
-    }
+    } else {
+        if (_Cvt._Mbcurmax == 1 || _Cvt._Mbcurmax == 4) {
+            // As above and in _Mbrtowc, assume 4-byte encodings are UTF-8
+            return _Find_unchecked(_First, _Last, _Val);
+        }
 
-    while (_First != _Last && *_First != _Val) {
-        const int _Units = _Code_units_in_next_character(_First, _Last, _Cvt);
-        if (_Units < 0) {
-            throw format_error("Invalid encoded character in format string.");
+        while (_First != _Last && *_First != _Val) {
+            const int _Units = _Code_units_in_next_character(_First, _Last, _Cvt);
+            if (_Units < 0) {
+                _THROW(format_error("Invalid encoded character in format string."));
+            }
+            _First += _Units;
         }
-        _First += _Units;
-    }
 
-    return _First;
+        return _First;
+    }
 }
 
 template <class _CharT, _Parse_replacement_field_callbacks<_CharT> _HandlerT>
@@ -2163,15 +2197,9 @@ _NODISCARD constexpr int _Unicode_width_estimate(const char32_t _Ch) noexcept {
     return 1;
 }
 
-_NODISCARD inline int _Estimate_character_width(const char* _Ptr, const int _Units, const _Cvtvec& _Cvt) {
+_NODISCARD inline int _Estimate_utf8_character_width(const char* const _Ptr, const int _Units) noexcept {
     // Return an estimate for the width of the character composed of _Units code units,
     // whose first code unit is denoted by _Ptr.
-    if (_Cvt._Mbcurmax != 4) {
-        // not a Unicode encoding; estimate width == number of code units
-        return _Units;
-    }
-
-    // assume UTF-8
     auto _Ch = static_cast<char32_t>(*_Ptr);
     switch (_Units) {
     default:
@@ -2197,6 +2225,22 @@ _NODISCARD inline int _Estimate_character_width(const char* _Ptr, const int _Uni
     return _Unicode_width_estimate<_Width_estimate_high_intervals>(_Ch);
 }
 
+_NODISCARD inline int _Estimate_character_width(const char* _Ptr, const int _Units, const _Cvtvec& _Cvt) {
+    // Return an estimate for the width of the character composed of _Units code units,
+    // whose first code unit is denoted by _Ptr.
+    if constexpr (_Is_execution_charset_utf8_v) {
+        return _Estimate_utf8_character_width(_Ptr, _Units);
+    } else {
+        if (_Cvt._Mbcurmax != 4) {
+            // not a Unicode encoding; estimate width == number of code units
+            return _Units;
+        }
+
+        // assume UTF-8
+        return _Estimate_utf8_character_width(_Ptr, _Units);
+    }
+}
+
 _NODISCARD inline int _Estimate_character_width(const wchar_t* _Ptr, const int _Units, const _Cvtvec&) {
     // Return an estimate for the width of the character composed of _Units code units,
     // whose first code unit is denoted by _Ptr.

diff --git a/tests/std/include/test_format_support.hpp b/tests/std/include/test_format_support.hpp
@@ -0,0 +1,127 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#pragma once
+
+#include <cassert>
+#include <cstddef>
+#include <format>
+#include <string_view>
+#include <utility>
+
+// copied from the string_view tests
+template <typename CharT>
+struct choose_literal; // not defined
+
+template <>
+struct choose_literal<char> {
+    static constexpr const char* choose(const char* s, const wchar_t*) {
+        return s;
+    }
+};
+
+template <>
+struct choose_literal<wchar_t> {
+    static constexpr const wchar_t* choose(const char*, const wchar_t* s) {
+        return s;
+    }
+};
+
+#define TYPED_LITERAL(CharT, Literal) (choose_literal<CharT>::choose(Literal, L##Literal))
+
+template <typename CharT>
+struct noop_testing_callbacks {
+    constexpr void _On_align(std::_Align) {}
+    constexpr void _On_fill(std::basic_string_view<CharT>) {}
+    constexpr void _On_width(unsigned int) {}
+    constexpr void _On_dynamic_width(std::size_t) {}
+    constexpr void _On_dynamic_width(std::_Auto_id_tag) {}
+    constexpr void _On_precision(unsigned int) {}
+    constexpr void _On_dynamic_precision(std::size_t) {}
+    constexpr void _On_dynamic_precision(std::_Auto_id_tag) {}
+    constexpr void _On_sign(std::_Sign) {}
+    constexpr void _On_hash() {}
+    constexpr void _On_zero() {}
+    constexpr void _On_localized() {}
+    constexpr void _On_type(CharT) {}
+};
+
+template <typename CharT>
+struct testing_callbacks {
+    std::_Align expected_alignment = std::_Align::_None;
+    std::_Sign expected_sign       = std::_Sign::_None;
+    std::basic_string_view<CharT> expected_fill;
+    int expected_width                     = -1;
+    std::size_t expected_dynamic_width     = static_cast<std::size_t>(-1);
+    bool expected_auto_dynamic_width       = false;
+    int expected_precision                 = -1;
+    std::size_t expected_dynamic_precision = static_cast<std::size_t>(-1);
+    bool expected_auto_dynamic_precision   = false;
+    bool expected_hash                     = false;
+    bool expected_zero                     = false;
+    bool expected_localized                = false;
+    CharT expected_type                    = '\0';
+
+    constexpr void _On_align(std::_Align aln) {
+        assert(aln == expected_alignment);
+    }
+    constexpr void _On_fill(std::basic_string_view<CharT> str_view) {
+        assert(str_view == expected_fill);
+    }
+    constexpr void _On_width(int width) {
+        assert(width == expected_width);
+    }
+    constexpr void _On_dynamic_width(std::size_t id) {
+        assert(id == expected_dynamic_width);
+    }
+    constexpr void _On_dynamic_width(std::_Auto_id_tag) {
+        assert(expected_auto_dynamic_width);
+    }
+    constexpr void _On_precision(int pre) {
+        assert(pre == expected_precision);
+    }
+    constexpr void _On_dynamic_precision(std::size_t id) {
+        assert(id == expected_dynamic_precision);
+    }
+    constexpr void _On_dynamic_precision(std::_Auto_id_tag) {
+        assert(expected_auto_dynamic_precision);
+    }
+    constexpr void _On_sign(std::_Sign sgn) {
+        assert(sgn == expected_sign);
+    }
+    constexpr void _On_hash() {
+        assert(expected_hash);
+    }
+    constexpr void _On_zero() {
+        assert(expected_zero);
+    }
+    constexpr void _On_localized() {
+        assert(expected_localized);
+    }
+    constexpr void _On_type(CharT type) {
+        assert(type == expected_type);
+    }
+};
+template <typename CharT>
+testing_callbacks(std::_Align, std::basic_string_view<CharT>) -> testing_callbacks<CharT>;
+
+struct testing_arg_id_callbacks {
+    constexpr void _On_auto_id() {}
+    constexpr void _On_manual_id(std::size_t) {}
+};
+
+template <typename CharT, typename callback_type>
+void test_parse_helper(const CharT* (*func)(const CharT*, const CharT*, callback_type&&),
+    std::basic_string_view<CharT> view, bool err_expected = false,
+    typename std::basic_string_view<CharT>::size_type expected_end_position = std::basic_string_view<CharT>::npos,
+    callback_type&& callbacks                                               = {}) {
+    try {
+        auto end = func(view.data(), view.data() + view.size(), std::move(callbacks));
+        if (expected_end_position != std::basic_string_view<CharT>::npos) {
+            assert(end == view.data() + expected_end_position);
+        }
+        assert(!err_expected);
+    } catch (const std::format_error&) {
+        assert(err_expected);
+    }
+}
diff --git a/tests/std/test.lst b/tests/std/test.lst
@@ -263,8 +263,10 @@ tests\P0645R10_text_formatting_args
 tests\P0645R10_text_formatting_custom_formatting
 tests\P0645R10_text_formatting_death
 tests\P0645R10_text_formatting_formatting
+tests\P0645R10_text_formatting_legacy_text_encoding
 tests\P0645R10_text_formatting_parse_contexts
 tests\P0645R10_text_formatting_parsing
+tests\P0645R10_text_formatting_utf8
 tests\P0660R10_jthread_and_cv_any
 tests\P0660R10_stop_token
 tests\P0660R10_stop_token_death

diff --git a/tests/std/tests/P0645R10_text_formatting_formatting/test.cpp b/tests/std/tests/P0645R10_text_formatting_formatting/test.cpp
@@ -973,35 +973,6 @@ void test_size() {
 }
 
 void test_multibyte_format_strings() {
-    {
-        setlocale(LC_ALL, ".932");
-        const auto s =
-            "\x93\xfa\x96{\x92\x6e\x90}"sv; // Note the use of `{` and `}` as continuation bytes (from GH-1576)
-        assert(format(s) == s);
-
-        assert(format("{:.2}", s) == "\x93\xfa"sv);
-        assert(format("{:4.2}", s) == "\x93\xfa  "sv);
-
-        assert(format("{:<4.2}", s) == "\x93\xfa  "sv);
-        assert(format("{:^4.2}", s) == " \x93\xfa "sv);
-        assert(format("{:>4.2}", s) == "  \x93\xfa"sv);
-
-        assert(format("{:\x90}<4.2}", s) == "\x93\xfa\x90}\x90}"sv);
-        assert(format("{:\x90}^4.2}", s) == "\x90}\x93\xfa\x90}"sv);
-        assert(format("{:\x90}>4.2}", s) == "\x90}\x90}\x93\xfa"sv);
-
-        assert(format("{:.3}", s) == "\x93\xfa"sv);
-        assert(format("{:4.3}", s) == "\x93\xfa  "sv);
-
-        assert(format("{:<4.3}", s) == "\x93\xfa  "sv);
-        assert(format("{:^4.3}", s) == " \x93\xfa "sv);
-        assert(format("{:>4.3}", s) == "  \x93\xfa"sv);
-
-        assert(format("{:\x90}<4.3}", s) == "\x93\xfa\x90}\x90}"sv);
-        assert(format("{:\x90}^4.3}", s) == "\x90}\x93\xfa\x90}"sv);
-        assert(format("{:\x90}>4.3}", s) == "\x90}\x90}\x93\xfa"sv);
-    }
-
 #ifndef MSVC_INTERNAL_TESTING // TRANSITION, Windows on Contest VMs understand ".UTF-8" codepage
     {
         setlocale(LC_ALL, ".UTF-8");