Skip to content

Commit

Permalink
<format>: Assume UTF-8 format strings when execution charset is UTF-8 (
Browse files Browse the repository at this point in the history
…#1824)

* Assume format strings are always UTF-8 when encoding charset is UTF-8

* Add `/utf-8` tests

* Run Shift-JIS tests with `/execution-charset:.932`

* Apply code review feedback

* constexpr function implies inline

* Code review feedback.

* Mitigate merge conflicts: Use _THROW.

* Mitigate merge conflicts: Check setlocale(), reset to "C".

Co-authored-by: Stephan T. Lavavej <stl@nuwen.net>
  • Loading branch information
statementreply and StephanTLavavej authored Apr 13, 2021
1 parent de11c9a commit b81d9eb
Show file tree
Hide file tree
Showing 10 changed files with 397 additions and 214 deletions.
160 changes: 102 additions & 58 deletions stl/inc/format
Original file line number Diff line number Diff line change
Expand Up @@ -417,55 +417,85 @@ _NODISCARD constexpr const _CharT* _Parse_arg_id(
throw format_error("Invalid format string.");
}

_NODISCARD constexpr bool _Is_execution_charset_utf8() {
#pragma warning(push)
#pragma warning(disable : 4309) // 'initializing' : truncation of constant value
#pragma warning(disable : 4566) // character represented by universal-character-name '\u4E00' cannot be represented in
// the current code page
#pragma warning(disable : 6201) // Index '2' is out of valid index range '0' to '1' for possibly stack allocated buffer
// '_Test_char'
#pragma warning(disable : 6239) // (<non-zero constant> && <expression>) always evaluates to the result of <expression>.
// Did you intend to use the bitwise-and operator?
constexpr char _Test_char[] = "\u4e00";
return sizeof(_Test_char) == 4 && _Test_char[0] == '\xe4' && _Test_char[1] == '\xb8' && _Test_char[2] == '\x80';
#pragma warning(pop)
}

inline constexpr bool _Is_execution_charset_utf8_v = _Is_execution_charset_utf8();

_NODISCARD constexpr int _Utf8_code_units_in_next_character(
const char* const _First, const char* const _Last) noexcept {
// Returns a count of the number of UTF-8 code units that compose the first encoded character in [_First, _Last),
// or -1 if [_First, _Last) doesn't contain an entire encoded character or *_First is not a valid lead byte.
const auto _Ch = static_cast<unsigned char>(*_First);
if (_Ch < 0b1000'0000u) {
return 1;
}

const auto _Len = static_cast<size_t>(_Last - _First);

if (_Ch < 0b1110'0000u) {
// check for non-lead byte or partial 2-byte encoded character
return (_Ch >= 0b1100'0000u && _Len >= 2) ? 2 : -1;
}

if (_Ch < 0b1111'0000u) {
// check for partial 3-byte encoded character
return (_Len >= 3) ? 3 : -1;
}

// check for partial 4-byte encoded character
return (_Len >= 4) ? 4 : -1;
}

_NODISCARD inline int _Double_byte_encoding_code_units_in_next_character(
const char* const _First, const char* const _Last, const _Cvtvec& _Cvt) {
// Returns a count of the number of code units that compose the first encoded character in [_First, _Last),
// or -1 if [_First, _Last) doesn't contain an entire encoded character or *_First is not a valid lead byte.
wchar_t _Wide;
mbstate_t _St{};
const auto _Len = static_cast<size_t>(_Last - _First);
const int _Result = _Mbrtowc(&_Wide, _First, _Len, &_St, &_Cvt);
if (_Result > 0) {
return _Result;
} else if (_Result < 0) { // invalid or incomplete encoded character
return -1;
} else { // next code unit is '\0'
return 1;
}
}

_NODISCARD inline int _Code_units_in_next_character(const char* _First, const char* _Last, const _Cvtvec& _Cvt) {
// Returns a count of the number of code units that compose the first encoded character in
// [_First, _Last), or -1 if [_First, _Last) doesn't contain an entire encoded character or
// *_First is not a valid lead byte.
_STL_INTERNAL_CHECK(_First < _Last);

switch (_Cvt._Mbcurmax) {
default:
_STL_INTERNAL_CHECK(!"Bad number of encoding units for this code page");
[[fallthrough]];
case 1:
return 1; // all characters have only one code unit

case 2:
{
wchar_t _Wide;
mbstate_t _St{};
const auto _Len = static_cast<size_t>(_Last - _First);
const int _Result = _Mbrtowc(&_Wide, _First, _Len, &_St, &_Cvt);
if (_Result > 0) {
return _Result;
} else if (_Result < 0) { // invalid or incomplete encoded character
return -1;
} else { // next code unit is '\0'
return 1;
}
}

case 4: // Assume UTF-8 (as does _Mbrtowc)
{
const auto _Ch = static_cast<unsigned char>(*_First);
if (_Ch < 0b1000'0000u) {
return 1;
}

const auto _Len = static_cast<size_t>(_Last - _First);
if constexpr (_Is_execution_charset_utf8_v) {
return _Utf8_code_units_in_next_character(_First, _Last);
} else {
switch (_Cvt._Mbcurmax) {
default:
_STL_INTERNAL_CHECK(!"Bad number of encoding units for this code page");
[[fallthrough]];
case 1:
return 1; // all characters have only one code unit

if (_Ch < 0b1110'0000u) {
// check for non-lead byte or partial 2-byte encoded character
return (_Ch >= 0b1100'0000u && _Len >= 2) ? 2 : -1;
}
case 2:
return _Double_byte_encoding_code_units_in_next_character(_First, _Last, _Cvt);

if (_Ch < 0b1111'0000u) {
// check for partial 3-byte encoded character
return (_Len >= 3) ? 3 : -1;
}

// check for partial 4-byte encoded character
return (_Len >= 4) ? 4 : -1;
case 4: // Assume UTF-8 (as does _Mbrtowc)
return _Utf8_code_units_in_next_character(_First, _Last);
}
}
}
Expand Down Expand Up @@ -751,20 +781,24 @@ template <class _CharT>
const _CharT* _Find_encoded(const _CharT* _First, const _CharT* _Last, const _CharT _Val, const _Cvtvec& _Cvt) {
// Returns the first occurrence of _Val as an encoded character (and not, for example, as a
// continuation byte) in [_First, _Last).
if (_Cvt._Mbcurmax == 1 || _Cvt._Mbcurmax == 4) {
// As above and in _Mbrtowc, assume 4-byte encodings are UTF-8
if constexpr (_Is_execution_charset_utf8_v) {
return _Find_unchecked(_First, _Last, _Val);
}
} else {
if (_Cvt._Mbcurmax == 1 || _Cvt._Mbcurmax == 4) {
// As above and in _Mbrtowc, assume 4-byte encodings are UTF-8
return _Find_unchecked(_First, _Last, _Val);
}

while (_First != _Last && *_First != _Val) {
const int _Units = _Code_units_in_next_character(_First, _Last, _Cvt);
if (_Units < 0) {
throw format_error("Invalid encoded character in format string.");
while (_First != _Last && *_First != _Val) {
const int _Units = _Code_units_in_next_character(_First, _Last, _Cvt);
if (_Units < 0) {
_THROW(format_error("Invalid encoded character in format string."));
}
_First += _Units;
}
_First += _Units;
}

return _First;
return _First;
}
}

template <class _CharT, _Parse_replacement_field_callbacks<_CharT> _HandlerT>
Expand Down Expand Up @@ -2163,15 +2197,9 @@ _NODISCARD constexpr int _Unicode_width_estimate(const char32_t _Ch) noexcept {
return 1;
}

_NODISCARD inline int _Estimate_character_width(const char* _Ptr, const int _Units, const _Cvtvec& _Cvt) {
_NODISCARD inline int _Estimate_utf8_character_width(const char* const _Ptr, const int _Units) noexcept {
// Return an estimate for the width of the character composed of _Units code units,
// whose first code unit is denoted by _Ptr.
if (_Cvt._Mbcurmax != 4) {
// not a Unicode encoding; estimate width == number of code units
return _Units;
}

// assume UTF-8
auto _Ch = static_cast<char32_t>(*_Ptr);
switch (_Units) {
default:
Expand All @@ -2197,6 +2225,22 @@ _NODISCARD inline int _Estimate_character_width(const char* _Ptr, const int _Uni
return _Unicode_width_estimate<_Width_estimate_high_intervals>(_Ch);
}

_NODISCARD inline int _Estimate_character_width(const char* _Ptr, const int _Units, const _Cvtvec& _Cvt) {
// Return an estimate for the width of the character composed of _Units code units,
// whose first code unit is denoted by _Ptr.
if constexpr (_Is_execution_charset_utf8_v) {
return _Estimate_utf8_character_width(_Ptr, _Units);
} else {
if (_Cvt._Mbcurmax != 4) {
// not a Unicode encoding; estimate width == number of code units
return _Units;
}

// assume UTF-8
return _Estimate_utf8_character_width(_Ptr, _Units);
}
}

_NODISCARD inline int _Estimate_character_width(const wchar_t* _Ptr, const int _Units, const _Cvtvec&) {
// Return an estimate for the width of the character composed of _Units code units,
// whose first code unit is denoted by _Ptr.
Expand Down
127 changes: 127 additions & 0 deletions tests/std/include/test_format_support.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#pragma once

#include <cassert>
#include <cstddef>
#include <format>
#include <string_view>
#include <utility>

// copied from the string_view tests
template <typename CharT>
struct choose_literal; // not defined

template <>
struct choose_literal<char> {
static constexpr const char* choose(const char* s, const wchar_t*) {
return s;
}
};

template <>
struct choose_literal<wchar_t> {
static constexpr const wchar_t* choose(const char*, const wchar_t* s) {
return s;
}
};

#define TYPED_LITERAL(CharT, Literal) (choose_literal<CharT>::choose(Literal, L##Literal))

template <typename CharT>
struct noop_testing_callbacks {
constexpr void _On_align(std::_Align) {}
constexpr void _On_fill(std::basic_string_view<CharT>) {}
constexpr void _On_width(unsigned int) {}
constexpr void _On_dynamic_width(std::size_t) {}
constexpr void _On_dynamic_width(std::_Auto_id_tag) {}
constexpr void _On_precision(unsigned int) {}
constexpr void _On_dynamic_precision(std::size_t) {}
constexpr void _On_dynamic_precision(std::_Auto_id_tag) {}
constexpr void _On_sign(std::_Sign) {}
constexpr void _On_hash() {}
constexpr void _On_zero() {}
constexpr void _On_localized() {}
constexpr void _On_type(CharT) {}
};

template <typename CharT>
struct testing_callbacks {
std::_Align expected_alignment = std::_Align::_None;
std::_Sign expected_sign = std::_Sign::_None;
std::basic_string_view<CharT> expected_fill;
int expected_width = -1;
std::size_t expected_dynamic_width = static_cast<std::size_t>(-1);
bool expected_auto_dynamic_width = false;
int expected_precision = -1;
std::size_t expected_dynamic_precision = static_cast<std::size_t>(-1);
bool expected_auto_dynamic_precision = false;
bool expected_hash = false;
bool expected_zero = false;
bool expected_localized = false;
CharT expected_type = '\0';

constexpr void _On_align(std::_Align aln) {
assert(aln == expected_alignment);
}
constexpr void _On_fill(std::basic_string_view<CharT> str_view) {
assert(str_view == expected_fill);
}
constexpr void _On_width(int width) {
assert(width == expected_width);
}
constexpr void _On_dynamic_width(std::size_t id) {
assert(id == expected_dynamic_width);
}
constexpr void _On_dynamic_width(std::_Auto_id_tag) {
assert(expected_auto_dynamic_width);
}
constexpr void _On_precision(int pre) {
assert(pre == expected_precision);
}
constexpr void _On_dynamic_precision(std::size_t id) {
assert(id == expected_dynamic_precision);
}
constexpr void _On_dynamic_precision(std::_Auto_id_tag) {
assert(expected_auto_dynamic_precision);
}
constexpr void _On_sign(std::_Sign sgn) {
assert(sgn == expected_sign);
}
constexpr void _On_hash() {
assert(expected_hash);
}
constexpr void _On_zero() {
assert(expected_zero);
}
constexpr void _On_localized() {
assert(expected_localized);
}
constexpr void _On_type(CharT type) {
assert(type == expected_type);
}
};
template <typename CharT>
testing_callbacks(std::_Align, std::basic_string_view<CharT>) -> testing_callbacks<CharT>;

struct testing_arg_id_callbacks {
constexpr void _On_auto_id() {}
constexpr void _On_manual_id(std::size_t) {}
};

template <typename CharT, typename callback_type>
void test_parse_helper(const CharT* (*func)(const CharT*, const CharT*, callback_type&&),
std::basic_string_view<CharT> view, bool err_expected = false,
typename std::basic_string_view<CharT>::size_type expected_end_position = std::basic_string_view<CharT>::npos,
callback_type&& callbacks = {}) {
try {
auto end = func(view.data(), view.data() + view.size(), std::move(callbacks));
if (expected_end_position != std::basic_string_view<CharT>::npos) {
assert(end == view.data() + expected_end_position);
}
assert(!err_expected);
} catch (const std::format_error&) {
assert(err_expected);
}
}
2 changes: 2 additions & 0 deletions tests/std/test.lst
Original file line number Diff line number Diff line change
Expand Up @@ -263,8 +263,10 @@ tests\P0645R10_text_formatting_args
tests\P0645R10_text_formatting_custom_formatting
tests\P0645R10_text_formatting_death
tests\P0645R10_text_formatting_formatting
tests\P0645R10_text_formatting_legacy_text_encoding
tests\P0645R10_text_formatting_parse_contexts
tests\P0645R10_text_formatting_parsing
tests\P0645R10_text_formatting_utf8
tests\P0660R10_jthread_and_cv_any
tests\P0660R10_stop_token
tests\P0660R10_stop_token_death
Expand Down
29 changes: 0 additions & 29 deletions tests/std/tests/P0645R10_text_formatting_formatting/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -973,35 +973,6 @@ void test_size() {
}

void test_multibyte_format_strings() {
{
setlocale(LC_ALL, ".932");
const auto s =
"\x93\xfa\x96{\x92\x6e\x90}"sv; // Note the use of `{` and `}` as continuation bytes (from GH-1576)
assert(format(s) == s);

assert(format("{:.2}", s) == "\x93\xfa"sv);
assert(format("{:4.2}", s) == "\x93\xfa "sv);

assert(format("{:<4.2}", s) == "\x93\xfa "sv);
assert(format("{:^4.2}", s) == " \x93\xfa "sv);
assert(format("{:>4.2}", s) == " \x93\xfa"sv);

assert(format("{:\x90}<4.2}", s) == "\x93\xfa\x90}\x90}"sv);
assert(format("{:\x90}^4.2}", s) == "\x90}\x93\xfa\x90}"sv);
assert(format("{:\x90}>4.2}", s) == "\x90}\x90}\x93\xfa"sv);

assert(format("{:.3}", s) == "\x93\xfa"sv);
assert(format("{:4.3}", s) == "\x93\xfa "sv);

assert(format("{:<4.3}", s) == "\x93\xfa "sv);
assert(format("{:^4.3}", s) == " \x93\xfa "sv);
assert(format("{:>4.3}", s) == " \x93\xfa"sv);

assert(format("{:\x90}<4.3}", s) == "\x93\xfa\x90}\x90}"sv);
assert(format("{:\x90}^4.3}", s) == "\x90}\x93\xfa\x90}"sv);
assert(format("{:\x90}>4.3}", s) == "\x90}\x90}\x93\xfa"sv);
}

#ifndef MSVC_INTERNAL_TESTING // TRANSITION, Windows on Contest VMs understand ".UTF-8" codepage
{
setlocale(LC_ALL, ".UTF-8");
Expand Down
Loading

0 comments on commit b81d9eb

Please sign in to comment.