Skip to content

Commit

Permalink
Handle formatter flags in byte OsStr Display
Browse files Browse the repository at this point in the history
The Display implementation for `OsStr` and `Path` on non-Windows systems
(the byte-oriented version) only handles formatter flags when the entire
string is valid UTF-8. As most paths are valid UTF-8, the common case is
formatted like `str`; however, flags are ignored when they contain an
invalid UTF-8 sequence. Implement its Display with the same logic as
that of `str`.

Additionally, it forwards flags when formatting the last chunk of valid
UTF-8, so it erroneously inserts padding within the string when it has
invalid UTF-8. Fix this.

Fixes #136617 for non-Windows systems.
  • Loading branch information
thaliaarchi committed Feb 16, 2025
1 parent c0e16b3 commit d52e2b2
Show file tree
Hide file tree
Showing 5 changed files with 89 additions and 17 deletions.
4 changes: 3 additions & 1 deletion library/core/src/str/lossy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -244,8 +244,10 @@ impl fmt::Debug for Utf8Chunks<'_> {
/// sequences. When the current sequence is invalid, the maximal prefix of a
/// valid UTF-8 code unit sequence is consumed. Returns whether the sequence is
/// a valid Unicode scalar value.
#[doc(hidden)]
#[unstable(feature = "str_internals", issue = "none")]
#[inline]
fn advance_utf8(bytes: &mut slice::Iter<'_, u8>) -> bool {
pub fn advance_utf8(bytes: &mut slice::Iter<'_, u8>) -> bool {
const TAG_CONT_U8: u8 = 128;
#[inline]
fn peek(bytes: &slice::Iter<'_, u8>) -> u8 {
Expand Down
4 changes: 3 additions & 1 deletion library/core/src/str/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ mod converts;
mod count;
mod error;
mod iter;
mod lossy;
mod traits;
mod validations;

Expand All @@ -21,7 +22,6 @@ use crate::{ascii, mem};

pub mod pattern;

mod lossy;
#[unstable(feature = "str_from_raw_parts", issue = "119206")]
pub use converts::{from_raw_parts, from_raw_parts_mut};
#[stable(feature = "rust1", since = "1.0.0")]
Expand Down Expand Up @@ -52,6 +52,8 @@ pub use iter::{Matches, RMatches};
pub use iter::{RSplit, RSplitTerminator, Split, SplitTerminator};
#[stable(feature = "rust1", since = "1.0.0")]
pub use iter::{RSplitN, SplitN};
#[unstable(feature = "str_internals", issue = "none")]
pub use lossy::advance_utf8;
#[stable(feature = "utf8_chunks", since = "1.79.0")]
pub use lossy::{Utf8Chunk, Utf8Chunks};
#[stable(feature = "rust1", since = "1.0.0")]
Expand Down
13 changes: 13 additions & 0 deletions library/std/src/ffi/os_str/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,19 @@ fn display() {
assert_eq!(format!("a{:^10}e", os_string.display()), "a bcd e");
}

#[cfg(unix)]
#[test]
fn display_invalid_utf8_unix() {
use crate::os::unix::ffi::OsStringExt;

let os_string = OsString::from_vec(b"b\xFFd".to_vec());
assert_eq!(format!("a{:^10}e", os_string.display()), "a b�d e");
assert_eq!(format!("a{:^10}e", os_string.as_os_str().display()), "a b�d e");
let os_string = OsString::from_vec(b"b\xE1\xBAd".to_vec());
assert_eq!(format!("a{:^10}e", os_string.display()), "a b�d e");
assert_eq!(format!("a{:^10}e", os_string.as_os_str().display()), "a b�d e");
}

#[cfg(windows)]
#[test]
fn display_invalid_wtf8_windows() {
Expand Down
71 changes: 56 additions & 15 deletions library/std/src/sys/os_str/bytes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
//! systems: just a `Vec<u8>`/`[u8]`.
use core::clone::CloneToUninit;
use core::str::advance_utf8;

use crate::borrow::Cow;
use crate::collections::TryReserveError;
Expand Down Expand Up @@ -33,25 +34,37 @@ impl fmt::Debug for Slice {

impl fmt::Display for Slice {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
// If we're the empty string then our iterator won't actually yield
// anything, so perform the formatting manually
if self.inner.is_empty() {
return "".fmt(f);
// Corresponds to `Formatter::pad`, but for `OsStr` instead of `str`.

// Make sure there's a fast path up front.
if f.options().get_width().is_none() && f.options().get_precision().is_none() {
return self.write_lossy(f);
}

for chunk in self.inner.utf8_chunks() {
let valid = chunk.valid();
// If we successfully decoded the whole chunk as a valid string then
// we can return a direct formatting of the string which will also
// respect various formatting flags if possible.
if chunk.invalid().is_empty() {
return valid.fmt(f);
}
// The `precision` field can be interpreted as a maximum width for the
// string being formatted.
let max_char_count = f.options().get_precision().unwrap_or(usize::MAX);
let (truncated, char_count) = truncate_chars(&self.inner, max_char_count);

// If our string is longer than the maximum width, truncate it and
// handle other flags in terms of the truncated string.
// SAFETY: The truncation splits at Unicode scalar value boundaries.
let s = unsafe { Slice::from_encoded_bytes_unchecked(truncated) };

f.write_str(valid)?;
f.write_char(char::REPLACEMENT_CHARACTER)?;
// The `width` field is more of a minimum width parameter at this point.
if let Some(width) = f.options().get_width()
&& char_count < width
{
// If we're under the minimum width, then fill up the minimum width
// with the specified string + some alignment.
let post_padding = f.padding(width - char_count, fmt::Alignment::Left)?;
s.write_lossy(f)?;
post_padding.write(f)
} else {
// If we're over the minimum width or there is no minimum width, we
// can just emit the string.
s.write_lossy(f)
}
Ok(())
}
}

Expand Down Expand Up @@ -286,6 +299,18 @@ impl Slice {
String::from_utf8_lossy(&self.inner)
}

/// Writes the string as lossy UTF-8 like [`String::from_utf8_lossy`].
/// It ignores formatter flags.
fn write_lossy(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
for chunk in self.inner.utf8_chunks() {
f.write_str(chunk.valid())?;
if !chunk.invalid().is_empty() {
f.write_char(char::REPLACEMENT_CHARACTER)?;
}
}
Ok(())
}

pub fn to_owned(&self) -> Buf {
Buf { inner: self.inner.to_vec() }
}
Expand Down Expand Up @@ -357,3 +382,19 @@ unsafe impl CloneToUninit for Slice {
unsafe { self.inner.clone_to_uninit(dst) }
}
}

/// Counts the number of Unicode scalar values in the byte string, allowing
/// invalid UTF-8 sequences. For invalid sequences, the maximal prefix of a
/// valid UTF-8 code unit counts as one. Only up to `max_chars` scalar values
/// are scanned. Returns the character count and the byte length.
fn truncate_chars(bytes: &[u8], max_chars: usize) -> (&[u8], usize) {
let mut iter = bytes.iter();
let mut char_count = 0;
while !iter.is_empty() && char_count < max_chars {
advance_utf8(&mut iter);
char_count += 1;
}
let byte_len = bytes.len() - iter.len();
let truncated = unsafe { bytes.get_unchecked(..byte_len) };
(truncated, char_count)
}
14 changes: 14 additions & 0 deletions library/std/tests/path.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1822,6 +1822,20 @@ fn display_format_flags() {
assert_eq!(format!("a{:^10}e", Path::new("bcd").display()), "a bcd e");
}

#[cfg(unix)]
#[test]
fn display_invalid_utf8_unix() {
use std::ffi::OsString;
use std::os::unix::ffi::OsStringExt;

let path_buf = PathBuf::from(OsString::from_vec(b"b\xFFd".to_vec()));
assert_eq!(format!("a{:^10}e", path_buf.display()), "a b�d e");
assert_eq!(format!("a{:^10}e", path_buf.as_path().display()), "a b�d e");
let path_buf = PathBuf::from(OsString::from_vec(b"b\xE1\xBAd".to_vec()));
assert_eq!(format!("a{:^10}e", path_buf.display()), "a b�d e");
assert_eq!(format!("a{:^10}e", path_buf.as_path().display()), "a b�d e");
}

#[cfg(windows)]
#[test]
fn display_invalid_wtf8_windows() {
Expand Down

0 comments on commit d52e2b2

Please sign in to comment.