Skip to content

Commit

Permalink
Rollup merge of rust-lang#127528 - estebank:ascii-control-chars, r=ol…
Browse files Browse the repository at this point in the history
…i-obk

Replace ASCII control chars with Unicode Control Pictures

Replace ASCII control chars like `CR` with Unicode Control Pictures like `␍`:

```
error: bare CR not allowed in doc-comment
  --> $DIR/lex-bare-cr-string-literal-doc-comment.rs:3:32
   |
LL | /// doc comment with bare CR: '␍'
   |                                ^
```

Centralize the checking of unicode char width for the purposes of CLI display in one place. Account for the new replacements. Remove unneeded tracking of "zero-width" unicode chars, as we calculate these in the `SourceMap` as needed now.
  • Loading branch information
matthiaskrgr authored Jul 25, 2024
2 parents cfc5f25 + 9bd7680 commit cce2db0
Show file tree
Hide file tree
Showing 67 changed files with 216 additions and 308 deletions.
1 change: 0 additions & 1 deletion Cargo.lock
Original file line number Diff line number Diff line change
Expand Up @@ -3882,7 +3882,6 @@ dependencies = [
"termcolor",
"termize",
"tracing",
"unicode-width",
"windows",
]

Expand Down
1 change: 0 additions & 1 deletion compiler/rustc_errors/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ serde_json = "1.0.59"
termcolor = "1.2.0"
termize = "0.1.1"
tracing = "0.1"
unicode-width = "0.1.4"
# tidy-alphabetical-end

[target.'cfg(windows)'.dependencies.windows]
Expand Down
77 changes: 51 additions & 26 deletions compiler/rustc_errors/src/emitter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
//! The output types are defined in `rustc_session::config::ErrorOutputType`.

use rustc_span::source_map::SourceMap;
use rustc_span::{FileLines, FileName, SourceFile, Span};
use rustc_span::{char_width, FileLines, FileName, SourceFile, Span};

use crate::snippet::{
Annotation, AnnotationColumn, AnnotationType, Line, MultilineAnnotation, Style, StyledString,
Expand Down Expand Up @@ -677,10 +677,7 @@ impl HumanEmitter {
.skip(left)
.take_while(|ch| {
// Make sure that the trimming on the right will fall within the terminal width.
// FIXME: `unicode_width` sometimes disagrees with terminals on how wide a `char`
// is. For now, just accept that sometimes the code line will be longer than
// desired.
let next = unicode_width::UnicodeWidthChar::width(*ch).unwrap_or(1);
let next = char_width(*ch);
if taken + next > right - left {
return false;
}
Expand Down Expand Up @@ -742,11 +739,7 @@ impl HumanEmitter {
let left = margin.left(source_string.len());

// Account for unicode characters of width !=0 that were removed.
let left = source_string
.chars()
.take(left)
.map(|ch| unicode_width::UnicodeWidthChar::width(ch).unwrap_or(1))
.sum();
let left = source_string.chars().take(left).map(|ch| char_width(ch)).sum();

self.draw_line(
buffer,
Expand Down Expand Up @@ -2039,7 +2032,7 @@ impl HumanEmitter {
let sub_len: usize =
if is_whitespace_addition { &part.snippet } else { part.snippet.trim() }
.chars()
.map(|ch| unicode_width::UnicodeWidthChar::width(ch).unwrap_or(1))
.map(|ch| char_width(ch))
.sum();

let offset: isize = offsets
Expand Down Expand Up @@ -2076,11 +2069,8 @@ impl HumanEmitter {
}

// length of the code after substitution
let full_sub_len = part
.snippet
.chars()
.map(|ch| unicode_width::UnicodeWidthChar::width(ch).unwrap_or(1))
.sum::<usize>() as isize;
let full_sub_len =
part.snippet.chars().map(|ch| char_width(ch)).sum::<usize>() as isize;

// length of the code to be substituted
let snippet_len = span_end_pos as isize - span_start_pos as isize;
Expand Down Expand Up @@ -2568,18 +2558,53 @@ fn num_decimal_digits(num: usize) -> usize {
}

// We replace some characters so the CLI output is always consistent and underlines aligned.
// Keep the following list in sync with `rustc_span::char_width`.
const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[
('\t', " "), // We do our own tab replacement
('\t', " "), // We do our own tab replacement
('\u{200D}', ""), // Replace ZWJ with nothing for consistent terminal output of grapheme clusters.
('\u{202A}', ""), // The following unicode text flow control characters are inconsistently
('\u{202B}', ""), // supported across CLIs and can cause confusion due to the bytes on disk
('\u{202D}', ""), // not corresponding to the visible source code, so we replace them always.
('\u{202E}', ""),
('\u{2066}', ""),
('\u{2067}', ""),
('\u{2068}', ""),
('\u{202C}', ""),
('\u{2069}', ""),
('\u{202A}', "�"), // The following unicode text flow control characters are inconsistently
('\u{202B}', "�"), // supported across CLIs and can cause confusion due to the bytes on disk
('\u{202D}', "�"), // not corresponding to the visible source code, so we replace them always.
('\u{202E}', "�"),
('\u{2066}', "�"),
('\u{2067}', "�"),
('\u{2068}', "�"),
('\u{202C}', "�"),
('\u{2069}', "�"),
// In terminals without Unicode support the following will be garbled, but in *all* terminals
// the underlying codepoint will be as well. We could gate this replacement behind a "unicode
// support" gate.
('\u{0000}', "␀"),
('\u{0001}', "␁"),
('\u{0002}', "␂"),
('\u{0003}', "␃"),
('\u{0004}', "␄"),
('\u{0005}', "␅"),
('\u{0006}', "␆"),
('\u{0007}', "␇"),
('\u{0008}', "␈"),
('\u{000B}', "␋"),
('\u{000C}', "␌"),
('\u{000D}', "␍"),
('\u{000E}', "␎"),
('\u{000F}', "␏"),
('\u{0010}', "␐"),
('\u{0011}', "␑"),
('\u{0012}', "␒"),
('\u{0013}', "␓"),
('\u{0014}', "␔"),
('\u{0015}', "␕"),
('\u{0016}', "␖"),
('\u{0017}', "␗"),
('\u{0018}', "␘"),
('\u{0019}', "␙"),
('\u{001A}', "␚"),
('\u{001B}', "␛"),
('\u{001C}', "␜"),
('\u{001D}', "␝"),
('\u{001E}', "␞"),
('\u{001F}', "␟"),
('\u{007F}', "␡"),
];

fn normalize_whitespace(str: &str) -> String {
Expand Down
2 changes: 0 additions & 2 deletions compiler/rustc_metadata/src/rmeta/decoder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1728,7 +1728,6 @@ impl<'a, 'tcx> CrateMetadataRef<'a> {
source_len,
lines,
multibyte_chars,
non_narrow_chars,
normalized_pos,
stable_id,
..
Expand Down Expand Up @@ -1780,7 +1779,6 @@ impl<'a, 'tcx> CrateMetadataRef<'a> {
self.cnum,
lines,
multibyte_chars,
non_narrow_chars,
normalized_pos,
source_file_index,
);
Expand Down
6 changes: 0 additions & 6 deletions compiler/rustc_query_system/src/ich/impls_syntax.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ impl<'a> HashStable<StableHashingContext<'a>> for SourceFile {
source_len: _,
lines: _,
ref multibyte_chars,
ref non_narrow_chars,
ref normalized_pos,
} = *self;

Expand All @@ -98,11 +97,6 @@ impl<'a> HashStable<StableHashingContext<'a>> for SourceFile {
char_pos.hash_stable(hcx, hasher);
}

non_narrow_chars.len().hash_stable(hcx, hasher);
for &char_pos in non_narrow_chars.iter() {
char_pos.hash_stable(hcx, hasher);
}

normalized_pos.len().hash_stable(hcx, hasher);
for &char_pos in normalized_pos.iter() {
char_pos.hash_stable(hcx, hasher);
Expand Down
40 changes: 6 additions & 34 deletions compiler/rustc_span/src/analyze_source_file.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use super::*;
use unicode_width::UnicodeWidthChar;

#[cfg(test)]
mod tests;
Expand All @@ -9,15 +8,12 @@ mod tests;
///
/// This function will use an SSE2 enhanced implementation if hardware support
/// is detected at runtime.
pub fn analyze_source_file(
src: &str,
) -> (Vec<RelativeBytePos>, Vec<MultiByteChar>, Vec<NonNarrowChar>) {
pub fn analyze_source_file(src: &str) -> (Vec<RelativeBytePos>, Vec<MultiByteChar>) {
let mut lines = vec![RelativeBytePos::from_u32(0)];
let mut multi_byte_chars = vec![];
let mut non_narrow_chars = vec![];

// Calls the right implementation, depending on hardware support available.
analyze_source_file_dispatch(src, &mut lines, &mut multi_byte_chars, &mut non_narrow_chars);
analyze_source_file_dispatch(src, &mut lines, &mut multi_byte_chars);

// The code above optimistically registers a new line *after* each \n
// it encounters. If that point is already outside the source_file, remove
Expand All @@ -30,7 +26,7 @@ pub fn analyze_source_file(
}
}

(lines, multi_byte_chars, non_narrow_chars)
(lines, multi_byte_chars)
}

cfg_match! {
Expand All @@ -39,11 +35,10 @@ cfg_match! {
src: &str,
lines: &mut Vec<RelativeBytePos>,
multi_byte_chars: &mut Vec<MultiByteChar>,
non_narrow_chars: &mut Vec<NonNarrowChar>,
) {
if is_x86_feature_detected!("sse2") {
unsafe {
analyze_source_file_sse2(src, lines, multi_byte_chars, non_narrow_chars);
analyze_source_file_sse2(src, lines, multi_byte_chars);
}
} else {
analyze_source_file_generic(
Expand All @@ -52,7 +47,6 @@ cfg_match! {
RelativeBytePos::from_u32(0),
lines,
multi_byte_chars,
non_narrow_chars,
);
}
}
Expand All @@ -66,7 +60,6 @@ cfg_match! {
src: &str,
lines: &mut Vec<RelativeBytePos>,
multi_byte_chars: &mut Vec<MultiByteChar>,
non_narrow_chars: &mut Vec<NonNarrowChar>,
) {
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
Expand Down Expand Up @@ -159,7 +152,6 @@ cfg_match! {
RelativeBytePos::from_usize(scan_start),
lines,
multi_byte_chars,
non_narrow_chars,
);
}

Expand All @@ -172,7 +164,6 @@ cfg_match! {
RelativeBytePos::from_usize(tail_start),
lines,
multi_byte_chars,
non_narrow_chars,
);
}
}
Expand All @@ -183,15 +174,13 @@ cfg_match! {
src: &str,
lines: &mut Vec<RelativeBytePos>,
multi_byte_chars: &mut Vec<MultiByteChar>,
non_narrow_chars: &mut Vec<NonNarrowChar>,
) {
analyze_source_file_generic(
src,
src.len(),
RelativeBytePos::from_u32(0),
lines,
multi_byte_chars,
non_narrow_chars,
);
}
}
Expand All @@ -205,7 +194,6 @@ fn analyze_source_file_generic(
output_offset: RelativeBytePos,
lines: &mut Vec<RelativeBytePos>,
multi_byte_chars: &mut Vec<MultiByteChar>,
non_narrow_chars: &mut Vec<NonNarrowChar>,
) -> usize {
assert!(src.len() >= scan_len);
let mut i = 0;
Expand All @@ -227,16 +215,8 @@ fn analyze_source_file_generic(

let pos = RelativeBytePos::from_usize(i) + output_offset;

match byte {
b'\n' => {
lines.push(pos + RelativeBytePos(1));
}
b'\t' => {
non_narrow_chars.push(NonNarrowChar::Tab(pos));
}
_ => {
non_narrow_chars.push(NonNarrowChar::ZeroWidth(pos));
}
if let b'\n' = byte {
lines.push(pos + RelativeBytePos(1));
}
} else if byte >= 127 {
// The slow path:
Expand All @@ -252,14 +232,6 @@ fn analyze_source_file_generic(
let mbc = MultiByteChar { pos, bytes: char_len as u8 };
multi_byte_chars.push(mbc);
}

// Assume control characters are zero width.
// FIXME: How can we decide between `width` and `width_cjk`?
let char_width = UnicodeWidthChar::width(c).unwrap_or(0);

if char_width != 1 {
non_narrow_chars.push(NonNarrowChar::new(pos, char_width));
}
}

i += char_len;
Expand Down
Loading

0 comments on commit cce2db0

Please sign in to comment.