Skip to content

Commit ef1ecbe

Browse files
committed
Auto merge of #62948 - matklad:failable-file-loading, r=petrochenkov
Normalize newlines when loading files Fixes #62865
2 parents fc8765d + 911398b commit ef1ecbe

File tree

6 files changed

+102
-104
lines changed

6 files changed

+102
-104
lines changed

src/librustc_lexer/src/lib.rs

-2
Original file line numberDiff line numberDiff line change
@@ -352,7 +352,6 @@ impl Cursor<'_> {
352352
loop {
353353
match self.nth_char(0) {
354354
'\n' => break,
355-
'\r' if self.nth_char(1) == '\n' => break,
356355
EOF_CHAR if self.is_eof() => break,
357356
_ => {
358357
self.bump();
@@ -525,7 +524,6 @@ impl Cursor<'_> {
525524
match self.nth_char(0) {
526525
'/' if !first => break,
527526
'\n' if self.nth_char(1) != '\'' => break,
528-
'\r' if self.nth_char(1) == '\n' => break,
529527
EOF_CHAR if self.is_eof() => break,
530528
'\'' => {
531529
self.bump();

src/librustc_lexer/src/unescape.rs

+8-28
Original file line numberDiff line numberDiff line change
@@ -128,11 +128,7 @@ fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<ch
128128
if first_char != '\\' {
129129
return match first_char {
130130
'\t' | '\n' => Err(EscapeError::EscapeOnlyChar),
131-
'\r' => Err(if chars.clone().next() == Some('\n') {
132-
EscapeError::EscapeOnlyChar
133-
} else {
134-
EscapeError::BareCarriageReturn
135-
}),
131+
'\r' => Err(EscapeError::BareCarriageReturn),
136132
'\'' if mode.in_single_quotes() => Err(EscapeError::EscapeOnlyChar),
137133
'"' if mode.in_double_quotes() => Err(EscapeError::EscapeOnlyChar),
138134
_ => {
@@ -244,27 +240,15 @@ where
244240

245241
let unescaped_char = match first_char {
246242
'\\' => {
247-
let (second_char, third_char) = {
248-
let mut chars = chars.clone();
249-
(chars.next(), chars.next())
250-
};
251-
match (second_char, third_char) {
252-
(Some('\n'), _) | (Some('\r'), Some('\n')) => {
243+
let second_char = chars.clone().next();
244+
match second_char {
245+
Some('\n') => {
253246
skip_ascii_whitespace(&mut chars);
254247
continue;
255248
}
256249
_ => scan_escape(first_char, &mut chars, mode),
257250
}
258251
}
259-
'\r' => {
260-
let second_char = chars.clone().next();
261-
if second_char == Some('\n') {
262-
chars.next();
263-
Ok('\n')
264-
} else {
265-
scan_escape(first_char, &mut chars, mode)
266-
}
267-
}
268252
'\n' => Ok('\n'),
269253
'\t' => Ok('\t'),
270254
_ => scan_escape(first_char, &mut chars, mode),
@@ -298,15 +282,11 @@ where
298282
while let Some(curr) = chars.next() {
299283
let start = initial_len - chars.as_str().len() - curr.len_utf8();
300284

301-
let result = match (curr, chars.clone().next()) {
302-
('\r', Some('\n')) => {
303-
chars.next();
304-
Ok('\n')
305-
},
306-
('\r', _) => Err(EscapeError::BareCarriageReturnInRawString),
307-
(c, _) if mode.is_bytes() && !c.is_ascii() =>
285+
let result = match curr {
286+
'\r' => Err(EscapeError::BareCarriageReturnInRawString),
287+
c if mode.is_bytes() && !c.is_ascii() =>
308288
Err(EscapeError::NonAsciiCharInByteString),
309-
(c, _) => Ok(c),
289+
c => Ok(c),
310290
};
311291
let end = initial_len - chars.as_str().len();
312292

src/librustc_lexer/src/unescape/tests.rs

+3-8
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ fn test_unescape_char_bad() {
1111
check(r"\", EscapeError::LoneSlash);
1212

1313
check("\n", EscapeError::EscapeOnlyChar);
14-
check("\r\n", EscapeError::EscapeOnlyChar);
1514
check("\t", EscapeError::EscapeOnlyChar);
1615
check("'", EscapeError::EscapeOnlyChar);
1716
check("\r", EscapeError::BareCarriageReturn);
@@ -31,6 +30,7 @@ fn test_unescape_char_bad() {
3130
check(r"\v", EscapeError::InvalidEscape);
3231
check(r"\💩", EscapeError::InvalidEscape);
3332
check(r"\●", EscapeError::InvalidEscape);
33+
check("\\\r", EscapeError::InvalidEscape);
3434

3535
check(r"\x", EscapeError::TooShortHexEscape);
3636
check(r"\x0", EscapeError::TooShortHexEscape);
@@ -116,10 +116,9 @@ fn test_unescape_str_good() {
116116

117117
check("foo", "foo");
118118
check("", "");
119-
check(" \t\n\r\n", " \t\n\n");
119+
check(" \t\n", " \t\n");
120120

121121
check("hello \\\n world", "hello world");
122-
check("hello \\\r\n world", "hello world");
123122
check("thread's", "thread's")
124123
}
125124

@@ -134,7 +133,6 @@ fn test_unescape_byte_bad() {
134133
check(r"\", EscapeError::LoneSlash);
135134

136135
check("\n", EscapeError::EscapeOnlyChar);
137-
check("\r\n", EscapeError::EscapeOnlyChar);
138136
check("\t", EscapeError::EscapeOnlyChar);
139137
check("'", EscapeError::EscapeOnlyChar);
140138
check("\r", EscapeError::BareCarriageReturn);
@@ -238,10 +236,9 @@ fn test_unescape_byte_str_good() {
238236

239237
check("foo", b"foo");
240238
check("", b"");
241-
check(" \t\n\r\n", b" \t\n\n");
239+
check(" \t\n", b" \t\n");
242240

243241
check("hello \\\n world", b"hello world");
244-
check("hello \\\r\n world", b"hello world");
245242
check("thread's", b"thread's")
246243
}
247244

@@ -253,7 +250,6 @@ fn test_unescape_raw_str() {
253250
assert_eq!(unescaped, expected);
254251
}
255252

256-
check("\r\n", &[(0..2, Ok('\n'))]);
257253
check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]);
258254
check("\rx", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString)), (1..2, Ok('x'))]);
259255
}
@@ -266,7 +262,6 @@ fn test_unescape_raw_byte_str() {
266262
assert_eq!(unescaped, expected);
267263
}
268264

269-
check("\r\n", &[(0..2, Ok(byte_from_char('\n')))]);
270265
check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]);
271266
check("🦀", &[(0..4, Err(EscapeError::NonAsciiCharInByteString))]);
272267
check(

src/libsyntax/parse/lexer/mod.rs

+15-66
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,7 @@ use syntax_pos::{BytePos, Pos, Span};
88
use rustc_lexer::Base;
99
use rustc_lexer::unescape;
1010

11-
use std::borrow::Cow;
1211
use std::char;
13-
use std::iter;
1412
use std::convert::TryInto;
1513
use rustc_data_structures::sync::Lrc;
1614
use log::debug;
@@ -181,18 +179,7 @@ impl<'a> StringReader<'a> {
181179
let string = self.str_from(start);
182180
// comments with only more "/"s are not doc comments
183181
let tok = if is_doc_comment(string) {
184-
let mut idx = 0;
185-
loop {
186-
idx = match string[idx..].find('\r') {
187-
None => break,
188-
Some(it) => idx + it + 1
189-
};
190-
if string[idx..].chars().next() != Some('\n') {
191-
self.err_span_(start + BytePos(idx as u32 - 1),
192-
start + BytePos(idx as u32),
193-
"bare CR not allowed in doc-comment");
194-
}
195-
}
182+
self.forbid_bare_cr(start, string, "bare CR not allowed in doc-comment");
196183
token::DocComment(Symbol::intern(string))
197184
} else {
198185
token::Comment
@@ -217,15 +204,10 @@ impl<'a> StringReader<'a> {
217204
}
218205

219206
let tok = if is_doc_comment {
220-
let has_cr = string.contains('\r');
221-
let string = if has_cr {
222-
self.translate_crlf(start,
223-
string,
224-
"bare CR not allowed in block doc-comment")
225-
} else {
226-
string.into()
227-
};
228-
token::DocComment(Symbol::intern(&string[..]))
207+
self.forbid_bare_cr(start,
208+
string,
209+
"bare CR not allowed in block doc-comment");
210+
token::DocComment(Symbol::intern(string))
229211
} else {
230212
token::Comment
231213
};
@@ -516,49 +498,16 @@ impl<'a> StringReader<'a> {
516498
&self.src[self.src_index(start)..self.src_index(end)]
517499
}
518500

519-
/// Converts CRLF to LF in the given string, raising an error on bare CR.
520-
fn translate_crlf<'b>(&self, start: BytePos, s: &'b str, errmsg: &'b str) -> Cow<'b, str> {
521-
let mut chars = s.char_indices().peekable();
522-
while let Some((i, ch)) = chars.next() {
523-
if ch == '\r' {
524-
if let Some((lf_idx, '\n')) = chars.peek() {
525-
return translate_crlf_(self, start, s, *lf_idx, chars, errmsg).into();
526-
}
527-
let pos = start + BytePos(i as u32);
528-
let end_pos = start + BytePos((i + ch.len_utf8()) as u32);
529-
self.err_span_(pos, end_pos, errmsg);
530-
}
531-
}
532-
return s.into();
533-
534-
fn translate_crlf_(rdr: &StringReader<'_>,
535-
start: BytePos,
536-
s: &str,
537-
mut j: usize,
538-
mut chars: iter::Peekable<impl Iterator<Item = (usize, char)>>,
539-
errmsg: &str)
540-
-> String {
541-
let mut buf = String::with_capacity(s.len());
542-
// Skip first CR
543-
buf.push_str(&s[.. j - 1]);
544-
while let Some((i, ch)) = chars.next() {
545-
if ch == '\r' {
546-
if j < i {
547-
buf.push_str(&s[j..i]);
548-
}
549-
let next = i + ch.len_utf8();
550-
j = next;
551-
if chars.peek().map(|(_, ch)| *ch) != Some('\n') {
552-
let pos = start + BytePos(i as u32);
553-
let end_pos = start + BytePos(next as u32);
554-
rdr.err_span_(pos, end_pos, errmsg);
555-
}
556-
}
557-
}
558-
if j < s.len() {
559-
buf.push_str(&s[j..]);
560-
}
561-
buf
501+
fn forbid_bare_cr(&self, start: BytePos, s: &str, errmsg: &str) {
502+
let mut idx = 0;
503+
loop {
504+
idx = match s[idx..].find('\r') {
505+
None => break,
506+
Some(it) => idx + it + 1
507+
};
508+
self.err_span_(start + BytePos(idx as u32 - 1),
509+
start + BytePos(idx as u32),
510+
errmsg);
562511
}
563512
}
564513

src/libsyntax_pos/lib.rs

+56
Original file line numberDiff line numberDiff line change
@@ -1043,6 +1043,7 @@ impl SourceFile {
10431043
mut src: String,
10441044
start_pos: BytePos) -> Result<SourceFile, OffsetOverflowError> {
10451045
remove_bom(&mut src);
1046+
normalize_newlines(&mut src);
10461047

10471048
let src_hash = {
10481049
let mut hasher: StableHasher<u128> = StableHasher::new();
@@ -1210,6 +1211,61 @@ fn remove_bom(src: &mut String) {
12101211
}
12111212
}
12121213

1214+
1215+
/// Replaces `\r\n` with `\n` in-place in `src`.
1216+
///
1217+
/// Returns error if there's a lone `\r` in the string
1218+
fn normalize_newlines(src: &mut String) {
1219+
if !src.as_bytes().contains(&b'\r') {
1220+
return;
1221+
}
1222+
1223+
// We replace `\r\n` with `\n` in-place, which doesn't break utf-8 encoding.
1224+
// While we *can* call `as_mut_vec` and do surgery on the live string
1225+
// directly, let's rather steal the contents of `src`. This makes the code
1226+
// safe even if a panic occurs.
1227+
1228+
let mut buf = std::mem::replace(src, String::new()).into_bytes();
1229+
let mut gap_len = 0;
1230+
let mut tail = buf.as_mut_slice();
1231+
loop {
1232+
let idx = match find_crlf(&tail[gap_len..]) {
1233+
None => tail.len(),
1234+
Some(idx) => idx + gap_len,
1235+
};
1236+
tail.copy_within(gap_len..idx, 0);
1237+
tail = &mut tail[idx - gap_len..];
1238+
if tail.len() == gap_len {
1239+
break;
1240+
}
1241+
gap_len += 1;
1242+
}
1243+
1244+
// Account for removed `\r`.
1245+
// After `set_len`, `buf` is guaranteed to contain utf-8 again.
1246+
let new_len = buf.len() - gap_len;
1247+
unsafe {
1248+
buf.set_len(new_len);
1249+
*src = String::from_utf8_unchecked(buf);
1250+
}
1251+
1252+
fn find_crlf(src: &[u8]) -> Option<usize> {
1253+
let mut search_idx = 0;
1254+
while let Some(idx) = find_cr(&src[search_idx..]) {
1255+
if src[search_idx..].get(idx + 1) != Some(&b'\n') {
1256+
search_idx += idx + 1;
1257+
continue;
1258+
}
1259+
return Some(search_idx + idx);
1260+
}
1261+
None
1262+
}
1263+
1264+
fn find_cr(src: &[u8]) -> Option<usize> {
1265+
src.iter().position(|&b| b == b'\r')
1266+
}
1267+
}
1268+
12131269
// _____________________________________________________________________________
12141270
// Pos, BytePos, CharPos
12151271
//

src/libsyntax_pos/tests.rs

+20
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,23 @@ fn test_lookup_line() {
1616
assert_eq!(lookup_line(lines, BytePos(28)), 2);
1717
assert_eq!(lookup_line(lines, BytePos(29)), 2);
1818
}
19+
20+
#[test]
21+
fn test_normalize_newlines() {
22+
fn check(before: &str, after: &str) {
23+
let mut actual = before.to_string();
24+
normalize_newlines(&mut actual);
25+
assert_eq!(actual.as_str(), after);
26+
}
27+
check("", "");
28+
check("\n", "\n");
29+
check("\r", "\r");
30+
check("\r\r", "\r\r");
31+
check("\r\n", "\n");
32+
check("hello world", "hello world");
33+
check("hello\nworld", "hello\nworld");
34+
check("hello\r\nworld", "hello\nworld");
35+
check("\r\nhello\r\nworld\r\n", "\nhello\nworld\n");
36+
check("\r\r\n", "\r\n");
37+
check("hello\rworld", "hello\rworld");
38+
}

0 commit comments

Comments
 (0)