Skip to content

Commit

Permalink
Auto merge of #60793 - Xanewok:raw-string-cleanup, r=petrochenkov
Browse files Browse the repository at this point in the history
lexer: Disallow bare CR in raw byte strings

Handles bare CR ~but doesn't translate `\r\n` to `\n` yet in raw strings yet~ and translates CRLF to LF in raw strings.

As a side-note I think it'd be good to change the `unescape_` to return plain iterators to reduce some boilerplate (e.g. `has_error` could benefit from collecting `Result<T>` and aborting early on errors) but will do that separately, unless I missed something here that prevents it.

@matklad @petrochenkov thoughts?
  • Loading branch information
bors committed Jun 10, 2019
2 parents 02564de + 630d5f3 commit 5e2c110
Show file tree
Hide file tree
Showing 8 changed files with 180 additions and 135 deletions.
158 changes: 58 additions & 100 deletions src/libsyntax/parse/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ impl<'a> StringReader<'a> {
self.ch.is_none()
}

fn fail_unterminated_raw_string(&self, pos: BytePos, hash_count: u16) {
fn fail_unterminated_raw_string(&self, pos: BytePos, hash_count: u16) -> ! {
let mut err = self.struct_span_fatal(pos, pos, "unterminated raw string");
err.span_label(self.mk_sp(pos, pos), "unterminated raw string");

Expand Down Expand Up @@ -292,15 +292,6 @@ impl<'a> StringReader<'a> {
self.sess.span_diagnostic.struct_span_fatal(self.mk_sp(from_pos, to_pos), &m[..])
}

/// Report a lexical error spanning [`from_pos`, `to_pos`), appending an
/// escaped character to the error message
fn err_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) {
let mut m = m.to_string();
m.push_str(": ");
push_escaped_char(&mut m, c);
self.err_span_(from_pos, to_pos, &m[..]);
}

/// Advance peek_token to refer to the next token, and
/// possibly update the interner.
fn advance_token(&mut self) -> Result<(), ()> {
Expand Down Expand Up @@ -1070,7 +1061,13 @@ impl<'a> StringReader<'a> {
self.validate_byte_str_escape(start_with_quote);
(token::ByteStr, symbol)
},
Some('r') => self.scan_raw_byte_string(),
Some('r') => {
let (start, end, hash_count) = self.scan_raw_string();
let symbol = self.name_from_to(start, end);
self.validate_raw_byte_str_escape(start, end);

(token::ByteStrRaw(hash_count), symbol)
}
_ => unreachable!(), // Should have been a token::Ident above.
};
let suffix = self.scan_optional_raw_name();
Expand All @@ -1086,79 +1083,9 @@ impl<'a> StringReader<'a> {
Ok(TokenKind::lit(token::Str, symbol, suffix))
}
'r' => {
let start_bpos = self.pos;
self.bump();
let mut hash_count: u16 = 0;
while self.ch_is('#') {
if hash_count == 65535 {
let bpos = self.next_pos;
self.fatal_span_(start_bpos,
bpos,
"too many `#` symbols: raw strings may be \
delimited by up to 65535 `#` symbols").raise();
}
self.bump();
hash_count += 1;
}

if self.is_eof() {
self.fail_unterminated_raw_string(start_bpos, hash_count);
} else if !self.ch_is('"') {
let last_bpos = self.pos;
let curr_char = self.ch.unwrap();
self.fatal_span_char(start_bpos,
last_bpos,
"found invalid character; only `#` is allowed \
in raw string delimitation",
curr_char).raise();
}
self.bump();
let content_start_bpos = self.pos;
let mut content_end_bpos;
let mut valid = true;
'outer: loop {
if self.is_eof() {
self.fail_unterminated_raw_string(start_bpos, hash_count);
}
// if self.ch_is('"') {
// content_end_bpos = self.pos;
// for _ in 0..hash_count {
// self.bump();
// if !self.ch_is('#') {
// continue 'outer;
let c = self.ch.unwrap();
match c {
'"' => {
content_end_bpos = self.pos;
for _ in 0..hash_count {
self.bump();
if !self.ch_is('#') {
continue 'outer;
}
}
break;
}
'\r' => {
if !self.nextch_is('\n') {
let last_bpos = self.pos;
self.err_span_(start_bpos,
last_bpos,
"bare CR not allowed in raw string, use \\r \
instead");
valid = false;
}
}
_ => (),
}
self.bump();
}

self.bump();
let symbol = if valid {
self.name_from_to(content_start_bpos, content_end_bpos)
} else {
Symbol::intern("??")
};
let (start, end, hash_count) = self.scan_raw_string();
let symbol = self.name_from_to(start, end);
self.validate_raw_str_escape(start, end);
let suffix = self.scan_optional_raw_name();

Ok(TokenKind::lit(token::StrRaw(hash_count), symbol, suffix))
Expand Down Expand Up @@ -1315,16 +1242,18 @@ impl<'a> StringReader<'a> {
id
}

fn scan_raw_byte_string(&mut self) -> (token::LitKind, Symbol) {
/// Scans a raw (byte) string, returning byte position range for `"<literal>"`
/// (including quotes) along with `#` character count in `(b)r##..."<literal>"##...`;
fn scan_raw_string(&mut self) -> (BytePos, BytePos, u16) {
let start_bpos = self.pos;
self.bump();
let mut hash_count = 0;
let mut hash_count: u16 = 0;
while self.ch_is('#') {
if hash_count == 65535 {
let bpos = self.next_pos;
self.fatal_span_(start_bpos,
bpos,
"too many `#` symbols: raw byte strings may be \
"too many `#` symbols: raw strings may be \
delimited by up to 65535 `#` symbols").raise();
}
self.bump();
Expand All @@ -1334,13 +1263,13 @@ impl<'a> StringReader<'a> {
if self.is_eof() {
self.fail_unterminated_raw_string(start_bpos, hash_count);
} else if !self.ch_is('"') {
let pos = self.pos;
let ch = self.ch.unwrap();
let last_bpos = self.pos;
let curr_char = self.ch.unwrap();
self.fatal_span_char(start_bpos,
pos,
"found invalid character; only `#` is allowed in raw \
string delimitation",
ch).raise();
last_bpos,
"found invalid character; only `#` is allowed \
in raw string delimitation",
curr_char).raise();
}
self.bump();
let content_start_bpos = self.pos;
Expand All @@ -1360,19 +1289,14 @@ impl<'a> StringReader<'a> {
}
break;
}
Some(c) => {
if c > '\x7F' {
let pos = self.pos;
self.err_span_char(pos, pos, "raw byte string must be ASCII", c);
}
}
_ => (),
}
self.bump();
}

self.bump();

(token::ByteStrRaw(hash_count), self.name_from_to(content_start_bpos, content_end_bpos))
(content_start_bpos, content_end_bpos, hash_count)
}

fn validate_char_escape(&self, start_with_quote: BytePos) {
Expand Down Expand Up @@ -1422,6 +1346,40 @@ impl<'a> StringReader<'a> {
});
}

fn validate_raw_str_escape(&self, content_start: BytePos, content_end: BytePos) {
self.with_str_from_to(content_start, content_end, |lit: &str| {
unescape::unescape_raw_str(lit, &mut |range, c| {
if let Err(err) = c {
emit_unescape_error(
&self.sess.span_diagnostic,
lit,
self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)),
unescape::Mode::Str,
range,
err,
)
}
})
});
}

fn validate_raw_byte_str_escape(&self, content_start: BytePos, content_end: BytePos) {
self.with_str_from_to(content_start, content_end, |lit: &str| {
unescape::unescape_raw_byte_str(lit, &mut |range, c| {
if let Err(err) = c {
emit_unescape_error(
&self.sess.span_diagnostic,
lit,
self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)),
unescape::Mode::ByteStr,
range,
err,
)
}
})
});
}

fn validate_byte_str_escape(&self, start_with_quote: BytePos) {
self.with_str_from_to(start_with_quote + BytePos(1), self.pos - BytePos(1), |lit| {
unescape::unescape_byte_str(lit, &mut |range, c| {
Expand Down
60 changes: 34 additions & 26 deletions src/libsyntax/parse/literal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ use crate::ast::{self, Lit, LitKind};
use crate::parse::parser::Parser;
use crate::parse::PResult;
use crate::parse::token::{self, Token, TokenKind};
use crate::parse::unescape::{unescape_str, unescape_char, unescape_byte_str, unescape_byte};
use crate::parse::unescape::{unescape_char, unescape_byte};
use crate::parse::unescape::{unescape_str, unescape_byte_str};
use crate::parse::unescape::{unescape_raw_str, unescape_raw_byte_str};
use crate::print::pprust;
use crate::symbol::{kw, sym, Symbol};
use crate::tokenstream::{TokenStream, TokenTree};
Expand Down Expand Up @@ -141,7 +143,17 @@ impl LitKind {
// Ditto.
let s = symbol.as_str();
let symbol = if s.contains('\r') {
Symbol::intern(&raw_str_lit(&s))
let mut buf = String::with_capacity(s.len());
let mut error = Ok(());
unescape_raw_str(&s, &mut |_, unescaped_char| {
match unescaped_char {
Ok(c) => buf.push(c),
Err(_) => error = Err(LitError::LexerError),
}
});
error?;
buf.shrink_to_fit();
Symbol::intern(&buf)
} else {
symbol
};
Expand All @@ -161,7 +173,26 @@ impl LitKind {
buf.shrink_to_fit();
LitKind::ByteStr(Lrc::new(buf))
}
token::ByteStrRaw(_) => LitKind::ByteStr(Lrc::new(symbol.to_string().into_bytes())),
token::ByteStrRaw(_) => {
let s = symbol.as_str();
let bytes = if s.contains('\r') {
let mut buf = Vec::with_capacity(s.len());
let mut error = Ok(());
unescape_raw_byte_str(&s, &mut |_, unescaped_byte| {
match unescaped_byte {
Ok(c) => buf.push(c),
Err(_) => error = Err(LitError::LexerError),
}
});
error?;
buf.shrink_to_fit();
buf
} else {
symbol.to_string().into_bytes()
};

LitKind::ByteStr(Lrc::new(bytes))
},
token::Err => LitKind::Err(symbol),
})
}
Expand Down Expand Up @@ -353,29 +384,6 @@ crate fn expect_no_suffix(diag: &Handler, sp: Span, kind: &str, suffix: Option<S
}
}

/// Parses a string representing a raw string literal into its final form. The
/// only operation this does is convert embedded CRLF into a single LF.
fn raw_str_lit(lit: &str) -> String {
debug!("raw_str_lit: {:?}", lit);
let mut res = String::with_capacity(lit.len());

let mut chars = lit.chars().peekable();
while let Some(c) = chars.next() {
if c == '\r' {
if *chars.peek().unwrap() != '\n' {
panic!("lexer accepted bare CR");
}
chars.next();
res.push('\n');
} else {
res.push(c);
}
}

res.shrink_to_fit();
res
}

// Checks if `s` looks like i32 or u1234 etc.
fn looks_like_width_suffix(first_chars: &[char], s: &str) -> bool {
s.len() > 1 && s.starts_with(first_chars) && s[1..].chars().all(|c| c.is_ascii_digit())
Expand Down
Loading

0 comments on commit 5e2c110

Please sign in to comment.