Skip to content

Commit

Permalink
Improve error messages for raw strings (#60762)
Browse files Browse the repository at this point in the history
This diff improves error messages around raw strings in a few ways:
- Catch extra trailing `#` in the parser. This can't be handled in the lexer because we could be in a macro that actually expects another # (see test)
- Refactor & unify error handling in the lexer between ByteStrings and RawByteStrings
- Detect potentially intended terminators (longest sequence of "#*" is suggested)
  • Loading branch information
rcoh committed Mar 29, 2020
1 parent 840a576 commit 629e97a
Show file tree
Hide file tree
Showing 10 changed files with 344 additions and 63 deletions.
2 changes: 1 addition & 1 deletion src/librustc_lexer/src/cursor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ impl<'a> Cursor<'a> {
/// If requested position doesn't exist, `EOF_CHAR` is returned.
/// However, getting `EOF_CHAR` doesn't always mean actual end of file,
/// it should be checked with `is_eof` method.
fn nth_char(&self, n: usize) -> char {
pub(crate) fn nth_char(&self, n: usize) -> char {
self.chars().nth(n).unwrap_or(EOF_CHAR)
}

Expand Down
131 changes: 109 additions & 22 deletions src/librustc_lexer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,13 @@
mod cursor;
pub mod unescape;

#[cfg(test)]
mod tests;

use self::LiteralKind::*;
use self::TokenKind::*;
use crate::cursor::{Cursor, EOF_CHAR};
use std::convert::TryInto;

/// Parsed token.
/// It doesn't contain information about data that has been parsed,
Expand Down Expand Up @@ -132,9 +136,65 @@ pub enum LiteralKind {
/// "b"abc"", "b"abc"
ByteStr { terminated: bool },
/// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a"
RawStr { n_hashes: usize, started: bool, terminated: bool },
RawStr(UnvalidatedRawStr),
/// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a"
RawByteStr { n_hashes: usize, started: bool, terminated: bool },
RawByteStr(UnvalidatedRawStr),
}

#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub struct UnvalidatedRawStr {
valid_start: bool,
n_start_hashes: usize,
n_end_hashes: usize,
possible_terminator_offset: Option<usize>,
}

#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum LexRawStrError {
/// Non # characters between `r` and `"` eg. `r#~"..`
InvalidStarter,
/// The string was never terminated. `possible_terminator_offset` is the best guess of where they
/// may have intended to terminate it.
NoTerminator { expected: usize, found: usize, possible_terminator_offset: Option<usize> },
/// More than 65536 # signs
TooManyDelimiters,
}

#[derive(Debug, Eq, PartialEq, Copy, Clone)]
pub struct ValidatedRawStr {
n_hashes: u16,
}

impl ValidatedRawStr {
pub fn num_hashes(&self) -> u16 {
self.n_hashes
}
}

impl UnvalidatedRawStr {
pub fn started(&self) -> bool {
self.valid_start
}

pub fn validate(self) -> Result<ValidatedRawStr, LexRawStrError> {
if !self.valid_start {
return Err(LexRawStrError::InvalidStarter);
}

let n_start_safe: u16 =
self.n_start_hashes.try_into().map_err(|_| LexRawStrError::TooManyDelimiters)?;
match (self.n_start_hashes, self.n_end_hashes) {
(n_start, n_end) if n_start > n_end => Err(LexRawStrError::NoTerminator {
expected: n_start,
found: self.n_end_hashes,
possible_terminator_offset: self.possible_terminator_offset,
}),
(n_start, n_end) => {
debug_assert_eq!(n_start, n_end);
Ok(ValidatedRawStr { n_hashes: n_start_safe })
}
}
}
}

/// Base of numeric literal encoding according to its prefix.
Expand Down Expand Up @@ -209,7 +269,7 @@ pub fn is_whitespace(c: char) -> bool {
// Dedicated whitespace characters from Unicode
| '\u{2028}' // LINE SEPARATOR
| '\u{2029}' // PARAGRAPH SEPARATOR
=> true,
=> true,
_ => false,
}
}
Expand Down Expand Up @@ -258,12 +318,12 @@ impl Cursor<'_> {
'r' => match (self.first(), self.second()) {
('#', c1) if is_id_start(c1) => self.raw_ident(),
('#', _) | ('"', _) => {
let (n_hashes, started, terminated) = self.raw_double_quoted_string();
let raw_str_i = self.raw_double_quoted_string(1);
let suffix_start = self.len_consumed();
if terminated {
if raw_str_i.n_end_hashes == raw_str_i.n_start_hashes {
self.eat_literal_suffix();
}
let kind = RawStr { n_hashes, started, terminated };
let kind = RawStr(raw_str_i);
Literal { kind, suffix_start }
}
_ => self.ident(),
Expand Down Expand Up @@ -293,12 +353,14 @@ impl Cursor<'_> {
}
('r', '"') | ('r', '#') => {
self.bump();
let (n_hashes, started, terminated) = self.raw_double_quoted_string();
let raw_str_i = self.raw_double_quoted_string(2);
let suffix_start = self.len_consumed();
let terminated = raw_str_i.n_start_hashes == raw_str_i.n_end_hashes;
if terminated {
self.eat_literal_suffix();
}
let kind = RawByteStr { n_hashes, started, terminated };

let kind = RawByteStr(raw_str_i);
Literal { kind, suffix_start }
}
_ => self.ident(),
Expand Down Expand Up @@ -594,37 +656,49 @@ impl Cursor<'_> {
false
}

/// Eats the double-quoted string and returns a tuple of
/// (amount of the '#' symbols, raw string started, raw string terminated)
fn raw_double_quoted_string(&mut self) -> (usize, bool, bool) {
/// Eats the double-quoted string an UnvalidatedRawStr
fn raw_double_quoted_string(&mut self, prefix_len: usize) -> UnvalidatedRawStr {
debug_assert!(self.prev() == 'r');
let mut started: bool = false;
let mut finished: bool = false;
let mut valid_start: bool = false;
let start_pos = self.len_consumed();
let (mut possible_terminator_offset, mut max_hashes) = (None, 0);

// Count opening '#' symbols.
let n_hashes = self.eat_while(|c| c == '#');
let n_start_hashes = self.eat_while(|c| c == '#');

// Check that string is started.
match self.bump() {
Some('"') => started = true,
_ => return (n_hashes, started, finished),
Some('"') => valid_start = true,
_ => {
return UnvalidatedRawStr {
valid_start,
n_start_hashes,
n_end_hashes: 0,
possible_terminator_offset,
};
}
}

// Skip the string contents and on each '#' character met, check if this is
// a raw string termination.
while !finished {
loop {
self.eat_while(|c| c != '"');

if self.is_eof() {
return (n_hashes, started, finished);
return UnvalidatedRawStr {
valid_start,
n_start_hashes,
n_end_hashes: max_hashes,
possible_terminator_offset,
};
}

// Eat closing double quote.
self.bump();

// Check that amount of closing '#' symbols
// is equal to the amount of opening ones.
let mut hashes_left = n_hashes;
let mut hashes_left = n_start_hashes;
let is_closing_hash = |c| {
if c == '#' && hashes_left != 0 {
hashes_left -= 1;
Expand All @@ -633,10 +707,23 @@ impl Cursor<'_> {
false
}
};
finished = self.eat_while(is_closing_hash) == n_hashes;
let n_end_hashes = self.eat_while(is_closing_hash);

if n_end_hashes == n_start_hashes {
return UnvalidatedRawStr {
valid_start,
n_start_hashes,
n_end_hashes,
possible_terminator_offset: None,
};
} else if n_end_hashes > 0 && n_end_hashes > max_hashes {
// Keep track of possible terminators to give a hint about where there might be
// a missing terminator
possible_terminator_offset =
Some(self.len_consumed() - start_pos - n_end_hashes + prefix_len);
max_hashes = n_end_hashes;
}
}

(n_hashes, started, finished)
}

fn eat_decimal_digits(&mut self) -> bool {
Expand Down
119 changes: 119 additions & 0 deletions src/librustc_lexer/src/tests.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
#[cfg(test)]
mod tests {
use crate::*;

fn check_raw_str(
s: &str,
expected: UnvalidatedRawStr,
validated: Result<ValidatedRawStr, LexRawStrError>,
) {
let mut cursor = Cursor::new(s);
let tok = cursor.raw_double_quoted_string(0);
assert_eq!(tok, expected);
assert_eq!(tok.validate(), validated);
}

#[test]
fn test_naked_raw_str() {
check_raw_str(
r#""abc""#,
UnvalidatedRawStr {
n_start_hashes: 0,
n_end_hashes: 0,
valid_start: true,
possible_terminator_offset: None,
},
Ok(ValidatedRawStr { n_hashes: 0 }),
);
}

#[test]
fn test_raw_no_start() {
check_raw_str(
r##""abc"#"##,
UnvalidatedRawStr {
n_start_hashes: 0,
n_end_hashes: 0,
valid_start: true,
possible_terminator_offset: None,
},
Ok(ValidatedRawStr { n_hashes: 0 }),
);
}

#[test]
fn test_too_many_terminators() {
// this error is handled in the parser later
check_raw_str(
r###"#"abc"##"###,
UnvalidatedRawStr {
n_start_hashes: 1,
n_end_hashes: 1,
valid_start: true,
possible_terminator_offset: None,
},
Ok(ValidatedRawStr { n_hashes: 1 }),
);
}

#[test]
fn test_unterminated() {
check_raw_str(
r#"#"abc"#,
UnvalidatedRawStr {
n_start_hashes: 1,
n_end_hashes: 0,
valid_start: true,
possible_terminator_offset: None,
},
Err(LexRawStrError::NoTerminator {
expected: 1,
found: 0,
possible_terminator_offset: None,
}),
);
check_raw_str(
r###"##"abc"#"###,
UnvalidatedRawStr {
n_start_hashes: 2,
n_end_hashes: 1,
valid_start: true,
possible_terminator_offset: Some(7),
},
Err(LexRawStrError::NoTerminator {
expected: 2,
found: 1,
possible_terminator_offset: Some(7),
}),
);
// We're looking for "# not just any #
check_raw_str(
r###"##"abc#"###,
UnvalidatedRawStr {
n_start_hashes: 2,
n_end_hashes: 0,
valid_start: true,
possible_terminator_offset: None,
},
Err(LexRawStrError::NoTerminator {
expected: 2,
found: 0,
possible_terminator_offset: None,
}),
)
}

#[test]
fn test_invalid_start() {
check_raw_str(
r##"#~"abc"#"##,
UnvalidatedRawStr {
n_start_hashes: 1,
n_end_hashes: 0,
valid_start: false,
possible_terminator_offset: None,
},
Err(LexRawStrError::InvalidStarter),
);
}
}
Loading

0 comments on commit 629e97a

Please sign in to comment.