Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lexer improvements #99884

Merged
merged 7 commits into from
Aug 1, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions compiler/rustc_ast/src/util/comments.rs
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ pub fn gather_comments(sm: &SourceMap, path: FileName, src: String) -> Vec<Comme
}

for token in rustc_lexer::tokenize(&text[pos..]) {
let token_text = &text[pos..pos + token.len];
let token_text = &text[pos..pos + token.len as usize];
match token.kind {
rustc_lexer::TokenKind::Whitespace => {
if let Some(mut idx) = token_text.find('\n') {
Expand All @@ -211,8 +211,10 @@ pub fn gather_comments(sm: &SourceMap, path: FileName, src: String) -> Vec<Comme
}
rustc_lexer::TokenKind::BlockComment { doc_style, .. } => {
if doc_style.is_none() {
let code_to_the_right =
!matches!(text[pos + token.len..].chars().next(), Some('\r' | '\n'));
let code_to_the_right = !matches!(
text[pos + token.len as usize..].chars().next(),
Some('\r' | '\n')
);
let style = match (code_to_the_left, code_to_the_right) {
(_, true) => CommentStyle::Mixed,
(false, false) => CommentStyle::Isolated,
Expand Down Expand Up @@ -246,7 +248,7 @@ pub fn gather_comments(sm: &SourceMap, path: FileName, src: String) -> Vec<Comme
code_to_the_left = true;
}
}
pos += token.len;
pos += token.len as usize;
}

comments
Expand Down
4 changes: 2 additions & 2 deletions compiler/rustc_lexer/src/cursor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@ impl<'a> Cursor<'a> {
}

/// Returns amount of already consumed symbols.
pub(crate) fn len_consumed(&self) -> usize {
self.initial_len - self.chars.as_str().len()
pub(crate) fn len_consumed(&self) -> u32 {
(self.initial_len - self.chars.as_str().len()) as u32
}

/// Resets the number of bytes consumed to 0.
Expand Down
89 changes: 48 additions & 41 deletions compiler/rustc_lexer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,18 +38,17 @@ use std::convert::TryFrom;
#[derive(Debug)]
pub struct Token {
pub kind: TokenKind,
pub len: usize,
pub len: u32,
}

impl Token {
fn new(kind: TokenKind, len: usize) -> Token {
fn new(kind: TokenKind, len: u32) -> Token {
Token { kind, len }
}
}

/// Enum representing common lexeme types.
// perf note: Changing all `usize` to `u32` doesn't change performance. See #77629
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum TokenKind {
// Multi-char tokens:
/// "// comment"
Expand All @@ -76,7 +75,7 @@ pub enum TokenKind {
/// tokens.
UnknownPrefix,
/// "12_u8", "1.0e-40", "b"123"". See `LiteralKind` for more details.
Literal { kind: LiteralKind, suffix_start: usize },
Literal { kind: LiteralKind, suffix_start: u32 },
/// "'a"
Lifetime { starts_with_number: bool },

Expand Down Expand Up @@ -160,26 +159,24 @@ pub enum LiteralKind {
Str { terminated: bool },
/// "b"abc"", "b"abc"
ByteStr { terminated: bool },
/// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a"
RawStr { n_hashes: u8, err: Option<RawStrError> },
/// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a"
RawByteStr { n_hashes: u8, err: Option<RawStrError> },
/// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a". `None` indicates
/// an invalid literal.
RawStr { n_hashes: Option<u8> },
/// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a". `None`
/// indicates an invalid literal.
RawByteStr { n_hashes: Option<u8> },
}

/// Error produced validating a raw string. Represents cases like:
/// - `r##~"abcde"##`: `InvalidStarter`
/// - `r###"abcde"##`: `NoTerminator { expected: 3, found: 2, possible_terminator_offset: Some(11)`
/// - Too many `#`s (>255): `TooManyDelimiters`
// perf note: It doesn't matter that this makes `Token` 36 bytes bigger. See #77629
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum RawStrError {
/// Non `#` characters exist between `r` and `"` eg. `r#~"..`
/// Non `#` characters exist between `r` and `"`, e.g. `r##~"abcde"##`
InvalidStarter { bad_char: char },
/// The string was never terminated. `possible_terminator_offset` is the number of characters after `r` or `br` where they
/// may have intended to terminate it.
NoTerminator { expected: usize, found: usize, possible_terminator_offset: Option<usize> },
/// The string was not terminated, e.g. `r###"abcde"##`.
/// `possible_terminator_offset` is the number of characters after `r` or
/// `br` where they may have intended to terminate it.
NoTerminator { expected: u32, found: u32, possible_terminator_offset: Option<u32> },
/// More than 255 `#`s exist.
TooManyDelimiters { found: usize },
TooManyDelimiters { found: u32 },
}

/// Base of numeric literal encoding according to its prefix.
Expand Down Expand Up @@ -221,11 +218,25 @@ pub fn strip_shebang(input: &str) -> Option<usize> {
}

/// Parses the first token from the provided input string.
#[inline]
pub fn first_token(input: &str) -> Token {
debug_assert!(!input.is_empty());
Cursor::new(input).advance_token()
}

/// Validates a raw string literal. Used for getting more information about a
/// problem with a `RawStr`/`RawByteStr` with a `None` field.
#[inline]
pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError> {
debug_assert!(!input.is_empty());
let mut cursor = Cursor::new(input);
// Move past the leading `r` or `br`.
for _ in 0..prefix_len {
cursor.bump().unwrap();
}
cursor.raw_double_quoted_string(prefix_len).map(|_| ())
}

/// Creates an iterator that produces tokens from the input string.
pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ {
let mut cursor = Cursor::new(input);
Expand Down Expand Up @@ -315,12 +326,12 @@ impl Cursor<'_> {
'r' => match (self.first(), self.second()) {
('#', c1) if is_id_start(c1) => self.raw_ident(),
('#', _) | ('"', _) => {
let (n_hashes, err) = self.raw_double_quoted_string(1);
let res = self.raw_double_quoted_string(1);
let suffix_start = self.len_consumed();
if err.is_none() {
if res.is_ok() {
self.eat_literal_suffix();
}
let kind = RawStr { n_hashes, err };
let kind = RawStr { n_hashes: res.ok() };
Literal { kind, suffix_start }
}
_ => self.ident_or_unknown_prefix(),
Expand Down Expand Up @@ -350,12 +361,12 @@ impl Cursor<'_> {
}
('r', '"') | ('r', '#') => {
self.bump();
let (n_hashes, err) = self.raw_double_quoted_string(2);
let res = self.raw_double_quoted_string(2);
let suffix_start = self.len_consumed();
if err.is_none() {
if res.is_ok() {
self.eat_literal_suffix();
}
let kind = RawByteStr { n_hashes, err };
let kind = RawByteStr { n_hashes: res.ok() };
Literal { kind, suffix_start }
}
_ => self.ident_or_unknown_prefix(),
Expand Down Expand Up @@ -698,19 +709,18 @@ impl Cursor<'_> {
}

/// Eats the double-quoted string and returns `n_hashes` and an error if encountered.
fn raw_double_quoted_string(&mut self, prefix_len: usize) -> (u8, Option<RawStrError>) {
fn raw_double_quoted_string(&mut self, prefix_len: u32) -> Result<u8, RawStrError> {
// Wrap the actual function to handle the error with too many hashes.
// This way, it eats the whole raw string.
let (n_hashes, err) = self.raw_string_unvalidated(prefix_len);
let n_hashes = self.raw_string_unvalidated(prefix_len)?;
// Only up to 255 `#`s are allowed in raw strings
match u8::try_from(n_hashes) {
Ok(num) => (num, err),
// We lie about the number of hashes here :P
Err(_) => (0, Some(RawStrError::TooManyDelimiters { found: n_hashes })),
Ok(num) => Ok(num),
Err(_) => Err(RawStrError::TooManyDelimiters { found: n_hashes }),
}
}

fn raw_string_unvalidated(&mut self, prefix_len: usize) -> (usize, Option<RawStrError>) {
fn raw_string_unvalidated(&mut self, prefix_len: u32) -> Result<u32, RawStrError> {
debug_assert!(self.prev() == 'r');
let start_pos = self.len_consumed();
let mut possible_terminator_offset = None;
Expand All @@ -729,7 +739,7 @@ impl Cursor<'_> {
Some('"') => (),
c => {
let c = c.unwrap_or(EOF_CHAR);
return (n_start_hashes, Some(RawStrError::InvalidStarter { bad_char: c }));
return Err(RawStrError::InvalidStarter { bad_char: c });
}
}

Expand All @@ -739,14 +749,11 @@ impl Cursor<'_> {
self.eat_while(|c| c != '"');

if self.is_eof() {
return (
n_start_hashes,
Some(RawStrError::NoTerminator {
expected: n_start_hashes,
found: max_hashes,
possible_terminator_offset,
}),
);
return Err(RawStrError::NoTerminator {
expected: n_start_hashes,
found: max_hashes,
possible_terminator_offset,
});
}

// Eat closing double quote.
Expand All @@ -764,7 +771,7 @@ impl Cursor<'_> {
}

if n_end_hashes == n_start_hashes {
return (n_start_hashes, None);
return Ok(n_start_hashes);
} else if n_end_hashes > max_hashes {
// Keep track of possible terminators to give a hint about
// where there might be a missing terminator
Expand Down
50 changes: 22 additions & 28 deletions compiler/rustc_lexer/src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,42 +2,39 @@ use super::*;

use expect_test::{expect, Expect};

fn check_raw_str(s: &str, expected_hashes: u8, expected_err: Option<RawStrError>) {
fn check_raw_str(s: &str, expected: Result<u8, RawStrError>) {
let s = &format!("r{}", s);
let mut cursor = Cursor::new(s);
cursor.bump();
let (n_hashes, err) = cursor.raw_double_quoted_string(0);
assert_eq!(n_hashes, expected_hashes);
assert_eq!(err, expected_err);
let res = cursor.raw_double_quoted_string(0);
assert_eq!(res, expected);
}

#[test]
fn test_naked_raw_str() {
check_raw_str(r#""abc""#, 0, None);
check_raw_str(r#""abc""#, Ok(0));
}

#[test]
fn test_raw_no_start() {
check_raw_str(r##""abc"#"##, 0, None);
check_raw_str(r##""abc"#"##, Ok(0));
}

#[test]
fn test_too_many_terminators() {
// this error is handled in the parser later
check_raw_str(r###"#"abc"##"###, 1, None);
check_raw_str(r###"#"abc"##"###, Ok(1));
}

#[test]
fn test_unterminated() {
check_raw_str(
r#"#"abc"#,
1,
Some(RawStrError::NoTerminator { expected: 1, found: 0, possible_terminator_offset: None }),
Err(RawStrError::NoTerminator { expected: 1, found: 0, possible_terminator_offset: None }),
);
check_raw_str(
r###"##"abc"#"###,
2,
Some(RawStrError::NoTerminator {
Err(RawStrError::NoTerminator {
expected: 2,
found: 1,
possible_terminator_offset: Some(7),
Expand All @@ -46,41 +43,38 @@ fn test_unterminated() {
// We're looking for "# not just any #
check_raw_str(
r###"##"abc#"###,
2,
Some(RawStrError::NoTerminator { expected: 2, found: 0, possible_terminator_offset: None }),
Err(RawStrError::NoTerminator { expected: 2, found: 0, possible_terminator_offset: None }),
)
}

#[test]
fn test_invalid_start() {
check_raw_str(r##"#~"abc"#"##, 1, Some(RawStrError::InvalidStarter { bad_char: '~' }));
check_raw_str(r##"#~"abc"#"##, Err(RawStrError::InvalidStarter { bad_char: '~' }));
}

#[test]
fn test_unterminated_no_pound() {
// https://github.com/rust-lang/rust/issues/70677
check_raw_str(
r#"""#,
0,
Some(RawStrError::NoTerminator { expected: 0, found: 0, possible_terminator_offset: None }),
Err(RawStrError::NoTerminator { expected: 0, found: 0, possible_terminator_offset: None }),
);
}

#[test]
fn test_too_many_hashes() {
let max_count = u8::MAX;
let mut hashes: String = "#".repeat(max_count.into());
let hashes1 = "#".repeat(max_count as usize);
let hashes2 = "#".repeat(max_count as usize + 1);
let middle = "\"abc\"";
let s1 = [&hashes1, middle, &hashes1].join("");
let s2 = [&hashes2, middle, &hashes2].join("");

// Valid number of hashes (255 = 2^8 - 1 = u8::MAX), but invalid string.
check_raw_str(&hashes, max_count, Some(RawStrError::InvalidStarter { bad_char: '\u{0}' }));
// Valid number of hashes (255 = 2^8 - 1 = u8::MAX).
check_raw_str(&s1, Ok(255));

// One more hash sign (256 = 2^8) becomes too many.
hashes.push('#');
check_raw_str(
&hashes,
0,
Some(RawStrError::TooManyDelimiters { found: usize::from(max_count) + 1 }),
);
check_raw_str(&s2, Err(RawStrError::TooManyDelimiters { found: u32::from(max_count) + 1 }));
}

#[test]
Expand Down Expand Up @@ -251,7 +245,7 @@ fn raw_string() {
check_lexing(
"r###\"\"#a\\b\x00c\"\"###",
expect![[r#"
Token { kind: Literal { kind: RawStr { n_hashes: 3, err: None }, suffix_start: 17 }, len: 17 }
Token { kind: Literal { kind: RawStr { n_hashes: Some(3) }, suffix_start: 17 }, len: 17 }
"#]],
)
}
Expand Down Expand Up @@ -295,9 +289,9 @@ br###"raw"###suffix
Token { kind: Whitespace, len: 1 }
Token { kind: Literal { kind: Int { base: Decimal, empty_int: false }, suffix_start: 1 }, len: 3 }
Token { kind: Whitespace, len: 1 }
Token { kind: Literal { kind: RawStr { n_hashes: 3, err: None }, suffix_start: 12 }, len: 18 }
Token { kind: Literal { kind: RawStr { n_hashes: Some(3) }, suffix_start: 12 }, len: 18 }
Token { kind: Whitespace, len: 1 }
Token { kind: Literal { kind: RawByteStr { n_hashes: 3, err: None }, suffix_start: 13 }, len: 19 }
Token { kind: Literal { kind: RawByteStr { n_hashes: Some(3) }, suffix_start: 13 }, len: 19 }
Token { kind: Whitespace, len: 1 }
"#]],
)
Expand Down
Loading