Skip to content

Commit

Permalink
Auto merge of #99884 - nnethercote:lexer-improvements, r=matklad
Browse files Browse the repository at this point in the history
Lexer improvements

Some cleanups and small speed improvements.

r? `@matklad`
  • Loading branch information
bors committed Aug 1, 2022
2 parents 1f5d8d4 + 99f5c79 commit dcb444a
Show file tree
Hide file tree
Showing 10 changed files with 128 additions and 117 deletions.
10 changes: 6 additions & 4 deletions compiler/rustc_ast/src/util/comments.rs
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ pub fn gather_comments(sm: &SourceMap, path: FileName, src: String) -> Vec<Comme
}

for token in rustc_lexer::tokenize(&text[pos..]) {
let token_text = &text[pos..pos + token.len];
let token_text = &text[pos..pos + token.len as usize];
match token.kind {
rustc_lexer::TokenKind::Whitespace => {
if let Some(mut idx) = token_text.find('\n') {
Expand All @@ -211,8 +211,10 @@ pub fn gather_comments(sm: &SourceMap, path: FileName, src: String) -> Vec<Comme
}
rustc_lexer::TokenKind::BlockComment { doc_style, .. } => {
if doc_style.is_none() {
let code_to_the_right =
!matches!(text[pos + token.len..].chars().next(), Some('\r' | '\n'));
let code_to_the_right = !matches!(
text[pos + token.len as usize..].chars().next(),
Some('\r' | '\n')
);
let style = match (code_to_the_left, code_to_the_right) {
(_, true) => CommentStyle::Mixed,
(false, false) => CommentStyle::Isolated,
Expand Down Expand Up @@ -246,7 +248,7 @@ pub fn gather_comments(sm: &SourceMap, path: FileName, src: String) -> Vec<Comme
code_to_the_left = true;
}
}
pos += token.len;
pos += token.len as usize;
}

comments
Expand Down
4 changes: 2 additions & 2 deletions compiler/rustc_lexer/src/cursor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@ impl<'a> Cursor<'a> {
}

/// Returns amount of already consumed symbols.
pub(crate) fn len_consumed(&self) -> usize {
self.initial_len - self.chars.as_str().len()
pub(crate) fn len_consumed(&self) -> u32 {
(self.initial_len - self.chars.as_str().len()) as u32
}

/// Resets the number of bytes consumed to 0.
Expand Down
89 changes: 48 additions & 41 deletions compiler/rustc_lexer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,18 +38,17 @@ use std::convert::TryFrom;
#[derive(Debug)]
pub struct Token {
pub kind: TokenKind,
pub len: usize,
pub len: u32,
}

impl Token {
fn new(kind: TokenKind, len: usize) -> Token {
fn new(kind: TokenKind, len: u32) -> Token {
Token { kind, len }
}
}

/// Enum representing common lexeme types.
// perf note: Changing all `usize` to `u32` doesn't change performance. See #77629
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum TokenKind {
// Multi-char tokens:
/// "// comment"
Expand All @@ -76,7 +75,7 @@ pub enum TokenKind {
/// tokens.
UnknownPrefix,
/// "12_u8", "1.0e-40", "b"123"". See `LiteralKind` for more details.
Literal { kind: LiteralKind, suffix_start: usize },
Literal { kind: LiteralKind, suffix_start: u32 },
/// "'a"
Lifetime { starts_with_number: bool },

Expand Down Expand Up @@ -160,26 +159,24 @@ pub enum LiteralKind {
Str { terminated: bool },
/// "b"abc"", "b"abc"
ByteStr { terminated: bool },
/// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a"
RawStr { n_hashes: u8, err: Option<RawStrError> },
/// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a"
RawByteStr { n_hashes: u8, err: Option<RawStrError> },
/// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a". `None` indicates
/// an invalid literal.
RawStr { n_hashes: Option<u8> },
/// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a". `None`
/// indicates an invalid literal.
RawByteStr { n_hashes: Option<u8> },
}

/// Error produced validating a raw string. Represents cases like:
/// - `r##~"abcde"##`: `InvalidStarter`
/// - `r###"abcde"##`: `NoTerminator { expected: 3, found: 2, possible_terminator_offset: Some(11)`
/// - Too many `#`s (>255): `TooManyDelimiters`
// perf note: It doesn't matter that this makes `Token` 36 bytes bigger. See #77629
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum RawStrError {
/// Non `#` characters exist between `r` and `"` eg. `r#~"..`
/// Non `#` characters exist between `r` and `"`, e.g. `r##~"abcde"##`
InvalidStarter { bad_char: char },
/// The string was never terminated. `possible_terminator_offset` is the number of characters after `r` or `br` where they
/// may have intended to terminate it.
NoTerminator { expected: usize, found: usize, possible_terminator_offset: Option<usize> },
/// The string was not terminated, e.g. `r###"abcde"##`.
/// `possible_terminator_offset` is the number of characters after `r` or
/// `br` where they may have intended to terminate it.
NoTerminator { expected: u32, found: u32, possible_terminator_offset: Option<u32> },
/// More than 255 `#`s exist.
TooManyDelimiters { found: usize },
TooManyDelimiters { found: u32 },
}

/// Base of numeric literal encoding according to its prefix.
Expand Down Expand Up @@ -221,11 +218,25 @@ pub fn strip_shebang(input: &str) -> Option<usize> {
}

/// Parses the first token from the provided input string.
#[inline]
pub fn first_token(input: &str) -> Token {
debug_assert!(!input.is_empty());
Cursor::new(input).advance_token()
}

/// Validates a raw string literal. Used for getting more information about a
/// problem with a `RawStr`/`RawByteStr` with a `None` field.
#[inline]
pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError> {
debug_assert!(!input.is_empty());
let mut cursor = Cursor::new(input);
// Move past the leading `r` or `br`.
for _ in 0..prefix_len {
cursor.bump().unwrap();
}
cursor.raw_double_quoted_string(prefix_len).map(|_| ())
}

/// Creates an iterator that produces tokens from the input string.
pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ {
let mut cursor = Cursor::new(input);
Expand Down Expand Up @@ -315,12 +326,12 @@ impl Cursor<'_> {
'r' => match (self.first(), self.second()) {
('#', c1) if is_id_start(c1) => self.raw_ident(),
('#', _) | ('"', _) => {
let (n_hashes, err) = self.raw_double_quoted_string(1);
let res = self.raw_double_quoted_string(1);
let suffix_start = self.len_consumed();
if err.is_none() {
if res.is_ok() {
self.eat_literal_suffix();
}
let kind = RawStr { n_hashes, err };
let kind = RawStr { n_hashes: res.ok() };
Literal { kind, suffix_start }
}
_ => self.ident_or_unknown_prefix(),
Expand Down Expand Up @@ -350,12 +361,12 @@ impl Cursor<'_> {
}
('r', '"') | ('r', '#') => {
self.bump();
let (n_hashes, err) = self.raw_double_quoted_string(2);
let res = self.raw_double_quoted_string(2);
let suffix_start = self.len_consumed();
if err.is_none() {
if res.is_ok() {
self.eat_literal_suffix();
}
let kind = RawByteStr { n_hashes, err };
let kind = RawByteStr { n_hashes: res.ok() };
Literal { kind, suffix_start }
}
_ => self.ident_or_unknown_prefix(),
Expand Down Expand Up @@ -698,19 +709,18 @@ impl Cursor<'_> {
}

/// Eats the double-quoted string and returns `n_hashes` and an error if encountered.
fn raw_double_quoted_string(&mut self, prefix_len: usize) -> (u8, Option<RawStrError>) {
fn raw_double_quoted_string(&mut self, prefix_len: u32) -> Result<u8, RawStrError> {
// Wrap the actual function to handle the error with too many hashes.
// This way, it eats the whole raw string.
let (n_hashes, err) = self.raw_string_unvalidated(prefix_len);
let n_hashes = self.raw_string_unvalidated(prefix_len)?;
// Only up to 255 `#`s are allowed in raw strings
match u8::try_from(n_hashes) {
Ok(num) => (num, err),
// We lie about the number of hashes here :P
Err(_) => (0, Some(RawStrError::TooManyDelimiters { found: n_hashes })),
Ok(num) => Ok(num),
Err(_) => Err(RawStrError::TooManyDelimiters { found: n_hashes }),
}
}

fn raw_string_unvalidated(&mut self, prefix_len: usize) -> (usize, Option<RawStrError>) {
fn raw_string_unvalidated(&mut self, prefix_len: u32) -> Result<u32, RawStrError> {
debug_assert!(self.prev() == 'r');
let start_pos = self.len_consumed();
let mut possible_terminator_offset = None;
Expand All @@ -729,7 +739,7 @@ impl Cursor<'_> {
Some('"') => (),
c => {
let c = c.unwrap_or(EOF_CHAR);
return (n_start_hashes, Some(RawStrError::InvalidStarter { bad_char: c }));
return Err(RawStrError::InvalidStarter { bad_char: c });
}
}

Expand All @@ -739,14 +749,11 @@ impl Cursor<'_> {
self.eat_while(|c| c != '"');

if self.is_eof() {
return (
n_start_hashes,
Some(RawStrError::NoTerminator {
expected: n_start_hashes,
found: max_hashes,
possible_terminator_offset,
}),
);
return Err(RawStrError::NoTerminator {
expected: n_start_hashes,
found: max_hashes,
possible_terminator_offset,
});
}

// Eat closing double quote.
Expand All @@ -764,7 +771,7 @@ impl Cursor<'_> {
}

if n_end_hashes == n_start_hashes {
return (n_start_hashes, None);
return Ok(n_start_hashes);
} else if n_end_hashes > max_hashes {
// Keep track of possible terminators to give a hint about
// where there might be a missing terminator
Expand Down
50 changes: 22 additions & 28 deletions compiler/rustc_lexer/src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,42 +2,39 @@ use super::*;

use expect_test::{expect, Expect};

fn check_raw_str(s: &str, expected_hashes: u8, expected_err: Option<RawStrError>) {
fn check_raw_str(s: &str, expected: Result<u8, RawStrError>) {
let s = &format!("r{}", s);
let mut cursor = Cursor::new(s);
cursor.bump();
let (n_hashes, err) = cursor.raw_double_quoted_string(0);
assert_eq!(n_hashes, expected_hashes);
assert_eq!(err, expected_err);
let res = cursor.raw_double_quoted_string(0);
assert_eq!(res, expected);
}

#[test]
fn test_naked_raw_str() {
check_raw_str(r#""abc""#, 0, None);
check_raw_str(r#""abc""#, Ok(0));
}

#[test]
fn test_raw_no_start() {
check_raw_str(r##""abc"#"##, 0, None);
check_raw_str(r##""abc"#"##, Ok(0));
}

#[test]
fn test_too_many_terminators() {
// this error is handled in the parser later
check_raw_str(r###"#"abc"##"###, 1, None);
check_raw_str(r###"#"abc"##"###, Ok(1));
}

#[test]
fn test_unterminated() {
check_raw_str(
r#"#"abc"#,
1,
Some(RawStrError::NoTerminator { expected: 1, found: 0, possible_terminator_offset: None }),
Err(RawStrError::NoTerminator { expected: 1, found: 0, possible_terminator_offset: None }),
);
check_raw_str(
r###"##"abc"#"###,
2,
Some(RawStrError::NoTerminator {
Err(RawStrError::NoTerminator {
expected: 2,
found: 1,
possible_terminator_offset: Some(7),
Expand All @@ -46,41 +43,38 @@ fn test_unterminated() {
// We're looking for "# not just any #
check_raw_str(
r###"##"abc#"###,
2,
Some(RawStrError::NoTerminator { expected: 2, found: 0, possible_terminator_offset: None }),
Err(RawStrError::NoTerminator { expected: 2, found: 0, possible_terminator_offset: None }),
)
}

#[test]
fn test_invalid_start() {
check_raw_str(r##"#~"abc"#"##, 1, Some(RawStrError::InvalidStarter { bad_char: '~' }));
check_raw_str(r##"#~"abc"#"##, Err(RawStrError::InvalidStarter { bad_char: '~' }));
}

#[test]
fn test_unterminated_no_pound() {
// https://github.com/rust-lang/rust/issues/70677
check_raw_str(
r#"""#,
0,
Some(RawStrError::NoTerminator { expected: 0, found: 0, possible_terminator_offset: None }),
Err(RawStrError::NoTerminator { expected: 0, found: 0, possible_terminator_offset: None }),
);
}

#[test]
fn test_too_many_hashes() {
let max_count = u8::MAX;
let mut hashes: String = "#".repeat(max_count.into());
let hashes1 = "#".repeat(max_count as usize);
let hashes2 = "#".repeat(max_count as usize + 1);
let middle = "\"abc\"";
let s1 = [&hashes1, middle, &hashes1].join("");
let s2 = [&hashes2, middle, &hashes2].join("");

// Valid number of hashes (255 = 2^8 - 1 = u8::MAX), but invalid string.
check_raw_str(&hashes, max_count, Some(RawStrError::InvalidStarter { bad_char: '\u{0}' }));
// Valid number of hashes (255 = 2^8 - 1 = u8::MAX).
check_raw_str(&s1, Ok(255));

// One more hash sign (256 = 2^8) becomes too many.
hashes.push('#');
check_raw_str(
&hashes,
0,
Some(RawStrError::TooManyDelimiters { found: usize::from(max_count) + 1 }),
);
check_raw_str(&s2, Err(RawStrError::TooManyDelimiters { found: u32::from(max_count) + 1 }));
}

#[test]
Expand Down Expand Up @@ -251,7 +245,7 @@ fn raw_string() {
check_lexing(
"r###\"\"#a\\b\x00c\"\"###",
expect![[r#"
Token { kind: Literal { kind: RawStr { n_hashes: 3, err: None }, suffix_start: 17 }, len: 17 }
Token { kind: Literal { kind: RawStr { n_hashes: Some(3) }, suffix_start: 17 }, len: 17 }
"#]],
)
}
Expand Down Expand Up @@ -295,9 +289,9 @@ br###"raw"###suffix
Token { kind: Whitespace, len: 1 }
Token { kind: Literal { kind: Int { base: Decimal, empty_int: false }, suffix_start: 1 }, len: 3 }
Token { kind: Whitespace, len: 1 }
Token { kind: Literal { kind: RawStr { n_hashes: 3, err: None }, suffix_start: 12 }, len: 18 }
Token { kind: Literal { kind: RawStr { n_hashes: Some(3) }, suffix_start: 12 }, len: 18 }
Token { kind: Whitespace, len: 1 }
Token { kind: Literal { kind: RawByteStr { n_hashes: 3, err: None }, suffix_start: 13 }, len: 19 }
Token { kind: Literal { kind: RawByteStr { n_hashes: Some(3) }, suffix_start: 13 }, len: 19 }
Token { kind: Whitespace, len: 1 }
"#]],
)
Expand Down
Loading

0 comments on commit dcb444a

Please sign in to comment.