Auto merge of #99884 - nnethercote:lexer-improvements, r=matklad

Lexer improvements Some cleanups and small speed improvements. r? `@matklad`
rust-lang · Aug 1, 2022 · dcb444a · dcb444a
2 parents 1f5d8d4 + 99f5c79
commit dcb444a
Show file tree

Hide file tree

Showing 10 changed files with 128 additions and 117 deletions.
diff --git a/compiler/rustc_ast/src/util/comments.rs b/compiler/rustc_ast/src/util/comments.rs
@@ -194,7 +194,7 @@ pub fn gather_comments(sm: &SourceMap, path: FileName, src: String) -> Vec<Comme
     }
 
     for token in rustc_lexer::tokenize(&text[pos..]) {
-        let token_text = &text[pos..pos + token.len];
+        let token_text = &text[pos..pos + token.len as usize];
         match token.kind {
             rustc_lexer::TokenKind::Whitespace => {
                 if let Some(mut idx) = token_text.find('\n') {
@@ -211,8 +211,10 @@ pub fn gather_comments(sm: &SourceMap, path: FileName, src: String) -> Vec<Comme
             }
             rustc_lexer::TokenKind::BlockComment { doc_style, .. } => {
                 if doc_style.is_none() {
-                    let code_to_the_right =
-                        !matches!(text[pos + token.len..].chars().next(), Some('\r' | '\n'));
+                    let code_to_the_right = !matches!(
+                        text[pos + token.len as usize..].chars().next(),
+                        Some('\r' | '\n')
+                    );
                     let style = match (code_to_the_left, code_to_the_right) {
                         (_, true) => CommentStyle::Mixed,
                         (false, false) => CommentStyle::Isolated,
@@ -246,7 +248,7 @@ pub fn gather_comments(sm: &SourceMap, path: FileName, src: String) -> Vec<Comme
                 code_to_the_left = true;
             }
         }
-        pos += token.len;
+        pos += token.len as usize;
     }
 
     comments

diff --git a/compiler/rustc_lexer/src/cursor.rs b/compiler/rustc_lexer/src/cursor.rs
@@ -61,8 +61,8 @@ impl<'a> Cursor<'a> {
     }
 
     /// Returns amount of already consumed symbols.
-    pub(crate) fn len_consumed(&self) -> usize {
-        self.initial_len - self.chars.as_str().len()
+    pub(crate) fn len_consumed(&self) -> u32 {
+        (self.initial_len - self.chars.as_str().len()) as u32
     }
 
     /// Resets the number of bytes consumed to 0.

diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs
@@ -38,18 +38,17 @@ use std::convert::TryFrom;
 #[derive(Debug)]
 pub struct Token {
     pub kind: TokenKind,
-    pub len: usize,
+    pub len: u32,
 }
 
 impl Token {
-    fn new(kind: TokenKind, len: usize) -> Token {
+    fn new(kind: TokenKind, len: u32) -> Token {
         Token { kind, len }
     }
 }
 
 /// Enum representing common lexeme types.
-// perf note: Changing all `usize` to `u32` doesn't change performance. See #77629
-#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub enum TokenKind {
     // Multi-char tokens:
     /// "// comment"
@@ -76,7 +75,7 @@ pub enum TokenKind {
     /// tokens.
     UnknownPrefix,
     /// "12_u8", "1.0e-40", "b"123"". See `LiteralKind` for more details.
-    Literal { kind: LiteralKind, suffix_start: usize },
+    Literal { kind: LiteralKind, suffix_start: u32 },
     /// "'a"
     Lifetime { starts_with_number: bool },
 
@@ -160,26 +159,24 @@ pub enum LiteralKind {
     Str { terminated: bool },
     /// "b"abc"", "b"abc"
     ByteStr { terminated: bool },
-    /// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a"
-    RawStr { n_hashes: u8, err: Option<RawStrError> },
-    /// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a"
-    RawByteStr { n_hashes: u8, err: Option<RawStrError> },
+    /// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a". `None` indicates
+    /// an invalid literal.
+    RawStr { n_hashes: Option<u8> },
+    /// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a". `None`
+    /// indicates an invalid literal.
+    RawByteStr { n_hashes: Option<u8> },
 }
 
-/// Error produced validating a raw string. Represents cases like:
-/// - `r##~"abcde"##`: `InvalidStarter`
-/// - `r###"abcde"##`: `NoTerminator { expected: 3, found: 2, possible_terminator_offset: Some(11)`
-/// - Too many `#`s (>255): `TooManyDelimiters`
-// perf note: It doesn't matter that this makes `Token` 36 bytes bigger. See #77629
 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
 pub enum RawStrError {
-    /// Non `#` characters exist between `r` and `"` eg. `r#~"..`
+    /// Non `#` characters exist between `r` and `"`, e.g. `r##~"abcde"##`
     InvalidStarter { bad_char: char },
-    /// The string was never terminated. `possible_terminator_offset` is the number of characters after `r` or `br` where they
-    /// may have intended to terminate it.
-    NoTerminator { expected: usize, found: usize, possible_terminator_offset: Option<usize> },
+    /// The string was not terminated, e.g. `r###"abcde"##`.
+    /// `possible_terminator_offset` is the number of characters after `r` or
+    /// `br` where they may have intended to terminate it.
+    NoTerminator { expected: u32, found: u32, possible_terminator_offset: Option<u32> },
     /// More than 255 `#`s exist.
-    TooManyDelimiters { found: usize },
+    TooManyDelimiters { found: u32 },
 }
 
 /// Base of numeric literal encoding according to its prefix.
@@ -221,11 +218,25 @@ pub fn strip_shebang(input: &str) -> Option<usize> {
 }
 
 /// Parses the first token from the provided input string.
+#[inline]
 pub fn first_token(input: &str) -> Token {
     debug_assert!(!input.is_empty());
     Cursor::new(input).advance_token()
 }
 
+/// Validates a raw string literal. Used for getting more information about a
+/// problem with a `RawStr`/`RawByteStr` with a `None` field.
+#[inline]
+pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError> {
+    debug_assert!(!input.is_empty());
+    let mut cursor = Cursor::new(input);
+    // Move past the leading `r` or `br`.
+    for _ in 0..prefix_len {
+        cursor.bump().unwrap();
+    }
+    cursor.raw_double_quoted_string(prefix_len).map(|_| ())
+}
+
 /// Creates an iterator that produces tokens from the input string.
 pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ {
     let mut cursor = Cursor::new(input);
@@ -315,12 +326,12 @@ impl Cursor<'_> {
             'r' => match (self.first(), self.second()) {
                 ('#', c1) if is_id_start(c1) => self.raw_ident(),
                 ('#', _) | ('"', _) => {
-                    let (n_hashes, err) = self.raw_double_quoted_string(1);
+                    let res = self.raw_double_quoted_string(1);
                     let suffix_start = self.len_consumed();
-                    if err.is_none() {
+                    if res.is_ok() {
                         self.eat_literal_suffix();
                     }
-                    let kind = RawStr { n_hashes, err };
+                    let kind = RawStr { n_hashes: res.ok() };
                     Literal { kind, suffix_start }
                 }
                 _ => self.ident_or_unknown_prefix(),
@@ -350,12 +361,12 @@ impl Cursor<'_> {
                 }
                 ('r', '"') | ('r', '#') => {
                     self.bump();
-                    let (n_hashes, err) = self.raw_double_quoted_string(2);
+                    let res = self.raw_double_quoted_string(2);
                     let suffix_start = self.len_consumed();
-                    if err.is_none() {
+                    if res.is_ok() {
                         self.eat_literal_suffix();
                     }
-                    let kind = RawByteStr { n_hashes, err };
+                    let kind = RawByteStr { n_hashes: res.ok() };
                     Literal { kind, suffix_start }
                 }
                 _ => self.ident_or_unknown_prefix(),
@@ -698,19 +709,18 @@ impl Cursor<'_> {
     }
 
     /// Eats the double-quoted string and returns `n_hashes` and an error if encountered.
-    fn raw_double_quoted_string(&mut self, prefix_len: usize) -> (u8, Option<RawStrError>) {
+    fn raw_double_quoted_string(&mut self, prefix_len: u32) -> Result<u8, RawStrError> {
         // Wrap the actual function to handle the error with too many hashes.
         // This way, it eats the whole raw string.
-        let (n_hashes, err) = self.raw_string_unvalidated(prefix_len);
+        let n_hashes = self.raw_string_unvalidated(prefix_len)?;
         // Only up to 255 `#`s are allowed in raw strings
         match u8::try_from(n_hashes) {
-            Ok(num) => (num, err),
-            // We lie about the number of hashes here :P
-            Err(_) => (0, Some(RawStrError::TooManyDelimiters { found: n_hashes })),
+            Ok(num) => Ok(num),
+            Err(_) => Err(RawStrError::TooManyDelimiters { found: n_hashes }),
         }
     }
 
-    fn raw_string_unvalidated(&mut self, prefix_len: usize) -> (usize, Option<RawStrError>) {
+    fn raw_string_unvalidated(&mut self, prefix_len: u32) -> Result<u32, RawStrError> {
         debug_assert!(self.prev() == 'r');
         let start_pos = self.len_consumed();
         let mut possible_terminator_offset = None;
@@ -729,7 +739,7 @@ impl Cursor<'_> {
             Some('"') => (),
             c => {
                 let c = c.unwrap_or(EOF_CHAR);
-                return (n_start_hashes, Some(RawStrError::InvalidStarter { bad_char: c }));
+                return Err(RawStrError::InvalidStarter { bad_char: c });
             }
         }
 
@@ -739,14 +749,11 @@ impl Cursor<'_> {
             self.eat_while(|c| c != '"');
 
             if self.is_eof() {
-                return (
-                    n_start_hashes,
-                    Some(RawStrError::NoTerminator {
-                        expected: n_start_hashes,
-                        found: max_hashes,
-                        possible_terminator_offset,
-                    }),
-                );
+                return Err(RawStrError::NoTerminator {
+                    expected: n_start_hashes,
+                    found: max_hashes,
+                    possible_terminator_offset,
+                });
             }
 
             // Eat closing double quote.
@@ -764,7 +771,7 @@ impl Cursor<'_> {
             }
 
             if n_end_hashes == n_start_hashes {
-                return (n_start_hashes, None);
+                return Ok(n_start_hashes);
             } else if n_end_hashes > max_hashes {
                 // Keep track of possible terminators to give a hint about
                 // where there might be a missing terminator

diff --git a/compiler/rustc_lexer/src/tests.rs b/compiler/rustc_lexer/src/tests.rs
@@ -2,42 +2,39 @@ use super::*;
 
 use expect_test::{expect, Expect};
 
-fn check_raw_str(s: &str, expected_hashes: u8, expected_err: Option<RawStrError>) {
+fn check_raw_str(s: &str, expected: Result<u8, RawStrError>) {
     let s = &format!("r{}", s);
     let mut cursor = Cursor::new(s);
     cursor.bump();
-    let (n_hashes, err) = cursor.raw_double_quoted_string(0);
-    assert_eq!(n_hashes, expected_hashes);
-    assert_eq!(err, expected_err);
+    let res = cursor.raw_double_quoted_string(0);
+    assert_eq!(res, expected);
 }
 
 #[test]
 fn test_naked_raw_str() {
-    check_raw_str(r#""abc""#, 0, None);
+    check_raw_str(r#""abc""#, Ok(0));
 }
 
 #[test]
 fn test_raw_no_start() {
-    check_raw_str(r##""abc"#"##, 0, None);
+    check_raw_str(r##""abc"#"##, Ok(0));
 }
 
 #[test]
 fn test_too_many_terminators() {
     // this error is handled in the parser later
-    check_raw_str(r###"#"abc"##"###, 1, None);
+    check_raw_str(r###"#"abc"##"###, Ok(1));
 }
 
 #[test]
 fn test_unterminated() {
     check_raw_str(
         r#"#"abc"#,
-        1,
-        Some(RawStrError::NoTerminator { expected: 1, found: 0, possible_terminator_offset: None }),
+        Err(RawStrError::NoTerminator { expected: 1, found: 0, possible_terminator_offset: None }),
     );
     check_raw_str(
         r###"##"abc"#"###,
-        2,
-        Some(RawStrError::NoTerminator {
+        Err(RawStrError::NoTerminator {
             expected: 2,
             found: 1,
             possible_terminator_offset: Some(7),
@@ -46,41 +43,38 @@ fn test_unterminated() {
     // We're looking for "# not just any #
     check_raw_str(
         r###"##"abc#"###,
-        2,
-        Some(RawStrError::NoTerminator { expected: 2, found: 0, possible_terminator_offset: None }),
+        Err(RawStrError::NoTerminator { expected: 2, found: 0, possible_terminator_offset: None }),
     )
 }
 
 #[test]
 fn test_invalid_start() {
-    check_raw_str(r##"#~"abc"#"##, 1, Some(RawStrError::InvalidStarter { bad_char: '~' }));
+    check_raw_str(r##"#~"abc"#"##, Err(RawStrError::InvalidStarter { bad_char: '~' }));
 }
 
 #[test]
 fn test_unterminated_no_pound() {
     // https://github.com/rust-lang/rust/issues/70677
     check_raw_str(
         r#"""#,
-        0,
-        Some(RawStrError::NoTerminator { expected: 0, found: 0, possible_terminator_offset: None }),
+        Err(RawStrError::NoTerminator { expected: 0, found: 0, possible_terminator_offset: None }),
     );
 }
 
 #[test]
 fn test_too_many_hashes() {
     let max_count = u8::MAX;
-    let mut hashes: String = "#".repeat(max_count.into());
+    let hashes1 = "#".repeat(max_count as usize);
+    let hashes2 = "#".repeat(max_count as usize + 1);
+    let middle = "\"abc\"";
+    let s1 = [&hashes1, middle, &hashes1].join("");
+    let s2 = [&hashes2, middle, &hashes2].join("");
 
-    // Valid number of hashes (255 = 2^8 - 1 = u8::MAX), but invalid string.
-    check_raw_str(&hashes, max_count, Some(RawStrError::InvalidStarter { bad_char: '\u{0}' }));
+    // Valid number of hashes (255 = 2^8 - 1 = u8::MAX).
+    check_raw_str(&s1, Ok(255));
 
     // One more hash sign (256 = 2^8) becomes too many.
-    hashes.push('#');
-    check_raw_str(
-        &hashes,
-        0,
-        Some(RawStrError::TooManyDelimiters { found: usize::from(max_count) + 1 }),
-    );
+    check_raw_str(&s2, Err(RawStrError::TooManyDelimiters { found: u32::from(max_count) + 1 }));
 }
 
 #[test]
@@ -251,7 +245,7 @@ fn raw_string() {
     check_lexing(
         "r###\"\"#a\\b\x00c\"\"###",
         expect![[r#"
-            Token { kind: Literal { kind: RawStr { n_hashes: 3, err: None }, suffix_start: 17 }, len: 17 }
+            Token { kind: Literal { kind: RawStr { n_hashes: Some(3) }, suffix_start: 17 }, len: 17 }
         "#]],
     )
 }
@@ -295,9 +289,9 @@ br###"raw"###suffix
             Token { kind: Whitespace, len: 1 }
             Token { kind: Literal { kind: Int { base: Decimal, empty_int: false }, suffix_start: 1 }, len: 3 }
             Token { kind: Whitespace, len: 1 }
-            Token { kind: Literal { kind: RawStr { n_hashes: 3, err: None }, suffix_start: 12 }, len: 18 }
+            Token { kind: Literal { kind: RawStr { n_hashes: Some(3) }, suffix_start: 12 }, len: 18 }
             Token { kind: Whitespace, len: 1 }
-            Token { kind: Literal { kind: RawByteStr { n_hashes: 3, err: None }, suffix_start: 13 }, len: 19 }
+            Token { kind: Literal { kind: RawByteStr { n_hashes: Some(3) }, suffix_start: 13 }, len: 19 }
             Token { kind: Whitespace, len: 1 }
         "#]],
     )