Merge #3026

3026: ra_syntax: reshape SyntaxError for the sake of removing redundancy r=matklad a=Veetaha Followup of #2911, also puts some crosses to the todo list of #223. **AHTUNG!** A big part of the diff of this PR are test data files changes. Simplified `SyntaxError` that was `SyntaxError { kind: { /* big enum */ }, location: Location }` to `SyntaxError(String, TextRange)`. I am not sure whether the tuple struct here is best fit, I am inclined to add names to the fields, because I already provide getters `SyntaxError::message()`, `SyntaxError::range()`. I also removed `Location` altogether ... This is currently WIP, because the following is not done: - [ ] ~~Add tests to `test_data` dir for unescape errors *// I don't know where to put these errors in particular, because they are out of the scope of the lexer and parser. However, I have an idea in mind that we move all validators we have right now to parsing stage, but this is up to discussion...*~~ **[UPD]** I came to a conclusion that tree validation logic, which unescape errors are a part of, should be rethought of, we currently have no tests and no place to put tests for tree validations. So I'd like to extract potential redesign (maybe move of tree validation to ra_parser) and adding tests for this into a separate task. Co-authored-by: Veetaha <gerzoh1@gmail.com> Co-authored-by: Veetaha <veetaha2@gmail.com>
rust-lang · Feb 18, 2020 · c447fe9 · c447fe9
2 parents 742459c + 053ccf4
commit c447fe9
Show file tree

Hide file tree

Showing 56 changed files with 452 additions and 650 deletions.
diff --git a/crates/ra_ide/src/diagnostics.rs b/crates/ra_ide/src/diagnostics.rs
@@ -10,7 +10,7 @@ use ra_prof::profile;
 use ra_syntax::{
     algo,
     ast::{self, make, AstNode},
-    Location, SyntaxNode, TextRange, T,
+    SyntaxNode, TextRange, T,
 };
 use ra_text_edit::{TextEdit, TextEditBuilder};
 
@@ -29,7 +29,7 @@ pub(crate) fn diagnostics(db: &RootDatabase, file_id: FileId) -> Vec<Diagnostic>
     let mut res = Vec::new();
 
     res.extend(parse.errors().iter().map(|err| Diagnostic {
-        range: location_to_range(err.location()),
+        range: err.range(),
         message: format!("Syntax Error: {}", err),
         severity: Severity::Error,
         fix: None,
@@ -116,12 +116,6 @@ pub(crate) fn diagnostics(db: &RootDatabase, file_id: FileId) -> Vec<Diagnostic>
     drop(sink);
     res.into_inner()
 }
-fn location_to_range(location: Location) -> TextRange {
-    match location {
-        Location::Offset(offset) => TextRange::offset_len(offset, 1.into()),
-        Location::Range(range) => range,
-    }
-}
 
 fn check_unnecessary_braces_in_use_statement(
     acc: &mut Vec<Diagnostic>,

diff --git a/crates/ra_syntax/src/lib.rs b/crates/ra_syntax/src/lib.rs
@@ -41,11 +41,9 @@ use crate::syntax_node::GreenNode;
 pub use crate::{
     algo::InsertPosition,
     ast::{AstNode, AstToken},
-    parsing::{
-        lex_single_syntax_kind, lex_single_valid_syntax_kind, tokenize, Token, TokenizeError,
-    },
+    parsing::{lex_single_syntax_kind, lex_single_valid_syntax_kind, tokenize, Token},
     ptr::{AstPtr, SyntaxNodePtr},
-    syntax_error::{Location, SyntaxError, SyntaxErrorKind},
+    syntax_error::SyntaxError,
     syntax_node::{
         Direction, NodeOrToken, SyntaxElement, SyntaxNode, SyntaxToken, SyntaxTreeBuilder,
     },
@@ -117,7 +115,7 @@ impl Parse<SourceFile> {
     pub fn debug_dump(&self) -> String {
         let mut buf = format!("{:#?}", self.tree().syntax());
         for err in self.errors.iter() {
-            writeln!(buf, "error {:?}: {}", err.location(), err.kind()).unwrap();
+            writeln!(buf, "error {:?}: {}", err.range(), err).unwrap();
         }
         buf
     }

diff --git a/crates/ra_syntax/src/parsing/lexer.rs b/crates/ra_syntax/src/parsing/lexer.rs
@@ -2,7 +2,7 @@
 //! It is just a bridge to `rustc_lexer`.
 
 use crate::{
-    SyntaxError, SyntaxErrorKind,
+    SyntaxError,
     SyntaxKind::{self, *},
     TextRange, TextUnit,
 };
@@ -41,13 +41,13 @@ pub fn tokenize(text: &str) -> (Vec<Token>, Vec<SyntaxError>) {
         let token_len = TextUnit::from_usize(rustc_token.len);
         let token_range = TextRange::offset_len(TextUnit::from_usize(offset), token_len);
 
-        let (syntax_kind, error) =
+        let (syntax_kind, err_message) =
             rustc_token_kind_to_syntax_kind(&rustc_token.kind, &text[token_range]);
 
         tokens.push(Token { kind: syntax_kind, len: token_len });
 
-        if let Some(error) = error {
-            errors.push(SyntaxError::new(SyntaxErrorKind::TokenizeError(error), token_range));
+        if let Some(err_message) = err_message {
+            errors.push(SyntaxError::new(err_message, token_range));
         }
 
         offset += rustc_token.len;
@@ -94,76 +94,37 @@ fn lex_first_token(text: &str) -> Option<(Token, Option<SyntaxError>)> {
     }
 
     let rustc_token = rustc_lexer::first_token(text);
-    let (syntax_kind, error) = rustc_token_kind_to_syntax_kind(&rustc_token.kind, text);
+    let (syntax_kind, err_message) = rustc_token_kind_to_syntax_kind(&rustc_token.kind, text);
 
     let token = Token { kind: syntax_kind, len: TextUnit::from_usize(rustc_token.len) };
-    let error = error.map(|error| {
-        SyntaxError::new(
-            SyntaxErrorKind::TokenizeError(error),
-            TextRange::from_to(TextUnit::from(0), TextUnit::of_str(text)),
-        )
+    let optional_error = err_message.map(|err_message| {
+        SyntaxError::new(err_message, TextRange::from_to(0.into(), TextUnit::of_str(text)))
     });
 
-    Some((token, error))
-}
-
-// FIXME: simplify TokenizeError to `SyntaxError(String, TextRange)` as per @matklad advice:
-// https://github.com/rust-analyzer/rust-analyzer/pull/2911/files#r371175067
-
-/// Describes the values of `SyntaxErrorKind::TokenizeError` enum variant.
-/// It describes all the types of errors that may happen during the tokenization
-/// of Rust source.
-#[derive(Debug, Clone, PartialEq, Eq, Hash)]
-pub enum TokenizeError {
-    /// Base prefix was provided, but there were no digits
-    /// after it, e.g. `0x`, `0b`.
-    EmptyInt,
-    /// Float exponent lacks digits e.g. `12.34e+`, `12.3E+`, `12e-`, `1_E-`,
-    EmptyExponent,
-
-    /// Block comment lacks trailing delimiter `*/`
-    UnterminatedBlockComment,
-    /// Character literal lacks trailing delimiter `'`
-    UnterminatedChar,
-    /// Characterish byte literal lacks trailing delimiter `'`
-    UnterminatedByte,
-    /// String literal lacks trailing delimiter `"`
-    UnterminatedString,
-    /// Byte string literal lacks trailing delimiter `"`
-    UnterminatedByteString,
-    /// Raw literal lacks trailing delimiter e.g. `"##`
-    UnterminatedRawString,
-    /// Raw byte string literal lacks trailing delimiter e.g. `"##`
-    UnterminatedRawByteString,
-
-    /// Raw string lacks a quote after the pound characters e.g. `r###`
-    UnstartedRawString,
-    /// Raw byte string lacks a quote after the pound characters e.g. `br###`
-    UnstartedRawByteString,
-
-    /// Lifetime starts with a number e.g. `'4ever`
-    LifetimeStartsWithNumber,
+    Some((token, optional_error))
 }
 
+/// Returns `SyntaxKind` and an optional tokenize error message.
 fn rustc_token_kind_to_syntax_kind(
     rustc_token_kind: &rustc_lexer::TokenKind,
     token_text: &str,
-) -> (SyntaxKind, Option<TokenizeError>) {
+) -> (SyntaxKind, Option<&'static str>) {
     // A note on an intended tradeoff:
     // We drop some useful infromation here (see patterns with double dots `..`)
     // Storing that info in `SyntaxKind` is not possible due to its layout requirements of
     // being `u16` that come from `rowan::SyntaxKind`.
 
     let syntax_kind = {
         use rustc_lexer::TokenKind as TK;
-        use TokenizeError as TE;
-
         match rustc_token_kind {
             TK::LineComment => COMMENT,
 
             TK::BlockComment { terminated: true } => COMMENT,
             TK::BlockComment { terminated: false } => {
-                return (COMMENT, Some(TE::UnterminatedBlockComment));
+                return (
+                    COMMENT,
+                    Some("Missing trailing `*/` symbols to terminate the block comment"),
+                );
             }
 
             TK::Whitespace => WHITESPACE,
@@ -181,7 +142,7 @@ fn rustc_token_kind_to_syntax_kind(
 
             TK::Lifetime { starts_with_number: false } => LIFETIME,
             TK::Lifetime { starts_with_number: true } => {
-                return (LIFETIME, Some(TE::LifetimeStartsWithNumber))
+                return (LIFETIME, Some("Lifetime name cannot start with a number"))
             }
 
             TK::Semi => SEMI,
@@ -217,57 +178,56 @@ fn rustc_token_kind_to_syntax_kind(
 
     return (syntax_kind, None);
 
-    fn match_literal_kind(kind: &rustc_lexer::LiteralKind) -> (SyntaxKind, Option<TokenizeError>) {
+    fn match_literal_kind(kind: &rustc_lexer::LiteralKind) -> (SyntaxKind, Option<&'static str>) {
         use rustc_lexer::LiteralKind as LK;
-        use TokenizeError as TE;
 
         #[rustfmt::skip]
         let syntax_kind = match *kind {
             LK::Int { empty_int: false, .. } => INT_NUMBER,
             LK::Int { empty_int: true, .. } => {
-                return (INT_NUMBER, Some(TE::EmptyInt))
+                return (INT_NUMBER, Some("Missing digits after the integer base prefix"))
             }
 
             LK::Float { empty_exponent: false, .. } => FLOAT_NUMBER,
             LK::Float { empty_exponent: true, .. } => {
-                return (FLOAT_NUMBER, Some(TE::EmptyExponent))
+                return (FLOAT_NUMBER, Some("Missing digits after the exponent symbol"))
             }
 
             LK::Char { terminated: true } => CHAR,
             LK::Char { terminated: false } => {
-                return (CHAR, Some(TE::UnterminatedChar))
+                return (CHAR, Some("Missing trailing `'` symbol to terminate the character literal"))
             }
 
             LK::Byte { terminated: true } => BYTE,
             LK::Byte { terminated: false } => {
-                return (BYTE, Some(TE::UnterminatedByte))
+                return (BYTE, Some("Missing trailing `'` symbol to terminate the byte literal"))
             }
 
             LK::Str { terminated: true } => STRING,
             LK::Str { terminated: false } => {
-                return (STRING, Some(TE::UnterminatedString))
+                return (STRING, Some("Missing trailing `\"` symbol to terminate the string literal"))
             }
 
 
             LK::ByteStr { terminated: true } => BYTE_STRING,
             LK::ByteStr { terminated: false } => {
-                return (BYTE_STRING, Some(TE::UnterminatedByteString))
+                return (BYTE_STRING, Some("Missing trailing `\"` symbol to terminate the byte string literal"))
             }
 
             LK::RawStr { started: true, terminated: true, .. } => RAW_STRING,
             LK::RawStr { started: true, terminated: false, .. } => {
-                return (RAW_STRING, Some(TE::UnterminatedRawString))
+                return (RAW_STRING, Some("Missing trailing `\"` with `#` symbols to terminate the raw string literal"))
             }
             LK::RawStr { started: false, .. } => {
-                return (RAW_STRING, Some(TE::UnstartedRawString))
+                return (RAW_STRING, Some("Missing `\"` symbol after `#` symbols to begin the raw string literal"))
             }
 
             LK::RawByteStr { started: true, terminated: true, .. } => RAW_BYTE_STRING,
             LK::RawByteStr { started: true, terminated: false, .. } => {
-                return (RAW_BYTE_STRING, Some(TE::UnterminatedRawByteString))
+                return (RAW_BYTE_STRING, Some("Missing trailing `\"` with `#` symbols to terminate the raw byte string literal"))
             }
             LK::RawByteStr { started: false, .. } => {
-                return (RAW_BYTE_STRING, Some(TE::UnstartedRawByteString))
+                return (RAW_BYTE_STRING, Some("Missing `\"` symbol after `#` symbols to begin the raw byte string literal"))
             }
         };
 

diff --git a/crates/ra_syntax/src/parsing/reparsing.rs b/crates/ra_syntax/src/parsing/reparsing.rs
@@ -27,8 +27,8 @@ pub(crate) fn incremental_reparse(
     edit: &AtomTextEdit,
     errors: Vec<SyntaxError>,
 ) -> Option<(GreenNode, Vec<SyntaxError>, TextRange)> {
-    if let Some((green, old_range)) = reparse_token(node, &edit) {
-        return Some((green, merge_errors(errors, Vec::new(), old_range, edit), old_range));
+    if let Some((green, new_errors, old_range)) = reparse_token(node, &edit) {
+        return Some((green, merge_errors(errors, new_errors, old_range, edit), old_range));
     }
 
     if let Some((green, new_errors, old_range)) = reparse_block(node, &edit) {
@@ -40,7 +40,7 @@ pub(crate) fn incremental_reparse(
 fn reparse_token<'node>(
     root: &'node SyntaxNode,
     edit: &AtomTextEdit,
-) -> Option<(GreenNode, TextRange)> {
+) -> Option<(GreenNode, Vec<SyntaxError>, TextRange)> {
     let prev_token = algo::find_covering_element(root, edit.delete).as_token()?.clone();
     let prev_token_kind = prev_token.kind();
     match prev_token_kind {
@@ -54,7 +54,7 @@ fn reparse_token<'node>(
             }
 
             let mut new_text = get_text_after_edit(prev_token.clone().into(), &edit);
-            let (new_token_kind, _error) = lex_single_syntax_kind(&new_text)?;
+            let (new_token_kind, new_err) = lex_single_syntax_kind(&new_text)?;
 
             if new_token_kind != prev_token_kind
                 || (new_token_kind == IDENT && is_contextual_kw(&new_text))
@@ -76,7 +76,11 @@ fn reparse_token<'node>(
 
             let new_token =
                 GreenToken::new(rowan::SyntaxKind(prev_token_kind.into()), new_text.into());
-            Some((prev_token.replace_with(new_token), prev_token.text_range()))
+            Some((
+                prev_token.replace_with(new_token),
+                new_err.into_iter().collect(),
+                prev_token.text_range(),
+            ))
         }
         _ => None,
     }
@@ -87,7 +91,7 @@ fn reparse_block<'node>(
     edit: &AtomTextEdit,
 ) -> Option<(GreenNode, Vec<SyntaxError>, TextRange)> {
     let (node, reparser) = find_reparsable_node(root, edit.delete)?;
-    let text = get_text_after_edit(node.clone().into(), &edit);
+    let text = get_text_after_edit(node.clone().into(), edit);
 
     let (tokens, new_lexer_errors) = tokenize(&text);
     if !is_balanced(&tokens) {
@@ -162,20 +166,27 @@ fn is_balanced(tokens: &[Token]) -> bool {
 fn merge_errors(
     old_errors: Vec<SyntaxError>,
     new_errors: Vec<SyntaxError>,
-    old_range: TextRange,
+    range_before_reparse: TextRange,
     edit: &AtomTextEdit,
 ) -> Vec<SyntaxError> {
     let mut res = Vec::new();
-    for e in old_errors {
-        if e.offset() <= old_range.start() {
-            res.push(e)
-        } else if e.offset() >= old_range.end() {
-            res.push(e.add_offset(TextUnit::of_str(&edit.insert), edit.delete.len()));
+
+    for old_err in old_errors {
+        let old_err_range = old_err.range();
+        // FIXME: make sure that .start() was here previously by a mistake
+        if old_err_range.end() <= range_before_reparse.start() {
+            res.push(old_err);
+        } else if old_err_range.start() >= range_before_reparse.end() {
+            let inserted_len = TextUnit::of_str(&edit.insert);
+            res.push(old_err.with_range((old_err_range + inserted_len) - edit.delete.len()));
+            // Note: extra parens are intentional to prevent uint underflow, HWAB (here was a bug)
         }
     }
-    for e in new_errors {
-        res.push(e.add_offset(old_range.start(), 0.into()));
-    }
+    res.extend(new_errors.into_iter().map(|new_err| {
+        // fighting borrow checker with a variable ;)
+        let offseted_range = new_err.range() + range_before_reparse.start();
+        new_err.with_range(offseted_range)
+    }));
     res
 }
 
@@ -193,9 +204,9 @@ mod tests {
 
         let fully_reparsed = SourceFile::parse(&after);
         let incrementally_reparsed: Parse<SourceFile> = {
-            let f = SourceFile::parse(&before);
+            let before = SourceFile::parse(&before);
             let (green, new_errors, range) =
-                incremental_reparse(f.tree().syntax(), &edit, f.errors.to_vec()).unwrap();
+                incremental_reparse(before.tree().syntax(), &edit, before.errors.to_vec()).unwrap();
             assert_eq!(range.len(), reparsed_len.into(), "reparsed fragment has wrong length");
             Parse::new(green, new_errors)
         };
@@ -204,6 +215,7 @@ mod tests {
             &format!("{:#?}", fully_reparsed.tree().syntax()),
             &format!("{:#?}", incrementally_reparsed.tree().syntax()),
         );
+        assert_eq!(fully_reparsed.errors(), incrementally_reparsed.errors());
     }
 
     #[test] // FIXME: some test here actually test token reparsing
@@ -402,4 +414,42 @@ enum Foo {
             4,
         );
     }
+
+    #[test]
+    fn reparse_str_token_with_error_unchanged() {
+        do_check(r#""<|>Unclosed<|> string literal"#, "Still unclosed", 24);
+    }
+
+    #[test]
+    fn reparse_str_token_with_error_fixed() {
+        do_check(r#""unterinated<|><|>"#, "\"", 12);
+    }
+
+    #[test]
+    fn reparse_block_with_error_in_middle_unchanged() {
+        do_check(
+            r#"fn main() {
+                if {}
+                32 + 4<|><|>
+                return
+                if {}
+            }"#,
+            "23",
+            105,
+        )
+    }
+
+    #[test]
+    fn reparse_block_with_error_in_middle_fixed() {
+        do_check(
+            r#"fn main() {
+                if {}
+                32 + 4<|><|>
+                return
+                if {}
+            }"#,
+            ";",
+            105,
+        )
+    }
 }