Skip to content

Commit

Permalink
Merge #3026
Browse files Browse the repository at this point in the history
3026: ra_syntax: reshape SyntaxError for the sake of removing redundancy r=matklad a=Veetaha

Followup of #2911, also puts some crosses to the todo list of #223.

**AHTUNG!** A big part of the diff of this PR are test data files changes.

Simplified `SyntaxError` that was `SyntaxError { kind: { /* big enum */  }, location: Location }` to `SyntaxError(String, TextRange)`. I am not sure whether the tuple struct here is best fit, I am inclined to add names to the fields, because I already provide getters `SyntaxError::message()`, `SyntaxError::range()`.
I also removed `Location` altogether ...

This is currently WIP, because the following is not done:
- [ ] ~~Add tests to `test_data` dir for unescape errors *// I don't know where to put these errors in particular, because they are out of the scope of the lexer and parser. However, I have an idea in mind that we move all validators we have right now to parsing stage, but this is up to discussion...*~~ **[UPD]** I came to a conclusion that tree validation logic, which unescape errors are a part of, should be rethought of, we currently have no tests and no place to put tests for tree validations. So I'd like to extract potential redesign (maybe move of tree validation to ra_parser) and adding tests for this into a separate task.

Co-authored-by: Veetaha <gerzoh1@gmail.com>
Co-authored-by: Veetaha <veetaha2@gmail.com>
  • Loading branch information
3 people authored Feb 18, 2020
2 parents 742459c + 053ccf4 commit c447fe9
Show file tree
Hide file tree
Showing 56 changed files with 452 additions and 650 deletions.
10 changes: 2 additions & 8 deletions crates/ra_ide/src/diagnostics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use ra_prof::profile;
use ra_syntax::{
algo,
ast::{self, make, AstNode},
Location, SyntaxNode, TextRange, T,
SyntaxNode, TextRange, T,
};
use ra_text_edit::{TextEdit, TextEditBuilder};

Expand All @@ -29,7 +29,7 @@ pub(crate) fn diagnostics(db: &RootDatabase, file_id: FileId) -> Vec<Diagnostic>
let mut res = Vec::new();

res.extend(parse.errors().iter().map(|err| Diagnostic {
range: location_to_range(err.location()),
range: err.range(),
message: format!("Syntax Error: {}", err),
severity: Severity::Error,
fix: None,
Expand Down Expand Up @@ -116,12 +116,6 @@ pub(crate) fn diagnostics(db: &RootDatabase, file_id: FileId) -> Vec<Diagnostic>
drop(sink);
res.into_inner()
}
fn location_to_range(location: Location) -> TextRange {
match location {
Location::Offset(offset) => TextRange::offset_len(offset, 1.into()),
Location::Range(range) => range,
}
}

fn check_unnecessary_braces_in_use_statement(
acc: &mut Vec<Diagnostic>,
Expand Down
8 changes: 3 additions & 5 deletions crates/ra_syntax/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,9 @@ use crate::syntax_node::GreenNode;
pub use crate::{
algo::InsertPosition,
ast::{AstNode, AstToken},
parsing::{
lex_single_syntax_kind, lex_single_valid_syntax_kind, tokenize, Token, TokenizeError,
},
parsing::{lex_single_syntax_kind, lex_single_valid_syntax_kind, tokenize, Token},
ptr::{AstPtr, SyntaxNodePtr},
syntax_error::{Location, SyntaxError, SyntaxErrorKind},
syntax_error::SyntaxError,
syntax_node::{
Direction, NodeOrToken, SyntaxElement, SyntaxNode, SyntaxToken, SyntaxTreeBuilder,
},
Expand Down Expand Up @@ -117,7 +115,7 @@ impl Parse<SourceFile> {
pub fn debug_dump(&self) -> String {
let mut buf = format!("{:#?}", self.tree().syntax());
for err in self.errors.iter() {
writeln!(buf, "error {:?}: {}", err.location(), err.kind()).unwrap();
writeln!(buf, "error {:?}: {}", err.range(), err).unwrap();
}
buf
}
Expand Down
92 changes: 26 additions & 66 deletions crates/ra_syntax/src/parsing/lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
//! It is just a bridge to `rustc_lexer`.
use crate::{
SyntaxError, SyntaxErrorKind,
SyntaxError,
SyntaxKind::{self, *},
TextRange, TextUnit,
};
Expand Down Expand Up @@ -41,13 +41,13 @@ pub fn tokenize(text: &str) -> (Vec<Token>, Vec<SyntaxError>) {
let token_len = TextUnit::from_usize(rustc_token.len);
let token_range = TextRange::offset_len(TextUnit::from_usize(offset), token_len);

let (syntax_kind, error) =
let (syntax_kind, err_message) =
rustc_token_kind_to_syntax_kind(&rustc_token.kind, &text[token_range]);

tokens.push(Token { kind: syntax_kind, len: token_len });

if let Some(error) = error {
errors.push(SyntaxError::new(SyntaxErrorKind::TokenizeError(error), token_range));
if let Some(err_message) = err_message {
errors.push(SyntaxError::new(err_message, token_range));
}

offset += rustc_token.len;
Expand Down Expand Up @@ -94,76 +94,37 @@ fn lex_first_token(text: &str) -> Option<(Token, Option<SyntaxError>)> {
}

let rustc_token = rustc_lexer::first_token(text);
let (syntax_kind, error) = rustc_token_kind_to_syntax_kind(&rustc_token.kind, text);
let (syntax_kind, err_message) = rustc_token_kind_to_syntax_kind(&rustc_token.kind, text);

let token = Token { kind: syntax_kind, len: TextUnit::from_usize(rustc_token.len) };
let error = error.map(|error| {
SyntaxError::new(
SyntaxErrorKind::TokenizeError(error),
TextRange::from_to(TextUnit::from(0), TextUnit::of_str(text)),
)
let optional_error = err_message.map(|err_message| {
SyntaxError::new(err_message, TextRange::from_to(0.into(), TextUnit::of_str(text)))
});

Some((token, error))
}

// FIXME: simplify TokenizeError to `SyntaxError(String, TextRange)` as per @matklad advice:
// https://github.com/rust-analyzer/rust-analyzer/pull/2911/files#r371175067

/// Describes the values of `SyntaxErrorKind::TokenizeError` enum variant.
/// It describes all the types of errors that may happen during the tokenization
/// of Rust source.
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum TokenizeError {
/// Base prefix was provided, but there were no digits
/// after it, e.g. `0x`, `0b`.
EmptyInt,
/// Float exponent lacks digits e.g. `12.34e+`, `12.3E+`, `12e-`, `1_E-`,
EmptyExponent,

/// Block comment lacks trailing delimiter `*/`
UnterminatedBlockComment,
/// Character literal lacks trailing delimiter `'`
UnterminatedChar,
/// Characterish byte literal lacks trailing delimiter `'`
UnterminatedByte,
/// String literal lacks trailing delimiter `"`
UnterminatedString,
/// Byte string literal lacks trailing delimiter `"`
UnterminatedByteString,
/// Raw literal lacks trailing delimiter e.g. `"##`
UnterminatedRawString,
/// Raw byte string literal lacks trailing delimiter e.g. `"##`
UnterminatedRawByteString,

/// Raw string lacks a quote after the pound characters e.g. `r###`
UnstartedRawString,
/// Raw byte string lacks a quote after the pound characters e.g. `br###`
UnstartedRawByteString,

/// Lifetime starts with a number e.g. `'4ever`
LifetimeStartsWithNumber,
Some((token, optional_error))
}

/// Returns `SyntaxKind` and an optional tokenize error message.
fn rustc_token_kind_to_syntax_kind(
rustc_token_kind: &rustc_lexer::TokenKind,
token_text: &str,
) -> (SyntaxKind, Option<TokenizeError>) {
) -> (SyntaxKind, Option<&'static str>) {
// A note on an intended tradeoff:
// We drop some useful infromation here (see patterns with double dots `..`)
// Storing that info in `SyntaxKind` is not possible due to its layout requirements of
// being `u16` that come from `rowan::SyntaxKind`.

let syntax_kind = {
use rustc_lexer::TokenKind as TK;
use TokenizeError as TE;

match rustc_token_kind {
TK::LineComment => COMMENT,

TK::BlockComment { terminated: true } => COMMENT,
TK::BlockComment { terminated: false } => {
return (COMMENT, Some(TE::UnterminatedBlockComment));
return (
COMMENT,
Some("Missing trailing `*/` symbols to terminate the block comment"),
);
}

TK::Whitespace => WHITESPACE,
Expand All @@ -181,7 +142,7 @@ fn rustc_token_kind_to_syntax_kind(

TK::Lifetime { starts_with_number: false } => LIFETIME,
TK::Lifetime { starts_with_number: true } => {
return (LIFETIME, Some(TE::LifetimeStartsWithNumber))
return (LIFETIME, Some("Lifetime name cannot start with a number"))
}

TK::Semi => SEMI,
Expand Down Expand Up @@ -217,57 +178,56 @@ fn rustc_token_kind_to_syntax_kind(

return (syntax_kind, None);

fn match_literal_kind(kind: &rustc_lexer::LiteralKind) -> (SyntaxKind, Option<TokenizeError>) {
fn match_literal_kind(kind: &rustc_lexer::LiteralKind) -> (SyntaxKind, Option<&'static str>) {
use rustc_lexer::LiteralKind as LK;
use TokenizeError as TE;

#[rustfmt::skip]
let syntax_kind = match *kind {
LK::Int { empty_int: false, .. } => INT_NUMBER,
LK::Int { empty_int: true, .. } => {
return (INT_NUMBER, Some(TE::EmptyInt))
return (INT_NUMBER, Some("Missing digits after the integer base prefix"))
}

LK::Float { empty_exponent: false, .. } => FLOAT_NUMBER,
LK::Float { empty_exponent: true, .. } => {
return (FLOAT_NUMBER, Some(TE::EmptyExponent))
return (FLOAT_NUMBER, Some("Missing digits after the exponent symbol"))
}

LK::Char { terminated: true } => CHAR,
LK::Char { terminated: false } => {
return (CHAR, Some(TE::UnterminatedChar))
return (CHAR, Some("Missing trailing `'` symbol to terminate the character literal"))
}

LK::Byte { terminated: true } => BYTE,
LK::Byte { terminated: false } => {
return (BYTE, Some(TE::UnterminatedByte))
return (BYTE, Some("Missing trailing `'` symbol to terminate the byte literal"))
}

LK::Str { terminated: true } => STRING,
LK::Str { terminated: false } => {
return (STRING, Some(TE::UnterminatedString))
return (STRING, Some("Missing trailing `\"` symbol to terminate the string literal"))
}


LK::ByteStr { terminated: true } => BYTE_STRING,
LK::ByteStr { terminated: false } => {
return (BYTE_STRING, Some(TE::UnterminatedByteString))
return (BYTE_STRING, Some("Missing trailing `\"` symbol to terminate the byte string literal"))
}

LK::RawStr { started: true, terminated: true, .. } => RAW_STRING,
LK::RawStr { started: true, terminated: false, .. } => {
return (RAW_STRING, Some(TE::UnterminatedRawString))
return (RAW_STRING, Some("Missing trailing `\"` with `#` symbols to terminate the raw string literal"))
}
LK::RawStr { started: false, .. } => {
return (RAW_STRING, Some(TE::UnstartedRawString))
return (RAW_STRING, Some("Missing `\"` symbol after `#` symbols to begin the raw string literal"))
}

LK::RawByteStr { started: true, terminated: true, .. } => RAW_BYTE_STRING,
LK::RawByteStr { started: true, terminated: false, .. } => {
return (RAW_BYTE_STRING, Some(TE::UnterminatedRawByteString))
return (RAW_BYTE_STRING, Some("Missing trailing `\"` with `#` symbols to terminate the raw byte string literal"))
}
LK::RawByteStr { started: false, .. } => {
return (RAW_BYTE_STRING, Some(TE::UnstartedRawByteString))
return (RAW_BYTE_STRING, Some("Missing `\"` symbol after `#` symbols to begin the raw byte string literal"))
}
};

Expand Down
84 changes: 67 additions & 17 deletions crates/ra_syntax/src/parsing/reparsing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ pub(crate) fn incremental_reparse(
edit: &AtomTextEdit,
errors: Vec<SyntaxError>,
) -> Option<(GreenNode, Vec<SyntaxError>, TextRange)> {
if let Some((green, old_range)) = reparse_token(node, &edit) {
return Some((green, merge_errors(errors, Vec::new(), old_range, edit), old_range));
if let Some((green, new_errors, old_range)) = reparse_token(node, &edit) {
return Some((green, merge_errors(errors, new_errors, old_range, edit), old_range));
}

if let Some((green, new_errors, old_range)) = reparse_block(node, &edit) {
Expand All @@ -40,7 +40,7 @@ pub(crate) fn incremental_reparse(
fn reparse_token<'node>(
root: &'node SyntaxNode,
edit: &AtomTextEdit,
) -> Option<(GreenNode, TextRange)> {
) -> Option<(GreenNode, Vec<SyntaxError>, TextRange)> {
let prev_token = algo::find_covering_element(root, edit.delete).as_token()?.clone();
let prev_token_kind = prev_token.kind();
match prev_token_kind {
Expand All @@ -54,7 +54,7 @@ fn reparse_token<'node>(
}

let mut new_text = get_text_after_edit(prev_token.clone().into(), &edit);
let (new_token_kind, _error) = lex_single_syntax_kind(&new_text)?;
let (new_token_kind, new_err) = lex_single_syntax_kind(&new_text)?;

if new_token_kind != prev_token_kind
|| (new_token_kind == IDENT && is_contextual_kw(&new_text))
Expand All @@ -76,7 +76,11 @@ fn reparse_token<'node>(

let new_token =
GreenToken::new(rowan::SyntaxKind(prev_token_kind.into()), new_text.into());
Some((prev_token.replace_with(new_token), prev_token.text_range()))
Some((
prev_token.replace_with(new_token),
new_err.into_iter().collect(),
prev_token.text_range(),
))
}
_ => None,
}
Expand All @@ -87,7 +91,7 @@ fn reparse_block<'node>(
edit: &AtomTextEdit,
) -> Option<(GreenNode, Vec<SyntaxError>, TextRange)> {
let (node, reparser) = find_reparsable_node(root, edit.delete)?;
let text = get_text_after_edit(node.clone().into(), &edit);
let text = get_text_after_edit(node.clone().into(), edit);

let (tokens, new_lexer_errors) = tokenize(&text);
if !is_balanced(&tokens) {
Expand Down Expand Up @@ -162,20 +166,27 @@ fn is_balanced(tokens: &[Token]) -> bool {
fn merge_errors(
old_errors: Vec<SyntaxError>,
new_errors: Vec<SyntaxError>,
old_range: TextRange,
range_before_reparse: TextRange,
edit: &AtomTextEdit,
) -> Vec<SyntaxError> {
let mut res = Vec::new();
for e in old_errors {
if e.offset() <= old_range.start() {
res.push(e)
} else if e.offset() >= old_range.end() {
res.push(e.add_offset(TextUnit::of_str(&edit.insert), edit.delete.len()));

for old_err in old_errors {
let old_err_range = old_err.range();
// FIXME: make sure that .start() was here previously by a mistake
if old_err_range.end() <= range_before_reparse.start() {
res.push(old_err);
} else if old_err_range.start() >= range_before_reparse.end() {
let inserted_len = TextUnit::of_str(&edit.insert);
res.push(old_err.with_range((old_err_range + inserted_len) - edit.delete.len()));
// Note: extra parens are intentional to prevent uint underflow, HWAB (here was a bug)
}
}
for e in new_errors {
res.push(e.add_offset(old_range.start(), 0.into()));
}
res.extend(new_errors.into_iter().map(|new_err| {
// fighting borrow checker with a variable ;)
let offseted_range = new_err.range() + range_before_reparse.start();
new_err.with_range(offseted_range)
}));
res
}

Expand All @@ -193,9 +204,9 @@ mod tests {

let fully_reparsed = SourceFile::parse(&after);
let incrementally_reparsed: Parse<SourceFile> = {
let f = SourceFile::parse(&before);
let before = SourceFile::parse(&before);
let (green, new_errors, range) =
incremental_reparse(f.tree().syntax(), &edit, f.errors.to_vec()).unwrap();
incremental_reparse(before.tree().syntax(), &edit, before.errors.to_vec()).unwrap();
assert_eq!(range.len(), reparsed_len.into(), "reparsed fragment has wrong length");
Parse::new(green, new_errors)
};
Expand All @@ -204,6 +215,7 @@ mod tests {
&format!("{:#?}", fully_reparsed.tree().syntax()),
&format!("{:#?}", incrementally_reparsed.tree().syntax()),
);
assert_eq!(fully_reparsed.errors(), incrementally_reparsed.errors());
}

#[test] // FIXME: some test here actually test token reparsing
Expand Down Expand Up @@ -402,4 +414,42 @@ enum Foo {
4,
);
}

#[test]
fn reparse_str_token_with_error_unchanged() {
do_check(r#""<|>Unclosed<|> string literal"#, "Still unclosed", 24);
}

#[test]
fn reparse_str_token_with_error_fixed() {
do_check(r#""unterinated<|><|>"#, "\"", 12);
}

#[test]
fn reparse_block_with_error_in_middle_unchanged() {
do_check(
r#"fn main() {
if {}
32 + 4<|><|>
return
if {}
}"#,
"23",
105,
)
}

#[test]
fn reparse_block_with_error_in_middle_fixed() {
do_check(
r#"fn main() {
if {}
32 + 4<|><|>
return
if {}
}"#,
";",
105,
)
}
}
Loading

0 comments on commit c447fe9

Please sign in to comment.