From 73dae3fd56df5e0ee192b730d3710c2e276ffec4 Mon Sep 17 00:00:00 2001
From: Victor Lopez <victor@codx.io>
Date: Fri, 14 Apr 2023 19:11:09 +0200
Subject: [PATCH] feat: add line number to parsing error

Prior to this commit, [ParsingError] tracked the step of the source
instead of the line number.

This is not friendly to the user as he won't be able to find the
location of the error. A common standard is to display the line number,
so the user can easily spot the location of the error.

This commit introduces such track. It keeps a 1:1 map on [TokenStream]
from the token to the parsed line. As in common text editors, line
number starts at `1`.

It also removes the `pos` from [Token] as its only purpose was to
display the step to the user. The introduction of `line` completely
supersedes that.

closes #574
---
 assembly/src/errors.rs        | 92 +++++++++++++++++------------------
 assembly/src/parsers/mod.rs   |  6 +--
 assembly/src/parsers/tests.rs | 68 +++++++++++++++++++++++++-
 assembly/src/tokens/mod.rs    | 18 +++----
 assembly/src/tokens/stream.rs | 22 ++++++---
 5 files changed, 141 insertions(+), 65 deletions(-)

diff --git a/assembly/src/errors.rs b/assembly/src/errors.rs
index 87ea78ee0d..a82e352df7 100644
--- a/assembly/src/errors.rs
+++ b/assembly/src/errors.rs
@@ -139,7 +139,7 @@ impl std::error::Error for AssemblyError {}
 #[derive(Clone, Eq, PartialEq)]
 pub struct ParsingError {
     message: String,
-    step: usize,
+    line: usize,
     op: String,
 }
 
@@ -150,15 +150,15 @@ impl ParsingError {
     pub fn empty_source() -> Self {
         ParsingError {
             message: "source code cannot be an empty string".to_string(),
-            step: 0,
+            line: 0,
             op: "".to_string(),
         }
     }
 
-    pub fn unexpected_eof(step: usize) -> Self {
+    pub fn unexpected_eof(line: usize) -> Self {
         ParsingError {
             message: "unexpected EOF".to_string(),
-            step,
+            line,
             op: "".to_string(),
         }
     }
@@ -166,7 +166,7 @@ impl ParsingError {
     pub fn unexpected_token(token: &Token, expected: &str) -> Self {
         ParsingError {
             message: format!("unexpected token: expected '{expected}' but was '{token}'"),
-            step: token.pos(),
+            line: token.line(),
             op: token.to_string(),
         }
     }
@@ -176,7 +176,7 @@ impl ParsingError {
     pub fn duplicate_const_name(token: &Token, label: &str) -> Self {
         ParsingError {
             message: format!("duplicate constant name: '{label}'"),
-            step: token.pos(),
+            line: token.line(),
             op: token.to_string(),
         }
     }
@@ -184,7 +184,7 @@ impl ParsingError {
     pub fn invalid_const_name(token: &Token, err: LabelError) -> Self {
         ParsingError {
             message: format!("invalid constant name: {err}"),
-            step: token.pos(),
+            line: token.line(),
             op: token.to_string(),
         }
     }
@@ -194,7 +194,7 @@ impl ParsingError {
             message: format!(
                 "malformed constant `{token}` - invalid value: `{value}` - reason: {reason}"
             ),
-            step: token.pos(),
+            line: token.line(),
             op: token.to_string(),
         }
     }
@@ -202,7 +202,7 @@ impl ParsingError {
     pub fn const_invalid_scope(token: &Token) -> Self {
         ParsingError {
             message: format!("invalid constant declaration: `{token}` - constants can only be defined below imports and above procedure / program bodies"),
-            step: token.pos(),
+            line: token.line(),
             op: token.to_string(),
         }
     }
@@ -210,7 +210,7 @@ impl ParsingError {
     pub fn const_not_found(token: &Token) -> Self {
         ParsingError {
             message: format!("constant used in operation `{token}` not found"),
-            step: token.pos(),
+            line: token.line(),
             op: token.to_string(),
         }
     }
@@ -220,7 +220,7 @@ impl ParsingError {
             message: format!(
                 "failed to convert u64 constant used in `{token}` to required type {type_name}"
             ),
-            step: token.pos(),
+            line: token.line(),
             op: token.to_string(),
         }
     }
@@ -231,7 +231,7 @@ impl ParsingError {
     pub fn invalid_op(token: &Token) -> Self {
         ParsingError {
             message: format!("instruction '{token}' is invalid"),
-            step: token.pos(),
+            line: token.line(),
             op: token.to_string(),
         }
     }
@@ -239,7 +239,7 @@ impl ParsingError {
     pub fn missing_param(token: &Token) -> Self {
         ParsingError {
             message: format!("malformed instruction '{token}': missing required parameter"),
-            step: token.pos(),
+            line: token.line(),
             op: token.to_string(),
         }
     }
@@ -247,7 +247,7 @@ impl ParsingError {
     pub fn extra_param(token: &Token) -> Self {
         ParsingError {
             message: format!("malformed instruction '{token}': too many parameters provided"),
-            step: token.pos(),
+            line: token.line(),
             op: token.to_string(),
         }
     }
@@ -258,7 +258,7 @@ impl ParsingError {
                 "malformed instruction `{token}`: parameter '{}' is invalid",
                 token.parts()[part_idx]
             ),
-            step: token.pos(),
+            line: token.line(),
             op: token.to_string(),
         }
     }
@@ -269,7 +269,7 @@ impl ParsingError {
                 "malformed instruction '{token}', parameter {} is invalid: {reason}",
                 token.parts()[part_idx],
             ),
-            step: token.pos(),
+            line: token.line(),
             op: token.to_string(),
         }
     }
@@ -280,7 +280,7 @@ impl ParsingError {
     pub fn dangling_else(token: &Token) -> Self {
         ParsingError {
             message: "else without matching if".to_string(),
-            step: token.pos(),
+            line: token.line(),
             op: token.to_string(),
         }
     }
@@ -288,7 +288,7 @@ impl ParsingError {
     pub fn unmatched_if(token: &Token) -> Self {
         ParsingError {
             message: "if without matching else/end".to_string(),
-            step: token.pos(),
+            line: token.line(),
             op: token.to_string(),
         }
     }
@@ -296,7 +296,7 @@ impl ParsingError {
     pub fn unmatched_while(token: &Token) -> Self {
         ParsingError {
             message: "while without matching end".to_string(),
-            step: token.pos(),
+            line: token.line(),
             op: token.to_string(),
         }
     }
@@ -304,7 +304,7 @@ impl ParsingError {
     pub fn unmatched_repeat(token: &Token) -> Self {
         ParsingError {
             message: "repeat without matching end".to_string(),
-            step: token.pos(),
+            line: token.line(),
             op: token.to_string(),
         }
     }
@@ -312,7 +312,7 @@ impl ParsingError {
     pub fn unmatched_else(token: &Token) -> Self {
         ParsingError {
             message: "else without matching end".to_string(),
-            step: token.pos(),
+            line: token.line(),
             op: token.to_string(),
         }
     }
@@ -320,7 +320,7 @@ impl ParsingError {
     pub fn unmatched_begin(token: &Token) -> Self {
         ParsingError {
             message: "begin without matching end".to_string(),
-            step: token.pos(),
+            line: token.line(),
             op: token.to_string(),
         }
     }
@@ -328,7 +328,7 @@ impl ParsingError {
     pub fn dangling_ops_after_program(token: &Token) -> Self {
         ParsingError {
             message: "dangling instructions after program end".to_string(),
-            step: token.pos(),
+            line: token.line(),
             op: token.to_string(),
         }
     }
@@ -336,16 +336,16 @@ impl ParsingError {
     pub fn dangling_ops_after_module(token: &Token) -> Self {
         ParsingError {
             message: "dangling instructions after module end".to_string(),
-            step: token.pos(),
+            line: token.line(),
             op: token.to_string(),
         }
     }
 
-    pub fn dangling_procedure_comment(step: usize) -> Self {
+    pub fn dangling_procedure_comment(line: usize) -> Self {
         ParsingError {
             message: "Procedure comment is not immediately followed by a procedure declaration."
                 .to_string(),
-            step,
+            line,
             op: "".to_string(),
         }
     }
@@ -353,7 +353,7 @@ impl ParsingError {
     pub fn not_a_library_module(token: &Token) -> Self {
         ParsingError {
             message: "not a module: `begin` instruction found".to_string(),
-            step: token.pos(),
+            line: token.line(),
             op: token.to_string(),
         }
     }
@@ -364,7 +364,7 @@ impl ParsingError {
     pub fn duplicate_proc_name(token: &Token, label: &str) -> Self {
         ParsingError {
             message: format!("duplicate procedure name: {label}"),
-            step: token.pos(),
+            line: token.line(),
             op: token.to_string(),
         }
     }
@@ -372,7 +372,7 @@ impl ParsingError {
     pub fn invalid_proc_name(token: &Token, err: LabelError) -> Self {
         ParsingError {
             message: format!("invalid procedure name: {err}"),
-            step: token.pos(),
+            line: token.line(),
             op: token.to_string(),
         }
     }
@@ -383,7 +383,7 @@ impl ParsingError {
                 "procedure name cannot be longer than {max_len} characters, but was {}",
                 label.len()
             ),
-            step: token.pos(),
+            line: token.line(),
             op: token.to_string(),
         }
     }
@@ -391,7 +391,7 @@ impl ParsingError {
     pub fn invalid_proc_locals(token: &Token, locals: &str) -> Self {
         ParsingError {
             message: format!("invalid procedure locals: {locals}"),
-            step: token.pos(),
+            line: token.line(),
             op: token.to_string(),
         }
     }
@@ -399,7 +399,7 @@ impl ParsingError {
     pub fn too_many_proc_locals(token: &Token, num_locals: u64, max_locals: u64) -> Self {
         ParsingError {
             message: format!("number of procedure locals cannot be greater than {max_locals} characters, but was {num_locals}"),
-            step: token.pos(),
+            line: token.line(),
             op: token.to_string(),
         }
     }
@@ -407,7 +407,7 @@ impl ParsingError {
     pub fn unmatched_proc(token: &Token, proc_name: &str) -> Self {
         ParsingError {
             message: format!("procedure '{proc_name}' has no matching end"),
-            step: token.pos(),
+            line: token.line(),
             op: token.to_string(),
         }
     }
@@ -415,7 +415,7 @@ impl ParsingError {
     pub fn proc_export_not_allowed(token: &Token, label: &str) -> Self {
         ParsingError {
             message: format!("exported procedures not allowed in this context: {label}"),
-            step: token.pos(),
+            line: token.line(),
             op: token.to_string(),
         }
     }
@@ -426,7 +426,7 @@ impl ParsingError {
     pub fn invalid_proc_invocation(token: &Token, label: &str) -> Self {
         ParsingError {
             message: format!("invalid procedure invocation: {label}"),
-            step: token.pos(),
+            line: token.line(),
             op: token.to_string(),
         }
     }
@@ -434,7 +434,7 @@ impl ParsingError {
     pub fn syscall_with_module_name(token: &Token) -> Self {
         ParsingError {
             message: "invalid syscall: cannot invoke a syscall on a named module".to_string(),
-            step: token.pos(),
+            line: token.line(),
             op: token.to_string(),
         }
     }
@@ -442,7 +442,7 @@ impl ParsingError {
     pub fn undefined_local_proc(token: &Token, label: &str) -> Self {
         ParsingError {
             message: format!("undefined local procedure: {label}"),
-            step: token.pos(),
+            line: token.line(),
             op: token.to_string(),
         }
     }
@@ -450,7 +450,7 @@ impl ParsingError {
     pub fn procedure_module_not_imported(token: &Token, module_name: &str) -> Self {
         ParsingError {
             message: format!("module '{module_name}' was not imported"),
-            step: token.pos(),
+            line: token.line(),
             op: token.to_string(),
         }
     }
@@ -461,7 +461,7 @@ impl ParsingError {
     pub fn duplicate_module_import(token: &Token, module: &str) -> Self {
         ParsingError {
             message: format!("duplicate module import found: {module}"),
-            step: token.pos(),
+            line: token.line(),
             op: token.to_string(),
         }
     }
@@ -469,7 +469,7 @@ impl ParsingError {
     pub fn invalid_module_path(token: &Token, module_path: &str) -> Self {
         ParsingError {
             message: format!("invalid module import path: {module_path}"),
-            step: token.pos(),
+            line: token.line(),
             op: token.to_string(),
         }
     }
@@ -477,7 +477,7 @@ impl ParsingError {
     pub fn import_inside_body(token: &Token) -> Self {
         ParsingError {
             message: "import in procedure body".to_string(),
-            step: token.pos(),
+            line: token.line(),
             op: token.to_string(),
         }
     }
@@ -485,7 +485,7 @@ impl ParsingError {
     pub fn invalid_library_path(token: &Token, error: LibraryError) -> Self {
         ParsingError {
             message: format!("invalid path resolution: {error}"),
-            step: token.pos(),
+            line: token.line(),
             op: token.to_string(),
         }
     }
@@ -500,20 +500,20 @@ impl ParsingError {
         &self.op
     }
 
-    pub fn step(&self) -> usize {
-        self.step
+    pub const fn line(&self) -> usize {
+        self.line
     }
 }
 
 impl fmt::Debug for ParsingError {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(f, "parsing error at {}: {}", self.step, self.message)
+        write!(f, "parsing error at line {}: {}", self.line, self.message)
     }
 }
 
 impl fmt::Display for ParsingError {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(f, "parsing error at {}: {}", self.step, self.message)
+        write!(f, "parsing error at line {}: {}", self.line, self.message)
     }
 }
 
diff --git a/assembly/src/parsers/mod.rs b/assembly/src/parsers/mod.rs
index b23456fea3..721840a386 100644
--- a/assembly/src/parsers/mod.rs
+++ b/assembly/src/parsers/mod.rs
@@ -228,7 +228,8 @@ pub fn parse_program(source: &str) -> Result<ProgramAst, ParsingError> {
     context.parse_procedures(&mut tokens, false)?;
 
     // make sure program body is present
-    let next_token = tokens.read().ok_or_else(|| ParsingError::unexpected_eof(tokens.pos()))?;
+    let next_token =
+        tokens.read().ok_or_else(|| ParsingError::unexpected_eof(tokens.num_lines()))?;
     if next_token.parts()[0] != Token::BEGIN {
         return Err(ParsingError::unexpected_token(next_token, Token::BEGIN));
     }
@@ -240,9 +241,8 @@ pub fn parse_program(source: &str) -> Result<ProgramAst, ParsingError> {
     tokens.advance();
 
     // make sure there is something to be read
-    let start_pos = tokens.pos();
     if tokens.eof() {
-        return Err(ParsingError::unexpected_eof(start_pos));
+        return Err(ParsingError::unexpected_eof(tokens.num_lines()));
     }
 
     let mut body = Vec::<Node>::new();
diff --git a/assembly/src/parsers/tests.rs b/assembly/src/parsers/tests.rs
index 797cdccac6..2974203517 100644
--- a/assembly/src/parsers/tests.rs
+++ b/assembly/src/parsers/tests.rs
@@ -2,7 +2,7 @@ use vm_core::Felt;
 
 use super::{
     parse_module, parse_program, BTreeMap, Instruction, LocalProcMap, ModuleAst, Node,
-    ProcedureAst, ProcedureId, ProgramAst,
+    ParsingError, ProcedureAst, ProcedureId, ProgramAst, Token,
 };
 
 // UNIT TESTS
@@ -692,6 +692,72 @@ fn test_ast_program_serde_control_flow() {
     assert_eq!(program, program_deserialized);
 }
 
+#[test]
+fn assert_parsing_line_unmatched_begin() {
+    let source = format!("\n\nbegin\npush.1.2\n\nadd mul");
+    let err = parse_program(&source).err().unwrap();
+    assert_eq!(err, ParsingError::unmatched_begin(&Token::new("begin", 3)));
+}
+
+#[test]
+fn assert_parsing_line_extra_param() {
+    let source = format!("begin add.1.2\nend");
+    let err = parse_program(&source).err().unwrap();
+    assert_eq!(err, ParsingError::extra_param(&Token::new("add.1.2", 1)));
+}
+
+#[test]
+fn assert_parsing_line_invalid_op() {
+    let source = "\
+    begin
+        repeat.3
+            push.1
+            push.0.1
+        end
+
+        # some comments
+
+        if.true
+            and
+            loc_store.0
+        else
+            padw
+        end
+
+        # more comments
+        # to test if line is correct
+
+        while.true
+            push.5.7
+            u32checked_add
+            loc_store.1
+            push.0
+        end
+
+        repeat.3
+            push.2
+            u32overflowing_mulx
+        end
+
+    end";
+    let err = parse_program(&source).err().unwrap();
+    assert_eq!(err, ParsingError::invalid_op(&Token::new("u32overflowing_mulx", 28)));
+}
+
+#[test]
+fn assert_parsing_line_unexpected_eof() {
+    let source = format!("proc.foo\nadd\nend");
+    let err = parse_program(&source).err().unwrap();
+    assert_eq!(err, ParsingError::unexpected_eof(3));
+}
+
+#[test]
+fn assert_parsing_line_unexpected_token() {
+    let source = format!("proc.foo\nadd\nend\n\nmul");
+    let err = parse_program(&source).err().unwrap();
+    assert_eq!(err, ParsingError::unexpected_token(&Token::new("mul", 5), "begin"));
+}
+
 fn assert_program_output(source: &str, procedures: LocalProcMap, body: Vec<Node>) {
     let program = parse_program(source).unwrap();
     assert_eq!(program.body, body);
diff --git a/assembly/src/tokens/mod.rs b/assembly/src/tokens/mod.rs
index 2d3f4f01d2..eeacf1924e 100644
--- a/assembly/src/tokens/mod.rs
+++ b/assembly/src/tokens/mod.rs
@@ -16,8 +16,8 @@ pub use stream::TokenStream;
 pub struct Token<'a> {
     /// The dot-separated parts of a token, e.g. `push.1` is split into `['push', '1']`.
     parts: Vec<&'a str>,
-    /// The token position in the token stream
-    pos: usize,
+    /// The line number linked to this token
+    line: usize,
 }
 
 impl<'a> Token<'a> {
@@ -46,20 +46,20 @@ impl<'a> Token<'a> {
     ///
     /// # Panics
     /// Panic if the `token` parameter is an empty string.
-    pub fn new(token: &'a str, pos: usize) -> Self {
+    pub fn new(token: &'a str, line: usize) -> Self {
         assert!(!token.is_empty(), "token cannot be an empty string");
         Self {
             parts: token.split('.').collect(),
-            pos,
+            line,
         }
     }
 
     // PUBLIC ACCESSORS
     // --------------------------------------------------------------------------------------------
 
-    /// Returns the position of this token in the source.
-    pub fn pos(&self) -> usize {
-        self.pos
+    /// Returns the line number of this token in the source.
+    pub const fn line(&self) -> usize {
+        self.line
     }
 
     /// Returns the number of parts in this token.
@@ -78,11 +78,11 @@ impl<'a> Token<'a> {
     ///
     /// # Panics
     /// Panic is the `token` parameter is an empty string.
-    pub fn update(&mut self, token: &'a str, pos: usize) {
+    pub fn update(&mut self, token: &'a str, line: usize) {
         assert!(!token.is_empty(), "token cannot be an empty string");
         self.parts.clear();
         token.split('.').for_each(|part| self.parts.push(part));
-        self.pos = pos;
+        self.line = line;
     }
 
     // CONTROL TOKEN PARSERS / VALIDATORS
diff --git a/assembly/src/tokens/stream.rs b/assembly/src/tokens/stream.rs
index 388f27b74e..5e2749e5b1 100644
--- a/assembly/src/tokens/stream.rs
+++ b/assembly/src/tokens/stream.rs
@@ -10,6 +10,7 @@ pub const LINE_COMMENT_PREFIX: &str = "#";
 #[derive(Debug)]
 pub struct TokenStream<'a> {
     tokens: Vec<&'a str>,
+    lines: Vec<usize>,
     current: Token<'a>,
     pos: usize,
     temp: Token<'a>,
@@ -26,12 +27,14 @@ impl<'a> TokenStream<'a> {
             return Err(ParsingError::empty_source());
         }
         let mut tokens = Vec::new();
+        let mut lines = Vec::new();
         let mut proc_comments = BTreeMap::new();
         let mut module_comment = None;
-
         let mut comment_builder = CommentBuilder(None);
+        let mut line_number = 0;
 
         for line in source.lines() {
+            line_number += 1;
             let line = line.trim();
             if line.starts_with(DOC_COMMENT_PREFIX) {
                 comment_builder.append_line(line);
@@ -46,7 +49,7 @@ impl<'a> TokenStream<'a> {
                     } else {
                         // since we already have a module comment, this is a procedure comment
                         // which is followed by a blank line.
-                        return Err(ParsingError::dangling_procedure_comment(tokens.len()));
+                        return Err(ParsingError::dangling_procedure_comment(line_number));
                     }
                 }
             } else {
@@ -62,19 +65,21 @@ impl<'a> TokenStream<'a> {
                     if token.starts_with(Token::EXPORT) || token.starts_with(Token::PROC) {
                         proc_comments.insert(tokens.len(), comment_builder.take_comment());
                     } else {
-                        return Err(ParsingError::dangling_procedure_comment(tokens.len()));
+                        return Err(ParsingError::dangling_procedure_comment(line_number));
                     }
                 }
                 tokens.append(&mut line_tokens);
+                lines.resize(tokens.len(), line_number);
             }
         }
 
         if tokens.is_empty() {
             return Err(ParsingError::empty_source());
         }
-        let current = Token::new(tokens[0], 0);
+        let current = Token::new(tokens[0], 1);
         Ok(Self {
             tokens,
+            lines,
             current,
             pos: 0,
             temp: Token::default(),
@@ -91,6 +96,11 @@ impl<'a> TokenStream<'a> {
         self.pos
     }
 
+    /// Returns the current lines count for the stream.
+    pub fn num_lines(&self) -> usize {
+        self.lines[self.pos.min(self.lines.len().saturating_sub(1))]
+    }
+
     /// Returns 'true' all tokens from this stream have been read.
     pub fn eof(&self) -> bool {
         self.pos == self.tokens.len()
@@ -119,7 +129,7 @@ impl<'a> TokenStream<'a> {
         if pos == self.pos {
             self.read()
         } else {
-            self.temp.update(self.tokens[pos], pos);
+            self.temp.update(self.tokens[pos], self.lines[pos]);
             Some(&self.temp)
         }
     }
@@ -129,7 +139,7 @@ impl<'a> TokenStream<'a> {
         if !self.eof() {
             self.pos += 1;
             if !self.eof() {
-                self.current.update(self.tokens[self.pos], self.pos);
+                self.current.update(self.tokens[self.pos], self.lines[self.pos]);
             }
         }
     }