feat: add assembly [Tokenizer]

This commit introduces [Tokenizer], a tokens iterator that will decouple tokenizing logic from the parse logic of [TokenStream]. It will create the foundation to bind tokens to their location, regardless of parse constraints. This commit aims to unblock the source mapping work. related issue: #857
0xPolygonMiden · Apr 18, 2023 · 1fa027e · 1fa027e
1 parent 4070dcf
commit 1fa027e
Show file tree

Hide file tree

Showing 3 changed files with 517 additions and 77 deletions.
diff --git a/assembly/src/tokens/mod.rs b/assembly/src/tokens/mod.rs
@@ -3,6 +3,9 @@ use super::{
 };
 use core::fmt;
 
+mod tokenizer;
+pub use tokenizer::Tokenizer;
+
 mod stream;
 pub use stream::TokenStream;
 
@@ -40,6 +43,11 @@ impl<'a> Token<'a> {
     pub const SYSCALL: &'static str = "syscall";
     pub const WHILE: &'static str = "while";
 
+    // COMMENT DELIMITERS
+    // --------------------------------------------------------------------------------------------
+    pub const DOC_COMMENT_PREFIX: &str = "#!";
+    pub const LINE_COMMENT_PREFIX: char = '#';
+
     // CONSTRUCTOR
     // --------------------------------------------------------------------------------------------
     /// Returns a new token created from the specified string and position.

diff --git a/assembly/src/tokens/stream.rs b/assembly/src/tokens/stream.rs
@@ -1,9 +1,6 @@
-use super::{BTreeMap, ParsingError, String, Token, Vec};
+use super::{BTreeMap, ParsingError, String, Token, Tokenizer, Vec};
 use core::fmt;
 
-pub const DOC_COMMENT_PREFIX: &str = "#!";
-pub const LINE_COMMENT_PREFIX: &str = "#";
-
 // TOKEN STREAM
 // ================================================================================================
 
@@ -23,59 +20,44 @@ impl<'a> TokenStream<'a> {
     // --------------------------------------------------------------------------------------------
     /// TODO: add comments
     pub fn new(source: &'a str) -> Result<Self, ParsingError> {
+        // halt if empty
         if source.is_empty() {
             return Err(ParsingError::empty_source());
         }
+
+        // TODO we should probably have a dedicated syntax for module docs. In Rust, it is `//!`.
         let mut tokens = Vec::new();
         let mut lines = Vec::new();
-        let mut proc_comments = BTreeMap::new();
         let mut module_comment = None;
-        let mut comment_builder = CommentBuilder(None);
-        let mut line_number = 0;
-
-        for line in source.lines() {
-            line_number += 1;
-            let line = line.trim();
-            if line.starts_with(DOC_COMMENT_PREFIX) {
-                comment_builder.append_line(line);
-            } else if line.starts_with(LINE_COMMENT_PREFIX) {
-                continue;
-            } else if line.is_empty() {
-                if !comment_builder.is_empty() {
-                    if tokens.is_empty() && module_comment.is_none() {
-                        // if we haven't read any tokens yet, but already have built a comment, a
-                        // new line must indicate the end of a module comment.
-                        module_comment = comment_builder.take_comment();
-                    } else {
-                        // since we already have a module comment, this is a procedure comment
-                        // which is followed by a blank line.
-                        return Err(ParsingError::dangling_procedure_comment(line_number));
-                    }
-                }
-            } else {
-                let mut line_tokens = line
-                    .split_whitespace()
-                    .take_while(|&token| !token.starts_with(LINE_COMMENT_PREFIX))
-                    .collect::<Vec<_>>();
-
-                if !comment_builder.is_empty() {
-                    // procedure comment should always be followed by a procedure token
-                    debug_assert!(!line_tokens.is_empty());
-                    let token = line_tokens[0];
+        let mut proc_comments = BTreeMap::new();
+
+        // fetch all tokens
+        for (token, docs, line) in Tokenizer::from(source) {
+            match token {
+                Some(token) => {
                     if token.starts_with(Token::EXPORT) || token.starts_with(Token::PROC) {
-                        proc_comments.insert(tokens.len(), comment_builder.take_comment());
-                    } else {
-                        return Err(ParsingError::dangling_procedure_comment(line_number));
+                        let doc_comment = build_comment(&docs);
+                        proc_comments.insert(tokens.len(), doc_comment);
+                    } else if !docs.is_empty() {
+                        return Err(ParsingError::dangling_procedure_comment(line as usize));
                     }
+
+                    tokens.push(token);
+                    lines.push(line as usize);
+                }
+
+                None if tokens.is_empty() => {
+                    module_comment = build_comment(&docs);
                 }
-                tokens.append(&mut line_tokens);
-                lines.resize(tokens.len(), line_number);
+
+                None => return Err(ParsingError::dangling_procedure_comment(line as usize)),
             }
         }
 
         if tokens.is_empty() {
             return Err(ParsingError::empty_source());
         }
+
         let current = Token::new(tokens[0], 1);
         Ok(Self {
             tokens,
@@ -159,40 +141,18 @@ impl<'a> fmt::Display for TokenStream<'a> {
     }
 }
 
-#[derive(Debug)]
-pub struct CommentBuilder(Option<String>);
-
-impl CommentBuilder {
-    pub fn append_line(&mut self, line: &str) {
-        let prepared_line = prepare_line(line);
-        if !prepared_line.is_empty() {
-            match &mut self.0 {
-                Some(comment) => {
-                    comment.push('\n');
-                    comment.push_str(prepared_line);
-                }
-                None => {
-                    self.0 = Some(String::from(prepared_line));
-                }
-            }
-        }
-    }
-
-    pub fn is_empty(&self) -> bool {
-        self.0.is_none()
-    }
-
-    pub fn take_comment(&mut self) -> Option<String> {
-        self.0.take()
-    }
-}
+// HELPERS
+// ================================================================================================
 
-/// Removes `prefix` from provided `line` and trims additional whitespaces from start and end of
-/// the `line`
-pub fn prepare_line(line: &str) -> &str {
-    // We should panic if strip_prefix returns None since it is our internal parsing error
-    line.trim()
-        .strip_prefix(DOC_COMMENT_PREFIX)
-        .expect("Current line is not a comment")
-        .trim()
+fn build_comment(docs: &[&str]) -> Option<String> {
+    let last = docs.len().saturating_sub(1);
+    let docs: String = docs
+        .iter()
+        .enumerate()
+        .map(|(i, d)| {
+            let lb = if last == i { "" } else { "\n" };
+            format!("{d}{lb}")
+        })
+        .collect();
+    (!docs.is_empty()).then_some(docs)
 }