Skip to content

Commit

Permalink
feat: add assembly [Tokenizer]
Browse files Browse the repository at this point in the history
This commit introduces [Tokenizer], a tokens iterator that will decouple
tokenizing logic from the parse logic of [TokenStream].

It will create the foundation to bind tokens to their location,
regardless of parse constraints.

This commit aims to unblock the source mapping work.

related issue: #857
  • Loading branch information
vlopes11 committed Apr 18, 2023
1 parent 4070dcf commit 1fa027e
Show file tree
Hide file tree
Showing 3 changed files with 517 additions and 77 deletions.
8 changes: 8 additions & 0 deletions assembly/src/tokens/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ use super::{
};
use core::fmt;

mod tokenizer;
pub use tokenizer::Tokenizer;

mod stream;
pub use stream::TokenStream;

Expand Down Expand Up @@ -40,6 +43,11 @@ impl<'a> Token<'a> {
pub const SYSCALL: &'static str = "syscall";
pub const WHILE: &'static str = "while";

// COMMENT DELIMITERS
// --------------------------------------------------------------------------------------------
pub const DOC_COMMENT_PREFIX: &str = "#!";
pub const LINE_COMMENT_PREFIX: char = '#';

// CONSTRUCTOR
// --------------------------------------------------------------------------------------------
/// Returns a new token created from the specified string and position.
Expand Down
114 changes: 37 additions & 77 deletions assembly/src/tokens/stream.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
use super::{BTreeMap, ParsingError, String, Token, Vec};
use super::{BTreeMap, ParsingError, String, Token, Tokenizer, Vec};
use core::fmt;

pub const DOC_COMMENT_PREFIX: &str = "#!";
pub const LINE_COMMENT_PREFIX: &str = "#";

// TOKEN STREAM
// ================================================================================================

Expand All @@ -23,59 +20,44 @@ impl<'a> TokenStream<'a> {
// --------------------------------------------------------------------------------------------
/// TODO: add comments
pub fn new(source: &'a str) -> Result<Self, ParsingError> {
// halt if empty
if source.is_empty() {
return Err(ParsingError::empty_source());
}

// TODO we should probably have a dedicated syntax for module docs. In Rust, it is `//!`.
let mut tokens = Vec::new();
let mut lines = Vec::new();
let mut proc_comments = BTreeMap::new();
let mut module_comment = None;
let mut comment_builder = CommentBuilder(None);
let mut line_number = 0;

for line in source.lines() {
line_number += 1;
let line = line.trim();
if line.starts_with(DOC_COMMENT_PREFIX) {
comment_builder.append_line(line);
} else if line.starts_with(LINE_COMMENT_PREFIX) {
continue;
} else if line.is_empty() {
if !comment_builder.is_empty() {
if tokens.is_empty() && module_comment.is_none() {
// if we haven't read any tokens yet, but already have built a comment, a
// new line must indicate the end of a module comment.
module_comment = comment_builder.take_comment();
} else {
// since we already have a module comment, this is a procedure comment
// which is followed by a blank line.
return Err(ParsingError::dangling_procedure_comment(line_number));
}
}
} else {
let mut line_tokens = line
.split_whitespace()
.take_while(|&token| !token.starts_with(LINE_COMMENT_PREFIX))
.collect::<Vec<_>>();

if !comment_builder.is_empty() {
// procedure comment should always be followed by a procedure token
debug_assert!(!line_tokens.is_empty());
let token = line_tokens[0];
let mut proc_comments = BTreeMap::new();

// fetch all tokens
for (token, docs, line) in Tokenizer::from(source) {
match token {
Some(token) => {
if token.starts_with(Token::EXPORT) || token.starts_with(Token::PROC) {
proc_comments.insert(tokens.len(), comment_builder.take_comment());
} else {
return Err(ParsingError::dangling_procedure_comment(line_number));
let doc_comment = build_comment(&docs);
proc_comments.insert(tokens.len(), doc_comment);
} else if !docs.is_empty() {
return Err(ParsingError::dangling_procedure_comment(line as usize));
}

tokens.push(token);
lines.push(line as usize);
}

None if tokens.is_empty() => {
module_comment = build_comment(&docs);
}
tokens.append(&mut line_tokens);
lines.resize(tokens.len(), line_number);

None => return Err(ParsingError::dangling_procedure_comment(line as usize)),
}
}

if tokens.is_empty() {
return Err(ParsingError::empty_source());
}

let current = Token::new(tokens[0], 1);
Ok(Self {
tokens,
Expand Down Expand Up @@ -159,40 +141,18 @@ impl<'a> fmt::Display for TokenStream<'a> {
}
}

#[derive(Debug)]
pub struct CommentBuilder(Option<String>);

impl CommentBuilder {
pub fn append_line(&mut self, line: &str) {
let prepared_line = prepare_line(line);
if !prepared_line.is_empty() {
match &mut self.0 {
Some(comment) => {
comment.push('\n');
comment.push_str(prepared_line);
}
None => {
self.0 = Some(String::from(prepared_line));
}
}
}
}

pub fn is_empty(&self) -> bool {
self.0.is_none()
}

pub fn take_comment(&mut self) -> Option<String> {
self.0.take()
}
}
// HELPERS
// ================================================================================================

/// Removes `prefix` from provided `line` and trims additional whitespaces from start and end of
/// the `line`
pub fn prepare_line(line: &str) -> &str {
// We should panic if strip_prefix returns None since it is our internal parsing error
line.trim()
.strip_prefix(DOC_COMMENT_PREFIX)
.expect("Current line is not a comment")
.trim()
fn build_comment(docs: &[&str]) -> Option<String> {
let last = docs.len().saturating_sub(1);
let docs: String = docs
.iter()
.enumerate()
.map(|(i, d)| {
let lb = if last == i { "" } else { "\n" };
format!("{d}{lb}")
})
.collect();
(!docs.is_empty()).then_some(docs)
}
Loading

0 comments on commit 1fa027e

Please sign in to comment.