From b67cff849cf6808037a8dac8a1f882be07cca6e7 Mon Sep 17 00:00:00 2001 From: Artemis Rosman <73006620+rozukke@users.noreply.github.com> Date: Tue, 13 Aug 2024 12:59:34 +1000 Subject: [PATCH 01/17] Add miette dep --- Cargo.lock | 240 +++++++++++++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 1 + 2 files changed, 241 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 362b36e..ed76f0c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,21 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "addr2line" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e4503c46a5c0c7844e948c9a4d6acd9f50cccb4de1c48eb9e291ea17470c678" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + [[package]] name = "aho-corasick" version = "1.1.3" @@ -75,6 +90,36 @@ dependencies = [ "wait-timeout", ] +[[package]] +name = "backtrace" +version = "0.3.73" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5cc23269a4f8976d0a4d2e7109211a419fe30e8d88d677cd60b6bc79c5732e0a" +dependencies = [ + "addr2line", + "cc", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", +] + +[[package]] +name = "backtrace-ext" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "537beee3be4a18fb023b570f80e3ae28003db9167a751266b259926e25539d50" +dependencies = [ + "backtrace", +] + +[[package]] +name = "bitflags" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" + [[package]] name = "bstr" version = "1.9.1" @@ -86,6 +131,18 @@ dependencies = [ "serde", ] +[[package]] +name = "cc" +version = "1.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9e8aabfac534be767c909e0690571677d49f41bd8465ae876fe043d52ba5292" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + [[package]] name = "clap" version = "4.5.4" @@ -154,6 +211,22 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" +[[package]] +name = "errno" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "gimli" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd" + [[package]] name = "glob" version = "0.3.1" @@ -166,6 +239,12 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "is_ci" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7655c9839580ee829dfacba1d1278c2b7883e50a277ff7541299489d6bdfdc45" + [[package]] name = "is_terminal_polyfill" version = "1.70.0" @@ -181,6 +260,7 @@ dependencies = [ "colored", "glob", "lazy_static", + "miette", "regex", ] @@ -196,12 +276,73 @@ version = "0.2.154" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ae743338b92ff9146ce83992f766a31066a91a8c84a45e0e9f21e7cf6de6d346" +[[package]] +name = "linux-raw-sys" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" + [[package]] name = "memchr" version = "2.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d" +[[package]] +name = "miette" +version = "7.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4edc8853320c2a0dab800fbda86253c8938f6ea88510dc92c5f1ed20e794afc1" +dependencies = [ + "backtrace", + "backtrace-ext", + "cfg-if", + "miette-derive", + "owo-colors", + "supports-color", + "supports-hyperlinks", + "supports-unicode", + "terminal_size", + "textwrap", + "thiserror", + "unicode-width", +] + +[[package]] +name = "miette-derive" +version = "7.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcf09caffaac8068c346b6df2a7fc27a177fd20b39421a39ce0a211bde679a6c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "miniz_oxide" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08" +dependencies = [ + "adler", +] + +[[package]] +name = "object" +version = "0.36.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27b64972346851a39438c60b341ebc01bba47464ae329e55cf343eb93964efd9" +dependencies = [ + "memchr", +] + +[[package]] +name = "owo-colors" +version = "4.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "caff54706df99d2a78a5a4e3455ff45448d81ef1bb63c22cd14052ca0e993a3f" + [[package]] name = "predicates" version = "3.1.0" @@ -276,6 +417,25 @@ version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" +[[package]] +name = "rustc-demangle" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" + +[[package]] +name = "rustix" +version = "0.38.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.52.0", +] + [[package]] name = "serde" version = "1.0.201" @@ -296,12 +456,39 @@ dependencies = [ "syn", ] +[[package]] +name = "smawk" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c388c1b5e93756d0c740965c41e8822f866621d41acbdf6336a6a168f8840c" + [[package]] name = "strsim" version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" +[[package]] +name = "supports-color" +version = "3.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9829b314621dfc575df4e409e79f9d6a66a3bd707ab73f23cb4aa3a854ac854f" +dependencies = [ + "is_ci", +] + +[[package]] +name = "supports-hyperlinks" +version = "3.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c0a1e5168041f5f3ff68ff7d95dcb9c8749df29f6e7e89ada40dd4c9de404ee" + +[[package]] +name = "supports-unicode" +version = "3.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7401a30af6cb5818bb64852270bb722533397edcfc7344954a38f420819ece2" + [[package]] name = "syn" version = "2.0.61" @@ -313,18 +500,71 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "terminal_size" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21bebf2b7c9e0a515f6e0f8c51dc0f8e4696391e6f1ff30379559f8365fb0df7" +dependencies = [ + "rustix", + "windows-sys 0.48.0", +] + [[package]] name = "termtree" version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76" +[[package]] +name = "textwrap" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23d434d3f8967a09480fb04132ebe0a3e088c173e6d0ee7897abbdf4eab0f8b9" +dependencies = [ + "smawk", + "unicode-linebreak", + "unicode-width", +] + +[[package]] +name = "thiserror" +version = "1.0.63" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.63" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "unicode-ident" version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" +[[package]] +name = "unicode-linebreak" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b09c83c3c29d37506a3e260c08c03743a6bb66a9cd432c6934ab501a190571f" + +[[package]] +name = "unicode-width" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d" + [[package]] name = "utf8parse" version = "0.2.1" diff --git a/Cargo.toml b/Cargo.toml index 1e8c232..4694274 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,6 +13,7 @@ clap = { version = "4.5.4", features = ["derive"] } colored = "2.1.0" regex = "1.10.6" lazy_static = "1.5.0" +miette = { version = "7.2.0", features = ["fancy"] } [dev-dependencies] assert_cmd = "2.0.14" From ae1f126761535b8516ce9c4498f5fc47a565393e Mon Sep 17 00:00:00 2001 From: Artemis Rosman <73006620+rozukke@users.noreply.github.com> Date: Tue, 13 Aug 2024 12:59:52 +1000 Subject: [PATCH 02/17] Parser progress --- src/lexer/cursor.rs | 5 +++ src/lexer/mod.rs | 77 ++++++++++++++++++++++++++++++++++++++++++++- src/main.rs | 8 ++++- src/parser.rs | 33 +++++++++++++------ 4 files changed, 112 insertions(+), 11 deletions(-) diff --git a/src/lexer/cursor.rs b/src/lexer/cursor.rs index f44c614..83166f0 100644 --- a/src/lexer/cursor.rs +++ b/src/lexer/cursor.rs @@ -20,6 +20,11 @@ impl<'a> Cursor<'a> { } } + pub fn get_next(&self, len: usize) -> &'a str { + println!("{}", &self.chars[self.curr_pt..(self.curr_pt + len)]); + &self.chars[self.curr_pt..(self.curr_pt + len)] + } + /// File is finished parsing pub fn is_eof(&self) -> bool { self.len_remaining == 0 diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index 6604850..b05e007 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -20,9 +20,56 @@ pub enum LiteralKind { Str, } +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub enum InstrKind { + Add, + And, + Branch, + Jump, + JumpSub, + JumpSubReg, + Load, + LoadInd, + LoadReg, + LoadAddr, + Not, + Return, + Interrupt, + Store, + StoreInd, +} + +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub enum DirecKind { + Orig, + Stringz, + Blkw, + Fill, + Alias, + Macro, + End, + Export, + Import, +} + +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub enum TrapKind { + /// Get a character from standard input + Getc, + /// Output a single character + Out, + /// Print string + Puts, + In, + Putsp, + Halt, + Trap, +} + #[derive(Clone, Copy, PartialEq, Eq, Debug)] pub enum TokenKind { Ident, + Instr(InstrKind), Lit(LiteralKind), Comment, Direc, @@ -79,8 +126,36 @@ impl Cursor<'_> { for (kind, re) in PATTERNS.iter() { if let Some(tok) = re.find(self.at_curr_pt()) { + // Parse into precise definition + let mut kind = *kind; + kind = match kind { + TokenKind::Ident => match self.get_next(tok.len()).to_lowercase().as_str() { + "add" => TokenKind::Instr(InstrKind::Add), + "and" => TokenKind::Instr(InstrKind::And), + "br" | "brn" | "brz" | "brp" | "brnz" | "brnzp" | "brnp" | "brzp" => { + TokenKind::Instr(InstrKind::Branch) + } + "jmp" => TokenKind::Instr(InstrKind::Jump), + "jsr" => TokenKind::Instr(InstrKind::JumpSub), + "jsrr" => TokenKind::Instr(InstrKind::JumpSubReg), + "ld" => TokenKind::Instr(InstrKind::Load), + "ldi" => TokenKind::Instr(InstrKind::LoadInd), + "ldr" => TokenKind::Instr(InstrKind::LoadReg), + "lea" => TokenKind::Instr(InstrKind::LoadAddr), + "not" => TokenKind::Instr(InstrKind::Not), + "ret" => TokenKind::Instr(InstrKind::Return), + "rti" => TokenKind::Instr(InstrKind::Interrupt), + "st" => TokenKind::Instr(InstrKind::Store), + "sti" => TokenKind::Instr(InstrKind::StoreInd), + _ => TokenKind::Ident, + }, + TokenKind::Direc => { + todo!() + } + _ => kind, + }; let token = Token { - kind: *kind, + kind, span: Span::new(Idx(self.curr_pt() as u32), tok.len() as u16), }; self.advance(tok.len()); diff --git a/src/main.rs b/src/main.rs index 9600e74..36873be 100644 --- a/src/main.rs +++ b/src/main.rs @@ -5,6 +5,8 @@ use std::fs; use clap::{Parser, Subcommand}; use colored::Colorize; use lexer::{tokenize, TokenKind}; +use miette::Result; +use parser::AsmParser; mod lexer; mod ops; @@ -59,7 +61,7 @@ enum Command { }, } -fn main() { +fn main() -> Result<()> { let args = Args::parse(); if let Some(command) = args.command { @@ -70,6 +72,10 @@ fn main() { for tok in tokenize(&file).filter(|tok| tok.kind != TokenKind::Junk) { println!("{:?} {}", tok, &file[tok.span.as_range()]); } + + let mut parse = AsmParser::from(file.as_str()); + parse.parse()?; + Ok(()) } Command::Clean { name } => todo!(), Command::Watch { name } => todo!(), diff --git a/src/parser.rs b/src/parser.rs index e865b62..b949a9e 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1,30 +1,45 @@ -use crate::lexer::{cursor::Cursor, TokenKind}; +use std::error::Error; + +use miette::{miette, Result}; + +use crate::lexer::{cursor::Cursor, Token, TokenKind}; /// Transforms token stream into 'AST' -pub struct Parser<'source> { +pub struct AsmParser<'source> { /// Reference to the source file src: &'source str, /// Used to parse tokens cur: Cursor<'source>, } -impl<'a> From<&'a str> for Parser<'a> { +impl<'a> From<&'a str> for AsmParser<'a> { fn from(value: &'a str) -> Self { - Parser { + AsmParser { src: value, cur: Cursor::new(value), } } } -impl<'source> Parser<'source> { - pub fn parse(&self) { +impl<'source> AsmParser<'source> { + pub fn parse(&mut self) -> Result<()> { // First, check that there is an .orig directive with an appropriate value. - todo!() + let orig = self.expect(TokenKind::Direc)?; + let addr = self.expect(TokenKind::Lit(crate::lexer::LiteralKind::Hex)); + + Ok(()) } - pub fn expect(kind: TokenKind) { - todo!() + pub fn expect(&mut self, kind: TokenKind) -> Result<Token> { + let tok = self.cur.advance_token(); + if tok.kind == kind { + return Ok(tok); + } + Err(miette!( + "ParseError: expected token of type {:?}, found {:?}", + kind, + tok + )) } pub fn parse_direc(&self) { From 221486a8c8d7ea80e1c30f7d53a80f408db89fd5 Mon Sep 17 00:00:00 2001 From: Artemis Rosman <73006620+rozukke@users.noreply.github.com> Date: Thu, 15 Aug 2024 12:16:36 +1000 Subject: [PATCH 03/17] Add indexmap dependency --- Cargo.lock | 39 +++++++++++++++++++++++++++++++++++++++ Cargo.toml | 2 ++ 2 files changed, 41 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index ed76f0c..dde14b2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -131,6 +131,12 @@ dependencies = [ "serde", ] +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "cc" version = "1.1.10" @@ -211,6 +217,12 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + [[package]] name = "errno" version = "0.3.9" @@ -221,6 +233,15 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + [[package]] name = "gimli" version = "0.29.0" @@ -233,12 +254,28 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" + [[package]] name = "heck" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "indexmap" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93ead53efc7ea8ed3cfb0c79fc8023fbb782a5432b52830b6518941cebe6505c" +dependencies = [ + "equivalent", + "hashbrown", +] + [[package]] name = "is_ci" version = "1.2.0" @@ -258,7 +295,9 @@ dependencies = [ "assert_cmd", "clap", "colored", + "fxhash", "glob", + "indexmap", "lazy_static", "miette", "regex", diff --git a/Cargo.toml b/Cargo.toml index 4694274..294e1e8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,6 +14,8 @@ colored = "2.1.0" regex = "1.10.6" lazy_static = "1.5.0" miette = { version = "7.2.0", features = ["fancy"] } +indexmap = "2.4.0" +fxhash = "0.2.1" [dev-dependencies] assert_cmd = "2.0.14" From 5ada5680a51e937b4e06e6840cf4cc021e7acc7a Mon Sep 17 00:00:00 2001 From: Artemis Rosman <73006620+rozukke@users.noreply.github.com> Date: Thu, 15 Aug 2024 12:17:16 +1000 Subject: [PATCH 04/17] Redo lexer without regex --- src/lexer/cursor.rs | 52 ++++++----- src/lexer/mod.rs | 218 ++++++++++++++++++-------------------------- src/main.rs | 6 +- 3 files changed, 124 insertions(+), 152 deletions(-) diff --git a/src/lexer/cursor.rs b/src/lexer/cursor.rs index 83166f0..7a56ee4 100644 --- a/src/lexer/cursor.rs +++ b/src/lexer/cursor.rs @@ -2,52 +2,60 @@ // Heavily instpired and referenced from `rustc_lexer` and adapted to suit the project. // See https://doc.rust-lang.org/beta/nightly-rustc/src/rustc_lexer/cursor.rs.html +use std::str::Chars; + /// Peekable iterator over a char sequence. pub struct Cursor<'a> { len_remaining: usize, - /// Index that the cursor is pointing to in the source - curr_pt: usize, /// Iterator over chars in a &str - chars: &'a str, + chars: Chars<'a>, } +pub(crate) const NULL_CHAR: char = '\0'; + impl<'a> Cursor<'a> { pub fn new(input: &'a str) -> Cursor<'a> { Cursor { len_remaining: input.len(), - curr_pt: 0, - chars: input, + chars: input.chars(), } } - pub fn get_next(&self, len: usize) -> &'a str { - println!("{}", &self.chars[self.curr_pt..(self.curr_pt + len)]); - &self.chars[self.curr_pt..(self.curr_pt + len)] + pub fn as_str(&self) -> &'a str { + self.chars.as_str() + } + + /// Returns next character without consuming it. + pub fn first(&self) -> char { + self.chars.clone().next().unwrap_or(NULL_CHAR) } /// File is finished parsing pub fn is_eof(&self) -> bool { - self.len_remaining == 0 + self.chars.as_str().is_empty() } - /// Return slice of input starting at the current point of the cursor - pub fn at_curr_pt(&self) -> &'a str { - &self.chars[self.curr_pt..] + /// Advance by one character + pub fn bump(&mut self) -> Option<char> { + let c = self.chars.next()?; + Some(c) } - /// Move cursor ahead in the input by given amount - pub fn advance(&mut self, amt: usize) { - self.curr_pt += amt; - self.len_remaining -= amt; + /// Return consumed tokens + /// Basic counter that is reset after each token. + pub(crate) fn pos_in_token(&self) -> u32 { + (self.len_remaining - self.chars.as_str().len()) as u32 } - /// Advance by one character - pub fn bump(&mut self) { - self.advance(1) + /// Resets the number of consumed chars + pub(crate) fn reset_pos(&mut self) { + self.len_remaining = self.chars.as_str().len(); } - /// Returns current cursor position - pub fn curr_pt(&self) -> usize { - self.curr_pt + /// Consume until given function returns false + pub(crate) fn take_while(&mut self, mut predicate: impl FnMut(char) -> bool) { + while predicate(self.first()) && !self.is_eof() { + self.bump(); + } } } diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index b05e007..36d1ecc 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -7,85 +7,45 @@ use crate::symbol::Register; pub mod cursor; +/// A 'light' token that only carries basic and easily derivable info #[derive(Debug)] -pub struct Token { - pub kind: TokenKind, - pub span: Span, +pub struct LToken { + pub kind: LTokenKind, + pub len: u32, +} + +impl LToken { + pub fn new(kind: LTokenKind, len: u32) -> Self { + LToken { kind, len } + } } #[derive(Clone, Copy, PartialEq, Eq, Debug)] pub enum LiteralKind { Hex, Dec, - Str, -} - -#[derive(Clone, Copy, PartialEq, Eq, Debug)] -pub enum InstrKind { - Add, - And, - Branch, - Jump, - JumpSub, - JumpSubReg, - Load, - LoadInd, - LoadReg, - LoadAddr, - Not, - Return, - Interrupt, - Store, - StoreInd, + Str { terminated: bool }, } #[derive(Clone, Copy, PartialEq, Eq, Debug)] -pub enum DirecKind { - Orig, - Stringz, - Blkw, - Fill, - Alias, - Macro, - End, - Export, - Import, -} - -#[derive(Clone, Copy, PartialEq, Eq, Debug)] -pub enum TrapKind { - /// Get a character from standard input - Getc, - /// Output a single character - Out, - /// Print string - Puts, - In, - Putsp, - Halt, - Trap, -} - -#[derive(Clone, Copy, PartialEq, Eq, Debug)] -pub enum TokenKind { +pub enum LTokenKind { Ident, - Instr(InstrKind), Lit(LiteralKind), Comment, Direc, Reg, - /// Commas and whitespace - Junk, + /// Also includes commas + Whitespace, Unknown, Eof, } /// Not actually used in parsing, more for debug purposes. -pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ { +pub fn tokenize(input: &str) -> impl Iterator<Item = LToken> + '_ { let mut cursor = Cursor::new(input); std::iter::from_fn(move || { let token = cursor.advance_token(); - if token.kind != TokenKind::Eof { + if token.kind != LTokenKind::Eof { Some(token) } else { None @@ -93,80 +53,84 @@ pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ { }) } -lazy_static! { - // Order is important since some patterns are subpatterns of others. - // Do NOT rearrange without a good hard think. - static ref PATTERNS: Vec<(TokenKind, Regex)> = vec![ - (TokenKind::Junk, Regex::new(r"^[,\s]+").unwrap()), - ( - TokenKind::Lit(LiteralKind::Hex), - Regex::new(r"^(0x|x)[0-9a-fA-F]+\b").unwrap(), - ), - ( - TokenKind::Lit(LiteralKind::Dec), - Regex::new(r"^#[0-9]+\b").unwrap(), - ), - (TokenKind::Reg, Regex::new(r"^[rR][0-8]\b").unwrap()), - // Includes instructions, branches, and labels. - (TokenKind::Ident, Regex::new(r"^[a-zA-Z_]\w*\b").unwrap()), - (TokenKind::Comment, Regex::new(r"^;[^\n]*").unwrap()), - (TokenKind::Direc, Regex::new(r"^\.[a-zA-Z_]*\b").unwrap()), - (TokenKind::Lit(LiteralKind::Str), Regex::new(r#"^"([^"\\]|\\.)*""#).unwrap()) - ]; +/// Test if a character is considered to be whitespace. +pub(crate) fn is_whitespace(c: char) -> bool { + // Commas are essentially whitespace in LC3 + matches!(c, ' ' | '\n' | '\t' | '\r' | ',') } -impl Cursor<'_> { - pub fn advance_token(&mut self) -> Token { - if self.is_eof() { - return Token { - kind: TokenKind::Eof, - span: Span::default(), - }; - } +pub(crate) fn is_id(c: char) -> bool { + // Non-prefixed numerical literals are considered identifiers. + // This is because line numbers can be used as labels. + matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '_') +} - for (kind, re) in PATTERNS.iter() { - if let Some(tok) = re.find(self.at_curr_pt()) { - // Parse into precise definition - let mut kind = *kind; - kind = match kind { - TokenKind::Ident => match self.get_next(tok.len()).to_lowercase().as_str() { - "add" => TokenKind::Instr(InstrKind::Add), - "and" => TokenKind::Instr(InstrKind::And), - "br" | "brn" | "brz" | "brp" | "brnz" | "brnzp" | "brnp" | "brzp" => { - TokenKind::Instr(InstrKind::Branch) - } - "jmp" => TokenKind::Instr(InstrKind::Jump), - "jsr" => TokenKind::Instr(InstrKind::JumpSub), - "jsrr" => TokenKind::Instr(InstrKind::JumpSubReg), - "ld" => TokenKind::Instr(InstrKind::Load), - "ldi" => TokenKind::Instr(InstrKind::LoadInd), - "ldr" => TokenKind::Instr(InstrKind::LoadReg), - "lea" => TokenKind::Instr(InstrKind::LoadAddr), - "not" => TokenKind::Instr(InstrKind::Not), - "ret" => TokenKind::Instr(InstrKind::Return), - "rti" => TokenKind::Instr(InstrKind::Interrupt), - "st" => TokenKind::Instr(InstrKind::Store), - "sti" => TokenKind::Instr(InstrKind::StoreInd), - _ => TokenKind::Ident, - }, - TokenKind::Direc => { - todo!() - } - _ => kind, - }; - let token = Token { - kind, - span: Span::new(Idx(self.curr_pt() as u32), tok.len() as u16), - }; - self.advance(tok.len()); - return token; - } - } +pub(crate) fn is_num(c: char) -> bool { + matches!(c, '0'..='9') +} - self.bump(); - Token { - kind: TokenKind::Unknown, - span: Span::new(Idx((self.curr_pt() - 1) as u32), 1u16), - } +pub(crate) fn is_hex(c: char) -> bool { + matches!(c, 'a'..='f' | 'A'..='F' | '0'..='9') +} + +impl Cursor<'_> { + pub fn advance_token(&mut self) -> LToken { + let first_char = match self.bump() { + Some(c) => c, + None => return LToken::new(LTokenKind::Eof, 0), + }; + let token_kind = match first_char { + ';' => { + self.take_while(|c| c != '\n'); + LTokenKind::Comment + } + c if is_whitespace(c) => { + self.take_while(is_whitespace); + LTokenKind::Whitespace + } + // Hex literals + 'x' | 'X' => { + self.take_while(is_hex); + LTokenKind::Lit(LiteralKind::Hex) + } + '0' => match self.first() { + 'x' | 'X' => { + self.take_while(is_hex); + LTokenKind::Lit(LiteralKind::Hex) + } + _ => { + self.take_while(is_id); + LTokenKind::Ident + } + }, + // Identifiers should be checked after everything else that overlaps. + c if is_id(c) => { + self.take_while(is_id); + LTokenKind::Ident + } + // Decimal literal + '#' => { + if self.first() == '-' { + self.bump(); + } + self.take_while(is_num); + LTokenKind::Lit(LiteralKind::Dec) + } + // Directive + '.' => { + self.take_while(is_id); + LTokenKind::Direc + } + // String literal + // TODO: Allow for escaped characters and the terminated thing + '"' => { + self.take_while(|c| c != '"'); + LTokenKind::Lit(LiteralKind::Str { terminated: true }) + } + _ => LTokenKind::Unknown, + }; + let res = LToken::new(token_kind, self.pos_in_token()); + self.reset_pos(); + res } } diff --git a/src/main.rs b/src/main.rs index 36873be..3787be5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -4,7 +4,7 @@ use std::fs; use clap::{Parser, Subcommand}; use colored::Colorize; -use lexer::{tokenize, TokenKind}; +use lexer::{tokenize, LTokenKind}; use miette::Result; use parser::AsmParser; @@ -69,8 +69,8 @@ fn main() -> Result<()> { Command::Run { os, name } => todo!(), Command::Compile { name, dest } => { let file = fs::read_to_string(name).unwrap(); - for tok in tokenize(&file).filter(|tok| tok.kind != TokenKind::Junk) { - println!("{:?} {}", tok, &file[tok.span.as_range()]); + for tok in tokenize(&file).filter(|tok| tok.kind != LTokenKind::Whitespace) { + println!("{:?}", tok); } let mut parse = AsmParser::from(file.as_str()); From 0a9f55185a1b0702cc8be7fabdab2c51278fc67b Mon Sep 17 00:00:00 2001 From: Artemis Rosman <73006620+rozukke@users.noreply.github.com> Date: Thu, 15 Aug 2024 12:18:01 +1000 Subject: [PATCH 05/17] Progress --- src/ops.rs | 4 ++-- src/parser.rs | 34 ++++++++++++++++++++++++++++++---- src/symbol.rs | 6 ++++++ 3 files changed, 38 insertions(+), 6 deletions(-) diff --git a/src/ops.rs b/src/ops.rs index 9cf5eae..fd2df66 100644 --- a/src/ops.rs +++ b/src/ops.rs @@ -1,5 +1,5 @@ use crate::{ - lexer::Token, + lexer::LToken, symbol::{ByteOffs, Flag, Label, Register}, }; @@ -75,7 +75,7 @@ pub enum Op { pc_offset9: u16, }, Dir { - args: Option<Vec<Token>>, + args: Option<Vec<LToken>>, }, } diff --git a/src/parser.rs b/src/parser.rs index b949a9e..846d6b3 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -2,7 +2,7 @@ use std::error::Error; use miette::{miette, Result}; -use crate::lexer::{cursor::Cursor, Token, TokenKind}; +use crate::lexer::{cursor::Cursor, LToken, LTokenKind}; /// Transforms token stream into 'AST' pub struct AsmParser<'source> { @@ -24,13 +24,39 @@ impl<'a> From<&'a str> for AsmParser<'a> { impl<'source> AsmParser<'source> { pub fn parse(&mut self) -> Result<()> { // First, check that there is an .orig directive with an appropriate value. - let orig = self.expect(TokenKind::Direc)?; - let addr = self.expect(TokenKind::Lit(crate::lexer::LiteralKind::Hex)); + // Should emit error with a label to the first line stating "Expected memory init" + // Should be in a function that is also used to init the memory - the question is + // whether it should remain as a full directive or as a value that gets emitted afterwards. + let orig = self.expect(LTokenKind::Direc)?; + // Need ability to expect an enum without specifying a subcase (maybe ()?) + let addr = self.expect(LTokenKind::Lit(crate::lexer::LiteralKind::Hex)); + + // Following this, the structure is always: + // [label] + // -> <inst> [args] + // OR + // <label> + // -> <direc> [args] + // OR + // [label] + // ->* <direc> <args> + // OR + // <trap> [arg] + // or: (sometimes opt label) num directives (opt argument) + // so should generally build to this structure. This means, however, that the complexity + // is not suuper high as there are really only two medium complexity subcases to parse. + // + // TODO: Split into LexToken and Token, to simplify the lexer and have a postprocessing + // step that can then put it into a Token format that is then easily transformed into + // the 'AST'. + // + // In order to do this, there needs to be peeking functionality on the token stream so + // that it can e.g. see if there is a label present at the start of a line. Ok(()) } - pub fn expect(&mut self, kind: TokenKind) -> Result<Token> { + pub fn expect(&mut self, kind: LTokenKind) -> Result<LToken> { let tok = self.cur.advance_token(); if tok.kind == kind { return Ok(tok); diff --git a/src/symbol.rs b/src/symbol.rs index e49dac7..71b987b 100644 --- a/src/symbol.rs +++ b/src/symbol.rs @@ -1,3 +1,9 @@ +use fxhash::FxBuildHasher; +use indexmap::IndexMap; + +// Symbol table of symbol -> memory address (line number) +type FxMap<K, V> = IndexMap<K, V, FxBuildHasher>; + /// Represents the CPU registers. #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)] pub enum Register { From 82a72d1dc070cb290fd340408e80607aa494dd42 Mon Sep 17 00:00:00 2001 From: Artemis Rosman <73006620+rozukke@users.noreply.github.com> Date: Thu, 15 Aug 2024 16:05:10 +1000 Subject: [PATCH 06/17] Add indexmap dependency --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 294e1e8..c0d0490 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,7 @@ colored = "2.1.0" regex = "1.10.6" lazy_static = "1.5.0" miette = { version = "7.2.0", features = ["fancy"] } -indexmap = "2.4.0" +indexmap = { version = "2.4.0", features = ["std"] } fxhash = "0.2.1" [dev-dependencies] From 3018c48d5850f22667c435f8f629c04e873b6e63 Mon Sep 17 00:00:00 2001 From: Artemis Rosman <73006620+rozukke@users.noreply.github.com> Date: Thu, 15 Aug 2024 16:05:47 +1000 Subject: [PATCH 07/17] Progress on token conversion --- src/lexer/cursor.rs | 4 ++-- src/lexer/mod.rs | 15 ++++----------- src/main.rs | 1 - src/ops.rs | 6 ------ src/parser.rs | 44 +++++++++++++++++++++++++++++++++++++------- src/span.rs | 22 ---------------------- src/symbol.rs | 30 ++++++++++++++++++++++++++++++ 7 files changed, 73 insertions(+), 49 deletions(-) delete mode 100644 src/span.rs diff --git a/src/lexer/cursor.rs b/src/lexer/cursor.rs index 7a56ee4..885d31c 100644 --- a/src/lexer/cursor.rs +++ b/src/lexer/cursor.rs @@ -1,6 +1,6 @@ //! Taken from the lexer in https://github.com/rozukke/mimi -// Heavily instpired and referenced from `rustc_lexer` and adapted to suit the project. -// See https://doc.rust-lang.org/beta/nightly-rustc/src/rustc_lexer/cursor.rs.html +//! Heavily instpired and referenced from `rustc_lexer` and adapted to suit the project. +//! See https://doc.rust-lang.org/beta/nightly-rustc/src/rustc_lexer/cursor.rs.html use std::str::Chars; diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index 36d1ecc..231e219 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -59,20 +59,13 @@ pub(crate) fn is_whitespace(c: char) -> bool { matches!(c, ' ' | '\n' | '\t' | '\r' | ',') } +/// Test if a character is considered an LC3 identifier character. pub(crate) fn is_id(c: char) -> bool { // Non-prefixed numerical literals are considered identifiers. // This is because line numbers can be used as labels. matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '_') } -pub(crate) fn is_num(c: char) -> bool { - matches!(c, '0'..='9') -} - -pub(crate) fn is_hex(c: char) -> bool { - matches!(c, 'a'..='f' | 'A'..='F' | '0'..='9') -} - impl Cursor<'_> { pub fn advance_token(&mut self) -> LToken { let first_char = match self.bump() { @@ -90,12 +83,12 @@ impl Cursor<'_> { } // Hex literals 'x' | 'X' => { - self.take_while(is_hex); + self.take_while(|c| char::is_ascii_hexdigit(&c)); LTokenKind::Lit(LiteralKind::Hex) } '0' => match self.first() { 'x' | 'X' => { - self.take_while(is_hex); + self.take_while(|c| char::is_ascii_hexdigit(&c)); LTokenKind::Lit(LiteralKind::Hex) } _ => { @@ -113,7 +106,7 @@ impl Cursor<'_> { if self.first() == '-' { self.bump(); } - self.take_while(is_num); + self.take_while(|c| char::is_ascii_digit(&c)); LTokenKind::Lit(LiteralKind::Dec) } // Directive diff --git a/src/main.rs b/src/main.rs index 3787be5..6284a30 100644 --- a/src/main.rs +++ b/src/main.rs @@ -12,7 +12,6 @@ mod lexer; mod ops; mod parser; mod runtime; -mod span; mod symbol; /// Lace is a complete & convenient assembler toolchain for the LC3 assembly language. diff --git a/src/ops.rs b/src/ops.rs index fd2df66..6baa6d7 100644 --- a/src/ops.rs +++ b/src/ops.rs @@ -3,12 +3,6 @@ use crate::{ symbol::{ByteOffs, Flag, Label, Register}, }; -pub struct Stmt { - line: u32, - label: Label, - op: Op, -} - /// Basically the entire 'AST' when it comes to LC3. /// TODO: Convert to labels instead of offsets at this stage. #[allow(clippy::upper_case_acronyms)] diff --git a/src/parser.rs b/src/parser.rs index 846d6b3..772ad28 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1,27 +1,57 @@ -use std::error::Error; +use std::{error::Error, io::Cursor}; use miette::{miette, Result}; -use crate::lexer::{cursor::Cursor, LToken, LTokenKind}; +use crate::{ + lexer::{tokenize, LToken, LTokenKind, LiteralKind}, + symbol::{DirKind, InstrKind, Register, Span, Symbol, TrapKind}, +}; + +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub struct Token { + kind: TokenKind, + span: Span, +} + +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub enum TokenKind { + /// `r0-r7 | R0-R7` + Reg(Register), + /// `LOOP_START`, `123`, `coolname` + Label(Symbol), + /// `.orig`, `.Stringz`, `.BLKW` + Dir(DirKind), + /// `PUTS`, `Trap`, `putc` + Trap(TrapKind), + /// `"hi\n"`, `0x3AB5F`, `#-1` + Lit(LiteralKind), + /// `add`, `JMP`, `Ret` + Inst(InstrKind), +} + +pub fn proc_tokens<'a>(src: &'a str) -> Vec<Token> { + todo!() +} /// Transforms token stream into 'AST' -pub struct AsmParser<'source> { +pub struct AsmParser<'a> { /// Reference to the source file - src: &'source str, + src: &'a Vec<Token>, /// Used to parse tokens - cur: Cursor<'source>, + cur: Cursor<'a>, } impl<'a> From<&'a str> for AsmParser<'a> { fn from(value: &'a str) -> Self { + let toks: Vec<LToken> = tokenize(value).collect(); AsmParser { - src: value, + src: toks, cur: Cursor::new(value), } } } -impl<'source> AsmParser<'source> { +impl<'a> AsmParser<'a> { pub fn parse(&mut self) -> Result<()> { // First, check that there is an .orig directive with an appropriate value. // Should emit error with a label to the first line stating "Expected memory init" diff --git a/src/span.rs b/src/span.rs deleted file mode 100644 index e821c2c..0000000 --- a/src/span.rs +++ /dev/null @@ -1,22 +0,0 @@ -/// Position relative to start of source. -#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default, Debug)] -pub struct Idx(pub u32); - -/// Holds a view into a source. -#[derive(Clone, Copy, PartialEq, Eq, Default, Hash, Debug)] -pub struct Span { - start: Idx, - len: u16, -} - -impl Span { - pub fn new(start: Idx, len: u16) -> Self { - Span { start, len } - } - - pub fn as_range(&self) -> std::ops::Range<usize> { - let start = self.start.0 as usize; - let end = start + self.len as usize; - start..end - } -} diff --git a/src/symbol.rs b/src/symbol.rs index 71b987b..d4956f0 100644 --- a/src/symbol.rs +++ b/src/symbol.rs @@ -4,6 +4,21 @@ use indexmap::IndexMap; // Symbol table of symbol -> memory address (line number) type FxMap<K, V> = IndexMap<K, V, FxBuildHasher>; +thread_local! { + static SYMBOL_TABLE: FxMap<String, u16> = IndexMap::with_hasher(FxBuildHasher::default()); +} + +/// Reference to symbol table index +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)] +pub struct Symbol(u16); + +/// Location within source +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub struct Span { + offs: ByteOffs, + len: usize, +} + /// Represents the CPU registers. #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)] pub enum Register { @@ -37,6 +52,21 @@ pub enum Flag { Nzp, } +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub enum InstrKind { + Add, +} + +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub enum TrapKind { + Trap(u16), +} + +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub enum DirKind { + Orig, +} + /// Newtype representing an address inside the LC3 memory. #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)] pub struct Addr(u16); From 67ca7536d91781c914a9001ea4b76481537156cb Mon Sep 17 00:00:00 2001 From: Artemis Rosman <73006620+rozukke@users.noreply.github.com> Date: Sat, 17 Aug 2024 03:03:10 +1000 Subject: [PATCH 08/17] Add initial parser function spec --- src/parser.rs | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index 772ad28..ef64a30 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1,9 +1,9 @@ -use std::{error::Error, io::Cursor}; +use std::error::Error; use miette::{miette, Result}; use crate::{ - lexer::{tokenize, LToken, LTokenKind, LiteralKind}, + lexer::{cursor::Cursor, tokenize, LToken, LTokenKind, LiteralKind}, symbol::{DirKind, InstrKind, Register, Span, Symbol, TrapKind}, }; @@ -29,24 +29,33 @@ pub enum TokenKind { Inst(InstrKind), } -pub fn proc_tokens<'a>(src: &'a str) -> Vec<Token> { +pub fn proc_tokens(src: &str) -> Vec<Token> { + // Get reference to global symbol table + // Iterate through, +1 to symbol count per inst + // +len(str) for every string literal + // +number of lines for BLKW (need to process cringe inconsistent literals) + // Also need to do matching to process register and instruction tokens into the correct contents + let toks: Vec<LToken> = tokenize(src).collect(); todo!() } /// Transforms token stream into 'AST' pub struct AsmParser<'a> { /// Reference to the source file - src: &'a Vec<Token>, + src: &'a str, + /// List of processed tokens + tok: Vec<Token>, /// Used to parse tokens cur: Cursor<'a>, } impl<'a> From<&'a str> for AsmParser<'a> { - fn from(value: &'a str) -> Self { - let toks: Vec<LToken> = tokenize(value).collect(); + fn from(src: &'a str) -> Self { + let tok: Vec<Token> = proc_tokens(src); AsmParser { - src: toks, - cur: Cursor::new(value), + src, + tok, + cur: Cursor::new(src), } } } From 596f9ff5ccdd76989a20ad33dbd3866ae4b35861 Mon Sep 17 00:00:00 2001 From: Artemis Rosman <73006620+rozukke@users.noreply.github.com> Date: Sat, 17 Aug 2024 03:03:27 +1000 Subject: [PATCH 09/17] Add function to look ahead n times --- src/lexer/cursor.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/lexer/cursor.rs b/src/lexer/cursor.rs index 885d31c..a7589ce 100644 --- a/src/lexer/cursor.rs +++ b/src/lexer/cursor.rs @@ -1,4 +1,3 @@ -//! Taken from the lexer in https://github.com/rozukke/mimi //! Heavily instpired and referenced from `rustc_lexer` and adapted to suit the project. //! See https://doc.rust-lang.org/beta/nightly-rustc/src/rustc_lexer/cursor.rs.html @@ -58,4 +57,8 @@ impl<'a> Cursor<'a> { self.bump(); } } + + pub(crate) fn take_n(&self, n: usize) -> String { + self.chars.clone().take(n).collect() + } } From 5e2d469257a0d537bedceced2d0ec51473489d2d Mon Sep 17 00:00:00 2001 From: Artemis Rosman <73006620+rozukke@users.noreply.github.com> Date: Sat, 17 Aug 2024 03:03:41 +1000 Subject: [PATCH 10/17] Check for .end --- src/lexer/mod.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index 231e219..f38cc34 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -2,7 +2,6 @@ use lazy_static::lazy_static; use regex::Regex; use crate::lexer::cursor::Cursor; -use crate::span::{Idx, Span}; use crate::symbol::Register; pub mod cursor; @@ -111,8 +110,13 @@ impl Cursor<'_> { } // Directive '.' => { + let check = self.take_n(3).to_ascii_lowercase(); self.take_while(is_id); - LTokenKind::Direc + // Need to check for .end directive to avoid unnecessary parsing and errors + match (self.pos_in_token(), check.as_str()) { + (3, "end") => LTokenKind::Eof, + _ => LTokenKind::Direc, + } } // String literal // TODO: Allow for escaped characters and the terminated thing From 22a5e7b8135919ef44f0dcc8212427138ead8b35 Mon Sep 17 00:00:00 2001 From: Artemis Rosman <73006620+rozukke@users.noreply.github.com> Date: Sat, 17 Aug 2024 17:52:04 +1000 Subject: [PATCH 11/17] Parser progress --- src/lexer/cursor.rs | 4 +++ src/parser.rs | 81 +++++++++++++++++++++++++++++++++++++++------ src/symbol.rs | 25 ++++++++++++-- 3 files changed, 97 insertions(+), 13 deletions(-) diff --git a/src/lexer/cursor.rs b/src/lexer/cursor.rs index a7589ce..e07b37f 100644 --- a/src/lexer/cursor.rs +++ b/src/lexer/cursor.rs @@ -61,4 +61,8 @@ impl<'a> Cursor<'a> { pub(crate) fn take_n(&self, n: usize) -> String { self.chars.clone().take(n).collect() } + + pub(crate) fn remaining(&self) -> usize { + self.chars.as_str().len() + } } diff --git a/src/parser.rs b/src/parser.rs index ef64a30..f956740 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1,12 +1,16 @@ -use std::error::Error; +use std::{borrow::BorrowMut, error::Error}; use miette::{miette, Result}; use crate::{ lexer::{cursor::Cursor, tokenize, LToken, LTokenKind, LiteralKind}, - symbol::{DirKind, InstrKind, Register, Span, Symbol, TrapKind}, + symbol::{ + with_symbol_table, ByteOffs, DirKind, InstrKind, Register, Span, Symbol, TrapKind, + SYMBOL_TABLE, + }, }; +/// Token with full span info and proper types #[derive(Clone, Copy, PartialEq, Eq, Debug)] pub struct Token { kind: TokenKind, @@ -29,14 +33,69 @@ pub enum TokenKind { Inst(InstrKind), } -pub fn proc_tokens(src: &str) -> Vec<Token> { - // Get reference to global symbol table - // Iterate through, +1 to symbol count per inst - // +len(str) for every string literal - // +number of lines for BLKW (need to process cringe inconsistent literals) - // Also need to do matching to process register and instruction tokens into the correct contents - let toks: Vec<LToken> = tokenize(src).collect(); - todo!() +/// Used to parse symbols and process exact instructions +pub struct StrParser<'a> { + src: &'a str, + cur: Cursor<'a>, + pos: usize, + line_num: usize, +} + +impl<'a> StrParser<'a> { + pub fn new(src: &'a str) -> Self { + StrParser { + src, + cur: Cursor::new(src), + pos: 0, + line_num: 1, + } + } + + fn get_next(&self, n: usize) -> &str { + &self.src[self.pos..=(self.pos + n)] + } + + pub fn proc_tokens(&mut self) -> Vec<Token> { + // Iterate through, +1 to symbol count per inst + // +len(str) for every string literal + // +number of lines for BLKW (need to process cringe inconsistent literals) + // Also need to do matching to process register and instruction tokens into the correct contents + let mut toks_final: Vec<Token> = Vec::new(); + let mut line_num = 1; + loop { + let tok = self.cur.advance_token(); + if let Some(tok_final) = match tok.kind { + // Add identifier to symbol table at with correct line number + LTokenKind::Ident => { + // Process possibility of it being a trap + todo!(); + // Add to symbol table as identifier + let idx = with_symbol_table(|sym| { + let tok_text = self.get_next(tok.len as usize); + sym.get_index_of(tok_text) + .unwrap_or(sym.insert_full(String::from(tok_text), line_num).0) + }); + Some(Token { + kind: TokenKind::Label(Symbol::from(idx)), + span: Span::new(ByteOffs(self.pos), tok.len as usize), + }) + } + // Create literal of correct value + LTokenKind::Lit(_) => todo!(), + // Match on directive, check next value for number of lines skipped + LTokenKind::Direc => todo!(), + // TODO: Add registers to lexer + LTokenKind::Reg => todo!(), + LTokenKind::Whitespace | LTokenKind::Comment => None, + // TODO: Should return list of errors eventually + LTokenKind::Unknown => todo!(), + LTokenKind::Eof => break, + } { + toks_final.push(tok_final); + } + } + toks_final + } } /// Transforms token stream into 'AST' @@ -51,7 +110,7 @@ pub struct AsmParser<'a> { impl<'a> From<&'a str> for AsmParser<'a> { fn from(src: &'a str) -> Self { - let tok: Vec<Token> = proc_tokens(src); + let tok: Vec<Token> = StrParser::new(src).proc_tokens(); AsmParser { src, tok, diff --git a/src/symbol.rs b/src/symbol.rs index d4956f0..8adf1b8 100644 --- a/src/symbol.rs +++ b/src/symbol.rs @@ -1,3 +1,5 @@ +use std::cell::RefCell; + use fxhash::FxBuildHasher; use indexmap::IndexMap; @@ -5,12 +7,25 @@ use indexmap::IndexMap; type FxMap<K, V> = IndexMap<K, V, FxBuildHasher>; thread_local! { - static SYMBOL_TABLE: FxMap<String, u16> = IndexMap::with_hasher(FxBuildHasher::default()); + pub static SYMBOL_TABLE: RefCell<FxMap<String, u16>> = RefCell::new(IndexMap::with_hasher(FxBuildHasher::default())); +} + +pub fn with_symbol_table<R, F>(f: F) -> R +where + F: FnOnce(&mut FxMap<String, u16>) -> R, +{ + SYMBOL_TABLE.with_borrow_mut(f) } /// Reference to symbol table index #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)] -pub struct Symbol(u16); +pub struct Symbol(usize); + +impl From<usize> for Symbol { + fn from(value: usize) -> Self { + Symbol { 0: value } + } +} /// Location within source #[derive(Clone, Copy, PartialEq, Eq, Debug)] @@ -19,6 +34,12 @@ pub struct Span { len: usize, } +impl Span { + pub fn new(offs: ByteOffs, len: usize) -> Self { + Span { offs, len } + } +} + /// Represents the CPU registers. #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)] pub enum Register { From b0dedfcf0357429749d2348a27073a8ae57994d7 Mon Sep 17 00:00:00 2001 From: Artemis Rosman <73006620+rozukke@users.noreply.github.com> Date: Mon, 19 Aug 2024 17:24:29 +1000 Subject: [PATCH 12/17] Parser progress --- src/lexer/cursor.rs | 1 + src/ops.rs | 4 +-- src/parser.rs | 82 ++++++++++++++++++++++++++++++++++++--------- src/symbol.rs | 28 +++++++++++++--- 4 files changed, 92 insertions(+), 23 deletions(-) diff --git a/src/lexer/cursor.rs b/src/lexer/cursor.rs index e07b37f..db97b3e 100644 --- a/src/lexer/cursor.rs +++ b/src/lexer/cursor.rs @@ -3,6 +3,7 @@ use std::str::Chars; +#[derive(Clone)] /// Peekable iterator over a char sequence. pub struct Cursor<'a> { len_remaining: usize, diff --git a/src/ops.rs b/src/ops.rs index 6baa6d7..5ba4560 100644 --- a/src/ops.rs +++ b/src/ops.rs @@ -1,6 +1,6 @@ use crate::{ lexer::LToken, - symbol::{ByteOffs, Flag, Label, Register}, + symbol::{Flag, Label, LineOffs, Register}, }; /// Basically the entire 'AST' when it comes to LC3. @@ -22,7 +22,7 @@ pub enum Op { /// Branch based on flag by adding ByteOffs to PC (program counter) BR { cc: Flag, - pc_offset9: ByteOffs, + pc_offset9: LineOffs, }, /// Set PC to BR to perform a jump on the next cycle JMP { diff --git a/src/parser.rs b/src/parser.rs index f956740..f66ceb2 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -5,8 +5,8 @@ use miette::{miette, Result}; use crate::{ lexer::{cursor::Cursor, tokenize, LToken, LTokenKind, LiteralKind}, symbol::{ - with_symbol_table, ByteOffs, DirKind, InstrKind, Register, Span, Symbol, TrapKind, - SYMBOL_TABLE, + with_symbol_table, DirKind, DirectiveKind, InstrKind, LineOffs, Register, Span, SrcOffset, + Symbol, TrapKind, SYMBOL_TABLE, }, }; @@ -55,47 +55,97 @@ impl<'a> StrParser<'a> { &self.src[self.pos..=(self.pos + n)] } + fn peek_next(&self) -> LToken { + self.cur.clone().advance_token() + } + pub fn proc_tokens(&mut self) -> Vec<Token> { // Iterate through, +1 to symbol count per inst // +len(str) for every string literal // +number of lines for BLKW (need to process cringe inconsistent literals) // Also need to do matching to process register and instruction tokens into the correct contents let mut toks_final: Vec<Token> = Vec::new(); - let mut line_num = 1; loop { let tok = self.cur.advance_token(); if let Some(tok_final) = match tok.kind { + LTokenKind::Eof => break, // Add identifier to symbol table at with correct line number LTokenKind::Ident => { // Process possibility of it being a trap - todo!(); - // Add to symbol table as identifier - let idx = with_symbol_table(|sym| { - let tok_text = self.get_next(tok.len as usize); - sym.get_index_of(tok_text) - .unwrap_or(sym.insert_full(String::from(tok_text), line_num).0) - }); - Some(Token { - kind: TokenKind::Label(Symbol::from(idx)), - span: Span::new(ByteOffs(self.pos), tok.len as usize), - }) + if let Some(trap) = StrParser::trap(self.get_next(tok.len as usize)) { + self.line_num += 1; + Some(Token { + kind: TokenKind::Trap(trap), + span: Span::new(SrcOffset(self.pos), tok.len as usize), + }) + } else { + // Add to symbol table as identifier + let idx = with_symbol_table(|sym| { + let tok_text = self.get_next(tok.len as usize); + sym.get_index_of(tok_text).unwrap_or( + sym.insert_full(String::from(tok_text), self.line_num as u16) + .0, + ) + }); + Some(Token { + kind: TokenKind::Label(Symbol::from(idx)), + span: Span::new(SrcOffset(self.pos), tok.len as usize), + }) + } } // Create literal of correct value LTokenKind::Lit(_) => todo!(), // Match on directive, check next value for number of lines skipped - LTokenKind::Direc => todo!(), + LTokenKind::Direc => { + if let Some(direc) = StrParser::direc(self.get_next(tok.len as usize)) { + Some(Token { + kind: TokenKind::Dir(direc), + span: Span::new(SrcOffset(self.pos), tok.len as usize), + }) + } else { + // TODO: Error handling in a list + todo!() + } + } // TODO: Add registers to lexer LTokenKind::Reg => todo!(), LTokenKind::Whitespace | LTokenKind::Comment => None, // TODO: Should return list of errors eventually LTokenKind::Unknown => todo!(), - LTokenKind::Eof => break, } { toks_final.push(tok_final); + self.pos += tok.len as usize; } } toks_final } + + fn trap(s: &str) -> Option<TrapKind> { + match s.to_ascii_lowercase().as_str() { + "getc" => Some(TrapKind::Getc), + "out" => Some(TrapKind::Out), + "puts" => Some(TrapKind::Puts), + "in" => Some(TrapKind::In), + "putsp" => Some(TrapKind::Putsp), + "halt" => Some(TrapKind::Halt), + "trap" => Some(TrapKind::Generic), + _ => None, + } + } + pub fn direc(s: &str) -> Option<DirectiveKind> { + match s.to_ascii_lowercase().as_str() { + ".alias" => Some(DirectiveKind::Alias), + ".macro" => Some(DirectiveKind::Macro), + ".orig" => Some(DirectiveKind::Orig), + ".end" => Some(DirectiveKind::End), + ".stringz" => Some(DirectiveKind::Stringz), + ".blkw" => Some(DirectiveKind::Blkw), + ".fill" => Some(DirectiveKind::Fill), + ".export" => Some(DirectiveKind::Export), + ".import" => Some(DirectiveKind::Import), + _ => None, + } + } } /// Transforms token stream into 'AST' diff --git a/src/symbol.rs b/src/symbol.rs index 8adf1b8..e81cbfd 100644 --- a/src/symbol.rs +++ b/src/symbol.rs @@ -30,12 +30,12 @@ impl From<usize> for Symbol { /// Location within source #[derive(Clone, Copy, PartialEq, Eq, Debug)] pub struct Span { - offs: ByteOffs, + offs: SrcOffset, len: usize, } impl Span { - pub fn new(offs: ByteOffs, len: usize) -> Self { + pub fn new(offs: SrcOffset, len: usize) -> Self { Span { offs, len } } } @@ -80,12 +80,26 @@ pub enum InstrKind { #[derive(Clone, Copy, PartialEq, Eq, Debug)] pub enum TrapKind { - Trap(u16), + Generic, + Halt, + Putsp, + In, + Puts, + Out, + Getc, } #[derive(Clone, Copy, PartialEq, Eq, Debug)] -pub enum DirKind { +pub enum DirectiveKind { + Alias, + Macro, Orig, + End, + Stringz, + Blkw, + Fill, + Export, + Import, } /// Newtype representing an address inside the LC3 memory. @@ -94,9 +108,13 @@ pub struct Addr(u16); /// Newtype representing an offset from a particular address. #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)] -pub struct ByteOffs(u16); +pub struct LineOffs(u16); /// Label used to refer to specific memory addresses /// TODO: optimize later #[derive(Clone, PartialEq, Eq, Debug)] pub struct Label(String); + +/// Used to refer to offsets from the start of a source file. +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)] +pub struct SrcOffset(pub usize); From ef80494238fc109488f6e94cd6c0ee7f93eb80b0 Mon Sep 17 00:00:00 2001 From: Artemis Rosman <73006620+rozukke@users.noreply.github.com> Date: Tue, 20 Aug 2024 12:56:35 +1000 Subject: [PATCH 13/17] Add directive processing and registers to lexer --- src/lexer/mod.rs | 45 ++++++++++++++++++++++----------- src/parser.rs | 66 ++++++++++++++++++++++++++++++++---------------- src/symbol.rs | 2 +- 3 files changed, 75 insertions(+), 38 deletions(-) diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index f38cc34..7855d74 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -58,6 +58,11 @@ pub(crate) fn is_whitespace(c: char) -> bool { matches!(c, ' ' | '\n' | '\t' | '\r' | ',') } +pub(crate) fn is_reg_num(c: char) -> bool { + // Valid only between 0-7 + matches!(c, '0'..='7') +} + /// Test if a character is considered an LC3 identifier character. pub(crate) fn is_id(c: char) -> bool { // Non-prefixed numerical literals are considered identifiers. @@ -81,25 +86,25 @@ impl Cursor<'_> { LTokenKind::Whitespace } // Hex literals - 'x' | 'X' => { - self.take_while(|c| char::is_ascii_hexdigit(&c)); - LTokenKind::Lit(LiteralKind::Hex) - } + 'x' | 'X' => self.hex(), '0' => match self.first() { - 'x' | 'X' => { - self.take_while(|c| char::is_ascii_hexdigit(&c)); - LTokenKind::Lit(LiteralKind::Hex) - } - _ => { - self.take_while(is_id); - LTokenKind::Ident + 'x' | 'X' => self.hex(), + _ => self.ident(), + }, + 'r' | 'R' => match self.first() { + c if is_reg_num(c) => { + self.take_while(is_reg_num); + // Registers are 2 tokens long and followed by whitespace/comma + if self.pos_in_token() == 2 && is_whitespace(self.first()) { + LTokenKind::Reg + } else { + self.ident() + } } + _ => self.ident(), }, // Identifiers should be checked after everything else that overlaps. - c if is_id(c) => { - self.take_while(is_id); - LTokenKind::Ident - } + c if is_id(c) => self.ident(), // Decimal literal '#' => { if self.first() == '-' { @@ -130,4 +135,14 @@ impl Cursor<'_> { self.reset_pos(); res } + + fn ident(&mut self) -> LTokenKind { + self.take_while(|c| char::is_ascii_hexdigit(&c)); + LTokenKind::Ident + } + + fn hex(&mut self) -> LTokenKind { + self.take_while(|c| char::is_ascii_hexdigit(&c)); + LTokenKind::Lit(LiteralKind::Hex) + } } diff --git a/src/parser.rs b/src/parser.rs index f66ceb2..f87b4f1 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1,12 +1,12 @@ -use std::{borrow::BorrowMut, error::Error}; +use std::{borrow::BorrowMut, error::Error, usize}; use miette::{miette, Result}; use crate::{ lexer::{cursor::Cursor, tokenize, LToken, LTokenKind, LiteralKind}, symbol::{ - with_symbol_table, DirKind, DirectiveKind, InstrKind, LineOffs, Register, Span, SrcOffset, - Symbol, TrapKind, SYMBOL_TABLE, + with_symbol_table, DirKind, InstrKind, LineOffs, Register, Span, SrcOffset, Symbol, + TrapKind, SYMBOL_TABLE, }, }; @@ -51,19 +51,23 @@ impl<'a> StrParser<'a> { } } - fn get_next(&self, n: usize) -> &str { + fn get_next_chars(&self, n: usize) -> &str { &self.src[self.pos..=(self.pos + n)] } + // TODO: bad bad bad bad bad fn peek_next(&self) -> LToken { - self.cur.clone().advance_token() + let mut cur = self.cur.clone(); + let mut tok = cur.advance_token(); + if tok.kind != LTokenKind::Whitespace { + return tok; + } + cur.advance_token() } pub fn proc_tokens(&mut self) -> Vec<Token> { // Iterate through, +1 to symbol count per inst // +len(str) for every string literal - // +number of lines for BLKW (need to process cringe inconsistent literals) - // Also need to do matching to process register and instruction tokens into the correct contents let mut toks_final: Vec<Token> = Vec::new(); loop { let tok = self.cur.advance_token(); @@ -72,16 +76,17 @@ impl<'a> StrParser<'a> { // Add identifier to symbol table at with correct line number LTokenKind::Ident => { // Process possibility of it being a trap - if let Some(trap) = StrParser::trap(self.get_next(tok.len as usize)) { + if let Some(trap_kind) = StrParser::trap(self.get_next_chars(tok.len as usize)) + { self.line_num += 1; Some(Token { - kind: TokenKind::Trap(trap), + kind: TokenKind::Trap(trap_kind), span: Span::new(SrcOffset(self.pos), tok.len as usize), }) } else { // Add to symbol table as identifier let idx = with_symbol_table(|sym| { - let tok_text = self.get_next(tok.len as usize); + let tok_text = self.get_next_chars(tok.len as usize); sym.get_index_of(tok_text).unwrap_or( sym.insert_full(String::from(tok_text), self.line_num as u16) .0, @@ -97,9 +102,25 @@ impl<'a> StrParser<'a> { LTokenKind::Lit(_) => todo!(), // Match on directive, check next value for number of lines skipped LTokenKind::Direc => { - if let Some(direc) = StrParser::direc(self.get_next(tok.len as usize)) { + if let Some(dir_kind) = StrParser::direc(self.get_next_chars(tok.len as usize)) + { + self.line_num += match dir_kind { + // Blkw should increment line count by the following int literal + // TODO: Check if not int literal + DirKind::Blkw => self + .get_next_chars(self.peek_next().len as usize) + .parse::<usize>() + .unwrap(), + // Stringz should increment line count by the number of characters + // in the string literal + null byte + DirKind::Stringz => { + // TODO: Check if not str literal + (self.peek_next().len - 2) as usize + } + _ => 1, + }; Some(Token { - kind: TokenKind::Dir(direc), + kind: TokenKind::Dir(dir_kind), span: Span::new(SrcOffset(self.pos), tok.len as usize), }) } else { @@ -132,17 +153,18 @@ impl<'a> StrParser<'a> { _ => None, } } - pub fn direc(s: &str) -> Option<DirectiveKind> { + + pub fn direc(s: &str) -> Option<DirKind> { match s.to_ascii_lowercase().as_str() { - ".alias" => Some(DirectiveKind::Alias), - ".macro" => Some(DirectiveKind::Macro), - ".orig" => Some(DirectiveKind::Orig), - ".end" => Some(DirectiveKind::End), - ".stringz" => Some(DirectiveKind::Stringz), - ".blkw" => Some(DirectiveKind::Blkw), - ".fill" => Some(DirectiveKind::Fill), - ".export" => Some(DirectiveKind::Export), - ".import" => Some(DirectiveKind::Import), + ".alias" => Some(DirKind::Alias), + ".macro" => Some(DirKind::Macro), + ".orig" => Some(DirKind::Orig), + ".end" => Some(DirKind::End), + ".stringz" => Some(DirKind::Stringz), + ".blkw" => Some(DirKind::Blkw), + ".fill" => Some(DirKind::Fill), + ".export" => Some(DirKind::Export), + ".import" => Some(DirKind::Import), _ => None, } } diff --git a/src/symbol.rs b/src/symbol.rs index e81cbfd..29247dd 100644 --- a/src/symbol.rs +++ b/src/symbol.rs @@ -90,7 +90,7 @@ pub enum TrapKind { } #[derive(Clone, Copy, PartialEq, Eq, Debug)] -pub enum DirectiveKind { +pub enum DirKind { Alias, Macro, Orig, From f08f81facffdfa4537da8ff618516f808511fa55 Mon Sep 17 00:00:00 2001 From: Artemis Rosman <73006620+rozukke@users.noreply.github.com> Date: Tue, 27 Aug 2024 18:40:37 +1000 Subject: [PATCH 14/17] Redo lexer --- flake.nix | 1 + scratch/test.asm | 7 + src/lexer/cursor.rs | 21 ++- src/lexer/mod.rs | 203 +++++++++++++++++-------- src/main.rs | 39 +++-- src/ops.rs | 4 +- src/parser.rs | 353 ++++++++++++++++++++------------------------ src/symbol.rs | 49 +++++- 8 files changed, 396 insertions(+), 281 deletions(-) create mode 100644 scratch/test.asm diff --git a/flake.nix b/flake.nix index 7da3ee9..74b1de3 100644 --- a/flake.nix +++ b/flake.nix @@ -55,6 +55,7 @@ name = "rust-dev"; buildInputs = with pkgs; [ _rustToolchain + rust-analyzer ]; RUST_SRC_PATH = "${_rustToolchain}/lib/rustlib/src/rust/library"; diff --git a/scratch/test.asm b/scratch/test.asm new file mode 100644 index 0000000..2352f14 --- /dev/null +++ b/scratch/test.asm @@ -0,0 +1,7 @@ +ahhh .orig x3000 +add R0, R0, #2; holllly shittt no wayyy +add R0, R1, #-32568; waow +add r1, r3, r4 r5 0x40 +ret +labelthing .stringz "woaw omg \"epic\"" + .stringz "okayyy" diff --git a/src/lexer/cursor.rs b/src/lexer/cursor.rs index db97b3e..11a6dc4 100644 --- a/src/lexer/cursor.rs +++ b/src/lexer/cursor.rs @@ -1,14 +1,16 @@ //! Heavily instpired and referenced from `rustc_lexer` and adapted to suit the project. //! See https://doc.rust-lang.org/beta/nightly-rustc/src/rustc_lexer/cursor.rs.html -use std::str::Chars; +use std::{ops::Range, str::Chars}; #[derive(Clone)] /// Peekable iterator over a char sequence. pub struct Cursor<'a> { len_remaining: usize, + orig_size: usize, /// Iterator over chars in a &str chars: Chars<'a>, + input: &'a str, } pub(crate) const NULL_CHAR: char = '\0'; @@ -17,7 +19,9 @@ impl<'a> Cursor<'a> { pub fn new(input: &'a str) -> Cursor<'a> { Cursor { len_remaining: input.len(), + orig_size: input.len(), chars: input.chars(), + input, } } @@ -41,10 +45,9 @@ impl<'a> Cursor<'a> { Some(c) } - /// Return consumed tokens - /// Basic counter that is reset after each token. - pub(crate) fn pos_in_token(&self) -> u32 { - (self.len_remaining - self.chars.as_str().len()) as u32 + /// Return number of consumed tokens + pub(crate) fn pos_in_token(&self) -> usize { + self.len_remaining - self.chars.as_str().len() } /// Resets the number of consumed chars @@ -66,4 +69,12 @@ impl<'a> Cursor<'a> { pub(crate) fn remaining(&self) -> usize { self.chars.as_str().len() } + + pub(crate) fn abs_pos(&self) -> usize { + self.orig_size - self.len_remaining + self.pos_in_token() + } + + pub(crate) fn get_range(&self, range: Range<usize>) -> &str { + &self.input[range] + } } diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index 7855d74..5f8ac77 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -1,53 +1,62 @@ -use lazy_static::lazy_static; -use regex::Regex; +use std::str::FromStr; + +use miette::{Result, bail, miette, LabeledSpan, Severity}; use crate::lexer::cursor::Cursor; -use crate::symbol::Register; +use crate::symbol::{DirKind, InstrKind, Register, Span, SrcOffset, TrapKind}; pub mod cursor; -/// A 'light' token that only carries basic and easily derivable info +/// A 'light' token that carries basic info and span #[derive(Debug)] -pub struct LToken { - pub kind: LTokenKind, - pub len: u32, +pub struct Token { + pub kind: TokenKind, + pub span: Span, } -impl LToken { - pub fn new(kind: LTokenKind, len: u32) -> Self { - LToken { kind, len } +impl Token { + pub fn new(kind: TokenKind, span: Span) -> Self { + Token { kind, span } } } #[derive(Clone, Copy, PartialEq, Eq, Debug)] pub enum LiteralKind { - Hex, - Dec, - Str { terminated: bool }, + Hex(u16), + Dec(i16), + Str, } #[derive(Clone, Copy, PartialEq, Eq, Debug)] -pub enum LTokenKind { - Ident, +pub enum TokenKind { + Label, + Instr(InstrKind), + Trap(TrapKind), Lit(LiteralKind), - Comment, - Direc, - Reg, + Dir(DirKind), + Reg(Register), /// Also includes commas Whitespace, Unknown, + Comment, Eof, } /// Not actually used in parsing, more for debug purposes. -pub fn tokenize(input: &str) -> impl Iterator<Item = LToken> + '_ { +pub fn tokenize(input: &str) -> impl Iterator<Item = Result<Token>> + '_ { let mut cursor = Cursor::new(input); std::iter::from_fn(move || { - let token = cursor.advance_token(); - if token.kind != LTokenKind::Eof { - Some(token) - } else { - None + loop { + let token = cursor.advance_token(); + if let Ok(inner) = &token { + if inner.kind == TokenKind::Whitespace { + continue; + } + if inner.kind == TokenKind::Eof { + return None; + } + } + return Some(token); } }) } @@ -71,24 +80,28 @@ pub(crate) fn is_id(c: char) -> bool { } impl Cursor<'_> { - pub fn advance_token(&mut self) -> LToken { + pub fn advance_token(&mut self) -> Result<Token> { + let start_pos = self.abs_pos(); let first_char = match self.bump() { Some(c) => c, - None => return LToken::new(LTokenKind::Eof, 0), + None => return Ok(Token::new(TokenKind::Eof, Span::dummy())), }; let token_kind = match first_char { ';' => { self.take_while(|c| c != '\n'); - LTokenKind::Comment + TokenKind::Comment } c if is_whitespace(c) => { self.take_while(is_whitespace); - LTokenKind::Whitespace + TokenKind::Whitespace } // Hex literals - 'x' | 'X' => self.hex(), + 'x' | 'X' => self.hex()?, '0' => match self.first() { - 'x' | 'X' => self.hex(), + 'x' | 'X' => { + self.bump(); + self.hex()? + }, _ => self.ident(), }, 'r' | 'R' => match self.first() { @@ -96,7 +109,8 @@ impl Cursor<'_> { self.take_while(is_reg_num); // Registers are 2 tokens long and followed by whitespace/comma if self.pos_in_token() == 2 && is_whitespace(self.first()) { - LTokenKind::Reg + // Unwrap is safe as c is always valid. + TokenKind::Reg(Register::from_str(&c.to_string()).unwrap()) } else { self.ident() } @@ -106,43 +120,110 @@ impl Cursor<'_> { // Identifiers should be checked after everything else that overlaps. c if is_id(c) => self.ident(), // Decimal literal - '#' => { - if self.first() == '-' { - self.bump(); - } - self.take_while(|c| char::is_ascii_digit(&c)); - LTokenKind::Lit(LiteralKind::Dec) - } + '#' => self.dec()?, // Directive - '.' => { - let check = self.take_n(3).to_ascii_lowercase(); - self.take_while(is_id); - // Need to check for .end directive to avoid unnecessary parsing and errors - match (self.pos_in_token(), check.as_str()) { - (3, "end") => LTokenKind::Eof, - _ => LTokenKind::Direc, - } - } + // '.' => { + // let check = self.take_n(3).to_ascii_lowercase(); + // self.take_while(is_id); + // // Need to check for .end directive to avoid unnecessary parsing and errors + // match (self.pos_in_token(), check.as_str()) { + // (3, "end") => TokenKind::Eof, + // _ => TokenKind::Dir, + // } + // } // String literal - // TODO: Allow for escaped characters and the terminated thing - '"' => { - self.take_while(|c| c != '"'); - LTokenKind::Lit(LiteralKind::Str { terminated: true }) - } - _ => LTokenKind::Unknown, + '"' => self.string_literal()?, + _ => { + self.take_while(|c| !is_whitespace(c)); + TokenKind::Unknown + }, }; - let res = LToken::new(token_kind, self.pos_in_token()); + let res = Token::new(token_kind, Span::new(SrcOffset(start_pos), self.pos_in_token())); self.reset_pos(); - res + Ok(res) } - fn ident(&mut self) -> LTokenKind { - self.take_while(|c| char::is_ascii_hexdigit(&c)); - LTokenKind::Ident + fn ident(&mut self) -> TokenKind { + self.take_while(is_id); + TokenKind::Label } - fn hex(&mut self) -> LTokenKind { - self.take_while(|c| char::is_ascii_hexdigit(&c)); - LTokenKind::Lit(LiteralKind::Hex) + fn hex(&mut self) -> Result<TokenKind> { + let start = self.abs_pos(); + let prefix = self.pos_in_token(); + self.take_while(|c| !is_whitespace(c)); + let str_val = self.get_range(start..self.abs_pos()); + let value = match u16::from_str_radix(str_val, 16) { + Ok(value) => value, + Err(e) => { + return Err(miette!( + severity = Severity::Error, + code = "parse::hex_lit", + help = "only use characters 0-9 and a-F.", + labels = vec![LabeledSpan::at(start - prefix..self.abs_pos(), "incorrect literal")], + "Encountered an invalid hex literal: {e}", + )) + } + }; + + Ok(TokenKind::Lit(LiteralKind::Hex(value))) + } + + fn dec(&mut self) -> Result<TokenKind> { + let start = self.abs_pos(); + let prefix = self.pos_in_token(); + // Check for negative sign + let is_negative = if self.first() == '-' { + self.bump(); // Skip the negative sign + true + } else { + false + }; + // Take the numeric part + self.take_while(|c| char::is_ascii_digit(&c)); + let str_val = self.get_range(start..self.abs_pos()); + + // Parse the string as an i16 to handle negative values + let value = match i16::from_str_radix(&str_val, 10) { + Ok(value) => value, + Err(e) => { + bail!( + severity = Severity::Error, + code = "parse::dec_lit", + help = "LC3 supports 16 bits of space, from -32,768 to 32,767.", + labels = vec![LabeledSpan::at(start - prefix..self.abs_pos(), "incorrect literal")], + "Encountered an invalid decimal literal: {e}", + ) + } + }; + + Ok(TokenKind::Lit(LiteralKind::Dec(value))) + } + + fn string_literal(&mut self) -> Result<TokenKind> { + let start = self.abs_pos() - 1; + let mut terminated = false; + while let Some(c) = self.bump() { + if c == '\n' {break}; + if c == '"' { + terminated = true; + break; + } + // Skip escaped + if c == '\\' { + self.bump(); + } + } + if !terminated { + bail!( + severity = Severity::Error, + code = "parse::str_lit", + help = "hint: make sure to close string literals with a \" character.", + labels = vec![LabeledSpan::at(start..self.abs_pos(), "incorrect literal")], + "Encountered an unterminated string literal.", + + ) + } + Ok(TokenKind::Lit(LiteralKind::Str)) } } diff --git a/src/main.rs b/src/main.rs index 6284a30..0f3c1cb 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,12 +1,14 @@ #![allow(unused)] // Remove later use std::fs; +use std::ops::RangeBounds; +use std::path::PathBuf; use clap::{Parser, Subcommand}; use colored::Colorize; -use lexer::{tokenize, LTokenKind}; -use miette::Result; -use parser::AsmParser; +use lexer::{tokenize}; +use lexer::TokenKind; +use miette::{Result, IntoDiagnostic}; mod lexer; mod ops; @@ -34,46 +36,50 @@ enum Command { #[arg(short, long)] os: bool, /// .asm file to run - name: String, + name: PathBuf, }, /// Create binary `.lc3` file to run later or view compiled data Compile { /// `.asm` file to compile - name: String, + name: PathBuf, /// Destination to output .lc3 file dest: Option<String>, }, /// Remove compilation artifacts for specified source Clean { /// `.asm` file to try remove artifacts for - name: String, + name: PathBuf, }, /// Place a watch on a `.asm` file to receive constant assembler updates Watch { /// `.asm` file to watch - name: String, + name: PathBuf, }, /// Format `.asm` file to adhere to recommended style Fmt { /// `.asm` file to format - name: String, + name: PathBuf, }, } -fn main() -> Result<()> { +fn main() -> miette::Result<()> { let args = Args::parse(); if let Some(command) = args.command { match command { Command::Run { os, name } => todo!(), Command::Compile { name, dest } => { - let file = fs::read_to_string(name).unwrap(); - for tok in tokenize(&file).filter(|tok| tok.kind != LTokenKind::Whitespace) { - println!("{:?}", tok); + let file = fs::read_to_string(name).into_diagnostic()?; + for tok in tokenize(&file) { + let ok = match tok { + Ok(ok) => ok, + Err(err) => { + return Err(err.with_source_code(file.clone())); + } + }; + println!("{:?}", ok); + println!("{:?}", &file[ok.span.range()]); } - - let mut parse = AsmParser::from(file.as_str()); - parse.parse()?; Ok(()) } Command::Clean { name } => todo!(), @@ -88,7 +94,8 @@ fn main() -> Result<()> { } } -const LOGO: &str = r#" .. +const LOGO: &str = r#" + .. x .d88" 5888R '888R u . .u diff --git a/src/ops.rs b/src/ops.rs index 5ba4560..eefe6a7 100644 --- a/src/ops.rs +++ b/src/ops.rs @@ -1,5 +1,5 @@ use crate::{ - lexer::LToken, + lexer::Token, symbol::{Flag, Label, LineOffs, Register}, }; @@ -69,7 +69,7 @@ pub enum Op { pc_offset9: u16, }, Dir { - args: Option<Vec<LToken>>, + args: Option<Vec<Token>>, }, } diff --git a/src/parser.rs b/src/parser.rs index f87b4f1..90b7c20 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -3,36 +3,13 @@ use std::{borrow::BorrowMut, error::Error, usize}; use miette::{miette, Result}; use crate::{ - lexer::{cursor::Cursor, tokenize, LToken, LTokenKind, LiteralKind}, + lexer::{cursor::Cursor, Token, TokenKind}, symbol::{ - with_symbol_table, DirKind, InstrKind, LineOffs, Register, Span, SrcOffset, Symbol, - TrapKind, SYMBOL_TABLE, + with_symbol_table, DirKind, Span, SrcOffset, Symbol, + TrapKind, }, }; -/// Token with full span info and proper types -#[derive(Clone, Copy, PartialEq, Eq, Debug)] -pub struct Token { - kind: TokenKind, - span: Span, -} - -#[derive(Clone, Copy, PartialEq, Eq, Debug)] -pub enum TokenKind { - /// `r0-r7 | R0-R7` - Reg(Register), - /// `LOOP_START`, `123`, `coolname` - Label(Symbol), - /// `.orig`, `.Stringz`, `.BLKW` - Dir(DirKind), - /// `PUTS`, `Trap`, `putc` - Trap(TrapKind), - /// `"hi\n"`, `0x3AB5F`, `#-1` - Lit(LiteralKind), - /// `add`, `JMP`, `Ret` - Inst(InstrKind), -} - /// Used to parse symbols and process exact instructions pub struct StrParser<'a> { src: &'a str, @@ -56,90 +33,90 @@ impl<'a> StrParser<'a> { } // TODO: bad bad bad bad bad - fn peek_next(&self) -> LToken { - let mut cur = self.cur.clone(); - let mut tok = cur.advance_token(); - if tok.kind != LTokenKind::Whitespace { - return tok; - } - cur.advance_token() - } - - pub fn proc_tokens(&mut self) -> Vec<Token> { - // Iterate through, +1 to symbol count per inst - // +len(str) for every string literal - let mut toks_final: Vec<Token> = Vec::new(); - loop { - let tok = self.cur.advance_token(); - if let Some(tok_final) = match tok.kind { - LTokenKind::Eof => break, - // Add identifier to symbol table at with correct line number - LTokenKind::Ident => { - // Process possibility of it being a trap - if let Some(trap_kind) = StrParser::trap(self.get_next_chars(tok.len as usize)) - { - self.line_num += 1; - Some(Token { - kind: TokenKind::Trap(trap_kind), - span: Span::new(SrcOffset(self.pos), tok.len as usize), - }) - } else { - // Add to symbol table as identifier - let idx = with_symbol_table(|sym| { - let tok_text = self.get_next_chars(tok.len as usize); - sym.get_index_of(tok_text).unwrap_or( - sym.insert_full(String::from(tok_text), self.line_num as u16) - .0, - ) - }); - Some(Token { - kind: TokenKind::Label(Symbol::from(idx)), - span: Span::new(SrcOffset(self.pos), tok.len as usize), - }) - } - } - // Create literal of correct value - LTokenKind::Lit(_) => todo!(), - // Match on directive, check next value for number of lines skipped - LTokenKind::Direc => { - if let Some(dir_kind) = StrParser::direc(self.get_next_chars(tok.len as usize)) - { - self.line_num += match dir_kind { - // Blkw should increment line count by the following int literal - // TODO: Check if not int literal - DirKind::Blkw => self - .get_next_chars(self.peek_next().len as usize) - .parse::<usize>() - .unwrap(), - // Stringz should increment line count by the number of characters - // in the string literal + null byte - DirKind::Stringz => { - // TODO: Check if not str literal - (self.peek_next().len - 2) as usize - } - _ => 1, - }; - Some(Token { - kind: TokenKind::Dir(dir_kind), - span: Span::new(SrcOffset(self.pos), tok.len as usize), - }) - } else { - // TODO: Error handling in a list - todo!() - } - } - // TODO: Add registers to lexer - LTokenKind::Reg => todo!(), - LTokenKind::Whitespace | LTokenKind::Comment => None, - // TODO: Should return list of errors eventually - LTokenKind::Unknown => todo!(), - } { - toks_final.push(tok_final); - self.pos += tok.len as usize; - } - } - toks_final - } + // fn peek_next(&self) -> Token { + // let mut cur = self.cur.clone(); + // let mut tok = cur.advance_token(); + // if tok.kind != TokenKind::Whitespace { + // return tok; + // } + // cur.advance_token() + // } + + // pub fn proc_tokens(&mut self) -> Vec<Token> { + // // Iterate through, +1 to symbol count per inst + // // +len(str) for every string literal + // let mut toks_final: Vec<Token> = Vec::new(); + // loop { + // let tok = self.cur.advance_token(); + // if let Some(tok_final) = match tok.kind { + // TokenKind::Eof => break, + // // Add identifier to symbol table at with correct line number + // TokenKind::Ident => { + // // Process possibility of it being a trap + // if let Some(trap_kind) = StrParser::trap(self.get_next_chars(tok.len as usize)) + // { + // self.line_num += 1; + // Some(Token { + // kind: TokenKind::Trap(trap_kind), + // span: Span::new(SrcOffset(self.pos), tok.len as usize), + // }) + // } else { + // // Add to symbol table as identifier + // let idx = with_symbol_table(|sym| { + // let tok_text = self.get_next_chars(tok.len as usize); + // sym.get_index_of(tok_text).unwrap_or( + // sym.insert_full(String::from(tok_text), self.line_num as u16) + // .0, + // ) + // }); + // Some(Token { + // kind: TokenKind::Label(Symbol::from(idx)), + // span: Span::new(SrcOffset(self.pos), tok.len as usize), + // }) + // } + // } + // // Create literal of correct value + // TokenKind::Lit(_) => todo!(), + // // Match on directive, check next value for number of lines skipped + // TokenKind::Direc => { + // if let Some(dir_kind) = StrParser::direc(self.get_next_chars(tok.len as usize)) + // { + // self.line_num += match dir_kind { + // // Blkw should increment line count by the following int literal + // // TODO: Check if not int literal + // DirKind::Blkw => self + // .get_next_chars(self.peek_next().len as usize) + // .parse::<usize>() + // .unwrap(), + // // Stringz should increment line count by the number of characters + // // in the string literal + null byte + // DirKind::Stringz => { + // // TODO: Check if not str literal + // (self.peek_next().len - 2) as usize + // } + // _ => 1, + // }; + // Some(Token { + // kind: TokenKind::Dir(dir_kind), + // span: Span::new(SrcOffset(self.pos), tok.len as usize), + // }) + // } else { + // // TODO: Error handling in a list + // todo!() + // } + // } + // // TODO: Add registers to lexer + // TokenKind::Reg => todo!(), + // TokenKind::Whitespace | TokenKind::Comment => None, + // // TODO: Should return list of errors eventually + // TokenKind::Unknown => todo!(), + // } { + // toks_final.push(tok_final); + // self.pos += tok.len as usize; + // } + // } + // toks_final + // } fn trap(s: &str) -> Option<TrapKind> { match s.to_ascii_lowercase().as_str() { @@ -156,93 +133,89 @@ impl<'a> StrParser<'a> { pub fn direc(s: &str) -> Option<DirKind> { match s.to_ascii_lowercase().as_str() { - ".alias" => Some(DirKind::Alias), - ".macro" => Some(DirKind::Macro), ".orig" => Some(DirKind::Orig), ".end" => Some(DirKind::End), ".stringz" => Some(DirKind::Stringz), ".blkw" => Some(DirKind::Blkw), ".fill" => Some(DirKind::Fill), - ".export" => Some(DirKind::Export), - ".import" => Some(DirKind::Import), _ => None, } } } -/// Transforms token stream into 'AST' -pub struct AsmParser<'a> { - /// Reference to the source file - src: &'a str, - /// List of processed tokens - tok: Vec<Token>, - /// Used to parse tokens - cur: Cursor<'a>, -} - -impl<'a> From<&'a str> for AsmParser<'a> { - fn from(src: &'a str) -> Self { - let tok: Vec<Token> = StrParser::new(src).proc_tokens(); - AsmParser { - src, - tok, - cur: Cursor::new(src), - } - } -} - -impl<'a> AsmParser<'a> { - pub fn parse(&mut self) -> Result<()> { - // First, check that there is an .orig directive with an appropriate value. - // Should emit error with a label to the first line stating "Expected memory init" - // Should be in a function that is also used to init the memory - the question is - // whether it should remain as a full directive or as a value that gets emitted afterwards. - let orig = self.expect(LTokenKind::Direc)?; - // Need ability to expect an enum without specifying a subcase (maybe ()?) - let addr = self.expect(LTokenKind::Lit(crate::lexer::LiteralKind::Hex)); - - // Following this, the structure is always: - // [label] - // -> <inst> [args] - // OR - // <label> - // -> <direc> [args] - // OR - // [label] - // ->* <direc> <args> - // OR - // <trap> [arg] - // or: (sometimes opt label) num directives (opt argument) - // so should generally build to this structure. This means, however, that the complexity - // is not suuper high as there are really only two medium complexity subcases to parse. - // - // TODO: Split into LexToken and Token, to simplify the lexer and have a postprocessing - // step that can then put it into a Token format that is then easily transformed into - // the 'AST'. - // - // In order to do this, there needs to be peeking functionality on the token stream so - // that it can e.g. see if there is a label present at the start of a line. - - Ok(()) - } - - pub fn expect(&mut self, kind: LTokenKind) -> Result<LToken> { - let tok = self.cur.advance_token(); - if tok.kind == kind { - return Ok(tok); - } - Err(miette!( - "ParseError: expected token of type {:?}, found {:?}", - kind, - tok - )) - } - - pub fn parse_direc(&self) { - todo!() - } - - pub fn parse_op(&self) { - todo!() - } -} +// /// Transforms token stream into 'AST' +// pub struct AsmParser<'a> { +// /// Reference to the source file +// src: &'a str, +// /// List of processed tokens +// tok: Vec<Token>, +// /// Used to parse tokens +// cur: Cursor<'a>, +// } + +// impl<'a> From<&'a str> for AsmParser<'a> { +// fn from(src: &'a str) -> Self { +// let tok: Vec<Token> = StrParser::new(src).proc_tokens(); +// AsmParser { +// src, +// tok, +// cur: Cursor::new(src), +// } +// } +// } + +// impl<'a> AsmParser<'a> { +// pub fn parse(&mut self) -> Result<()> { +// // First, check that there is an .orig directive with an appropriate value. +// // Should emit error with a label to the first line stating "Expected memory init" +// // Should be in a function that is also used to init the memory - the question is +// // whether it should remain as a full directive or as a value that gets emitted afterwards. +// let orig = self.expect(LTokenKind::Direc)?; +// // Need ability to expect an enum without specifying a subcase (maybe ()?) +// let addr = self.expect(LTokenKind::Lit(crate::lexer::LiteralKind::Hex)); + +// // Following this, the structure is always: +// // [label] +// // -> <inst> [args] +// // OR +// // <label> +// // -> <direc> [args] +// // OR +// // [label] +// // ->* <direc> <args> +// // OR +// // <trap> [arg] +// // or: (sometimes opt label) num directives (opt argument) +// // so should generally build to this structure. This means, however, that the complexity +// // is not suuper high as there are really only two medium complexity subcases to parse. +// // +// // TODO: Split into LexToken and Token, to simplify the lexer and have a postprocessing +// // step that can then put it into a Token format that is then easily transformed into +// // the 'AST'. +// // +// // In order to do this, there needs to be peeking functionality on the token stream so +// // that it can e.g. see if there is a label present at the start of a line. + +// Ok(()) +// } + +// pub fn expect(&mut self, kind: LTokenKind) -> Result<LToken> { +// let tok = self.cur.advance_token(); +// if tok.kind == kind { +// return Ok(tok); +// } +// Err(miette!( +// "ParseError: expected token of type {:?}, found {:?}", +// kind, +// tok +// )) +// } + +// pub fn parse_direc(&self) { +// todo!() +// } + +// pub fn parse_op(&self) { +// todo!() +// } +// } diff --git a/src/symbol.rs b/src/symbol.rs index 29247dd..c9c97e3 100644 --- a/src/symbol.rs +++ b/src/symbol.rs @@ -1,4 +1,4 @@ -use std::cell::RefCell; +use std::{cell::RefCell, ops::{Bound, Range, RangeBounds}, slice::SliceIndex, str::FromStr}; use fxhash::FxBuildHasher; use indexmap::IndexMap; @@ -31,19 +31,39 @@ impl From<usize> for Symbol { #[derive(Clone, Copy, PartialEq, Eq, Debug)] pub struct Span { offs: SrcOffset, + end: SrcOffset, len: usize, } impl Span { pub fn new(offs: SrcOffset, len: usize) -> Self { - Span { offs, len } + Span { offs, len, end: SrcOffset(offs.0 + len) } + } + + pub fn dummy() -> Self { + Span { offs: SrcOffset(0), len: 0, end: SrcOffset(0) } + } + + pub fn range(&self) -> Range<usize> { + self.offs.0..self.end.0 } } +impl RangeBounds<usize> for Span { + fn start_bound(&self) -> Bound<&usize> { + Bound::Included(&self.offs.0) + } + + fn end_bound(&self) -> Bound<&usize> { + Bound::Excluded(&self.end.0) + } + +} + /// Represents the CPU registers. #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)] pub enum Register { - R0, + R0 = 0, R1, R2, R3, @@ -54,6 +74,25 @@ pub enum Register { R7, } +impl FromStr for Register { + type Err = (); + + // Does not fail in this codebase. + fn from_str(s: &str) -> Result<Self, Self::Err> { + match s { + "0" => Ok(Register::R0), + "1" => Ok(Register::R1), + "2" => Ok(Register::R2), + "3" => Ok(Register::R3), + "4" => Ok(Register::R4), + "5" => Ok(Register::R5), + "6" => Ok(Register::R6), + "7" => Ok(Register::R7), + _ => Err(()), + } + } +} + /// Set by a subset of instructions, representing whether the result was negative, zero, or positive. #[derive(Clone, Copy, PartialEq, Eq, Debug)] pub enum Flag { @@ -91,15 +130,11 @@ pub enum TrapKind { #[derive(Clone, Copy, PartialEq, Eq, Debug)] pub enum DirKind { - Alias, - Macro, Orig, End, Stringz, Blkw, Fill, - Export, - Import, } /// Newtype representing an address inside the LC3 memory. From f07f7338a4aeabfe5afa93119502c02ca909b427 Mon Sep 17 00:00:00 2001 From: Artemis Rosman <73006620+rozukke@users.noreply.github.com> Date: Tue, 27 Aug 2024 20:33:47 +1000 Subject: [PATCH 15/17] Lexer instructions --- src/lexer/mod.rs | 102 +++++++++++++++++++++++++++++++++++++++++++---- src/symbol.rs | 14 +++++++ 2 files changed, 108 insertions(+), 8 deletions(-) diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index 5f8ac77..b064803 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -3,7 +3,7 @@ use std::str::FromStr; use miette::{Result, bail, miette, LabeledSpan, Severity}; use crate::lexer::cursor::Cursor; -use crate::symbol::{DirKind, InstrKind, Register, Span, SrcOffset, TrapKind}; +use crate::symbol::{DirKind, Flag, InstrKind, Register, Span, SrcOffset, TrapKind}; pub mod cursor; @@ -143,11 +143,6 @@ impl Cursor<'_> { Ok(res) } - fn ident(&mut self) -> TokenKind { - self.take_while(is_id); - TokenKind::Label - } - fn hex(&mut self) -> Result<TokenKind> { let start = self.abs_pos(); let prefix = self.pos_in_token(); @@ -221,9 +216,100 @@ impl Cursor<'_> { help = "hint: make sure to close string literals with a \" character.", labels = vec![LabeledSpan::at(start..self.abs_pos(), "incorrect literal")], "Encountered an unterminated string literal.", - - ) + ) } Ok(TokenKind::Lit(LiteralKind::Str)) } + + fn directive(&mut self) -> Result<TokenKind> { + // Account for starting . + let start = self.abs_pos() - 1; + self.take_while(is_id); + let dir = self.get_range(start..self.abs_pos()).to_ascii_lowercase(); + } + + fn ident(&mut self) -> Result<TokenKind> { + let mut token_kind = TokenKind::Label; + let ident_start = self.abs_pos(); + self.take_while(is_id); + let ident = self.get_range(ident_start..self.abs_pos()).to_ascii_lowercase(); + + // This actually needs to be in its own function :/ + if ident.starts_with('.') { + token_kind = self.check_directive(&ident[1..]); + if token_kind == TokenKind::Unknown { + bail!( + severity = Severity::Error, + code = "parse::dir", + help = "hint: check the list of available directives in the documentation.", + labels = vec![LabeledSpan::at(ident_start..self.abs_pos(), "incorrect literal")], + "Encountered an invalid directive.", + ) + } + } else { + token_kind = self.check_instruction(&ident); + + // If not an instruction, check if it's a trap + if token_kind == TokenKind::Label { + token_kind = self.check_trap(&ident); + } + } + + Ok(token_kind) + } + + fn check_directive(&self, dir_str: &str) -> TokenKind { + match dir_str { + "orig" => TokenKind::Dir(DirKind::Orig), + "end" => TokenKind::Dir(DirKind::End), + "stringz" => TokenKind::Dir(DirKind::Stringz), + "blkw" => TokenKind::Dir(DirKind::Blkw), + "fill" => TokenKind::Dir(DirKind::Fill), + // Not a directive + _ => TokenKind::Unknown, + } + } + + // Should learn how to write macros tbh :) + fn check_instruction(&self, ident: &str) -> TokenKind { + match ident { + "add" => TokenKind::Instr(InstrKind::Add), + "and" => TokenKind::Instr(InstrKind::And), + "brnzp" => TokenKind::Instr(InstrKind::Br(Flag::Nzp)), + "brnz" => TokenKind::Instr(InstrKind::Br(Flag::Nz)), + "brzp" => TokenKind::Instr(InstrKind::Br(Flag::Zp)), + "brnp" => TokenKind::Instr(InstrKind::Br(Flag::Np)), + "brn" => TokenKind::Instr(InstrKind::Br(Flag::N)), + "brz" => TokenKind::Instr(InstrKind::Br(Flag::Z)), + "brp" => TokenKind::Instr(InstrKind::Br(Flag::P)), + "jmp" => TokenKind::Instr(InstrKind::Jmp), + "jsr" => TokenKind::Instr(InstrKind::Jsr), + "jsrr" => TokenKind::Instr(InstrKind::Jsrr), + "ld" => TokenKind::Instr(InstrKind::Ld), + "ldi" => TokenKind::Instr(InstrKind::Ldi), + "ldr" => TokenKind::Instr(InstrKind::Ldr), + "lea" => TokenKind::Instr(InstrKind::Lea), + "not" => TokenKind::Instr(InstrKind::Not), + "ret" => TokenKind::Instr(InstrKind::Ret), + "rti" => TokenKind::Instr(InstrKind::Rti), + "st" => TokenKind::Instr(InstrKind::St), + "sti" => TokenKind::Instr(InstrKind::Sti), + // Not an instruction + _ => TokenKind::Label, + } + } + + fn check_trap(&self, ident: &str) -> TokenKind { + match ident { + "getc" => TokenKind::Trap(TrapKind::Getc), + "out" => TokenKind::Trap(TrapKind::Out), + "puts" => TokenKind::Trap(TrapKind::Puts), + "in" => TokenKind::Trap(TrapKind::In), + "putsp" => TokenKind::Trap(TrapKind::Putsp), + "halt" => TokenKind::Trap(TrapKind::Halt), + "trap" => TokenKind::Trap(TrapKind::Generic), + // Not a trap + _ => TokenKind::Label, + } + } } diff --git a/src/symbol.rs b/src/symbol.rs index c9c97e3..14959af 100644 --- a/src/symbol.rs +++ b/src/symbol.rs @@ -115,6 +115,20 @@ pub enum Flag { #[derive(Clone, Copy, PartialEq, Eq, Debug)] pub enum InstrKind { Add, + And, + Br(Flag), + Jmp, + Jsr, + Jsrr, + Ld, + Ldi, + Ldr, + Lea, + Not, + Ret, + Rti, + St, + Sti, } #[derive(Clone, Copy, PartialEq, Eq, Debug)] From 8295432206a110179a702f3c0da6fb3ab70625f1 Mon Sep 17 00:00:00 2001 From: Artemis Rosman <73006620+rozukke@users.noreply.github.com> Date: Thu, 29 Aug 2024 11:23:22 +1000 Subject: [PATCH 16/17] Finish lexer --- src/lexer/mod.rs | 167 +++++++++++++++++++++++++++++++++++------------ src/main.rs | 4 +- 2 files changed, 127 insertions(+), 44 deletions(-) diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index b064803..7f1d4f4 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -7,10 +7,12 @@ use crate::symbol::{DirKind, Flag, InstrKind, Register, Span, SrcOffset, TrapKin pub mod cursor; -/// A 'light' token that carries basic info and span +/// Carries all literal info alongside span location inside source code. #[derive(Debug)] pub struct Token { + /// Lexed token kind, with literal values contained as part of the enum. pub kind: TokenKind, + /// Span pointing at the location of the token in the source. pub span: Span, } @@ -22,8 +24,11 @@ impl Token { #[derive(Clone, Copy, PartialEq, Eq, Debug)] pub enum LiteralKind { + /// 0x3000, xFFFF, x123 Hex(u16), + /// #-1, #32456 Dec(i16), + /// "str with \" escaped chars" Str, } @@ -87,10 +92,12 @@ impl Cursor<'_> { None => return Ok(Token::new(TokenKind::Eof, Span::dummy())), }; let token_kind = match first_char { + // Comment ';' => { self.take_while(|c| c != '\n'); TokenKind::Comment } + // Whitespace c if is_whitespace(c) => { self.take_while(is_whitespace); TokenKind::Whitespace @@ -104,6 +111,7 @@ impl Cursor<'_> { }, _ => self.ident(), }, + // Register literal 'r' | 'R' => match self.first() { c if is_reg_num(c) => { self.take_while(is_reg_num); @@ -122,17 +130,10 @@ impl Cursor<'_> { // Decimal literal '#' => self.dec()?, // Directive - // '.' => { - // let check = self.take_n(3).to_ascii_lowercase(); - // self.take_while(is_id); - // // Need to check for .end directive to avoid unnecessary parsing and errors - // match (self.pos_in_token(), check.as_str()) { - // (3, "end") => TokenKind::Eof, - // _ => TokenKind::Dir, - // } - // } + '.' => self.dir()?, // String literal - '"' => self.string_literal()?, + '"' => self.str()?, + // Unknown starting characters _ => { self.take_while(|c| !is_whitespace(c)); TokenKind::Unknown @@ -195,7 +196,7 @@ impl Cursor<'_> { Ok(TokenKind::Lit(LiteralKind::Dec(value))) } - fn string_literal(&mut self) -> Result<TokenKind> { + fn str(&mut self) -> Result<TokenKind> { let start = self.abs_pos() - 1; let mut terminated = false; while let Some(c) = self.bump() { @@ -221,56 +222,55 @@ impl Cursor<'_> { Ok(TokenKind::Lit(LiteralKind::Str)) } - fn directive(&mut self) -> Result<TokenKind> { + fn dir(&mut self) -> Result<TokenKind> { // Account for starting . let start = self.abs_pos() - 1; self.take_while(is_id); let dir = self.get_range(start..self.abs_pos()).to_ascii_lowercase(); + + if let Some(token_kind) = self.check_directive(&dir) { + Ok(token_kind) + } else { + bail!( + severity = Severity::Error, + code = "parse::dir", + help = "hint: check the list of available directives in the documentation.", + labels = vec![LabeledSpan::at(start..self.abs_pos(), "incorrect literal")], + "Encountered an invalid directive.", + ) + } } - fn ident(&mut self) -> Result<TokenKind> { + fn ident(&mut self) -> TokenKind { let mut token_kind = TokenKind::Label; - let ident_start = self.abs_pos(); + let ident_start = self.abs_pos() - 1; self.take_while(is_id); let ident = self.get_range(ident_start..self.abs_pos()).to_ascii_lowercase(); - // This actually needs to be in its own function :/ - if ident.starts_with('.') { - token_kind = self.check_directive(&ident[1..]); - if token_kind == TokenKind::Unknown { - bail!( - severity = Severity::Error, - code = "parse::dir", - help = "hint: check the list of available directives in the documentation.", - labels = vec![LabeledSpan::at(ident_start..self.abs_pos(), "incorrect literal")], - "Encountered an invalid directive.", - ) - } - } else { - token_kind = self.check_instruction(&ident); - - // If not an instruction, check if it's a trap - if token_kind == TokenKind::Label { - token_kind = self.check_trap(&ident); - } + token_kind = self.check_instruction(&ident); + // If not an instruction, check if it's a trap + if token_kind == TokenKind::Label { + token_kind = self.check_trap(&ident); } - Ok(token_kind) + token_kind } - fn check_directive(&self, dir_str: &str) -> TokenKind { + /// Expects lowercase + fn check_directive(&self, dir_str: &str) -> Option<TokenKind> { match dir_str { - "orig" => TokenKind::Dir(DirKind::Orig), - "end" => TokenKind::Dir(DirKind::End), - "stringz" => TokenKind::Dir(DirKind::Stringz), - "blkw" => TokenKind::Dir(DirKind::Blkw), - "fill" => TokenKind::Dir(DirKind::Fill), + ".orig" => Some(TokenKind::Dir(DirKind::Orig)), + ".end" => Some(TokenKind::Dir(DirKind::End)), + ".stringz" => Some(TokenKind::Dir(DirKind::Stringz)), + ".blkw" => Some(TokenKind::Dir(DirKind::Blkw)), + ".fill" => Some(TokenKind::Dir(DirKind::Fill)), // Not a directive - _ => TokenKind::Unknown, + _ => None, } } // Should learn how to write macros tbh :) + /// Expects lowercase fn check_instruction(&self, ident: &str) -> TokenKind { match ident { "add" => TokenKind::Instr(InstrKind::Add), @@ -299,6 +299,7 @@ impl Cursor<'_> { } } + /// Expects lowercase fn check_trap(&self, ident: &str) -> TokenKind { match ident { "getc" => TokenKind::Trap(TrapKind::Getc), @@ -313,3 +314,85 @@ impl Cursor<'_> { } } } + +mod tests { + use crate::lexer::{LiteralKind, TokenKind}; + + use super::cursor::Cursor; + + // HEX LIT TESTS + + #[test] + fn hex_correct_value() { + let mut lex = Cursor::new("0x1234"); + let res = lex.advance_token().unwrap(); + assert!(res.kind == TokenKind::Lit(LiteralKind::Hex(0x1234))) + } + + #[test] + fn hex_too_large() { + let mut lex = Cursor::new("xFFFF x10000"); + let res = lex.advance_token().unwrap(); + assert!(res.kind == TokenKind::Lit(LiteralKind::Hex(0xFFFF))); + // Whitespace + let res = lex.advance_token().unwrap(); + assert!(lex.advance_token().is_err()); + } + + #[test] + fn hex_leading_0() { + let mut lex = Cursor::new("0x3000"); + let res = lex.advance_token().unwrap(); + assert!(res.kind == TokenKind::Lit(LiteralKind::Hex(0x3000))) + } + + // DEC LIT TESTS + + #[test] + fn dec_correct_value() { + let mut lex = Cursor::new("#32412"); + let res = lex.advance_token().unwrap(); + assert!(res.kind == TokenKind::Lit(LiteralKind::Dec(32412))) + } + + #[test] + fn dec_negative_value () { + let mut lex = Cursor::new("#-300"); + let res = lex.advance_token().unwrap(); + assert!(res.kind == TokenKind::Lit(LiteralKind::Dec(-300))) + } + + #[test] + fn dec_too_small () { + let mut lex = Cursor::new("#-32768 #-32769"); + let res = lex.advance_token().unwrap(); + assert!(res.kind == TokenKind::Lit(LiteralKind::Dec(-32768))); + // Whitespace + let res = lex.advance_token().unwrap(); + assert!(lex.advance_token().is_err()); + } + + #[test] + fn dec_too_large () { + let mut lex = Cursor::new("#32767 #32768"); + let res = lex.advance_token().unwrap(); + assert!(res.kind == TokenKind::Lit(LiteralKind::Dec(32767))); + // Whitespace + let res = lex.advance_token().unwrap(); + assert!(lex.advance_token().is_err()); + } + + // STR LIT TESTS + + #[test] + fn str_unterminated() { + let mut lex = Cursor::new(r#""unterminated"#); + assert!(lex.advance_token().is_err()) + } + + #[test] + fn str_escaped() { + let mut lex = Cursor::new(r#"there is an escaped \" in this str\n"#); + assert!(lex.advance_token().is_ok()) + } +} diff --git a/src/main.rs b/src/main.rs index 0f3c1cb..3c4dfd0 100644 --- a/src/main.rs +++ b/src/main.rs @@ -77,8 +77,8 @@ fn main() -> miette::Result<()> { return Err(err.with_source_code(file.clone())); } }; - println!("{:?}", ok); - println!("{:?}", &file[ok.span.range()]); + print!("{:?} ", ok.kind); + println!("{}", &file[ok.span.range()]); } Ok(()) } From f92b81fefb0e07b84bf7fb21009965327ea13927 Mon Sep 17 00:00:00 2001 From: Artemis Rosman <73006620+rozukke@users.noreply.github.com> Date: Thu, 29 Aug 2024 11:26:18 +1000 Subject: [PATCH 17/17] Remove accidental file --- scratch/test.asm | 7 ------- 1 file changed, 7 deletions(-) delete mode 100644 scratch/test.asm diff --git a/scratch/test.asm b/scratch/test.asm deleted file mode 100644 index 2352f14..0000000 --- a/scratch/test.asm +++ /dev/null @@ -1,7 +0,0 @@ -ahhh .orig x3000 -add R0, R0, #2; holllly shittt no wayyy -add R0, R1, #-32568; waow -add r1, r3, r4 r5 0x40 -ret -labelthing .stringz "woaw omg \"epic\"" - .stringz "okayyy"