From b67cff849cf6808037a8dac8a1f882be07cca6e7 Mon Sep 17 00:00:00 2001
From: Artemis Rosman <73006620+rozukke@users.noreply.github.com>
Date: Tue, 13 Aug 2024 12:59:34 +1000
Subject: [PATCH 01/17] Add miette dep

---
 Cargo.lock | 240 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 Cargo.toml |   1 +
 2 files changed, 241 insertions(+)

diff --git a/Cargo.lock b/Cargo.lock
index 362b36e..ed76f0c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2,6 +2,21 @@
 # It is not intended for manual editing.
 version = 3
 
+[[package]]
+name = "addr2line"
+version = "0.22.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e4503c46a5c0c7844e948c9a4d6acd9f50cccb4de1c48eb9e291ea17470c678"
+dependencies = [
+ "gimli",
+]
+
+[[package]]
+name = "adler"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
+
 [[package]]
 name = "aho-corasick"
 version = "1.1.3"
@@ -75,6 +90,36 @@ dependencies = [
  "wait-timeout",
 ]
 
+[[package]]
+name = "backtrace"
+version = "0.3.73"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5cc23269a4f8976d0a4d2e7109211a419fe30e8d88d677cd60b6bc79c5732e0a"
+dependencies = [
+ "addr2line",
+ "cc",
+ "cfg-if",
+ "libc",
+ "miniz_oxide",
+ "object",
+ "rustc-demangle",
+]
+
+[[package]]
+name = "backtrace-ext"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "537beee3be4a18fb023b570f80e3ae28003db9167a751266b259926e25539d50"
+dependencies = [
+ "backtrace",
+]
+
+[[package]]
+name = "bitflags"
+version = "2.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
+
 [[package]]
 name = "bstr"
 version = "1.9.1"
@@ -86,6 +131,18 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "cc"
+version = "1.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e9e8aabfac534be767c909e0690571677d49f41bd8465ae876fe043d52ba5292"
+
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
 [[package]]
 name = "clap"
 version = "4.5.4"
@@ -154,6 +211,22 @@ version = "0.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10"
 
+[[package]]
+name = "errno"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba"
+dependencies = [
+ "libc",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "gimli"
+version = "0.29.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd"
+
 [[package]]
 name = "glob"
 version = "0.3.1"
@@ -166,6 +239,12 @@ version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
 
+[[package]]
+name = "is_ci"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7655c9839580ee829dfacba1d1278c2b7883e50a277ff7541299489d6bdfdc45"
+
 [[package]]
 name = "is_terminal_polyfill"
 version = "1.70.0"
@@ -181,6 +260,7 @@ dependencies = [
  "colored",
  "glob",
  "lazy_static",
+ "miette",
  "regex",
 ]
 
@@ -196,12 +276,73 @@ version = "0.2.154"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ae743338b92ff9146ce83992f766a31066a91a8c84a45e0e9f21e7cf6de6d346"
 
+[[package]]
+name = "linux-raw-sys"
+version = "0.4.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
+
 [[package]]
 name = "memchr"
 version = "2.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d"
 
+[[package]]
+name = "miette"
+version = "7.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4edc8853320c2a0dab800fbda86253c8938f6ea88510dc92c5f1ed20e794afc1"
+dependencies = [
+ "backtrace",
+ "backtrace-ext",
+ "cfg-if",
+ "miette-derive",
+ "owo-colors",
+ "supports-color",
+ "supports-hyperlinks",
+ "supports-unicode",
+ "terminal_size",
+ "textwrap",
+ "thiserror",
+ "unicode-width",
+]
+
+[[package]]
+name = "miette-derive"
+version = "7.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dcf09caffaac8068c346b6df2a7fc27a177fd20b39421a39ce0a211bde679a6c"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "miniz_oxide"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08"
+dependencies = [
+ "adler",
+]
+
+[[package]]
+name = "object"
+version = "0.36.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "27b64972346851a39438c60b341ebc01bba47464ae329e55cf343eb93964efd9"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "owo-colors"
+version = "4.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "caff54706df99d2a78a5a4e3455ff45448d81ef1bb63c22cd14052ca0e993a3f"
+
 [[package]]
 name = "predicates"
 version = "3.1.0"
@@ -276,6 +417,25 @@ version = "0.8.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b"
 
+[[package]]
+name = "rustc-demangle"
+version = "0.1.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
+
+[[package]]
+name = "rustix"
+version = "0.38.34"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f"
+dependencies = [
+ "bitflags",
+ "errno",
+ "libc",
+ "linux-raw-sys",
+ "windows-sys 0.52.0",
+]
+
 [[package]]
 name = "serde"
 version = "1.0.201"
@@ -296,12 +456,39 @@ dependencies = [
  "syn",
 ]
 
+[[package]]
+name = "smawk"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b7c388c1b5e93756d0c740965c41e8822f866621d41acbdf6336a6a168f8840c"
+
 [[package]]
 name = "strsim"
 version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
 
+[[package]]
+name = "supports-color"
+version = "3.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9829b314621dfc575df4e409e79f9d6a66a3bd707ab73f23cb4aa3a854ac854f"
+dependencies = [
+ "is_ci",
+]
+
+[[package]]
+name = "supports-hyperlinks"
+version = "3.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2c0a1e5168041f5f3ff68ff7d95dcb9c8749df29f6e7e89ada40dd4c9de404ee"
+
+[[package]]
+name = "supports-unicode"
+version = "3.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b7401a30af6cb5818bb64852270bb722533397edcfc7344954a38f420819ece2"
+
 [[package]]
 name = "syn"
 version = "2.0.61"
@@ -313,18 +500,71 @@ dependencies = [
  "unicode-ident",
 ]
 
+[[package]]
+name = "terminal_size"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "21bebf2b7c9e0a515f6e0f8c51dc0f8e4696391e6f1ff30379559f8365fb0df7"
+dependencies = [
+ "rustix",
+ "windows-sys 0.48.0",
+]
+
 [[package]]
 name = "termtree"
 version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76"
 
+[[package]]
+name = "textwrap"
+version = "0.16.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "23d434d3f8967a09480fb04132ebe0a3e088c173e6d0ee7897abbdf4eab0f8b9"
+dependencies = [
+ "smawk",
+ "unicode-linebreak",
+ "unicode-width",
+]
+
+[[package]]
+name = "thiserror"
+version = "1.0.63"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724"
+dependencies = [
+ "thiserror-impl",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "1.0.63"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "unicode-ident"
 version = "1.0.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
 
+[[package]]
+name = "unicode-linebreak"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b09c83c3c29d37506a3e260c08c03743a6bb66a9cd432c6934ab501a190571f"
+
+[[package]]
+name = "unicode-width"
+version = "0.1.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d"
+
 [[package]]
 name = "utf8parse"
 version = "0.2.1"
diff --git a/Cargo.toml b/Cargo.toml
index 1e8c232..4694274 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,6 +13,7 @@ clap = { version = "4.5.4", features = ["derive"] }
 colored = "2.1.0"
 regex = "1.10.6"
 lazy_static = "1.5.0"
+miette = { version = "7.2.0", features = ["fancy"] }
 
 [dev-dependencies]
 assert_cmd = "2.0.14"

From ae1f126761535b8516ce9c4498f5fc47a565393e Mon Sep 17 00:00:00 2001
From: Artemis Rosman <73006620+rozukke@users.noreply.github.com>
Date: Tue, 13 Aug 2024 12:59:52 +1000
Subject: [PATCH 02/17] Parser progress

---
 src/lexer/cursor.rs |  5 +++
 src/lexer/mod.rs    | 77 ++++++++++++++++++++++++++++++++++++++++++++-
 src/main.rs         |  8 ++++-
 src/parser.rs       | 33 +++++++++++++------
 4 files changed, 112 insertions(+), 11 deletions(-)

diff --git a/src/lexer/cursor.rs b/src/lexer/cursor.rs
index f44c614..83166f0 100644
--- a/src/lexer/cursor.rs
+++ b/src/lexer/cursor.rs
@@ -20,6 +20,11 @@ impl<'a> Cursor<'a> {
         }
     }
 
+    pub fn get_next(&self, len: usize) -> &'a str {
+        println!("{}", &self.chars[self.curr_pt..(self.curr_pt + len)]);
+        &self.chars[self.curr_pt..(self.curr_pt + len)]
+    }
+
     /// File is finished parsing
     pub fn is_eof(&self) -> bool {
         self.len_remaining == 0
diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs
index 6604850..b05e007 100644
--- a/src/lexer/mod.rs
+++ b/src/lexer/mod.rs
@@ -20,9 +20,56 @@ pub enum LiteralKind {
     Str,
 }
 
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+pub enum InstrKind {
+    Add,
+    And,
+    Branch,
+    Jump,
+    JumpSub,
+    JumpSubReg,
+    Load,
+    LoadInd,
+    LoadReg,
+    LoadAddr,
+    Not,
+    Return,
+    Interrupt,
+    Store,
+    StoreInd,
+}
+
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+pub enum DirecKind {
+    Orig,
+    Stringz,
+    Blkw,
+    Fill,
+    Alias,
+    Macro,
+    End,
+    Export,
+    Import,
+}
+
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+pub enum TrapKind {
+    /// Get a character from standard input
+    Getc,
+    /// Output a single character
+    Out,
+    /// Print string
+    Puts,
+    In,
+    Putsp,
+    Halt,
+    Trap,
+}
+
 #[derive(Clone, Copy, PartialEq, Eq, Debug)]
 pub enum TokenKind {
     Ident,
+    Instr(InstrKind),
     Lit(LiteralKind),
     Comment,
     Direc,
@@ -79,8 +126,36 @@ impl Cursor<'_> {
 
         for (kind, re) in PATTERNS.iter() {
             if let Some(tok) = re.find(self.at_curr_pt()) {
+                // Parse into precise definition
+                let mut kind = *kind;
+                kind = match kind {
+                    TokenKind::Ident => match self.get_next(tok.len()).to_lowercase().as_str() {
+                        "add" => TokenKind::Instr(InstrKind::Add),
+                        "and" => TokenKind::Instr(InstrKind::And),
+                        "br" | "brn" | "brz" | "brp" | "brnz" | "brnzp" | "brnp" | "brzp" => {
+                            TokenKind::Instr(InstrKind::Branch)
+                        }
+                        "jmp" => TokenKind::Instr(InstrKind::Jump),
+                        "jsr" => TokenKind::Instr(InstrKind::JumpSub),
+                        "jsrr" => TokenKind::Instr(InstrKind::JumpSubReg),
+                        "ld" => TokenKind::Instr(InstrKind::Load),
+                        "ldi" => TokenKind::Instr(InstrKind::LoadInd),
+                        "ldr" => TokenKind::Instr(InstrKind::LoadReg),
+                        "lea" => TokenKind::Instr(InstrKind::LoadAddr),
+                        "not" => TokenKind::Instr(InstrKind::Not),
+                        "ret" => TokenKind::Instr(InstrKind::Return),
+                        "rti" => TokenKind::Instr(InstrKind::Interrupt),
+                        "st" => TokenKind::Instr(InstrKind::Store),
+                        "sti" => TokenKind::Instr(InstrKind::StoreInd),
+                        _ => TokenKind::Ident,
+                    },
+                    TokenKind::Direc => {
+                        todo!()
+                    }
+                    _ => kind,
+                };
                 let token = Token {
-                    kind: *kind,
+                    kind,
                     span: Span::new(Idx(self.curr_pt() as u32), tok.len() as u16),
                 };
                 self.advance(tok.len());
diff --git a/src/main.rs b/src/main.rs
index 9600e74..36873be 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -5,6 +5,8 @@ use std::fs;
 use clap::{Parser, Subcommand};
 use colored::Colorize;
 use lexer::{tokenize, TokenKind};
+use miette::Result;
+use parser::AsmParser;
 
 mod lexer;
 mod ops;
@@ -59,7 +61,7 @@ enum Command {
     },
 }
 
-fn main() {
+fn main() -> Result<()> {
     let args = Args::parse();
 
     if let Some(command) = args.command {
@@ -70,6 +72,10 @@ fn main() {
                 for tok in tokenize(&file).filter(|tok| tok.kind != TokenKind::Junk) {
                     println!("{:?} {}", tok, &file[tok.span.as_range()]);
                 }
+
+                let mut parse = AsmParser::from(file.as_str());
+                parse.parse()?;
+                Ok(())
             }
             Command::Clean { name } => todo!(),
             Command::Watch { name } => todo!(),
diff --git a/src/parser.rs b/src/parser.rs
index e865b62..b949a9e 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -1,30 +1,45 @@
-use crate::lexer::{cursor::Cursor, TokenKind};
+use std::error::Error;
+
+use miette::{miette, Result};
+
+use crate::lexer::{cursor::Cursor, Token, TokenKind};
 
 /// Transforms token stream into 'AST'
-pub struct Parser<'source> {
+pub struct AsmParser<'source> {
     /// Reference to the source file
     src: &'source str,
     /// Used to parse tokens
     cur: Cursor<'source>,
 }
 
-impl<'a> From<&'a str> for Parser<'a> {
+impl<'a> From<&'a str> for AsmParser<'a> {
     fn from(value: &'a str) -> Self {
-        Parser {
+        AsmParser {
             src: value,
             cur: Cursor::new(value),
         }
     }
 }
 
-impl<'source> Parser<'source> {
-    pub fn parse(&self) {
+impl<'source> AsmParser<'source> {
+    pub fn parse(&mut self) -> Result<()> {
         // First, check that there is an .orig directive with an appropriate value.
-        todo!()
+        let orig = self.expect(TokenKind::Direc)?;
+        let addr = self.expect(TokenKind::Lit(crate::lexer::LiteralKind::Hex));
+
+        Ok(())
     }
 
-    pub fn expect(kind: TokenKind) {
-        todo!()
+    pub fn expect(&mut self, kind: TokenKind) -> Result<Token> {
+        let tok = self.cur.advance_token();
+        if tok.kind == kind {
+            return Ok(tok);
+        }
+        Err(miette!(
+            "ParseError: expected token of type {:?}, found {:?}",
+            kind,
+            tok
+        ))
     }
 
     pub fn parse_direc(&self) {

From 221486a8c8d7ea80e1c30f7d53a80f408db89fd5 Mon Sep 17 00:00:00 2001
From: Artemis Rosman <73006620+rozukke@users.noreply.github.com>
Date: Thu, 15 Aug 2024 12:16:36 +1000
Subject: [PATCH 03/17] Add indexmap dependency

---
 Cargo.lock | 39 +++++++++++++++++++++++++++++++++++++++
 Cargo.toml |  2 ++
 2 files changed, 41 insertions(+)

diff --git a/Cargo.lock b/Cargo.lock
index ed76f0c..dde14b2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -131,6 +131,12 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "byteorder"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
+
 [[package]]
 name = "cc"
 version = "1.1.10"
@@ -211,6 +217,12 @@ version = "0.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10"
 
+[[package]]
+name = "equivalent"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
+
 [[package]]
 name = "errno"
 version = "0.3.9"
@@ -221,6 +233,15 @@ dependencies = [
  "windows-sys 0.52.0",
 ]
 
+[[package]]
+name = "fxhash"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
+dependencies = [
+ "byteorder",
+]
+
 [[package]]
 name = "gimli"
 version = "0.29.0"
@@ -233,12 +254,28 @@ version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
 
+[[package]]
+name = "hashbrown"
+version = "0.14.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
+
 [[package]]
 name = "heck"
 version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
 
+[[package]]
+name = "indexmap"
+version = "2.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93ead53efc7ea8ed3cfb0c79fc8023fbb782a5432b52830b6518941cebe6505c"
+dependencies = [
+ "equivalent",
+ "hashbrown",
+]
+
 [[package]]
 name = "is_ci"
 version = "1.2.0"
@@ -258,7 +295,9 @@ dependencies = [
  "assert_cmd",
  "clap",
  "colored",
+ "fxhash",
  "glob",
+ "indexmap",
  "lazy_static",
  "miette",
  "regex",
diff --git a/Cargo.toml b/Cargo.toml
index 4694274..294e1e8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -14,6 +14,8 @@ colored = "2.1.0"
 regex = "1.10.6"
 lazy_static = "1.5.0"
 miette = { version = "7.2.0", features = ["fancy"] }
+indexmap = "2.4.0"
+fxhash = "0.2.1"
 
 [dev-dependencies]
 assert_cmd = "2.0.14"

From 5ada5680a51e937b4e06e6840cf4cc021e7acc7a Mon Sep 17 00:00:00 2001
From: Artemis Rosman <73006620+rozukke@users.noreply.github.com>
Date: Thu, 15 Aug 2024 12:17:16 +1000
Subject: [PATCH 04/17] Redo lexer without regex

---
 src/lexer/cursor.rs |  52 ++++++-----
 src/lexer/mod.rs    | 218 ++++++++++++++++++--------------------------
 src/main.rs         |   6 +-
 3 files changed, 124 insertions(+), 152 deletions(-)

diff --git a/src/lexer/cursor.rs b/src/lexer/cursor.rs
index 83166f0..7a56ee4 100644
--- a/src/lexer/cursor.rs
+++ b/src/lexer/cursor.rs
@@ -2,52 +2,60 @@
 // Heavily instpired and referenced from `rustc_lexer` and adapted to suit the project.
 // See https://doc.rust-lang.org/beta/nightly-rustc/src/rustc_lexer/cursor.rs.html
 
+use std::str::Chars;
+
 /// Peekable iterator over a char sequence.
 pub struct Cursor<'a> {
     len_remaining: usize,
-    /// Index that the cursor is pointing to in the source
-    curr_pt: usize,
     /// Iterator over chars in a &str
-    chars: &'a str,
+    chars: Chars<'a>,
 }
 
+pub(crate) const NULL_CHAR: char = '\0';
+
 impl<'a> Cursor<'a> {
     pub fn new(input: &'a str) -> Cursor<'a> {
         Cursor {
             len_remaining: input.len(),
-            curr_pt: 0,
-            chars: input,
+            chars: input.chars(),
         }
     }
 
-    pub fn get_next(&self, len: usize) -> &'a str {
-        println!("{}", &self.chars[self.curr_pt..(self.curr_pt + len)]);
-        &self.chars[self.curr_pt..(self.curr_pt + len)]
+    pub fn as_str(&self) -> &'a str {
+        self.chars.as_str()
+    }
+
+    /// Returns next character without consuming it.
+    pub fn first(&self) -> char {
+        self.chars.clone().next().unwrap_or(NULL_CHAR)
     }
 
     /// File is finished parsing
     pub fn is_eof(&self) -> bool {
-        self.len_remaining == 0
+        self.chars.as_str().is_empty()
     }
 
-    /// Return slice of input starting at the current point of the cursor
-    pub fn at_curr_pt(&self) -> &'a str {
-        &self.chars[self.curr_pt..]
+    /// Advance by one character
+    pub fn bump(&mut self) -> Option<char> {
+        let c = self.chars.next()?;
+        Some(c)
     }
 
-    /// Move cursor ahead in the input by given amount
-    pub fn advance(&mut self, amt: usize) {
-        self.curr_pt += amt;
-        self.len_remaining -= amt;
+    /// Return consumed tokens
+    /// Basic counter that is reset after each token.
+    pub(crate) fn pos_in_token(&self) -> u32 {
+        (self.len_remaining - self.chars.as_str().len()) as u32
     }
 
-    /// Advance by one character
-    pub fn bump(&mut self) {
-        self.advance(1)
+    /// Resets the number of consumed chars
+    pub(crate) fn reset_pos(&mut self) {
+        self.len_remaining = self.chars.as_str().len();
     }
 
-    /// Returns current cursor position
-    pub fn curr_pt(&self) -> usize {
-        self.curr_pt
+    /// Consume until given function returns false
+    pub(crate) fn take_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
+        while predicate(self.first()) && !self.is_eof() {
+            self.bump();
+        }
     }
 }
diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs
index b05e007..36d1ecc 100644
--- a/src/lexer/mod.rs
+++ b/src/lexer/mod.rs
@@ -7,85 +7,45 @@ use crate::symbol::Register;
 
 pub mod cursor;
 
+/// A 'light' token that only carries basic and easily derivable info
 #[derive(Debug)]
-pub struct Token {
-    pub kind: TokenKind,
-    pub span: Span,
+pub struct LToken {
+    pub kind: LTokenKind,
+    pub len: u32,
+}
+
+impl LToken {
+    pub fn new(kind: LTokenKind, len: u32) -> Self {
+        LToken { kind, len }
+    }
 }
 
 #[derive(Clone, Copy, PartialEq, Eq, Debug)]
 pub enum LiteralKind {
     Hex,
     Dec,
-    Str,
-}
-
-#[derive(Clone, Copy, PartialEq, Eq, Debug)]
-pub enum InstrKind {
-    Add,
-    And,
-    Branch,
-    Jump,
-    JumpSub,
-    JumpSubReg,
-    Load,
-    LoadInd,
-    LoadReg,
-    LoadAddr,
-    Not,
-    Return,
-    Interrupt,
-    Store,
-    StoreInd,
+    Str { terminated: bool },
 }
 
 #[derive(Clone, Copy, PartialEq, Eq, Debug)]
-pub enum DirecKind {
-    Orig,
-    Stringz,
-    Blkw,
-    Fill,
-    Alias,
-    Macro,
-    End,
-    Export,
-    Import,
-}
-
-#[derive(Clone, Copy, PartialEq, Eq, Debug)]
-pub enum TrapKind {
-    /// Get a character from standard input
-    Getc,
-    /// Output a single character
-    Out,
-    /// Print string
-    Puts,
-    In,
-    Putsp,
-    Halt,
-    Trap,
-}
-
-#[derive(Clone, Copy, PartialEq, Eq, Debug)]
-pub enum TokenKind {
+pub enum LTokenKind {
     Ident,
-    Instr(InstrKind),
     Lit(LiteralKind),
     Comment,
     Direc,
     Reg,
-    /// Commas and whitespace
-    Junk,
+    /// Also includes commas
+    Whitespace,
     Unknown,
     Eof,
 }
 
 /// Not actually used in parsing, more for debug purposes.
-pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ {
+pub fn tokenize(input: &str) -> impl Iterator<Item = LToken> + '_ {
     let mut cursor = Cursor::new(input);
     std::iter::from_fn(move || {
         let token = cursor.advance_token();
-        if token.kind != TokenKind::Eof {
+        if token.kind != LTokenKind::Eof {
             Some(token)
         } else {
             None
@@ -93,80 +53,84 @@ pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ {
     })
 }
 
-lazy_static! {
-    // Order is important since some patterns are subpatterns of others.
-    // Do NOT rearrange without a good hard think.
-    static ref PATTERNS: Vec<(TokenKind, Regex)> = vec![
-        (TokenKind::Junk, Regex::new(r"^[,\s]+").unwrap()),
-        (
-            TokenKind::Lit(LiteralKind::Hex),
-            Regex::new(r"^(0x|x)[0-9a-fA-F]+\b").unwrap(),
-        ),
-        (
-            TokenKind::Lit(LiteralKind::Dec),
-            Regex::new(r"^#[0-9]+\b").unwrap(),
-        ),
-        (TokenKind::Reg, Regex::new(r"^[rR][0-8]\b").unwrap()),
-        // Includes instructions, branches, and labels.
-        (TokenKind::Ident, Regex::new(r"^[a-zA-Z_]\w*\b").unwrap()),
-        (TokenKind::Comment, Regex::new(r"^;[^\n]*").unwrap()),
-        (TokenKind::Direc, Regex::new(r"^\.[a-zA-Z_]*\b").unwrap()),
-        (TokenKind::Lit(LiteralKind::Str), Regex::new(r#"^"([^"\\]|\\.)*""#).unwrap())
-    ];
+/// Test if a character is considered to be whitespace.
+pub(crate) fn is_whitespace(c: char) -> bool {
+    // Commas are essentially whitespace in LC3
+    matches!(c, ' ' | '\n' | '\t' | '\r' | ',')
 }
 
-impl Cursor<'_> {
-    pub fn advance_token(&mut self) -> Token {
-        if self.is_eof() {
-            return Token {
-                kind: TokenKind::Eof,
-                span: Span::default(),
-            };
-        }
+pub(crate) fn is_id(c: char) -> bool {
+    // Non-prefixed numerical literals are considered identifiers.
+    // This is because line numbers can be used as labels.
+    matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '_')
+}
 
-        for (kind, re) in PATTERNS.iter() {
-            if let Some(tok) = re.find(self.at_curr_pt()) {
-                // Parse into precise definition
-                let mut kind = *kind;
-                kind = match kind {
-                    TokenKind::Ident => match self.get_next(tok.len()).to_lowercase().as_str() {
-                        "add" => TokenKind::Instr(InstrKind::Add),
-                        "and" => TokenKind::Instr(InstrKind::And),
-                        "br" | "brn" | "brz" | "brp" | "brnz" | "brnzp" | "brnp" | "brzp" => {
-                            TokenKind::Instr(InstrKind::Branch)
-                        }
-                        "jmp" => TokenKind::Instr(InstrKind::Jump),
-                        "jsr" => TokenKind::Instr(InstrKind::JumpSub),
-                        "jsrr" => TokenKind::Instr(InstrKind::JumpSubReg),
-                        "ld" => TokenKind::Instr(InstrKind::Load),
-                        "ldi" => TokenKind::Instr(InstrKind::LoadInd),
-                        "ldr" => TokenKind::Instr(InstrKind::LoadReg),
-                        "lea" => TokenKind::Instr(InstrKind::LoadAddr),
-                        "not" => TokenKind::Instr(InstrKind::Not),
-                        "ret" => TokenKind::Instr(InstrKind::Return),
-                        "rti" => TokenKind::Instr(InstrKind::Interrupt),
-                        "st" => TokenKind::Instr(InstrKind::Store),
-                        "sti" => TokenKind::Instr(InstrKind::StoreInd),
-                        _ => TokenKind::Ident,
-                    },
-                    TokenKind::Direc => {
-                        todo!()
-                    }
-                    _ => kind,
-                };
-                let token = Token {
-                    kind,
-                    span: Span::new(Idx(self.curr_pt() as u32), tok.len() as u16),
-                };
-                self.advance(tok.len());
-                return token;
-            }
-        }
+pub(crate) fn is_num(c: char) -> bool {
+    matches!(c, '0'..='9')
+}
 
-        self.bump();
-        Token {
-            kind: TokenKind::Unknown,
-            span: Span::new(Idx((self.curr_pt() - 1) as u32), 1u16),
-        }
+pub(crate) fn is_hex(c: char) -> bool {
+    matches!(c, 'a'..='f' | 'A'..='F' | '0'..='9')
+}
+
+impl Cursor<'_> {
+    pub fn advance_token(&mut self) -> LToken {
+        let first_char = match self.bump() {
+            Some(c) => c,
+            None => return LToken::new(LTokenKind::Eof, 0),
+        };
+        let token_kind = match first_char {
+            ';' => {
+                self.take_while(|c| c != '\n');
+                LTokenKind::Comment
+            }
+            c if is_whitespace(c) => {
+                self.take_while(is_whitespace);
+                LTokenKind::Whitespace
+            }
+            // Hex literals
+            'x' | 'X' => {
+                self.take_while(is_hex);
+                LTokenKind::Lit(LiteralKind::Hex)
+            }
+            '0' => match self.first() {
+                'x' | 'X' => {
+                    self.take_while(is_hex);
+                    LTokenKind::Lit(LiteralKind::Hex)
+                }
+                _ => {
+                    self.take_while(is_id);
+                    LTokenKind::Ident
+                }
+            },
+            // Identifiers should be checked after everything else that overlaps.
+            c if is_id(c) => {
+                self.take_while(is_id);
+                LTokenKind::Ident
+            }
+            // Decimal literal
+            '#' => {
+                if self.first() == '-' {
+                    self.bump();
+                }
+                self.take_while(is_num);
+                LTokenKind::Lit(LiteralKind::Dec)
+            }
+            // Directive
+            '.' => {
+                self.take_while(is_id);
+                LTokenKind::Direc
+            }
+            // String literal
+            // TODO: Allow for escaped characters and the terminated thing
+            '"' => {
+                self.take_while(|c| c != '"');
+                LTokenKind::Lit(LiteralKind::Str { terminated: true })
+            }
+            _ => LTokenKind::Unknown,
+        };
+        let res = LToken::new(token_kind, self.pos_in_token());
+        self.reset_pos();
+        res
     }
 }
diff --git a/src/main.rs b/src/main.rs
index 36873be..3787be5 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -4,7 +4,7 @@ use std::fs;
 
 use clap::{Parser, Subcommand};
 use colored::Colorize;
-use lexer::{tokenize, TokenKind};
+use lexer::{tokenize, LTokenKind};
 use miette::Result;
 use parser::AsmParser;
 
@@ -69,8 +69,8 @@ fn main() -> Result<()> {
             Command::Run { os, name } => todo!(),
             Command::Compile { name, dest } => {
                 let file = fs::read_to_string(name).unwrap();
-                for tok in tokenize(&file).filter(|tok| tok.kind != TokenKind::Junk) {
-                    println!("{:?} {}", tok, &file[tok.span.as_range()]);
+                for tok in tokenize(&file).filter(|tok| tok.kind != LTokenKind::Whitespace) {
+                    println!("{:?}", tok);
                 }
 
                 let mut parse = AsmParser::from(file.as_str());

From 0a9f55185a1b0702cc8be7fabdab2c51278fc67b Mon Sep 17 00:00:00 2001
From: Artemis Rosman <73006620+rozukke@users.noreply.github.com>
Date: Thu, 15 Aug 2024 12:18:01 +1000
Subject: [PATCH 05/17] Progress

---
 src/ops.rs    |  4 ++--
 src/parser.rs | 34 ++++++++++++++++++++++++++++++----
 src/symbol.rs |  6 ++++++
 3 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/src/ops.rs b/src/ops.rs
index 9cf5eae..fd2df66 100644
--- a/src/ops.rs
+++ b/src/ops.rs
@@ -1,5 +1,5 @@
 use crate::{
-    lexer::Token,
+    lexer::LToken,
     symbol::{ByteOffs, Flag, Label, Register},
 };
 
@@ -75,7 +75,7 @@ pub enum Op {
         pc_offset9: u16,
     },
     Dir {
-        args: Option<Vec<Token>>,
+        args: Option<Vec<LToken>>,
     },
 }
 
diff --git a/src/parser.rs b/src/parser.rs
index b949a9e..846d6b3 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -2,7 +2,7 @@ use std::error::Error;
 
 use miette::{miette, Result};
 
-use crate::lexer::{cursor::Cursor, Token, TokenKind};
+use crate::lexer::{cursor::Cursor, LToken, LTokenKind};
 
 /// Transforms token stream into 'AST'
 pub struct AsmParser<'source> {
@@ -24,13 +24,39 @@ impl<'a> From<&'a str> for AsmParser<'a> {
 impl<'source> AsmParser<'source> {
     pub fn parse(&mut self) -> Result<()> {
         // First, check that there is an .orig directive with an appropriate value.
-        let orig = self.expect(TokenKind::Direc)?;
-        let addr = self.expect(TokenKind::Lit(crate::lexer::LiteralKind::Hex));
+        // Should emit error with a label to the first line stating "Expected memory init"
+        // Should be in a function that is also used to init the memory - the question is
+        // whether it should remain as a full directive or as a value that gets emitted afterwards.
+        let orig = self.expect(LTokenKind::Direc)?;
+        // Need ability to expect an enum without specifying a subcase (maybe ()?)
+        let addr = self.expect(LTokenKind::Lit(crate::lexer::LiteralKind::Hex));
+
+        // Following this, the structure is always:
+        // [label]
+        // ->   <inst> [args]
+        // OR
+        // <label>
+        // ->   <direc> [args]
+        // OR
+        // [label]
+        // ->*   <direc> <args>
+        // OR
+        // <trap> [arg]
+        // or: (sometimes opt label) num directives (opt argument)
+        // so should generally build to this structure. This means, however, that the complexity
+        // is not suuper high as there are really only two medium complexity subcases to parse.
+        //
+        // TODO: Split into LexToken and Token, to simplify the lexer and have a postprocessing
+        // step that can then put it into a Token format that is then easily transformed into
+        // the 'AST'.
+        //
+        // In order to do this, there needs to be peeking functionality on the token stream so
+        // that it can e.g. see if there is a label present at the start of a line.
 
         Ok(())
     }
 
-    pub fn expect(&mut self, kind: TokenKind) -> Result<Token> {
+    pub fn expect(&mut self, kind: LTokenKind) -> Result<LToken> {
         let tok = self.cur.advance_token();
         if tok.kind == kind {
             return Ok(tok);
diff --git a/src/symbol.rs b/src/symbol.rs
index e49dac7..71b987b 100644
--- a/src/symbol.rs
+++ b/src/symbol.rs
@@ -1,3 +1,9 @@
+use fxhash::FxBuildHasher;
+use indexmap::IndexMap;
+
+// Symbol table of symbol -> memory address (line number)
+type FxMap<K, V> = IndexMap<K, V, FxBuildHasher>;
+
 /// Represents the CPU registers.
 #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)]
 pub enum Register {

From 82a72d1dc070cb290fd340408e80607aa494dd42 Mon Sep 17 00:00:00 2001
From: Artemis Rosman <73006620+rozukke@users.noreply.github.com>
Date: Thu, 15 Aug 2024 16:05:10 +1000
Subject: [PATCH 06/17] Add indexmap dependency

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index 294e1e8..c0d0490 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -14,7 +14,7 @@ colored = "2.1.0"
 regex = "1.10.6"
 lazy_static = "1.5.0"
 miette = { version = "7.2.0", features = ["fancy"] }
-indexmap = "2.4.0"
+indexmap = { version = "2.4.0", features = ["std"] }
 fxhash = "0.2.1"
 
 [dev-dependencies]

From 3018c48d5850f22667c435f8f629c04e873b6e63 Mon Sep 17 00:00:00 2001
From: Artemis Rosman <73006620+rozukke@users.noreply.github.com>
Date: Thu, 15 Aug 2024 16:05:47 +1000
Subject: [PATCH 07/17] Progress on token conversion

---
 src/lexer/cursor.rs |  4 ++--
 src/lexer/mod.rs    | 15 ++++-----------
 src/main.rs         |  1 -
 src/ops.rs          |  6 ------
 src/parser.rs       | 44 +++++++++++++++++++++++++++++++++++++-------
 src/span.rs         | 22 ----------------------
 src/symbol.rs       | 30 ++++++++++++++++++++++++++++++
 7 files changed, 73 insertions(+), 49 deletions(-)
 delete mode 100644 src/span.rs

diff --git a/src/lexer/cursor.rs b/src/lexer/cursor.rs
index 7a56ee4..885d31c 100644
--- a/src/lexer/cursor.rs
+++ b/src/lexer/cursor.rs
@@ -1,6 +1,6 @@
 //! Taken from the lexer in https://github.com/rozukke/mimi
-// Heavily instpired and referenced from `rustc_lexer` and adapted to suit the project.
-// See https://doc.rust-lang.org/beta/nightly-rustc/src/rustc_lexer/cursor.rs.html
+//! Heavily instpired and referenced from `rustc_lexer` and adapted to suit the project.
+//! See https://doc.rust-lang.org/beta/nightly-rustc/src/rustc_lexer/cursor.rs.html
 
 use std::str::Chars;
 
diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs
index 36d1ecc..231e219 100644
--- a/src/lexer/mod.rs
+++ b/src/lexer/mod.rs
@@ -59,20 +59,13 @@ pub(crate) fn is_whitespace(c: char) -> bool {
     matches!(c, ' ' | '\n' | '\t' | '\r' | ',')
 }
 
+/// Test if a character is considered an LC3 identifier character.
 pub(crate) fn is_id(c: char) -> bool {
     // Non-prefixed numerical literals are considered identifiers.
     // This is because line numbers can be used as labels.
     matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '_')
 }
 
-pub(crate) fn is_num(c: char) -> bool {
-    matches!(c, '0'..='9')
-}
-
-pub(crate) fn is_hex(c: char) -> bool {
-    matches!(c, 'a'..='f' | 'A'..='F' | '0'..='9')
-}
-
 impl Cursor<'_> {
     pub fn advance_token(&mut self) -> LToken {
         let first_char = match self.bump() {
@@ -90,12 +83,12 @@ impl Cursor<'_> {
             }
             // Hex literals
             'x' | 'X' => {
-                self.take_while(is_hex);
+                self.take_while(|c| char::is_ascii_hexdigit(&c));
                 LTokenKind::Lit(LiteralKind::Hex)
             }
             '0' => match self.first() {
                 'x' | 'X' => {
-                    self.take_while(is_hex);
+                    self.take_while(|c| char::is_ascii_hexdigit(&c));
                     LTokenKind::Lit(LiteralKind::Hex)
                 }
                 _ => {
@@ -113,7 +106,7 @@ impl Cursor<'_> {
                 if self.first() == '-' {
                     self.bump();
                 }
-                self.take_while(is_num);
+                self.take_while(|c| char::is_ascii_digit(&c));
                 LTokenKind::Lit(LiteralKind::Dec)
             }
             // Directive
diff --git a/src/main.rs b/src/main.rs
index 3787be5..6284a30 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -12,7 +12,6 @@ mod lexer;
 mod ops;
 mod parser;
 mod runtime;
-mod span;
 mod symbol;
 
 /// Lace is a complete & convenient assembler toolchain for the LC3 assembly language.
diff --git a/src/ops.rs b/src/ops.rs
index fd2df66..6baa6d7 100644
--- a/src/ops.rs
+++ b/src/ops.rs
@@ -3,12 +3,6 @@ use crate::{
     symbol::{ByteOffs, Flag, Label, Register},
 };
 
-pub struct Stmt {
-    line: u32,
-    label: Label,
-    op: Op,
-}
-
 /// Basically the entire 'AST' when it comes to LC3.
 /// TODO: Convert to labels instead of offsets at this stage.
 #[allow(clippy::upper_case_acronyms)]
diff --git a/src/parser.rs b/src/parser.rs
index 846d6b3..772ad28 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -1,27 +1,57 @@
-use std::error::Error;
+use std::{error::Error, io::Cursor};
 
 use miette::{miette, Result};
 
-use crate::lexer::{cursor::Cursor, LToken, LTokenKind};
+use crate::{
+    lexer::{tokenize, LToken, LTokenKind, LiteralKind},
+    symbol::{DirKind, InstrKind, Register, Span, Symbol, TrapKind},
+};
+
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+pub struct Token {
+    kind: TokenKind,
+    span: Span,
+}
+
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+pub enum TokenKind {
+    /// `r0-r7 | R0-R7`
+    Reg(Register),
+    /// `LOOP_START`, `123`, `coolname`
+    Label(Symbol),
+    /// `.orig`, `.Stringz`, `.BLKW`
+    Dir(DirKind),
+    /// `PUTS`, `Trap`, `putc`
+    Trap(TrapKind),
+    /// `"hi\n"`, `0x3AB5F`, `#-1`
+    Lit(LiteralKind),
+    /// `add`, `JMP`, `Ret`
+    Inst(InstrKind),
+}
+
+pub fn proc_tokens<'a>(src: &'a str) -> Vec<Token> {
+    todo!()
+}
 
 /// Transforms token stream into 'AST'
-pub struct AsmParser<'source> {
+pub struct AsmParser<'a> {
     /// Reference to the source file
-    src: &'source str,
+    src: &'a Vec<Token>,
     /// Used to parse tokens
-    cur: Cursor<'source>,
+    cur: Cursor<'a>,
 }
 
 impl<'a> From<&'a str> for AsmParser<'a> {
     fn from(value: &'a str) -> Self {
+        let toks: Vec<LToken> = tokenize(value).collect();
         AsmParser {
-            src: value,
+            src: toks,
             cur: Cursor::new(value),
         }
     }
 }
 
-impl<'source> AsmParser<'source> {
+impl<'a> AsmParser<'a> {
     pub fn parse(&mut self) -> Result<()> {
         // First, check that there is an .orig directive with an appropriate value.
         // Should emit error with a label to the first line stating "Expected memory init"
diff --git a/src/span.rs b/src/span.rs
deleted file mode 100644
index e821c2c..0000000
--- a/src/span.rs
+++ /dev/null
@@ -1,22 +0,0 @@
-/// Position relative to start of source.
-#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default, Debug)]
-pub struct Idx(pub u32);
-
-/// Holds a view into a source.
-#[derive(Clone, Copy, PartialEq, Eq, Default, Hash, Debug)]
-pub struct Span {
-    start: Idx,
-    len: u16,
-}
-
-impl Span {
-    pub fn new(start: Idx, len: u16) -> Self {
-        Span { start, len }
-    }
-
-    pub fn as_range(&self) -> std::ops::Range<usize> {
-        let start = self.start.0 as usize;
-        let end = start + self.len as usize;
-        start..end
-    }
-}
diff --git a/src/symbol.rs b/src/symbol.rs
index 71b987b..d4956f0 100644
--- a/src/symbol.rs
+++ b/src/symbol.rs
@@ -4,6 +4,21 @@ use indexmap::IndexMap;
 // Symbol table of symbol -> memory address (line number)
 type FxMap<K, V> = IndexMap<K, V, FxBuildHasher>;
 
+thread_local! {
+    static SYMBOL_TABLE: FxMap<String, u16> = IndexMap::with_hasher(FxBuildHasher::default());
+}
+
+/// Reference to symbol table index
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)]
+pub struct Symbol(u16);
+
+/// Location within source
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+pub struct Span {
+    offs: ByteOffs,
+    len: usize,
+}
+
 /// Represents the CPU registers.
 #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)]
 pub enum Register {
@@ -37,6 +52,21 @@ pub enum Flag {
     Nzp,
 }
 
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+pub enum InstrKind {
+    Add,
+}
+
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+pub enum TrapKind {
+    Trap(u16),
+}
+
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+pub enum DirKind {
+    Orig,
+}
+
 /// Newtype representing an address inside the LC3 memory.
 #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)]
 pub struct Addr(u16);

From 67ca7536d91781c914a9001ea4b76481537156cb Mon Sep 17 00:00:00 2001
From: Artemis Rosman <73006620+rozukke@users.noreply.github.com>
Date: Sat, 17 Aug 2024 03:03:10 +1000
Subject: [PATCH 08/17] Add initial parser function spec

---
 src/parser.rs | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/src/parser.rs b/src/parser.rs
index 772ad28..ef64a30 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -1,9 +1,9 @@
-use std::{error::Error, io::Cursor};
+use std::error::Error;
 
 use miette::{miette, Result};
 
 use crate::{
-    lexer::{tokenize, LToken, LTokenKind, LiteralKind},
+    lexer::{cursor::Cursor, tokenize, LToken, LTokenKind, LiteralKind},
     symbol::{DirKind, InstrKind, Register, Span, Symbol, TrapKind},
 };
 
@@ -29,24 +29,33 @@ pub enum TokenKind {
     Inst(InstrKind),
 }
 
-pub fn proc_tokens<'a>(src: &'a str) -> Vec<Token> {
+pub fn proc_tokens(src: &str) -> Vec<Token> {
+    // Get reference to global symbol table
+    // Iterate through, +1 to symbol count per inst
+    // +len(str) for every string literal
+    // +number of lines for BLKW (need to process cringe inconsistent literals)
+    // Also need to do matching to process register and instruction tokens into the correct contents
+    let toks: Vec<LToken> = tokenize(src).collect();
     todo!()
 }
 
 /// Transforms token stream into 'AST'
 pub struct AsmParser<'a> {
     /// Reference to the source file
-    src: &'a Vec<Token>,
+    src: &'a str,
+    /// List of processed tokens
+    tok: Vec<Token>,
     /// Used to parse tokens
     cur: Cursor<'a>,
 }
 
 impl<'a> From<&'a str> for AsmParser<'a> {
-    fn from(value: &'a str) -> Self {
-        let toks: Vec<LToken> = tokenize(value).collect();
+    fn from(src: &'a str) -> Self {
+        let tok: Vec<Token> = proc_tokens(src);
         AsmParser {
-            src: toks,
-            cur: Cursor::new(value),
+            src,
+            tok,
+            cur: Cursor::new(src),
         }
     }
 }

From 596f9ff5ccdd76989a20ad33dbd3866ae4b35861 Mon Sep 17 00:00:00 2001
From: Artemis Rosman <73006620+rozukke@users.noreply.github.com>
Date: Sat, 17 Aug 2024 03:03:27 +1000
Subject: [PATCH 09/17] Add function to look ahead n times

---
 src/lexer/cursor.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/lexer/cursor.rs b/src/lexer/cursor.rs
index 885d31c..a7589ce 100644
--- a/src/lexer/cursor.rs
+++ b/src/lexer/cursor.rs
@@ -1,4 +1,3 @@
-//! Taken from the lexer in https://github.com/rozukke/mimi
 //! Heavily instpired and referenced from `rustc_lexer` and adapted to suit the project.
 //! See https://doc.rust-lang.org/beta/nightly-rustc/src/rustc_lexer/cursor.rs.html
 
@@ -58,4 +57,8 @@ impl<'a> Cursor<'a> {
             self.bump();
         }
     }
+
+    pub(crate) fn take_n(&self, n: usize) -> String {
+        self.chars.clone().take(n).collect()
+    }
 }

From 5e2d469257a0d537bedceced2d0ec51473489d2d Mon Sep 17 00:00:00 2001
From: Artemis Rosman <73006620+rozukke@users.noreply.github.com>
Date: Sat, 17 Aug 2024 03:03:41 +1000
Subject: [PATCH 10/17] Check for .end

---
 src/lexer/mod.rs | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs
index 231e219..f38cc34 100644
--- a/src/lexer/mod.rs
+++ b/src/lexer/mod.rs
@@ -2,7 +2,6 @@ use lazy_static::lazy_static;
 use regex::Regex;
 
 use crate::lexer::cursor::Cursor;
-use crate::span::{Idx, Span};
 use crate::symbol::Register;
 
 pub mod cursor;
@@ -111,8 +110,13 @@ impl Cursor<'_> {
             }
             // Directive
             '.' => {
+                let check = self.take_n(3).to_ascii_lowercase();
                 self.take_while(is_id);
-                LTokenKind::Direc
+                // Need to check for .end directive to avoid unnecessary parsing and errors
+                match (self.pos_in_token(), check.as_str()) {
+                    (3, "end") => LTokenKind::Eof,
+                    _ => LTokenKind::Direc,
+                }
             }
             // String literal
             // TODO: Allow for escaped characters and the terminated thing

From 22a5e7b8135919ef44f0dcc8212427138ead8b35 Mon Sep 17 00:00:00 2001
From: Artemis Rosman <73006620+rozukke@users.noreply.github.com>
Date: Sat, 17 Aug 2024 17:52:04 +1000
Subject: [PATCH 11/17] Parser progress

---
 src/lexer/cursor.rs |  4 +++
 src/parser.rs       | 81 +++++++++++++++++++++++++++++++++++++++------
 src/symbol.rs       | 25 ++++++++++++--
 3 files changed, 97 insertions(+), 13 deletions(-)

diff --git a/src/lexer/cursor.rs b/src/lexer/cursor.rs
index a7589ce..e07b37f 100644
--- a/src/lexer/cursor.rs
+++ b/src/lexer/cursor.rs
@@ -61,4 +61,8 @@ impl<'a> Cursor<'a> {
     pub(crate) fn take_n(&self, n: usize) -> String {
         self.chars.clone().take(n).collect()
     }
+
+    pub(crate) fn remaining(&self) -> usize {
+        self.chars.as_str().len()
+    }
 }
diff --git a/src/parser.rs b/src/parser.rs
index ef64a30..f956740 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -1,12 +1,16 @@
-use std::error::Error;
+use std::{borrow::BorrowMut, error::Error};
 
 use miette::{miette, Result};
 
 use crate::{
     lexer::{cursor::Cursor, tokenize, LToken, LTokenKind, LiteralKind},
-    symbol::{DirKind, InstrKind, Register, Span, Symbol, TrapKind},
+    symbol::{
+        with_symbol_table, ByteOffs, DirKind, InstrKind, Register, Span, Symbol, TrapKind,
+        SYMBOL_TABLE,
+    },
 };
 
+/// Token with full span info and proper types
 #[derive(Clone, Copy, PartialEq, Eq, Debug)]
 pub struct Token {
     kind: TokenKind,
@@ -29,14 +33,69 @@ pub enum TokenKind {
     Inst(InstrKind),
 }
 
-pub fn proc_tokens(src: &str) -> Vec<Token> {
-    // Get reference to global symbol table
-    // Iterate through, +1 to symbol count per inst
-    // +len(str) for every string literal
-    // +number of lines for BLKW (need to process cringe inconsistent literals)
-    // Also need to do matching to process register and instruction tokens into the correct contents
-    let toks: Vec<LToken> = tokenize(src).collect();
-    todo!()
+/// Used to parse symbols and process exact instructions
+pub struct StrParser<'a> {
+    src: &'a str,
+    cur: Cursor<'a>,
+    pos: usize,
+    line_num: usize,
+}
+
+impl<'a> StrParser<'a> {
+    pub fn new(src: &'a str) -> Self {
+        StrParser {
+            src,
+            cur: Cursor::new(src),
+            pos: 0,
+            line_num: 1,
+        }
+    }
+
+    fn get_next(&self, n: usize) -> &str {
+        &self.src[self.pos..=(self.pos + n)]
+    }
+
+    pub fn proc_tokens(&mut self) -> Vec<Token> {
+        // Iterate through, +1 to symbol count per inst
+        // +len(str) for every string literal
+        // +number of lines for BLKW (need to process cringe inconsistent literals)
+        // Also need to do matching to process register and instruction tokens into the correct contents
+        let mut toks_final: Vec<Token> = Vec::new();
+        let mut line_num = 1;
+        loop {
+            let tok = self.cur.advance_token();
+            if let Some(tok_final) = match tok.kind {
+                // Add identifier to symbol table at with correct line number
+                LTokenKind::Ident => {
+                    // Process possibility of it being a trap
+                    todo!();
+                    // Add to symbol table as identifier
+                    let idx = with_symbol_table(|sym| {
+                        let tok_text = self.get_next(tok.len as usize);
+                        sym.get_index_of(tok_text)
+                            .unwrap_or(sym.insert_full(String::from(tok_text), line_num).0)
+                    });
+                    Some(Token {
+                        kind: TokenKind::Label(Symbol::from(idx)),
+                        span: Span::new(ByteOffs(self.pos), tok.len as usize),
+                    })
+                }
+                // Create literal of correct value
+                LTokenKind::Lit(_) => todo!(),
+                // Match on directive, check next value for number of lines skipped
+                LTokenKind::Direc => todo!(),
+                // TODO: Add registers to lexer
+                LTokenKind::Reg => todo!(),
+                LTokenKind::Whitespace | LTokenKind::Comment => None,
+                // TODO: Should return list of errors eventually
+                LTokenKind::Unknown => todo!(),
+                LTokenKind::Eof => break,
+            } {
+                toks_final.push(tok_final);
+            }
+        }
+        toks_final
+    }
 }
 
 /// Transforms token stream into 'AST'
@@ -51,7 +110,7 @@ pub struct AsmParser<'a> {
 
 impl<'a> From<&'a str> for AsmParser<'a> {
     fn from(src: &'a str) -> Self {
-        let tok: Vec<Token> = proc_tokens(src);
+        let tok: Vec<Token> = StrParser::new(src).proc_tokens();
         AsmParser {
             src,
             tok,
diff --git a/src/symbol.rs b/src/symbol.rs
index d4956f0..8adf1b8 100644
--- a/src/symbol.rs
+++ b/src/symbol.rs
@@ -1,3 +1,5 @@
+use std::cell::RefCell;
+
 use fxhash::FxBuildHasher;
 use indexmap::IndexMap;
 
@@ -5,12 +7,25 @@ use indexmap::IndexMap;
 type FxMap<K, V> = IndexMap<K, V, FxBuildHasher>;
 
 thread_local! {
-    static SYMBOL_TABLE: FxMap<String, u16> = IndexMap::with_hasher(FxBuildHasher::default());
+    pub static SYMBOL_TABLE: RefCell<FxMap<String, u16>> = RefCell::new(IndexMap::with_hasher(FxBuildHasher::default()));
+}
+
+pub fn with_symbol_table<R, F>(f: F) -> R
+where
+    F: FnOnce(&mut FxMap<String, u16>) -> R,
+{
+    SYMBOL_TABLE.with_borrow_mut(f)
 }
 
 /// Reference to symbol table index
 #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)]
-pub struct Symbol(u16);
+pub struct Symbol(usize);
+
+impl From<usize> for Symbol {
+    fn from(value: usize) -> Self {
+        Symbol { 0: value }
+    }
+}
 
 /// Location within source
 #[derive(Clone, Copy, PartialEq, Eq, Debug)]
@@ -19,6 +34,12 @@ pub struct Span {
     len: usize,
 }
 
+impl Span {
+    pub fn new(offs: ByteOffs, len: usize) -> Self {
+        Span { offs, len }
+    }
+}
+
 /// Represents the CPU registers.
 #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)]
 pub enum Register {

From b0dedfcf0357429749d2348a27073a8ae57994d7 Mon Sep 17 00:00:00 2001
From: Artemis Rosman <73006620+rozukke@users.noreply.github.com>
Date: Mon, 19 Aug 2024 17:24:29 +1000
Subject: [PATCH 12/17] Parser progress

---
 src/lexer/cursor.rs |  1 +
 src/ops.rs          |  4 +--
 src/parser.rs       | 82 ++++++++++++++++++++++++++++++++++++---------
 src/symbol.rs       | 28 +++++++++++++---
 4 files changed, 92 insertions(+), 23 deletions(-)

diff --git a/src/lexer/cursor.rs b/src/lexer/cursor.rs
index e07b37f..db97b3e 100644
--- a/src/lexer/cursor.rs
+++ b/src/lexer/cursor.rs
@@ -3,6 +3,7 @@
 
 use std::str::Chars;
 
+#[derive(Clone)]
 /// Peekable iterator over a char sequence.
 pub struct Cursor<'a> {
     len_remaining: usize,
diff --git a/src/ops.rs b/src/ops.rs
index 6baa6d7..5ba4560 100644
--- a/src/ops.rs
+++ b/src/ops.rs
@@ -1,6 +1,6 @@
 use crate::{
     lexer::LToken,
-    symbol::{ByteOffs, Flag, Label, Register},
+    symbol::{Flag, Label, LineOffs, Register},
 };
 
 /// Basically the entire 'AST' when it comes to LC3.
@@ -22,7 +22,7 @@ pub enum Op {
     /// Branch based on flag by adding ByteOffs to PC (program counter)
     BR {
         cc: Flag,
-        pc_offset9: ByteOffs,
+        pc_offset9: LineOffs,
     },
     /// Set PC to BR to perform a jump on the next cycle
     JMP {
diff --git a/src/parser.rs b/src/parser.rs
index f956740..f66ceb2 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -5,8 +5,8 @@ use miette::{miette, Result};
 use crate::{
     lexer::{cursor::Cursor, tokenize, LToken, LTokenKind, LiteralKind},
     symbol::{
-        with_symbol_table, ByteOffs, DirKind, InstrKind, Register, Span, Symbol, TrapKind,
-        SYMBOL_TABLE,
+        with_symbol_table, DirKind, DirectiveKind, InstrKind, LineOffs, Register, Span, SrcOffset,
+        Symbol, TrapKind, SYMBOL_TABLE,
     },
 };
 
@@ -55,47 +55,97 @@ impl<'a> StrParser<'a> {
         &self.src[self.pos..=(self.pos + n)]
     }
 
+    fn peek_next(&self) -> LToken {
+        self.cur.clone().advance_token()
+    }
+
     pub fn proc_tokens(&mut self) -> Vec<Token> {
         // Iterate through, +1 to symbol count per inst
         // +len(str) for every string literal
         // +number of lines for BLKW (need to process cringe inconsistent literals)
         // Also need to do matching to process register and instruction tokens into the correct contents
         let mut toks_final: Vec<Token> = Vec::new();
-        let mut line_num = 1;
         loop {
             let tok = self.cur.advance_token();
             if let Some(tok_final) = match tok.kind {
+                LTokenKind::Eof => break,
                 // Add identifier to symbol table at with correct line number
                 LTokenKind::Ident => {
                     // Process possibility of it being a trap
-                    todo!();
-                    // Add to symbol table as identifier
-                    let idx = with_symbol_table(|sym| {
-                        let tok_text = self.get_next(tok.len as usize);
-                        sym.get_index_of(tok_text)
-                            .unwrap_or(sym.insert_full(String::from(tok_text), line_num).0)
-                    });
-                    Some(Token {
-                        kind: TokenKind::Label(Symbol::from(idx)),
-                        span: Span::new(ByteOffs(self.pos), tok.len as usize),
-                    })
+                    if let Some(trap) = StrParser::trap(self.get_next(tok.len as usize)) {
+                        self.line_num += 1;
+                        Some(Token {
+                            kind: TokenKind::Trap(trap),
+                            span: Span::new(SrcOffset(self.pos), tok.len as usize),
+                        })
+                    } else {
+                        // Add to symbol table as identifier
+                        let idx = with_symbol_table(|sym| {
+                            let tok_text = self.get_next(tok.len as usize);
+                            sym.get_index_of(tok_text).unwrap_or(
+                                sym.insert_full(String::from(tok_text), self.line_num as u16)
+                                    .0,
+                            )
+                        });
+                        Some(Token {
+                            kind: TokenKind::Label(Symbol::from(idx)),
+                            span: Span::new(SrcOffset(self.pos), tok.len as usize),
+                        })
+                    }
                 }
                 // Create literal of correct value
                 LTokenKind::Lit(_) => todo!(),
                 // Match on directive, check next value for number of lines skipped
-                LTokenKind::Direc => todo!(),
+                LTokenKind::Direc => {
+                    if let Some(direc) = StrParser::direc(self.get_next(tok.len as usize)) {
+                        Some(Token {
+                            kind: TokenKind::Dir(direc),
+                            span: Span::new(SrcOffset(self.pos), tok.len as usize),
+                        })
+                    } else {
+                        // TODO: Error handling in a list
+                        todo!()
+                    }
+                }
                 // TODO: Add registers to lexer
                 LTokenKind::Reg => todo!(),
                 LTokenKind::Whitespace | LTokenKind::Comment => None,
                 // TODO: Should return list of errors eventually
                 LTokenKind::Unknown => todo!(),
-                LTokenKind::Eof => break,
             } {
                 toks_final.push(tok_final);
+                self.pos += tok.len as usize;
             }
         }
         toks_final
     }
+
+    fn trap(s: &str) -> Option<TrapKind> {
+        match s.to_ascii_lowercase().as_str() {
+            "getc" => Some(TrapKind::Getc),
+            "out" => Some(TrapKind::Out),
+            "puts" => Some(TrapKind::Puts),
+            "in" => Some(TrapKind::In),
+            "putsp" => Some(TrapKind::Putsp),
+            "halt" => Some(TrapKind::Halt),
+            "trap" => Some(TrapKind::Generic),
+            _ => None,
+        }
+    }
+    pub fn direc(s: &str) -> Option<DirectiveKind> {
+        match s.to_ascii_lowercase().as_str() {
+            ".alias" => Some(DirectiveKind::Alias),
+            ".macro" => Some(DirectiveKind::Macro),
+            ".orig" => Some(DirectiveKind::Orig),
+            ".end" => Some(DirectiveKind::End),
+            ".stringz" => Some(DirectiveKind::Stringz),
+            ".blkw" => Some(DirectiveKind::Blkw),
+            ".fill" => Some(DirectiveKind::Fill),
+            ".export" => Some(DirectiveKind::Export),
+            ".import" => Some(DirectiveKind::Import),
+            _ => None,
+        }
+    }
 }
 
 /// Transforms token stream into 'AST'
diff --git a/src/symbol.rs b/src/symbol.rs
index 8adf1b8..e81cbfd 100644
--- a/src/symbol.rs
+++ b/src/symbol.rs
@@ -30,12 +30,12 @@ impl From<usize> for Symbol {
 /// Location within source
 #[derive(Clone, Copy, PartialEq, Eq, Debug)]
 pub struct Span {
-    offs: ByteOffs,
+    offs: SrcOffset,
     len: usize,
 }
 
 impl Span {
-    pub fn new(offs: ByteOffs, len: usize) -> Self {
+    pub fn new(offs: SrcOffset, len: usize) -> Self {
         Span { offs, len }
     }
 }
@@ -80,12 +80,26 @@ pub enum InstrKind {
 
 #[derive(Clone, Copy, PartialEq, Eq, Debug)]
 pub enum TrapKind {
-    Trap(u16),
+    Generic,
+    Halt,
+    Putsp,
+    In,
+    Puts,
+    Out,
+    Getc,
 }
 
 #[derive(Clone, Copy, PartialEq, Eq, Debug)]
-pub enum DirKind {
+pub enum DirectiveKind {
+    Alias,
+    Macro,
     Orig,
+    End,
+    Stringz,
+    Blkw,
+    Fill,
+    Export,
+    Import,
 }
 
 /// Newtype representing an address inside the LC3 memory.
@@ -94,9 +108,13 @@ pub struct Addr(u16);
 
 /// Newtype representing an offset from a particular address.
 #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)]
-pub struct ByteOffs(u16);
+pub struct LineOffs(u16);
 
 /// Label used to refer to specific memory addresses
 /// TODO: optimize later
 #[derive(Clone, PartialEq, Eq, Debug)]
 pub struct Label(String);
+
+/// Used to refer to offsets from the start of a source file.
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)]
+pub struct SrcOffset(pub usize);

From ef80494238fc109488f6e94cd6c0ee7f93eb80b0 Mon Sep 17 00:00:00 2001
From: Artemis Rosman <73006620+rozukke@users.noreply.github.com>
Date: Tue, 20 Aug 2024 12:56:35 +1000
Subject: [PATCH 13/17] Add directive processing and registers to lexer

---
 src/lexer/mod.rs | 45 ++++++++++++++++++++++-----------
 src/parser.rs    | 66 ++++++++++++++++++++++++++++++++----------------
 src/symbol.rs    |  2 +-
 3 files changed, 75 insertions(+), 38 deletions(-)

diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs
index f38cc34..7855d74 100644
--- a/src/lexer/mod.rs
+++ b/src/lexer/mod.rs
@@ -58,6 +58,11 @@ pub(crate) fn is_whitespace(c: char) -> bool {
     matches!(c, ' ' | '\n' | '\t' | '\r' | ',')
 }
 
+pub(crate) fn is_reg_num(c: char) -> bool {
+    // Valid only between 0-7
+    matches!(c, '0'..='7')
+}
+
 /// Test if a character is considered an LC3 identifier character.
 pub(crate) fn is_id(c: char) -> bool {
     // Non-prefixed numerical literals are considered identifiers.
@@ -81,25 +86,25 @@ impl Cursor<'_> {
                 LTokenKind::Whitespace
             }
             // Hex literals
-            'x' | 'X' => {
-                self.take_while(|c| char::is_ascii_hexdigit(&c));
-                LTokenKind::Lit(LiteralKind::Hex)
-            }
+            'x' | 'X' => self.hex(),
             '0' => match self.first() {
-                'x' | 'X' => {
-                    self.take_while(|c| char::is_ascii_hexdigit(&c));
-                    LTokenKind::Lit(LiteralKind::Hex)
-                }
-                _ => {
-                    self.take_while(is_id);
-                    LTokenKind::Ident
+                'x' | 'X' => self.hex(),
+                _ => self.ident(),
+            },
+            'r' | 'R' => match self.first() {
+                c if is_reg_num(c) => {
+                    self.take_while(is_reg_num);
+                    // Registers are 2 tokens long and followed by whitespace/comma
+                    if self.pos_in_token() == 2 && is_whitespace(self.first()) {
+                        LTokenKind::Reg
+                    } else {
+                        self.ident()
+                    }
                 }
+                _ => self.ident(),
             },
             // Identifiers should be checked after everything else that overlaps.
-            c if is_id(c) => {
-                self.take_while(is_id);
-                LTokenKind::Ident
-            }
+            c if is_id(c) => self.ident(),
             // Decimal literal
             '#' => {
                 if self.first() == '-' {
@@ -130,4 +135,14 @@ impl Cursor<'_> {
         self.reset_pos();
         res
     }
+
+    fn ident(&mut self) -> LTokenKind {
+        self.take_while(|c| char::is_ascii_hexdigit(&c));
+        LTokenKind::Ident
+    }
+
+    fn hex(&mut self) -> LTokenKind {
+        self.take_while(|c| char::is_ascii_hexdigit(&c));
+        LTokenKind::Lit(LiteralKind::Hex)
+    }
 }
diff --git a/src/parser.rs b/src/parser.rs
index f66ceb2..f87b4f1 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -1,12 +1,12 @@
-use std::{borrow::BorrowMut, error::Error};
+use std::{borrow::BorrowMut, error::Error, usize};
 
 use miette::{miette, Result};
 
 use crate::{
     lexer::{cursor::Cursor, tokenize, LToken, LTokenKind, LiteralKind},
     symbol::{
-        with_symbol_table, DirKind, DirectiveKind, InstrKind, LineOffs, Register, Span, SrcOffset,
-        Symbol, TrapKind, SYMBOL_TABLE,
+        with_symbol_table, DirKind, InstrKind, LineOffs, Register, Span, SrcOffset, Symbol,
+        TrapKind, SYMBOL_TABLE,
     },
 };
 
@@ -51,19 +51,23 @@ impl<'a> StrParser<'a> {
         }
     }
 
-    fn get_next(&self, n: usize) -> &str {
+    fn get_next_chars(&self, n: usize) -> &str {
         &self.src[self.pos..=(self.pos + n)]
     }
 
+    // TODO: bad bad bad bad bad
     fn peek_next(&self) -> LToken {
-        self.cur.clone().advance_token()
+        let mut cur = self.cur.clone();
+        let mut tok = cur.advance_token();
+        if tok.kind != LTokenKind::Whitespace {
+            return tok;
+        }
+        cur.advance_token()
     }
 
     pub fn proc_tokens(&mut self) -> Vec<Token> {
         // Iterate through, +1 to symbol count per inst
         // +len(str) for every string literal
-        // +number of lines for BLKW (need to process cringe inconsistent literals)
-        // Also need to do matching to process register and instruction tokens into the correct contents
         let mut toks_final: Vec<Token> = Vec::new();
         loop {
             let tok = self.cur.advance_token();
@@ -72,16 +76,17 @@ impl<'a> StrParser<'a> {
                 // Add identifier to symbol table at with correct line number
                 LTokenKind::Ident => {
                     // Process possibility of it being a trap
-                    if let Some(trap) = StrParser::trap(self.get_next(tok.len as usize)) {
+                    if let Some(trap_kind) = StrParser::trap(self.get_next_chars(tok.len as usize))
+                    {
                         self.line_num += 1;
                         Some(Token {
-                            kind: TokenKind::Trap(trap),
+                            kind: TokenKind::Trap(trap_kind),
                             span: Span::new(SrcOffset(self.pos), tok.len as usize),
                         })
                     } else {
                         // Add to symbol table as identifier
                         let idx = with_symbol_table(|sym| {
-                            let tok_text = self.get_next(tok.len as usize);
+                            let tok_text = self.get_next_chars(tok.len as usize);
                             sym.get_index_of(tok_text).unwrap_or(
                                 sym.insert_full(String::from(tok_text), self.line_num as u16)
                                     .0,
@@ -97,9 +102,25 @@ impl<'a> StrParser<'a> {
                 LTokenKind::Lit(_) => todo!(),
                 // Match on directive, check next value for number of lines skipped
                 LTokenKind::Direc => {
-                    if let Some(direc) = StrParser::direc(self.get_next(tok.len as usize)) {
+                    if let Some(dir_kind) = StrParser::direc(self.get_next_chars(tok.len as usize))
+                    {
+                        self.line_num += match dir_kind {
+                            // Blkw should increment line count by the following int literal
+                            // TODO: Check if not int literal
+                            DirKind::Blkw => self
+                                .get_next_chars(self.peek_next().len as usize)
+                                .parse::<usize>()
+                                .unwrap(),
+                            // Stringz should increment line count by the number of characters
+                            // in the string literal + null byte
+                            DirKind::Stringz => {
+                                // TODO: Check if not str literal
+                                (self.peek_next().len - 2) as usize
+                            }
+                            _ => 1,
+                        };
                         Some(Token {
-                            kind: TokenKind::Dir(direc),
+                            kind: TokenKind::Dir(dir_kind),
                             span: Span::new(SrcOffset(self.pos), tok.len as usize),
                         })
                     } else {
@@ -132,17 +153,18 @@ impl<'a> StrParser<'a> {
             _ => None,
         }
     }
-    pub fn direc(s: &str) -> Option<DirectiveKind> {
+
+    pub fn direc(s: &str) -> Option<DirKind> {
         match s.to_ascii_lowercase().as_str() {
-            ".alias" => Some(DirectiveKind::Alias),
-            ".macro" => Some(DirectiveKind::Macro),
-            ".orig" => Some(DirectiveKind::Orig),
-            ".end" => Some(DirectiveKind::End),
-            ".stringz" => Some(DirectiveKind::Stringz),
-            ".blkw" => Some(DirectiveKind::Blkw),
-            ".fill" => Some(DirectiveKind::Fill),
-            ".export" => Some(DirectiveKind::Export),
-            ".import" => Some(DirectiveKind::Import),
+            ".alias" => Some(DirKind::Alias),
+            ".macro" => Some(DirKind::Macro),
+            ".orig" => Some(DirKind::Orig),
+            ".end" => Some(DirKind::End),
+            ".stringz" => Some(DirKind::Stringz),
+            ".blkw" => Some(DirKind::Blkw),
+            ".fill" => Some(DirKind::Fill),
+            ".export" => Some(DirKind::Export),
+            ".import" => Some(DirKind::Import),
             _ => None,
         }
     }
diff --git a/src/symbol.rs b/src/symbol.rs
index e81cbfd..29247dd 100644
--- a/src/symbol.rs
+++ b/src/symbol.rs
@@ -90,7 +90,7 @@ pub enum TrapKind {
 }
 
 #[derive(Clone, Copy, PartialEq, Eq, Debug)]
-pub enum DirectiveKind {
+pub enum DirKind {
     Alias,
     Macro,
     Orig,

From f08f81facffdfa4537da8ff618516f808511fa55 Mon Sep 17 00:00:00 2001
From: Artemis Rosman <73006620+rozukke@users.noreply.github.com>
Date: Tue, 27 Aug 2024 18:40:37 +1000
Subject: [PATCH 14/17] Redo lexer

---
 flake.nix           |   1 +
 scratch/test.asm    |   7 +
 src/lexer/cursor.rs |  21 ++-
 src/lexer/mod.rs    | 203 +++++++++++++++++--------
 src/main.rs         |  39 +++--
 src/ops.rs          |   4 +-
 src/parser.rs       | 353 ++++++++++++++++++++------------------------
 src/symbol.rs       |  49 +++++-
 8 files changed, 396 insertions(+), 281 deletions(-)
 create mode 100644 scratch/test.asm

diff --git a/flake.nix b/flake.nix
index 7da3ee9..74b1de3 100644
--- a/flake.nix
+++ b/flake.nix
@@ -55,6 +55,7 @@
           name = "rust-dev";
           buildInputs = with pkgs; [
             _rustToolchain
+            rust-analyzer
           ];
 
           RUST_SRC_PATH = "${_rustToolchain}/lib/rustlib/src/rust/library";
diff --git a/scratch/test.asm b/scratch/test.asm
new file mode 100644
index 0000000..2352f14
--- /dev/null
+++ b/scratch/test.asm
@@ -0,0 +1,7 @@
+ahhh .orig x3000
+add R0, R0, #2; holllly shittt no wayyy
+add R0, R1, #-32568; waow
+add r1, r3, r4 r5 0x40
+ret
+labelthing .stringz "woaw omg \"epic\""
+           .stringz "okayyy"
diff --git a/src/lexer/cursor.rs b/src/lexer/cursor.rs
index db97b3e..11a6dc4 100644
--- a/src/lexer/cursor.rs
+++ b/src/lexer/cursor.rs
@@ -1,14 +1,16 @@
 //! Heavily instpired and referenced from `rustc_lexer` and adapted to suit the project.
 //! See https://doc.rust-lang.org/beta/nightly-rustc/src/rustc_lexer/cursor.rs.html
 
-use std::str::Chars;
+use std::{ops::Range, str::Chars};
 
 #[derive(Clone)]
 /// Peekable iterator over a char sequence.
 pub struct Cursor<'a> {
     len_remaining: usize,
+    orig_size: usize,
     /// Iterator over chars in a &str
     chars: Chars<'a>,
+    input: &'a str,
 }
 
 pub(crate) const NULL_CHAR: char = '\0';
@@ -17,7 +19,9 @@ impl<'a> Cursor<'a> {
     pub fn new(input: &'a str) -> Cursor<'a> {
         Cursor {
             len_remaining: input.len(),
+            orig_size: input.len(),
             chars: input.chars(),
+            input,
         }
     }
 
@@ -41,10 +45,9 @@ impl<'a> Cursor<'a> {
         Some(c)
     }
 
-    /// Return consumed tokens
-    /// Basic counter that is reset after each token.
-    pub(crate) fn pos_in_token(&self) -> u32 {
-        (self.len_remaining - self.chars.as_str().len()) as u32
+    /// Return number of consumed tokens
+    pub(crate) fn pos_in_token(&self) -> usize {
+        self.len_remaining - self.chars.as_str().len()
     }
 
     /// Resets the number of consumed chars
@@ -66,4 +69,12 @@ impl<'a> Cursor<'a> {
     pub(crate) fn remaining(&self) -> usize {
         self.chars.as_str().len()
     }
+
+    pub(crate) fn abs_pos(&self) -> usize {
+        self.orig_size - self.len_remaining + self.pos_in_token()
+    }
+
+    pub(crate) fn get_range(&self, range: Range<usize>) -> &str {
+        &self.input[range]
+    }
 }
diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs
index 7855d74..5f8ac77 100644
--- a/src/lexer/mod.rs
+++ b/src/lexer/mod.rs
@@ -1,53 +1,62 @@
-use lazy_static::lazy_static;
-use regex::Regex;
+use std::str::FromStr;
+
+use miette::{Result, bail, miette, LabeledSpan, Severity};
 
 use crate::lexer::cursor::Cursor;
-use crate::symbol::Register;
+use crate::symbol::{DirKind, InstrKind, Register, Span, SrcOffset, TrapKind};
 
 pub mod cursor;
 
-/// A 'light' token that only carries basic and easily derivable info
+/// A 'light' token that carries basic info and span
 #[derive(Debug)]
-pub struct LToken {
-    pub kind: LTokenKind,
-    pub len: u32,
+pub struct Token {
+    pub kind: TokenKind,
+    pub span: Span,
 }
 
-impl LToken {
-    pub fn new(kind: LTokenKind, len: u32) -> Self {
-        LToken { kind, len }
+impl Token {
+    pub fn new(kind: TokenKind, span: Span) -> Self {
+        Token { kind, span }
     }
 }
 
 #[derive(Clone, Copy, PartialEq, Eq, Debug)]
 pub enum LiteralKind {
-    Hex,
-    Dec,
-    Str { terminated: bool },
+    Hex(u16),
+    Dec(i16),
+    Str,
 }
 
 #[derive(Clone, Copy, PartialEq, Eq, Debug)]
-pub enum LTokenKind {
-    Ident,
+pub enum TokenKind {
+    Label,
+    Instr(InstrKind),
+    Trap(TrapKind),
     Lit(LiteralKind),
-    Comment,
-    Direc,
-    Reg,
+    Dir(DirKind),
+    Reg(Register),
     /// Also includes commas
     Whitespace,
     Unknown,
+    Comment,
     Eof,
 }
 
 /// Not actually used in parsing, more for debug purposes.
-pub fn tokenize(input: &str) -> impl Iterator<Item = LToken> + '_ {
+pub fn tokenize(input: &str) -> impl Iterator<Item = Result<Token>> + '_ {
     let mut cursor = Cursor::new(input);
     std::iter::from_fn(move || {
-        let token = cursor.advance_token();
-        if token.kind != LTokenKind::Eof {
-            Some(token)
-        } else {
-            None
+        loop {
+            let token = cursor.advance_token();
+            if let Ok(inner) = &token {
+                if inner.kind == TokenKind::Whitespace {
+                    continue;
+                }
+                if inner.kind == TokenKind::Eof {
+                    return None;
+                }
+            }
+            return Some(token);
         }
     })
 }
@@ -71,24 +80,28 @@ pub(crate) fn is_id(c: char) -> bool {
 }
 
 impl Cursor<'_> {
-    pub fn advance_token(&mut self) -> LToken {
+    pub fn advance_token(&mut self) -> Result<Token> {
+        let start_pos = self.abs_pos();
         let first_char = match self.bump() {
             Some(c) => c,
-            None => return LToken::new(LTokenKind::Eof, 0),
+            None => return Ok(Token::new(TokenKind::Eof, Span::dummy())),
         };
         let token_kind = match first_char {
             ';' => {
                 self.take_while(|c| c != '\n');
-                LTokenKind::Comment
+                TokenKind::Comment
             }
             c if is_whitespace(c) => {
                 self.take_while(is_whitespace);
-                LTokenKind::Whitespace
+                TokenKind::Whitespace
             }
             // Hex literals
-            'x' | 'X' => self.hex(),
+            'x' | 'X' => self.hex()?,
             '0' => match self.first() {
-                'x' | 'X' => self.hex(),
+                'x' | 'X' => {
+                    self.bump();
+                    self.hex()?
+                    },
                 _ => self.ident(),
             },
             'r' | 'R' => match self.first() {
@@ -96,7 +109,8 @@ impl Cursor<'_> {
                     self.take_while(is_reg_num);
                     // Registers are 2 tokens long and followed by whitespace/comma
                     if self.pos_in_token() == 2 && is_whitespace(self.first()) {
-                        LTokenKind::Reg
+                        // Unwrap is safe as c is always valid.
+                        TokenKind::Reg(Register::from_str(&c.to_string()).unwrap())
                     } else {
                         self.ident()
                     }
@@ -106,43 +120,110 @@ impl Cursor<'_> {
             // Identifiers should be checked after everything else that overlaps.
             c if is_id(c) => self.ident(),
             // Decimal literal
-            '#' => {
-                if self.first() == '-' {
-                    self.bump();
-                }
-                self.take_while(|c| char::is_ascii_digit(&c));
-                LTokenKind::Lit(LiteralKind::Dec)
-            }
+            '#' => self.dec()?,
             // Directive
-            '.' => {
-                let check = self.take_n(3).to_ascii_lowercase();
-                self.take_while(is_id);
-                // Need to check for .end directive to avoid unnecessary parsing and errors
-                match (self.pos_in_token(), check.as_str()) {
-                    (3, "end") => LTokenKind::Eof,
-                    _ => LTokenKind::Direc,
-                }
-            }
+            // '.' => {
+            //     let check = self.take_n(3).to_ascii_lowercase();
+            //     self.take_while(is_id);
+            //     // Need to check for .end directive to avoid unnecessary parsing and errors
+            //     match (self.pos_in_token(), check.as_str()) {
+            //         (3, "end") => TokenKind::Eof,
+            //         _ => TokenKind::Dir,
+            //     }
+            // }
             // String literal
-            // TODO: Allow for escaped characters and the terminated thing
-            '"' => {
-                self.take_while(|c| c != '"');
-                LTokenKind::Lit(LiteralKind::Str { terminated: true })
-            }
-            _ => LTokenKind::Unknown,
+            '"' => self.string_literal()?,
+            _ => {
+                self.take_while(|c| !is_whitespace(c));
+                TokenKind::Unknown
+            },
         };
-        let res = LToken::new(token_kind, self.pos_in_token());
+        let res = Token::new(token_kind, Span::new(SrcOffset(start_pos), self.pos_in_token()));
         self.reset_pos();
-        res
+        Ok(res)
     }
 
-    fn ident(&mut self) -> LTokenKind {
-        self.take_while(|c| char::is_ascii_hexdigit(&c));
-        LTokenKind::Ident
+    fn ident(&mut self) -> TokenKind {
+        self.take_while(is_id);
+        TokenKind::Label
     }
 
-    fn hex(&mut self) -> LTokenKind {
-        self.take_while(|c| char::is_ascii_hexdigit(&c));
-        LTokenKind::Lit(LiteralKind::Hex)
+    fn hex(&mut self) -> Result<TokenKind> {
+        let start = self.abs_pos();
+        let prefix = self.pos_in_token();
+        self.take_while(|c| !is_whitespace(c));
+        let str_val = self.get_range(start..self.abs_pos());
+        let value = match u16::from_str_radix(str_val, 16) {
+            Ok(value) => value,
+            Err(e) => {
+                return Err(miette!(
+                    severity = Severity::Error,
+                    code = "parse::hex_lit",
+                    help = "only use characters 0-9 and a-F.",
+                    labels = vec![LabeledSpan::at(start - prefix..self.abs_pos(), "incorrect literal")],
+                    "Encountered an invalid hex literal: {e}",
+                ))
+            }
+        };
+
+        Ok(TokenKind::Lit(LiteralKind::Hex(value)))
+    }
+
+    fn dec(&mut self) -> Result<TokenKind> {
+        let start = self.abs_pos();
+        let prefix = self.pos_in_token();
+        // Check for negative sign
+        let is_negative = if self.first() == '-' {
+            self.bump(); // Skip the negative sign
+            true
+        } else {
+            false
+        };
+        // Take the numeric part
+        self.take_while(|c| char::is_ascii_digit(&c));
+        let str_val = self.get_range(start..self.abs_pos());
+
+        // Parse the string as an i16 to handle negative values
+        let value = match i16::from_str_radix(&str_val, 10) {
+            Ok(value) => value,
+            Err(e) => {
+                bail!(
+                    severity = Severity::Error,
+                    code = "parse::dec_lit",
+                    help = "LC3 supports 16 bits of space, from -32,768 to 32,767.",
+                    labels = vec![LabeledSpan::at(start - prefix..self.abs_pos(), "incorrect literal")],
+                    "Encountered an invalid decimal literal: {e}",
+                )
+            }
+        };
+
+        Ok(TokenKind::Lit(LiteralKind::Dec(value)))
+    }
+
+    fn string_literal(&mut self) -> Result<TokenKind> {
+        let start = self.abs_pos() - 1;
+        let mut terminated = false;
+        while let Some(c) = self.bump() {
+            if c == '\n' {break};
+            if c == '"' {
+                terminated = true;
+                break;
+            }
+            // Skip escaped
+            if c == '\\' {
+                self.bump();
+            }
+        }
+        if !terminated {
+            bail!(
+                severity = Severity::Error,
+                code = "parse::str_lit",
+                help = "hint: make sure to close string literals with a \" character.",
+                labels = vec![LabeledSpan::at(start..self.abs_pos(), "incorrect literal")],
+                "Encountered an unterminated string literal.",
+
+        )
+        }
+        Ok(TokenKind::Lit(LiteralKind::Str))
     }
 }
diff --git a/src/main.rs b/src/main.rs
index 6284a30..0f3c1cb 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,12 +1,14 @@
 #![allow(unused)] // Remove later
 
 use std::fs;
+use std::ops::RangeBounds;
+use std::path::PathBuf;
 
 use clap::{Parser, Subcommand};
 use colored::Colorize;
-use lexer::{tokenize, LTokenKind};
-use miette::Result;
-use parser::AsmParser;
+use lexer::{tokenize};
+use lexer::TokenKind;
+use miette::{Result, IntoDiagnostic};
 
 mod lexer;
 mod ops;
@@ -34,46 +36,50 @@ enum Command {
         #[arg(short, long)]
         os: bool,
         /// .asm file to run
-        name: String,
+        name: PathBuf,
     },
     /// Create binary `.lc3` file to run later or view compiled data
     Compile {
         /// `.asm` file to compile
-        name: String,
+        name: PathBuf,
         /// Destination to output .lc3 file
         dest: Option<String>,
     },
     /// Remove compilation artifacts for specified source
     Clean {
         /// `.asm` file to try remove artifacts for
-        name: String,
+        name: PathBuf,
     },
     /// Place a watch on a `.asm` file to receive constant assembler updates
     Watch {
         /// `.asm` file to watch
-        name: String,
+        name: PathBuf,
     },
     /// Format `.asm` file to adhere to recommended style
     Fmt {
         /// `.asm` file to format
-        name: String,
+        name: PathBuf,
     },
 }
 
-fn main() -> Result<()> {
+fn main() -> miette::Result<()> {
     let args = Args::parse();
 
     if let Some(command) = args.command {
         match command {
             Command::Run { os, name } => todo!(),
             Command::Compile { name, dest } => {
-                let file = fs::read_to_string(name).unwrap();
-                for tok in tokenize(&file).filter(|tok| tok.kind != LTokenKind::Whitespace) {
-                    println!("{:?}", tok);
+                let file = fs::read_to_string(name).into_diagnostic()?;
+                for tok in tokenize(&file) {
+                    let ok = match tok {
+                        Ok(ok) => ok,
+                        Err(err) => {
+                            return Err(err.with_source_code(file.clone()));
+                        }
+                    };
+                    println!("{:?}", ok);
+                    println!("{:?}", &file[ok.span.range()]);
                 }
-
-                let mut parse = AsmParser::from(file.as_str());
-                parse.parse()?;
                 Ok(())
             }
             Command::Clean { name } => todo!(),
@@ -88,7 +94,8 @@ fn main() -> Result<()> {
     }
 }
 
-const LOGO: &str = r#"      ..                                  
+const LOGO: &str = r#"
+      ..                                  
 x .d88"                                   
  5888R                                    
  '888R         u           .        .u    
diff --git a/src/ops.rs b/src/ops.rs
index 5ba4560..eefe6a7 100644
--- a/src/ops.rs
+++ b/src/ops.rs
@@ -1,5 +1,5 @@
 use crate::{
-    lexer::LToken,
+    lexer::Token,
     symbol::{Flag, Label, LineOffs, Register},
 };
 
@@ -69,7 +69,7 @@ pub enum Op {
         pc_offset9: u16,
     },
     Dir {
-        args: Option<Vec<LToken>>,
+        args: Option<Vec<Token>>,
     },
 }
 
diff --git a/src/parser.rs b/src/parser.rs
index f87b4f1..90b7c20 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -3,36 +3,13 @@ use std::{borrow::BorrowMut, error::Error, usize};
 use miette::{miette, Result};
 
 use crate::{
-    lexer::{cursor::Cursor, tokenize, LToken, LTokenKind, LiteralKind},
+    lexer::{cursor::Cursor, Token, TokenKind},
     symbol::{
-        with_symbol_table, DirKind, InstrKind, LineOffs, Register, Span, SrcOffset, Symbol,
-        TrapKind, SYMBOL_TABLE,
+        with_symbol_table, DirKind, Span, SrcOffset, Symbol,
+        TrapKind,
     },
 };
 
-/// Token with full span info and proper types
-#[derive(Clone, Copy, PartialEq, Eq, Debug)]
-pub struct Token {
-    kind: TokenKind,
-    span: Span,
-}
-
-#[derive(Clone, Copy, PartialEq, Eq, Debug)]
-pub enum TokenKind {
-    /// `r0-r7 | R0-R7`
-    Reg(Register),
-    /// `LOOP_START`, `123`, `coolname`
-    Label(Symbol),
-    /// `.orig`, `.Stringz`, `.BLKW`
-    Dir(DirKind),
-    /// `PUTS`, `Trap`, `putc`
-    Trap(TrapKind),
-    /// `"hi\n"`, `0x3AB5F`, `#-1`
-    Lit(LiteralKind),
-    /// `add`, `JMP`, `Ret`
-    Inst(InstrKind),
-}
-
 /// Used to parse symbols and process exact instructions
 pub struct StrParser<'a> {
     src: &'a str,
@@ -56,90 +33,90 @@ impl<'a> StrParser<'a> {
     }
 
     // TODO: bad bad bad bad bad
-    fn peek_next(&self) -> LToken {
-        let mut cur = self.cur.clone();
-        let mut tok = cur.advance_token();
-        if tok.kind != LTokenKind::Whitespace {
-            return tok;
-        }
-        cur.advance_token()
-    }
-
-    pub fn proc_tokens(&mut self) -> Vec<Token> {
-        // Iterate through, +1 to symbol count per inst
-        // +len(str) for every string literal
-        let mut toks_final: Vec<Token> = Vec::new();
-        loop {
-            let tok = self.cur.advance_token();
-            if let Some(tok_final) = match tok.kind {
-                LTokenKind::Eof => break,
-                // Add identifier to symbol table at with correct line number
-                LTokenKind::Ident => {
-                    // Process possibility of it being a trap
-                    if let Some(trap_kind) = StrParser::trap(self.get_next_chars(tok.len as usize))
-                    {
-                        self.line_num += 1;
-                        Some(Token {
-                            kind: TokenKind::Trap(trap_kind),
-                            span: Span::new(SrcOffset(self.pos), tok.len as usize),
-                        })
-                    } else {
-                        // Add to symbol table as identifier
-                        let idx = with_symbol_table(|sym| {
-                            let tok_text = self.get_next_chars(tok.len as usize);
-                            sym.get_index_of(tok_text).unwrap_or(
-                                sym.insert_full(String::from(tok_text), self.line_num as u16)
-                                    .0,
-                            )
-                        });
-                        Some(Token {
-                            kind: TokenKind::Label(Symbol::from(idx)),
-                            span: Span::new(SrcOffset(self.pos), tok.len as usize),
-                        })
-                    }
-                }
-                // Create literal of correct value
-                LTokenKind::Lit(_) => todo!(),
-                // Match on directive, check next value for number of lines skipped
-                LTokenKind::Direc => {
-                    if let Some(dir_kind) = StrParser::direc(self.get_next_chars(tok.len as usize))
-                    {
-                        self.line_num += match dir_kind {
-                            // Blkw should increment line count by the following int literal
-                            // TODO: Check if not int literal
-                            DirKind::Blkw => self
-                                .get_next_chars(self.peek_next().len as usize)
-                                .parse::<usize>()
-                                .unwrap(),
-                            // Stringz should increment line count by the number of characters
-                            // in the string literal + null byte
-                            DirKind::Stringz => {
-                                // TODO: Check if not str literal
-                                (self.peek_next().len - 2) as usize
-                            }
-                            _ => 1,
-                        };
-                        Some(Token {
-                            kind: TokenKind::Dir(dir_kind),
-                            span: Span::new(SrcOffset(self.pos), tok.len as usize),
-                        })
-                    } else {
-                        // TODO: Error handling in a list
-                        todo!()
-                    }
-                }
-                // TODO: Add registers to lexer
-                LTokenKind::Reg => todo!(),
-                LTokenKind::Whitespace | LTokenKind::Comment => None,
-                // TODO: Should return list of errors eventually
-                LTokenKind::Unknown => todo!(),
-            } {
-                toks_final.push(tok_final);
-                self.pos += tok.len as usize;
-            }
-        }
-        toks_final
-    }
+    // fn peek_next(&self) -> Token {
+    //     let mut cur = self.cur.clone();
+    //     let mut tok = cur.advance_token();
+    //     if tok.kind != TokenKind::Whitespace {
+    //         return tok;
+    //     }
+    //     cur.advance_token()
+    // }
+
+    // pub fn proc_tokens(&mut self) -> Vec<Token> {
+    //     // Iterate through, +1 to symbol count per inst
+    //     // +len(str) for every string literal
+    //     let mut toks_final: Vec<Token> = Vec::new();
+    //     loop {
+    //         let tok = self.cur.advance_token();
+    //         if let Some(tok_final) = match tok.kind {
+    //             TokenKind::Eof => break,
+    //             // Add identifier to symbol table at with correct line number
+    //             TokenKind::Ident => {
+    //                 // Process possibility of it being a trap
+    //                 if let Some(trap_kind) = StrParser::trap(self.get_next_chars(tok.len as usize))
+    //                 {
+    //                     self.line_num += 1;
+    //                     Some(Token {
+    //                         kind: TokenKind::Trap(trap_kind),
+    //                         span: Span::new(SrcOffset(self.pos), tok.len as usize),
+    //                     })
+    //                 } else {
+    //                     // Add to symbol table as identifier
+    //                     let idx = with_symbol_table(|sym| {
+    //                         let tok_text = self.get_next_chars(tok.len as usize);
+    //                         sym.get_index_of(tok_text).unwrap_or(
+    //                             sym.insert_full(String::from(tok_text), self.line_num as u16)
+    //                                 .0,
+    //                         )
+    //                     });
+    //                     Some(Token {
+    //                         kind: TokenKind::Label(Symbol::from(idx)),
+    //                         span: Span::new(SrcOffset(self.pos), tok.len as usize),
+    //                     })
+    //                 }
+    //             }
+    //             // Create literal of correct value
+    //             TokenKind::Lit(_) => todo!(),
+    //             // Match on directive, check next value for number of lines skipped
+    //             TokenKind::Direc => {
+    //                 if let Some(dir_kind) = StrParser::direc(self.get_next_chars(tok.len as usize))
+    //                 {
+    //                     self.line_num += match dir_kind {
+    //                         // Blkw should increment line count by the following int literal
+    //                         // TODO: Check if not int literal
+    //                         DirKind::Blkw => self
+    //                             .get_next_chars(self.peek_next().len as usize)
+    //                             .parse::<usize>()
+    //                             .unwrap(),
+    //                         // Stringz should increment line count by the number of characters
+    //                         // in the string literal + null byte
+    //                         DirKind::Stringz => {
+    //                             // TODO: Check if not str literal
+    //                             (self.peek_next().len - 2) as usize
+    //                         }
+    //                         _ => 1,
+    //                     };
+    //                     Some(Token {
+    //                         kind: TokenKind::Dir(dir_kind),
+    //                         span: Span::new(SrcOffset(self.pos), tok.len as usize),
+    //                     })
+    //                 } else {
+    //                     // TODO: Error handling in a list
+    //                     todo!()
+    //                 }
+    //             }
+    //             // TODO: Add registers to lexer
+    //             TokenKind::Reg => todo!(),
+    //             TokenKind::Whitespace | TokenKind::Comment => None,
+    //             // TODO: Should return list of errors eventually
+    //             TokenKind::Unknown => todo!(),
+    //         } {
+    //             toks_final.push(tok_final);
+    //             self.pos += tok.len as usize;
+    //         }
+    //     }
+    //     toks_final
+    // }
 
     fn trap(s: &str) -> Option<TrapKind> {
         match s.to_ascii_lowercase().as_str() {
@@ -156,93 +133,89 @@ impl<'a> StrParser<'a> {
 
     pub fn direc(s: &str) -> Option<DirKind> {
         match s.to_ascii_lowercase().as_str() {
-            ".alias" => Some(DirKind::Alias),
-            ".macro" => Some(DirKind::Macro),
             ".orig" => Some(DirKind::Orig),
             ".end" => Some(DirKind::End),
             ".stringz" => Some(DirKind::Stringz),
             ".blkw" => Some(DirKind::Blkw),
             ".fill" => Some(DirKind::Fill),
-            ".export" => Some(DirKind::Export),
-            ".import" => Some(DirKind::Import),
             _ => None,
         }
     }
 }
 
-/// Transforms token stream into 'AST'
-pub struct AsmParser<'a> {
-    /// Reference to the source file
-    src: &'a str,
-    /// List of processed tokens
-    tok: Vec<Token>,
-    /// Used to parse tokens
-    cur: Cursor<'a>,
-}
-
-impl<'a> From<&'a str> for AsmParser<'a> {
-    fn from(src: &'a str) -> Self {
-        let tok: Vec<Token> = StrParser::new(src).proc_tokens();
-        AsmParser {
-            src,
-            tok,
-            cur: Cursor::new(src),
-        }
-    }
-}
-
-impl<'a> AsmParser<'a> {
-    pub fn parse(&mut self) -> Result<()> {
-        // First, check that there is an .orig directive with an appropriate value.
-        // Should emit error with a label to the first line stating "Expected memory init"
-        // Should be in a function that is also used to init the memory - the question is
-        // whether it should remain as a full directive or as a value that gets emitted afterwards.
-        let orig = self.expect(LTokenKind::Direc)?;
-        // Need ability to expect an enum without specifying a subcase (maybe ()?)
-        let addr = self.expect(LTokenKind::Lit(crate::lexer::LiteralKind::Hex));
-
-        // Following this, the structure is always:
-        // [label]
-        // ->   <inst> [args]
-        // OR
-        // <label>
-        // ->   <direc> [args]
-        // OR
-        // [label]
-        // ->*   <direc> <args>
-        // OR
-        // <trap> [arg]
-        // or: (sometimes opt label) num directives (opt argument)
-        // so should generally build to this structure. This means, however, that the complexity
-        // is not suuper high as there are really only two medium complexity subcases to parse.
-        //
-        // TODO: Split into LexToken and Token, to simplify the lexer and have a postprocessing
-        // step that can then put it into a Token format that is then easily transformed into
-        // the 'AST'.
-        //
-        // In order to do this, there needs to be peeking functionality on the token stream so
-        // that it can e.g. see if there is a label present at the start of a line.
-
-        Ok(())
-    }
-
-    pub fn expect(&mut self, kind: LTokenKind) -> Result<LToken> {
-        let tok = self.cur.advance_token();
-        if tok.kind == kind {
-            return Ok(tok);
-        }
-        Err(miette!(
-            "ParseError: expected token of type {:?}, found {:?}",
-            kind,
-            tok
-        ))
-    }
-
-    pub fn parse_direc(&self) {
-        todo!()
-    }
-
-    pub fn parse_op(&self) {
-        todo!()
-    }
-}
+// /// Transforms token stream into 'AST'
+// pub struct AsmParser<'a> {
+//     /// Reference to the source file
+//     src: &'a str,
+//     /// List of processed tokens
+//     tok: Vec<Token>,
+//     /// Used to parse tokens
+//     cur: Cursor<'a>,
+// }
+
+// impl<'a> From<&'a str> for AsmParser<'a> {
+//     fn from(src: &'a str) -> Self {
+//         let tok: Vec<Token> = StrParser::new(src).proc_tokens();
+//         AsmParser {
+//             src,
+//             tok,
+//             cur: Cursor::new(src),
+//         }
+//     }
+// }
+
+// impl<'a> AsmParser<'a> {
+//     pub fn parse(&mut self) -> Result<()> {
+//         // First, check that there is an .orig directive with an appropriate value.
+//         // Should emit error with a label to the first line stating "Expected memory init"
+//         // Should be in a function that is also used to init the memory - the question is
+//         // whether it should remain as a full directive or as a value that gets emitted afterwards.
+//         let orig = self.expect(LTokenKind::Direc)?;
+//         // Need ability to expect an enum without specifying a subcase (maybe ()?)
+//         let addr = self.expect(LTokenKind::Lit(crate::lexer::LiteralKind::Hex));
+
+//         // Following this, the structure is always:
+//         // [label]
+//         // ->   <inst> [args]
+//         // OR
+//         // <label>
+//         // ->   <direc> [args]
+//         // OR
+//         // [label]
+//         // ->*   <direc> <args>
+//         // OR
+//         // <trap> [arg]
+//         // or: (sometimes opt label) num directives (opt argument)
+//         // so should generally build to this structure. This means, however, that the complexity
+//         // is not suuper high as there are really only two medium complexity subcases to parse.
+//         //
+//         // TODO: Split into LexToken and Token, to simplify the lexer and have a postprocessing
+//         // step that can then put it into a Token format that is then easily transformed into
+//         // the 'AST'.
+//         //
+//         // In order to do this, there needs to be peeking functionality on the token stream so
+//         // that it can e.g. see if there is a label present at the start of a line.
+
+//         Ok(())
+//     }
+
+//     pub fn expect(&mut self, kind: LTokenKind) -> Result<LToken> {
+//         let tok = self.cur.advance_token();
+//         if tok.kind == kind {
+//             return Ok(tok);
+//         }
+//         Err(miette!(
+//             "ParseError: expected token of type {:?}, found {:?}",
+//             kind,
+//             tok
+//         ))
+//     }
+
+//     pub fn parse_direc(&self) {
+//         todo!()
+//     }
+
+//     pub fn parse_op(&self) {
+//         todo!()
+//     }
+// }
diff --git a/src/symbol.rs b/src/symbol.rs
index 29247dd..c9c97e3 100644
--- a/src/symbol.rs
+++ b/src/symbol.rs
@@ -1,4 +1,4 @@
-use std::cell::RefCell;
+use std::{cell::RefCell, ops::{Bound, Range, RangeBounds}, slice::SliceIndex, str::FromStr};
 
 use fxhash::FxBuildHasher;
 use indexmap::IndexMap;
@@ -31,19 +31,39 @@ impl From<usize> for Symbol {
 #[derive(Clone, Copy, PartialEq, Eq, Debug)]
 pub struct Span {
     offs: SrcOffset,
+    end: SrcOffset,
     len: usize,
 }
 
 impl Span {
     pub fn new(offs: SrcOffset, len: usize) -> Self {
-        Span { offs, len }
+        Span { offs, len, end: SrcOffset(offs.0 + len) }
+    }
+
+    pub fn dummy() -> Self {
+        Span { offs: SrcOffset(0), len: 0, end: SrcOffset(0) }
+    }
+
+    pub fn range(&self) -> Range<usize> {
+        self.offs.0..self.end.0
     }
 }
 
+impl RangeBounds<usize> for Span {
+    fn start_bound(&self) -> Bound<&usize> {
+        Bound::Included(&self.offs.0)
+    }
+
+    fn end_bound(&self) -> Bound<&usize> {
+        Bound::Excluded(&self.end.0)
+    }
+
+}
+
 /// Represents the CPU registers.
 #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)]
 pub enum Register {
-    R0,
+    R0 = 0,
     R1,
     R2,
     R3,
@@ -54,6 +74,25 @@ pub enum Register {
     R7,
 }
 
+impl FromStr for Register {
+    type Err = ();
+
+    // Does not fail in this codebase.
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "0" => Ok(Register::R0),
+            "1" => Ok(Register::R1),
+            "2" => Ok(Register::R2),
+            "3" => Ok(Register::R3),
+            "4" => Ok(Register::R4),
+            "5" => Ok(Register::R5),
+            "6" => Ok(Register::R6),
+            "7" => Ok(Register::R7),
+            _ => Err(()),
+        }
+    }
+}
+
 /// Set by a subset of instructions, representing whether the result was negative, zero, or positive.
 #[derive(Clone, Copy, PartialEq, Eq, Debug)]
 pub enum Flag {
@@ -91,15 +130,11 @@ pub enum TrapKind {
 
 #[derive(Clone, Copy, PartialEq, Eq, Debug)]
 pub enum DirKind {
-    Alias,
-    Macro,
     Orig,
     End,
     Stringz,
     Blkw,
     Fill,
-    Export,
-    Import,
 }
 
 /// Newtype representing an address inside the LC3 memory.

From f07f7338a4aeabfe5afa93119502c02ca909b427 Mon Sep 17 00:00:00 2001
From: Artemis Rosman <73006620+rozukke@users.noreply.github.com>
Date: Tue, 27 Aug 2024 20:33:47 +1000
Subject: [PATCH 15/17] Lexer instructions

---
 src/lexer/mod.rs | 102 +++++++++++++++++++++++++++++++++++++++++++----
 src/symbol.rs    |  14 +++++++
 2 files changed, 108 insertions(+), 8 deletions(-)

diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs
index 5f8ac77..b064803 100644
--- a/src/lexer/mod.rs
+++ b/src/lexer/mod.rs
@@ -3,7 +3,7 @@ use std::str::FromStr;
 use miette::{Result, bail, miette, LabeledSpan, Severity};
 
 use crate::lexer::cursor::Cursor;
-use crate::symbol::{DirKind, InstrKind, Register, Span, SrcOffset, TrapKind};
+use crate::symbol::{DirKind, Flag, InstrKind, Register, Span, SrcOffset, TrapKind};
 
 pub mod cursor;
 
@@ -143,11 +143,6 @@ impl Cursor<'_> {
         Ok(res)
     }
 
-    fn ident(&mut self) -> TokenKind {
-        self.take_while(is_id);
-        TokenKind::Label
-    }
-
     fn hex(&mut self) -> Result<TokenKind> {
         let start = self.abs_pos();
         let prefix = self.pos_in_token();
@@ -221,9 +216,100 @@ impl Cursor<'_> {
                 help = "hint: make sure to close string literals with a \" character.",
                 labels = vec![LabeledSpan::at(start..self.abs_pos(), "incorrect literal")],
                 "Encountered an unterminated string literal.",
-
-        )
+            )
         }
         Ok(TokenKind::Lit(LiteralKind::Str))
     }
+
+    fn directive(&mut self) -> Result<TokenKind> {
+        // Account for starting .
+        let start = self.abs_pos() - 1;
+        self.take_while(is_id);
+        let dir = self.get_range(start..self.abs_pos()).to_ascii_lowercase();
+    }
+
+    fn ident(&mut self) -> Result<TokenKind> {
+        let mut token_kind = TokenKind::Label;
+        let ident_start = self.abs_pos();
+        self.take_while(is_id);
+        let ident = self.get_range(ident_start..self.abs_pos()).to_ascii_lowercase();
+
+        // This actually needs to be in its own function :/
+        if ident.starts_with('.') {
+            token_kind = self.check_directive(&ident[1..]);
+            if token_kind == TokenKind::Unknown {
+                bail!(
+                    severity = Severity::Error,
+                    code = "parse::dir",
+                    help = "hint: check the list of available directives in the documentation.",
+                    labels = vec![LabeledSpan::at(ident_start..self.abs_pos(), "incorrect literal")],
+                    "Encountered an invalid directive.",
+                )
+            }
+        } else {
+            token_kind = self.check_instruction(&ident); 
+
+            // If not an instruction, check if it's a trap
+            if token_kind == TokenKind::Label { 
+                token_kind = self.check_trap(&ident);
+            }
+        }
+
+        Ok(token_kind)
+    }
+
+    fn check_directive(&self, dir_str: &str) -> TokenKind {
+        match dir_str {
+            "orig" => TokenKind::Dir(DirKind::Orig),
+            "end" => TokenKind::Dir(DirKind::End),
+            "stringz" => TokenKind::Dir(DirKind::Stringz),
+            "blkw" => TokenKind::Dir(DirKind::Blkw),
+            "fill" => TokenKind::Dir(DirKind::Fill),
+            // Not a directive
+            _ => TokenKind::Unknown,
+        }
+    }
+
+    // Should learn how to write macros tbh :)
+    fn check_instruction(&self, ident: &str) -> TokenKind {
+        match ident {
+            "add" => TokenKind::Instr(InstrKind::Add),
+            "and" => TokenKind::Instr(InstrKind::And),
+            "brnzp" => TokenKind::Instr(InstrKind::Br(Flag::Nzp)),
+            "brnz" => TokenKind::Instr(InstrKind::Br(Flag::Nz)),
+            "brzp" => TokenKind::Instr(InstrKind::Br(Flag::Zp)),
+            "brnp" => TokenKind::Instr(InstrKind::Br(Flag::Np)),
+            "brn" => TokenKind::Instr(InstrKind::Br(Flag::N)),
+            "brz" => TokenKind::Instr(InstrKind::Br(Flag::Z)),
+            "brp" => TokenKind::Instr(InstrKind::Br(Flag::P)),
+            "jmp" => TokenKind::Instr(InstrKind::Jmp),
+            "jsr" => TokenKind::Instr(InstrKind::Jsr),
+            "jsrr" => TokenKind::Instr(InstrKind::Jsrr),
+            "ld" => TokenKind::Instr(InstrKind::Ld),
+            "ldi" => TokenKind::Instr(InstrKind::Ldi),
+            "ldr" => TokenKind::Instr(InstrKind::Ldr),
+            "lea" => TokenKind::Instr(InstrKind::Lea),
+            "not" => TokenKind::Instr(InstrKind::Not),
+            "ret" => TokenKind::Instr(InstrKind::Ret),
+            "rti" => TokenKind::Instr(InstrKind::Rti),
+            "st" => TokenKind::Instr(InstrKind::St),
+            "sti" => TokenKind::Instr(InstrKind::Sti),
+            // Not an instruction
+            _ => TokenKind::Label,
+        }
+    }
+
+    fn check_trap(&self, ident: &str) -> TokenKind {
+        match ident {
+            "getc" => TokenKind::Trap(TrapKind::Getc),
+            "out" => TokenKind::Trap(TrapKind::Out),
+            "puts" => TokenKind::Trap(TrapKind::Puts),
+            "in" => TokenKind::Trap(TrapKind::In),
+            "putsp" => TokenKind::Trap(TrapKind::Putsp),
+            "halt" => TokenKind::Trap(TrapKind::Halt),
+            "trap" => TokenKind::Trap(TrapKind::Generic),
+            // Not a trap
+            _ => TokenKind::Label,
+        }
+    }
 }
diff --git a/src/symbol.rs b/src/symbol.rs
index c9c97e3..14959af 100644
--- a/src/symbol.rs
+++ b/src/symbol.rs
@@ -115,6 +115,20 @@ pub enum Flag {
 #[derive(Clone, Copy, PartialEq, Eq, Debug)]
 pub enum InstrKind {
     Add,
+    And,
+    Br(Flag),
+    Jmp,
+    Jsr,
+    Jsrr,
+    Ld,
+    Ldi,
+    Ldr,
+    Lea,
+    Not,
+    Ret,
+    Rti,
+    St,
+    Sti,
 }
 
 #[derive(Clone, Copy, PartialEq, Eq, Debug)]

From 8295432206a110179a702f3c0da6fb3ab70625f1 Mon Sep 17 00:00:00 2001
From: Artemis Rosman <73006620+rozukke@users.noreply.github.com>
Date: Thu, 29 Aug 2024 11:23:22 +1000
Subject: [PATCH 16/17] Finish lexer

---
 src/lexer/mod.rs | 167 +++++++++++++++++++++++++++++++++++------------
 src/main.rs      |   4 +-
 2 files changed, 127 insertions(+), 44 deletions(-)

diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs
index b064803..7f1d4f4 100644
--- a/src/lexer/mod.rs
+++ b/src/lexer/mod.rs
@@ -7,10 +7,12 @@ use crate::symbol::{DirKind, Flag, InstrKind, Register, Span, SrcOffset, TrapKin
 
 pub mod cursor;
 
-/// A 'light' token that carries basic info and span
+/// Carries all literal info alongside span location inside source code.
 #[derive(Debug)]
 pub struct Token {
+    /// Lexed token kind, with literal values contained as part of the enum.
     pub kind: TokenKind,
+    /// Span pointing at the location of the token in the source.
     pub span: Span,
 }
 
@@ -22,8 +24,11 @@ impl Token {
 
 #[derive(Clone, Copy, PartialEq, Eq, Debug)]
 pub enum LiteralKind {
+    /// 0x3000, xFFFF, x123
     Hex(u16),
+    /// #-1, #32456
     Dec(i16),
+    /// "str with \" escaped chars"
     Str,
 }
 
@@ -87,10 +92,12 @@ impl Cursor<'_> {
             None => return Ok(Token::new(TokenKind::Eof, Span::dummy())),
         };
         let token_kind = match first_char {
+            // Comment
             ';' => {
                 self.take_while(|c| c != '\n');
                 TokenKind::Comment
             }
+            // Whitespace
             c if is_whitespace(c) => {
                 self.take_while(is_whitespace);
                 TokenKind::Whitespace
@@ -104,6 +111,7 @@ impl Cursor<'_> {
                     },
                 _ => self.ident(),
             },
+            // Register literal
             'r' | 'R' => match self.first() {
                 c if is_reg_num(c) => {
                     self.take_while(is_reg_num);
@@ -122,17 +130,10 @@ impl Cursor<'_> {
             // Decimal literal
             '#' => self.dec()?,
             // Directive
-            // '.' => {
-            //     let check = self.take_n(3).to_ascii_lowercase();
-            //     self.take_while(is_id);
-            //     // Need to check for .end directive to avoid unnecessary parsing and errors
-            //     match (self.pos_in_token(), check.as_str()) {
-            //         (3, "end") => TokenKind::Eof,
-            //         _ => TokenKind::Dir,
-            //     }
-            // }
+            '.' => self.dir()?,
             // String literal
-            '"' => self.string_literal()?,
+            '"' => self.str()?,
+            // Unknown starting characters
             _ => {
                 self.take_while(|c| !is_whitespace(c));
                 TokenKind::Unknown
@@ -195,7 +196,7 @@ impl Cursor<'_> {
         Ok(TokenKind::Lit(LiteralKind::Dec(value)))
     }
 
-    fn string_literal(&mut self) -> Result<TokenKind> {
+    fn str(&mut self) -> Result<TokenKind> {
         let start = self.abs_pos() - 1;
         let mut terminated = false;
         while let Some(c) = self.bump() {
@@ -221,56 +222,55 @@ impl Cursor<'_> {
         Ok(TokenKind::Lit(LiteralKind::Str))
     }
 
-    fn directive(&mut self) -> Result<TokenKind> {
+    fn dir(&mut self) -> Result<TokenKind> {
         // Account for starting .
         let start = self.abs_pos() - 1;
         self.take_while(is_id);
         let dir = self.get_range(start..self.abs_pos()).to_ascii_lowercase();
+
+        if let Some(token_kind) = self.check_directive(&dir) {
+            Ok(token_kind)
+        } else {
+            bail!(
+                severity = Severity::Error,
+                code = "parse::dir",
+                help = "hint: check the list of available directives in the documentation.",
+                labels = vec![LabeledSpan::at(start..self.abs_pos(), "incorrect literal")],
+                "Encountered an invalid directive.",
+            )
+        }
     }
 
-    fn ident(&mut self) -> Result<TokenKind> {
+    fn ident(&mut self) -> TokenKind {
         let mut token_kind = TokenKind::Label;
-        let ident_start = self.abs_pos();
+        let ident_start = self.abs_pos() - 1;
         self.take_while(is_id);
         let ident = self.get_range(ident_start..self.abs_pos()).to_ascii_lowercase();
 
-        // This actually needs to be in its own function :/
-        if ident.starts_with('.') {
-            token_kind = self.check_directive(&ident[1..]);
-            if token_kind == TokenKind::Unknown {
-                bail!(
-                    severity = Severity::Error,
-                    code = "parse::dir",
-                    help = "hint: check the list of available directives in the documentation.",
-                    labels = vec![LabeledSpan::at(ident_start..self.abs_pos(), "incorrect literal")],
-                    "Encountered an invalid directive.",
-                )
-            }
-        } else {
-            token_kind = self.check_instruction(&ident); 
-
-            // If not an instruction, check if it's a trap
-            if token_kind == TokenKind::Label { 
-                token_kind = self.check_trap(&ident);
-            }
+        token_kind = self.check_instruction(&ident);
+        // If not an instruction, check if it's a trap
+        if token_kind == TokenKind::Label { 
+            token_kind = self.check_trap(&ident);
         }
 
-        Ok(token_kind)
+        token_kind
     }
 
-    fn check_directive(&self, dir_str: &str) -> TokenKind {
+    /// Expects lowercase
+    fn check_directive(&self, dir_str: &str) -> Option<TokenKind> {
         match dir_str {
-            "orig" => TokenKind::Dir(DirKind::Orig),
-            "end" => TokenKind::Dir(DirKind::End),
-            "stringz" => TokenKind::Dir(DirKind::Stringz),
-            "blkw" => TokenKind::Dir(DirKind::Blkw),
-            "fill" => TokenKind::Dir(DirKind::Fill),
+            ".orig" => Some(TokenKind::Dir(DirKind::Orig)),
+            ".end" => Some(TokenKind::Dir(DirKind::End)),
+            ".stringz" => Some(TokenKind::Dir(DirKind::Stringz)),
+            ".blkw" => Some(TokenKind::Dir(DirKind::Blkw)),
+            ".fill" => Some(TokenKind::Dir(DirKind::Fill)),
             // Not a directive
-            _ => TokenKind::Unknown,
+            _ => None,
         }
     }
 
     // Should learn how to write macros tbh :)
+    /// Expects lowercase
     fn check_instruction(&self, ident: &str) -> TokenKind {
         match ident {
             "add" => TokenKind::Instr(InstrKind::Add),
@@ -299,6 +299,7 @@ impl Cursor<'_> {
         }
     }
 
+    /// Expects lowercase
     fn check_trap(&self, ident: &str) -> TokenKind {
         match ident {
             "getc" => TokenKind::Trap(TrapKind::Getc),
@@ -313,3 +314,85 @@ impl Cursor<'_> {
         }
     }
 }
+
+mod tests {
+    use crate::lexer::{LiteralKind, TokenKind};
+
+    use super::cursor::Cursor;
+
+    // HEX LIT TESTS
+
+    #[test]
+    fn hex_correct_value() {
+        let mut lex = Cursor::new("0x1234");
+        let res = lex.advance_token().unwrap();
+        assert!(res.kind == TokenKind::Lit(LiteralKind::Hex(0x1234)))
+    }
+
+    #[test]
+    fn hex_too_large() {
+        let mut lex = Cursor::new("xFFFF x10000");
+        let res = lex.advance_token().unwrap();
+        assert!(res.kind == TokenKind::Lit(LiteralKind::Hex(0xFFFF)));
+        // Whitespace
+        let res = lex.advance_token().unwrap();
+        assert!(lex.advance_token().is_err());
+    }
+
+    #[test]
+    fn hex_leading_0() {
+        let mut lex = Cursor::new("0x3000");
+        let res = lex.advance_token().unwrap();
+        assert!(res.kind == TokenKind::Lit(LiteralKind::Hex(0x3000)))
+    }
+
+    // DEC LIT TESTS
+
+    #[test]
+    fn dec_correct_value() {
+        let mut lex = Cursor::new("#32412");
+        let res = lex.advance_token().unwrap();
+        assert!(res.kind == TokenKind::Lit(LiteralKind::Dec(32412)))
+    }
+
+    #[test]
+    fn dec_negative_value () {
+        let mut lex = Cursor::new("#-300");
+        let res = lex.advance_token().unwrap();
+        assert!(res.kind == TokenKind::Lit(LiteralKind::Dec(-300)))
+    }
+
+    #[test]
+    fn dec_too_small () {
+        let mut lex = Cursor::new("#-32768 #-32769");
+        let res = lex.advance_token().unwrap();
+        assert!(res.kind == TokenKind::Lit(LiteralKind::Dec(-32768)));
+        // Whitespace
+        let res = lex.advance_token().unwrap();
+        assert!(lex.advance_token().is_err());
+    }
+
+    #[test]
+    fn dec_too_large () {
+        let mut lex = Cursor::new("#32767 #32768");
+        let res = lex.advance_token().unwrap();
+        assert!(res.kind == TokenKind::Lit(LiteralKind::Dec(32767)));
+        // Whitespace
+        let res = lex.advance_token().unwrap();
+        assert!(lex.advance_token().is_err());
+    }
+
+    // STR LIT TESTS
+
+    #[test]
+    fn str_unterminated() {
+        let mut lex = Cursor::new(r#""unterminated"#);
+        assert!(lex.advance_token().is_err())
+    }
+
+    #[test]
+    fn str_escaped() {
+        let mut lex = Cursor::new(r#"there is an escaped \" in this str\n"#);
+        assert!(lex.advance_token().is_ok())
+    }
+}
diff --git a/src/main.rs b/src/main.rs
index 0f3c1cb..3c4dfd0 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -77,8 +77,8 @@ fn main() -> miette::Result<()> {
                             return Err(err.with_source_code(file.clone()));
                         }
                     };
-                    println!("{:?}", ok);
-                    println!("{:?}", &file[ok.span.range()]);
+                    print!("{:?} ", ok.kind);
+                    println!("{}", &file[ok.span.range()]);
                 }
                 Ok(())
             }

From f92b81fefb0e07b84bf7fb21009965327ea13927 Mon Sep 17 00:00:00 2001
From: Artemis Rosman <73006620+rozukke@users.noreply.github.com>
Date: Thu, 29 Aug 2024 11:26:18 +1000
Subject: [PATCH 17/17] Remove accidental file

---
 scratch/test.asm | 7 -------
 1 file changed, 7 deletions(-)
 delete mode 100644 scratch/test.asm

diff --git a/scratch/test.asm b/scratch/test.asm
deleted file mode 100644
index 2352f14..0000000
--- a/scratch/test.asm
+++ /dev/null
@@ -1,7 +0,0 @@
-ahhh .orig x3000
-add R0, R0, #2; holllly shittt no wayyy
-add R0, R1, #-32568; waow
-add r1, r3, r4 r5 0x40
-ret
-labelthing .stringz "woaw omg \"epic\""
-           .stringz "okayyy"