/* Mycelium Scheme * Copyright (C) 2025 Ava Affine * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ use core::fmt::Display; use crate::lexer::{ LexError, LexToken, LexTokenType, Lexer, E_CHAR_TOO_LONG, E_END_OF_DOCUMENT }; use crate::number::{Number, Numeric}; use crate::sexpr::{Datum, Ast}; use alloc::vec::Vec; use alloc::vec; use alloc::rc::Rc; use alloc::string::String; pub const E_LEX_ERROR: &str = "error in lexing document"; pub const E_EXTRA_CLOSE: &str = "closing parenthesis closes nothing"; pub const E_TERRIBLE: &str = "something has gone terribly wrong...."; pub const E_VECTOR_DOT: &str = "dotted notation not valid in vectors"; pub const E_DOT_NO_LIST: &str = "dotted notation used outside of list"; pub const E_CHAR_TRUNCATED: &str = "character literal is truncated"; pub const E_CHAR_HEX_PARSE: &str = "hexadecimal character literal failed to parse"; pub const E_COLLECTION_TRUNC: &str = "collection is truncated"; pub const E_BV_BADBYTE: &str = "number provided is not a real byte"; pub const E_BV_NONBYTE: &str = "bytevector elements must all be bytes"; pub const E_TOO_MANY_DOT: &str = "valid dot notation only includes one dot"; pub const E_DOT_IDX: &str = "dot should preceed only last element in list"; pub const E_DOT_EMPTY: &str = "cannot apply dotted notation to otherwise empty list"; pub const E_UNQUOTE_NONQQ: &str = "unquote must be within a quasiquoted form"; pub const E_UNQUOTE_SPL_NONQQ: &str = "unquote-splicing must be within a quasiquoted form"; pub const E_UNQUOTE_SPL_COLL: &str = "expected list or vector after unquote-splicing"; /* ParseError * 0: error string * 1: either problematic lexing token, or a lexing error */ #[derive(Clone)] pub struct ParseError(pub &'static str, pub Option>); impl Display for ParseError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let err_snippet_start = |t: &LexToken| -> usize { /* backtrack from current index until we either hit * - beginning of line * - 25 characters ago * - the doc Start */ if t.source_doc.len() < 25 { 0 } else { let mut idx = t.start_idx; while t.start_idx - idx < 25 { idx -= 1; if t.source_doc[idx..] .char_indices() .next() .is_some_and(|(i, x)| x == '\n' && i == idx) { idx += 1; break; } } idx } }; let err_snippet_end = |t: &LexToken| -> usize { /* read through document until we either hit * - end of line * - 25 characters forward * - the doc end */ if t.source_doc.len() - t.end_idx < 25 { t.source_doc.len() } else { let mut idx = t.end_idx; while idx - t.end_idx < 25 { idx += 1; if t.source_doc[idx..] .char_indices() .next() .is_some_and(|(i, x)| x == '\n' && i == idx) { break; } } idx } }; if let Some(frag) = &self.1 { match frag { Ok(token) => { write!(f, "Error parsing syntax: {}\n", self.0)?; write!(f," problematic token: {}\n", &token.source_doc[token.start_idx..token.end_idx])?; write!(f," {}\n", &token.source_doc[err_snippet_start(token)..err_snippet_end(token)])?; }, Err(e) => { return e.fmt(f); } } } write!(f, "Error parsing syntax: {}\n", self.0) } } pub struct Parser { lexer: Lexer, pub has_error_state: Option, delayed: Vec>, quasiquoted: bool, } /* The From and Iterator traits serve as the primary * interface to work with the parser. It is expected to * make a Lexer first, and then use casting or type conv * to make it into a parser and then a final AST, which * we can then convert into a VM image once the compile * step is finished. */ impl From for Parser { fn from(l: Lexer) -> Parser { Parser { lexer: l, has_error_state: None, delayed: vec![], quasiquoted: false } } } impl Iterator for Parser { type Item = Rc; fn next(&mut self) -> Option { if self.has_error_state.is_some() { return None; } if self.delayed.len() > 0 { return self.delayed.pop() } let res = self.get_next_datum(); if let Err(ref e) = res { self.has_error_state = Some(e.clone()); } return res.ok() } } fn read_number(token: LexToken) -> Result { return match (&token.source_doc[token.start_idx..token.end_idx]).parse::() { Ok(num) => Ok(num), Err(e) => Err(ParseError(e, Some(Ok(token)))), } } fn read_char(token: LexToken) -> Result { if token.end_idx - token.start_idx < 2 { return Err(ParseError(E_CHAR_TRUNCATED, Some(Ok(token)))) } match &token.source_doc[token.start_idx + 2..token.end_idx] { "alarm" => Ok(7), "backspace" => Ok(8), "delete" => Ok(127), "escape" => Ok(33), "newline" => Ok('\n' as u8), "null" => Ok(0), "return" => Ok(13), "space" => Ok(32), "tab" => Ok(11), _ if token.source_doc[token.start_idx + 2..].starts_with('x') && token.end_idx - token.start_idx > 2 => { if token.end_idx - token.start_idx > 5 { return Err(ParseError(E_CHAR_TOO_LONG, Some(Ok(token)))) } match u8::from_str_radix( &token.source_doc[token.start_idx + 3..token.end_idx], 16) { Ok(u) => Ok(u), Err(_) => Err(ParseError(E_CHAR_HEX_PARSE, Some(Ok(token)))) } }, _ => Ok(token.source_doc.as_bytes()[token.start_idx + 2]) } } fn read_bool(token: LexToken) -> bool { match &token.source_doc[token.start_idx..token.end_idx] { "#t" => true, "#f" => false, _ => panic!("impossible boolean") } } fn read_string(token: LexToken) -> Vec { if token.end_idx - token.start_idx < 3 { // empty string other than delimiters Vec::default() } else { token.source_doc[token.start_idx + 1..token.end_idx - 1] .as_bytes() .to_vec() } } impl Parser { /* Rules we must mind: * 0. at this stage, drop and ignore comments, directives * 1. quote, quasiquote, unquote, and unquote splicing * all require another input after them (excluding * collection end) * 2. unquote-splicing explicitly requires a form I think? * (verify) * 3. vectors, lists, may have nested collections in them * so track collection state in the parser's stack. * 4. list dotted notation needs next datum put in cdr. * 5. bytevectors can only have numbers from 0-255 in them. */ fn complete_quote(&mut self) -> Result, ParseError> { let next = self.get_next_datum()?; Ok(Rc::from(Datum::List(Rc::from(Ast( Rc::from(Datum::Symbol(String::from("quote"))), Rc::from(Datum::List(Rc::from(Ast( next, Rc::from(Datum::None) )))) ))))) } fn complete_unquote_splicing(&mut self, tok: LexToken) -> Result, ParseError> { let next = self.get_next_datum()?; match *next { Datum::List(_) | Datum::Vector(_) | Datum::Symbol(_) => (), _ => return Err(ParseError(E_UNQUOTE_SPL_COLL, Some(Ok(tok)))) } Ok(Rc::from(Datum::List(Rc::from(Ast( Rc::from(Datum::Symbol(String::from("unquote-splicing"))), Rc::from(Datum::List(Rc::from(Ast( next, Rc::from(Datum::None) )))) ))))) } fn complete_unquote(&mut self) -> Result, ParseError> { let next = self.get_next_datum()?; Ok(Rc::from(Datum::List(Rc::from(Ast( Rc::from(Datum::Symbol(String::from("unquote"))), Rc::from(Datum::List(Rc::from(Ast( next, Rc::from(Datum::None) )))) ))))) } fn complete_quasiquote(&mut self) -> Result, ParseError> { let prev = self.quasiquoted; // handle nesting appropriately self.quasiquoted = true; let next = self.get_next_datum()?; self.quasiquoted = prev; Ok(Rc::from(Datum::List(Rc::from(Ast( Rc::from(Datum::Symbol(String::from("quasiquote"))), Rc::from(Datum::List(Rc::from(Ast( next, Rc::from(Datum::None) )))) ))))) } fn complete_collection(&mut self, token: LexToken) -> Result, ParseError> { let is_bv = match token.token_type { LexTokenType::ByteVectorStart => true, _ => false, }; let mut lex_stack = vec![]; let mut bv_stack = vec![]; /* counting indexes helps greatly with calculating position dependent * syntax rules like dot notation in lists */ let mut iter_count = 0; let mut dot_idx = (None, None, None); loop { let next_tok = self.lexer.next(); if let None = next_tok { return Err(ParseError(E_COLLECTION_TRUNC, None)) } let tok = next_tok.unwrap(); match tok.token_type { // Universal cases LexTokenType::Comment | LexTokenType::Directive => continue, LexTokenType::NumTypes => return Err(ParseError(E_TERRIBLE, Some(Ok(tok)))), LexTokenType::Unquote if !self.quasiquoted => return Err(ParseError(E_UNQUOTE_SPL_NONQQ, Some(Ok(tok)))), LexTokenType::UnquoteSplice if !self.quasiquoted => return Err(ParseError(E_UNQUOTE_NONQQ, Some(Ok(tok)))), // CollectionEnd must take precedence over the dot notation case LexTokenType::CollectionEnd => break, _ if let Some(idx) = dot_idx.0 && iter_count - idx > 2 => return Err(ParseError(E_DOT_IDX, Some(Ok(dot_idx.1.unwrap())))), LexTokenType::Dot if token.token_type != LexTokenType::ListStart => return Err(ParseError(E_VECTOR_DOT, Some(Ok(tok)))), // List, Vector cases LexTokenType::ListStart | LexTokenType::VectorStart | LexTokenType::ByteVectorStart if !is_bv => lex_stack.push(self.complete_collection(tok)?), LexTokenType::String if !is_bv => lex_stack.push(Rc::from(Datum::String(read_string(tok)))), LexTokenType::Number if !is_bv => lex_stack.push(Rc::from(Datum::Number(read_number(tok)?))), LexTokenType::Char if !is_bv => lex_stack.push(Rc::from(Datum::Char(read_char(tok)?))), LexTokenType::Boolean if !is_bv => lex_stack.push(Rc::from(Datum::Bool(read_bool(tok)))), LexTokenType::Symbol if !is_bv => lex_stack.push(Rc::from(Datum::Symbol( String::from(&tok.source_doc[tok.start_idx..tok.end_idx])))), LexTokenType::Quote if !is_bv => lex_stack.push(self.complete_quote()?), LexTokenType::QuasiQuote if !is_bv => lex_stack.push(self.complete_quasiquote()?), LexTokenType::Unquote if !is_bv && self.quasiquoted => lex_stack.push(self.complete_unquote()?), LexTokenType::UnquoteSplice if !is_bv && self.quasiquoted => lex_stack.push(self.complete_unquote_splicing(tok)?), // List only cases LexTokenType::Dot => if let Some(_) = dot_idx.0 { return Err(ParseError(E_TOO_MANY_DOT, Some(Ok(tok)))) } else { dot_idx = (Some(iter_count), Some(tok), None) }, // ByteVector cases LexTokenType::Number if is_bv => { let n = read_number(tok.clone())? .make_inexact(); if n.0 < 0.0 || n.0 > 255.0 || n.0.fract() != 0.0 { return Err(ParseError(E_BV_BADBYTE, Some(Ok(tok)))) } bv_stack.push(n.0 as u8); }, _ if is_bv => return Err(ParseError(E_BV_NONBYTE, Some(Ok(tok)))), // This should never get touched _ => todo!("theoretically impossible case in parser::complete_collection"), } if let Some(idx) = dot_idx.0 && iter_count == idx + 1 { dot_idx.2 = Some(lex_stack.pop()); } iter_count += 1; } if is_bv { return Ok(Rc::from(Datum::ByteVector(bv_stack))) } if token.token_type == LexTokenType::VectorStart { return Ok(Rc::from(Datum::Vector(lex_stack))) } // handle an empty list if lex_stack.len() < 1 { // dont try to do something like "( . 'thing)" if let (_, Some(node), _) = dot_idx { return Err(ParseError(E_DOT_EMPTY, Some(Ok(node)))) } return Ok(Rc::from(Datum::List(Rc::from(Ast(Rc::from(Datum::None), Rc::from(Datum::None)))))) } let mut from_rear: Rc; if let (_, _, Some(node)) = dot_idx { from_rear = Rc::from(Ast(lex_stack.pop().unwrap(), node.unwrap())); } else { from_rear = Rc::from(Ast(lex_stack.pop().unwrap(), Rc::from(Datum::None))); } lex_stack.iter() .rev() .for_each(|x| { from_rear = Rc::from(Ast(x.clone(), Rc::from(Datum::List(from_rear.clone())))); }); Ok(Rc::from(Datum::List(from_rear))) } fn get_next_datum(&mut self) -> Result, ParseError> { if let Some(token) = self.lexer.next() { match token.token_type { // normal paths: LexTokenType::String => Ok(Rc::from(Datum::String(read_string(token)))), LexTokenType::Number => Ok(Rc::from(Datum::Number(read_number(token)?))), LexTokenType::Char => Ok(Rc::from(Datum::Char(read_char(token)?))), LexTokenType::Symbol => Ok(Rc::from(Datum::Symbol(String::from( &token.source_doc[token.start_idx..token.end_idx])))), LexTokenType::Boolean => Ok(Rc::from(Datum::Bool(read_bool(token)))), LexTokenType::VectorStart | LexTokenType::ListStart | LexTokenType::ByteVectorStart => self.complete_collection(token), LexTokenType::Quote => self.complete_quote(), LexTokenType::QuasiQuote => self.complete_quasiquote(), LexTokenType::Unquote if self.quasiquoted => self.complete_unquote(), LexTokenType::UnquoteSplice if self.quasiquoted => self.complete_unquote_splicing(token), // immediate errors: LexTokenType::CollectionEnd => Err(ParseError(E_EXTRA_CLOSE, Some(Ok(token)))), LexTokenType::NumTypes => Err(ParseError(E_TERRIBLE, Some(Ok(token)))), LexTokenType::Dot => Err(ParseError(E_DOT_NO_LIST, Some(Ok(token)))), LexTokenType::Unquote if !self.quasiquoted => Err(ParseError(E_UNQUOTE_NONQQ, Some(Ok(token)))), LexTokenType::UnquoteSplice if !self.quasiquoted => Err(ParseError(E_UNQUOTE_SPL_NONQQ, Some(Ok(token)))), // ignore comment, directive: _ => self.get_next_datum(), } // Lexer error } else if self.lexer.has_error_state.is_some() { Err(ParseError(E_LEX_ERROR, Some(Err(self.lexer.has_error_state.clone().unwrap())))) // End of document } else { Err(ParseError(E_END_OF_DOCUMENT, None)) } } } #[cfg(test)] mod tests { use super::*; #[test] fn test_parse_cases() { let happy_cases = vec![ // case, result ("\"test\"", "\"test\""), ("test", "test"), ("(1 2 3)", "(1 2 3)"), ("'test", "(quote test)"), ("`test", "(quasiquote test)"), ("`(,one)", "(quasiquote ((unquote one)))"), ("`(test ,@(two))", "(quasiquote (test (unquote-splicing (two))))"), ("#u8(0 14 249)", "#u8(0 14 249)"), ("(nested lists (are pretty cool))", "(nested lists (are pretty cool))"), ("((nested) lists (are (pretty) cool))", "((nested) lists (are (pretty) cool))"), ("(dotted . notation)", "(dotted . notation)"), ("(longer dotted . notation)", "(longer dotted . notation)"), ("(hello \"world\")", "(hello \"world\")"), ("; big doc string\n(one two)", "(one two)"), ("(list #(vect 2 3 #u8(0 0)))", "(list #(vect 2 3 #u8(0 0)))"), ("(#\\xf0 #\\alarm #\\a #\\z)", "(#\\xf0 #\\x7 #\\a #\\z)") ]; let sad_cases = vec![ "(", "( one two ", "( one two three ( four )", ")", "#(st", "#u8(0 ", "#u8(256)", "#u8(two)", "(one two ,three)", "(one two ,@three)", "`(one two ,@4.0)", "(. two)", "(one . two . three)", ]; println!("+ Testing Happy Cases..."); happy_cases.iter() .for_each(|(case, result)| { println!(" - case: {}", *case); let mut p = Parser::from(Lexer::from(Rc::from(*case))); let res = p.next(); if let None = res { println!("{}", p.has_error_state.unwrap()); } assert_eq!( format!("{}", res.unwrap()), format!("{}", result) ); }); println!("+ Testing Sad Cases..."); sad_cases.iter() .for_each(|case| { println!(" - case: {}", *case); let mut p = Parser::from(Lexer::from(Rc::from(*case))); assert!(p.next().is_none() && p.has_error_state.is_some()) }); } }