Mycelium/mycelium/src/parser.rs
Ava Affine 8d2d0ebf0c
All checks were successful
per-push tests / build (push) Successful in 32s
per-push tests / test-utility (push) Successful in 32s
per-push tests / test-frontend (push) Successful in 34s
per-push tests / test-backend (push) Successful in 30s
per-push tests / timed-decomposer-parse (push) Successful in 26s
Clean up project structure
The number package is moved into its own package henceforth referred
to as "organelle". Hyphae and Mycelium are updated accordingly. In
addition, Hyphae gets a copy of the sexpr module of Mycelium. This
will not remain a copy, rather it will be the basis of a heap manager
module within Mycelium to be worked on in the future.

Fixes #32

Signed-off-by: Ava Affine <ava@sunnypup.io>
2025-07-24 19:44:43 +00:00

563 lines
20 KiB
Rust

/* Mycelium Scheme
* Copyright (C) 2025 Ava Affine
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
use core::fmt::Display;
use core::cell::RefCell;
use crate::lexer::{
LexError,
LexToken,
LexTokenType,
Lexer,
E_CHAR_TOO_LONG,
E_END_OF_DOCUMENT
};
use organelle::{Number, Numeric};
use crate::sexpr::{Datum, Ast};
use alloc::vec::Vec;
use alloc::vec;
use alloc::rc::Rc;
use alloc::string::String;
pub const E_LEX_ERROR: &str = "error in lexing document";
pub const E_EXTRA_CLOSE: &str = "closing parenthesis closes nothing";
pub const E_TERRIBLE: &str = "something has gone terribly wrong....";
pub const E_VECTOR_DOT: &str = "dotted notation not valid in vectors";
pub const E_DOT_NO_LIST: &str = "dotted notation used outside of list";
pub const E_CHAR_TRUNCATED: &str = "character literal is truncated";
pub const E_CHAR_HEX_PARSE: &str = "hexadecimal character literal failed to parse";
pub const E_COLLECTION_TRUNC: &str = "collection is truncated";
pub const E_BV_BADBYTE: &str = "number provided is not a real byte";
pub const E_BV_NONBYTE: &str = "bytevector elements must all be bytes";
pub const E_TOO_MANY_DOT: &str = "valid dot notation only includes one dot";
pub const E_DOT_IDX: &str = "dot should preceed only last element in list";
pub const E_DOT_EMPTY: &str = "cannot apply dotted notation to otherwise empty list";
pub const E_UNQUOTE_NONQQ: &str = "unquote must be within a quasiquoted form";
pub const E_UNQUOTE_SPL_NONQQ: &str = "unquote-splicing must be within a quasiquoted form";
pub const E_UNQUOTE_SPL_COLL: &str = "expected list or vector after unquote-splicing";
/* ParseError
* 0: error string
* 1: either problematic lexing token, or a lexing error
*/
#[derive(Clone)]
pub struct ParseError(pub &'static str, pub Option<Result<LexToken, LexError>>);
impl Display for ParseError {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
let err_snippet_start = |t: &LexToken| -> usize {
/* backtrack from current index until we either hit
* - beginning of line
* - 25 characters ago
* - the doc Start
*/
if t.source_doc.len() < 25 {
0
} else {
let mut idx = t.start_idx;
while t.start_idx - idx < 25 {
idx -= 1;
if t.source_doc[idx..]
.char_indices()
.next()
.is_some_and(|(i, x)| x == '\n' && i == idx) {
idx += 1;
break;
}
}
idx
}
};
let err_snippet_end = |t: &LexToken| -> usize {
/* read through document until we either hit
* - end of line
* - 25 characters forward
* - the doc end
*/
if t.source_doc.len() - t.end_idx < 25 {
t.source_doc.len()
} else {
let mut idx = t.end_idx;
while idx - t.end_idx < 25 {
idx += 1;
if t.source_doc[idx..]
.char_indices()
.next()
.is_some_and(|(i, x)| x == '\n' && i == idx) {
break;
}
}
idx
}
};
if let Some(frag) = &self.1 {
match frag {
Ok(token) => {
write!(f, "Error parsing syntax: {}\n", self.0)?;
write!(f," problematic token: {}\n",
&token.source_doc[token.start_idx..token.end_idx])?;
write!(f," {}\n",
&token.source_doc[err_snippet_start(token)..err_snippet_end(token)])?;
},
Err(e) => {
return e.fmt(f);
}
}
}
write!(f, "Error parsing syntax: {}\n", self.0)
}
}
pub struct Parser {
lexer: Lexer,
pub has_error_state: Option<ParseError>,
delayed: Vec<Rc<Datum>>,
quasiquoted: bool,
}
/* The From and Iterator traits serve as the primary
* interface to work with the parser. It is expected to
* make a Lexer first, and then use casting or type conv
* to make it into a parser and then a final AST, which
* we can then convert into a VM image once the compile
* step is finished.
*/
impl From<Lexer> for Parser {
fn from(l: Lexer) -> Parser {
Parser {
lexer: l,
has_error_state: None,
delayed: vec![],
quasiquoted: false
}
}
}
impl Iterator for Parser {
type Item = Rc<Datum>;
fn next(&mut self) -> Option<Self::Item> {
if self.has_error_state.is_some() {
return None;
}
if self.delayed.len() > 0 {
return self.delayed.pop()
}
let res = self.get_next_datum();
if let Err(ref e) = res {
self.has_error_state = Some(e.clone());
}
return res.ok()
}
}
fn read_number(token: LexToken) -> Result<Number, ParseError> {
return match (&token.source_doc[token.start_idx..token.end_idx]).parse::<Number>() {
Ok(num) => Ok(num),
Err(e) => Err(ParseError(e, Some(Ok(token)))),
}
}
fn read_char(token: LexToken) -> Result<u8, ParseError> {
if token.end_idx - token.start_idx < 3 {
return Err(ParseError(E_CHAR_TRUNCATED, Some(Ok(token))))
}
match &token.source_doc[token.start_idx + 2..token.end_idx] {
"alarm" => Ok(7),
"backspace" => Ok(8),
"delete" => Ok(127),
"escape" => Ok(33),
"newline" => Ok('\n' as u8),
"null" => Ok(0),
"return" => Ok(13),
"space" => Ok(32),
"tab" => Ok(11),
_ if token.source_doc[token.start_idx + 2..].starts_with('x') &&
token.end_idx - token.start_idx > 3 => {
if token.end_idx - token.start_idx > 5 {
return Err(ParseError(E_CHAR_TOO_LONG, Some(Ok(token))))
}
match u8::from_str_radix(
&token.source_doc[token.start_idx + 3..token.end_idx],
16) {
Ok(u) => Ok(u),
Err(_) => Err(ParseError(E_CHAR_HEX_PARSE, Some(Ok(token))))
}
},
_ => Ok(token.source_doc.as_bytes()[token.start_idx + 2])
}
}
fn read_bool(token: LexToken) -> bool {
match &token.source_doc[token.start_idx..token.end_idx] {
"#t" => true,
"#f" => false,
_ => panic!("impossible boolean")
}
}
fn read_string(token: LexToken) -> Vec<u8> {
if token.end_idx - token.start_idx < 3 {
// empty string other than delimiters
Vec::default()
} else {
token.source_doc[token.start_idx + 1..token.end_idx - 1]
.as_bytes()
.to_vec()
}
}
impl Parser {
/* Rules we must mind:
* 0. at this stage, drop and ignore comments, directives
* 1. quote, quasiquote, unquote, and unquote splicing
* all require another input after them (excluding
* collection end)
* 2. unquote-splicing explicitly requires a form I think?
* (verify)
* 3. vectors, lists, may have nested collections in them
* so track collection state in the parser's stack.
* 4. list dotted notation needs next datum put in cdr.
* 5. bytevectors can only have numbers from 0-255 in them.
*/
fn complete_quote(&mut self) -> Result<Rc<Datum>, ParseError> {
let next = self.get_next_datum()?;
Ok(Rc::from(Datum::List(Rc::from(Ast(
Rc::from(Datum::Symbol(String::from("quote"))),
Rc::from(Datum::List(Rc::from(Ast(
next,
Rc::from(Datum::None)
))))
)))))
}
fn complete_unquote_splicing(&mut self, tok: LexToken) -> Result<Rc<Datum>, ParseError> {
let next = self.get_next_datum()?;
match *next {
Datum::List(_) | Datum::Vector(_) | Datum::Symbol(_) => (),
_ => return Err(ParseError(E_UNQUOTE_SPL_COLL, Some(Ok(tok))))
}
Ok(Rc::from(Datum::List(Rc::from(Ast(
Rc::from(Datum::Symbol(String::from("unquote-splicing"))),
Rc::from(Datum::List(Rc::from(Ast(
next,
Rc::from(Datum::None)
))))
)))))
}
fn complete_unquote(&mut self) -> Result<Rc<Datum>, ParseError> {
let next = self.get_next_datum()?;
Ok(Rc::from(Datum::List(Rc::from(Ast(
Rc::from(Datum::Symbol(String::from("unquote"))),
Rc::from(Datum::List(Rc::from(Ast(
next,
Rc::from(Datum::None)
))))
)))))
}
fn complete_quasiquote(&mut self) -> Result<Rc<Datum>, ParseError> {
let prev = self.quasiquoted; // handle nesting appropriately
self.quasiquoted = true;
let next = self.get_next_datum()?;
self.quasiquoted = prev;
Ok(Rc::from(Datum::List(Rc::from(Ast(
Rc::from(Datum::Symbol(String::from("quasiquote"))),
Rc::from(Datum::List(Rc::from(Ast(
next,
Rc::from(Datum::None)
))))
)))))
}
fn complete_collection(&mut self, token: LexToken) -> Result<Rc<Datum>, ParseError> {
let is_bv = match token.token_type {
LexTokenType::ByteVectorStart => true,
_ => false,
};
let mut lex_stack = vec![];
let mut bv_stack = vec![];
/* counting indexes helps greatly with calculating position dependent
* syntax rules like dot notation in lists
*/
let mut iter_count = 0;
let mut dot_idx = (None, None, None);
loop {
let next_tok = self.lexer.next();
if let None = next_tok {
if let Some(e) = &self.lexer.has_error_state {
return Err(ParseError(E_LEX_ERROR, Some(Err(e.clone()))))
}
return Err(ParseError(E_COLLECTION_TRUNC, Some(Ok(token))))
}
let tok = next_tok.unwrap();
match tok.token_type {
// Universal cases
LexTokenType::Comment | LexTokenType::Directive => continue,
LexTokenType::NumTypes =>
return Err(ParseError(E_TERRIBLE, Some(Ok(tok)))),
LexTokenType::Unquote if !self.quasiquoted =>
return Err(ParseError(E_UNQUOTE_SPL_NONQQ, Some(Ok(tok)))),
LexTokenType::UnquoteSplice if !self.quasiquoted =>
return Err(ParseError(E_UNQUOTE_NONQQ, Some(Ok(tok)))),
// CollectionEnd must take precedence over the dot notation case
LexTokenType::CollectionEnd => break,
_ if let Some(idx) = dot_idx.0 && iter_count - idx > 2 =>
return Err(ParseError(E_DOT_IDX, Some(Ok(dot_idx.1.unwrap())))),
LexTokenType::Dot if token.token_type != LexTokenType::ListStart =>
return Err(ParseError(E_VECTOR_DOT, Some(Ok(tok)))),
// List, Vector cases
LexTokenType::ListStart | LexTokenType::VectorStart |
LexTokenType::ByteVectorStart if !is_bv =>
lex_stack.push(self.complete_collection(tok)?),
LexTokenType::String if !is_bv =>
lex_stack.push(Rc::from(Datum::String(read_string(tok)))),
LexTokenType::Number if !is_bv =>
lex_stack.push(Rc::from(Datum::Number(read_number(tok)?))),
LexTokenType::Char if !is_bv =>
lex_stack.push(Rc::from(Datum::Char(read_char(tok)?))),
LexTokenType::Boolean if !is_bv =>
lex_stack.push(Rc::from(Datum::Bool(read_bool(tok)))),
LexTokenType::Symbol if !is_bv =>
lex_stack.push(Rc::from(Datum::Symbol(
String::from(&tok.source_doc[tok.start_idx..tok.end_idx])))),
LexTokenType::Quote if !is_bv =>
lex_stack.push(self.complete_quote()?),
LexTokenType::QuasiQuote if !is_bv =>
lex_stack.push(self.complete_quasiquote()?),
LexTokenType::Unquote if !is_bv && self.quasiquoted =>
lex_stack.push(self.complete_unquote()?),
LexTokenType::UnquoteSplice if !is_bv && self.quasiquoted =>
lex_stack.push(self.complete_unquote_splicing(tok)?),
// List only cases
LexTokenType::Dot => if let Some(_) = dot_idx.0 {
return Err(ParseError(E_TOO_MANY_DOT, Some(Ok(tok))))
} else {
dot_idx = (Some(iter_count), Some(tok), None)
},
// ByteVector cases
LexTokenType::Number if is_bv => {
let n = read_number(tok.clone())?
.make_inexact();
if n.0 < 0.0 || n.0 > 255.0 || n.0.fract() != 0.0 {
return Err(ParseError(E_BV_BADBYTE, Some(Ok(tok))))
}
bv_stack.push(n.0 as u8);
},
_ if is_bv => return Err(ParseError(E_BV_NONBYTE, Some(Ok(tok)))),
// This should never get touched
_ => todo!("theoretically impossible case in parser::complete_collection"),
}
if let Some(idx) = dot_idx.0 && iter_count == idx + 1 {
dot_idx.2 = Some(lex_stack.pop());
}
iter_count += 1;
}
if is_bv {
return Ok(Rc::from(Datum::ByteVector(RefCell::from(bv_stack))))
}
if token.token_type == LexTokenType::VectorStart {
return Ok(Rc::from(Datum::Vector(RefCell::from(lex_stack))))
}
// handle an empty list
if lex_stack.len() < 1 {
// dont try to do something like "( . 'thing)"
if let (_, Some(node), _) = dot_idx {
return Err(ParseError(E_DOT_EMPTY, Some(Ok(node))))
}
return Ok(Rc::from(Datum::List(Rc::from(Ast(Rc::from(Datum::None),
Rc::from(Datum::None))))))
}
let mut from_rear: Rc<Ast>;
if let (_, _, Some(node)) = dot_idx {
from_rear = Rc::from(Ast(lex_stack.pop().unwrap(), node.unwrap()));
} else {
from_rear = Rc::from(Ast(lex_stack.pop().unwrap(), Rc::from(Datum::None)));
}
lex_stack.iter()
.rev()
.for_each(|x| {
from_rear = Rc::from(Ast(x.clone(), Rc::from(Datum::List(from_rear.clone()))));
});
Ok(Rc::from(Datum::List(from_rear)))
}
fn get_next_datum(&mut self) -> Result<Rc<Datum>, ParseError> {
if let Some(token) = self.lexer.next() {
match token.token_type {
// normal paths:
LexTokenType::String => Ok(Rc::from(Datum::String(read_string(token)))),
LexTokenType::Number => Ok(Rc::from(Datum::Number(read_number(token)?))),
LexTokenType::Char => Ok(Rc::from(Datum::Char(read_char(token)?))),
LexTokenType::Symbol => Ok(Rc::from(Datum::Symbol(String::from(
&token.source_doc[token.start_idx..token.end_idx])))),
LexTokenType::Boolean => Ok(Rc::from(Datum::Bool(read_bool(token)))),
LexTokenType::VectorStart | LexTokenType::ListStart |
LexTokenType::ByteVectorStart => self.complete_collection(token),
LexTokenType::Quote => self.complete_quote(),
LexTokenType::QuasiQuote => self.complete_quasiquote(),
LexTokenType::Unquote if self.quasiquoted => self.complete_unquote(),
LexTokenType::UnquoteSplice if self.quasiquoted =>
self.complete_unquote_splicing(token),
// immediate errors:
LexTokenType::CollectionEnd => Err(ParseError(E_EXTRA_CLOSE, Some(Ok(token)))),
LexTokenType::NumTypes => Err(ParseError(E_TERRIBLE, Some(Ok(token)))),
LexTokenType::Dot => Err(ParseError(E_DOT_NO_LIST, Some(Ok(token)))),
LexTokenType::Unquote if !self.quasiquoted =>
Err(ParseError(E_UNQUOTE_NONQQ, Some(Ok(token)))),
LexTokenType::UnquoteSplice if !self.quasiquoted =>
Err(ParseError(E_UNQUOTE_SPL_NONQQ, Some(Ok(token)))),
// ignore comment, directive:
_ => self.get_next_datum(),
}
// Lexer error
} else if let Some(e) = &self.lexer.has_error_state && e.0 != E_END_OF_DOCUMENT {
Err(ParseError(E_LEX_ERROR,
Some(Err(self.lexer.has_error_state.clone().unwrap()))))
// End of document
} else {
Err(ParseError(E_END_OF_DOCUMENT, None))
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_cases() {
let happy_cases = vec![
// case, result
("\"test\"", "\"test\""),
("test", "test"),
("(1 2 3)", "(1 2 3)"),
("'test", "(quote test)"),
("`test", "(quasiquote test)"),
("`(,one)", "(quasiquote ((unquote one)))"),
("`(test ,@(two))", "(quasiquote (test (unquote-splicing (two))))"),
("#u8(0 14 249)", "#u8(0 14 249)"),
("(nested lists (are pretty cool))", "(nested lists (are pretty cool))"),
("((nested) lists (are (pretty) cool))", "((nested) lists (are (pretty) cool))"),
("(dotted . notation)", "(dotted . notation)"),
("(longer dotted . notation)", "(longer dotted . notation)"),
("(hello \"world\")", "(hello \"world\")"),
("; big doc string\n(one two)", "(one two)"),
("(list #(vect 2 3 #u8(0 0)))", "(list #(vect 2 3 #u8(0 0)))"),
("(#\\xf0 #\\alarm #\\a #\\z)", "(#\\xf0 #\\x7 #\\a #\\z)"),
("(- q 1)", "(- q 1)"),
("(+ q 1)", "(+ q 1)"),
("(#\\x)", "(#\\x)"),
];
let sad_cases = vec![
"(",
"( one two ",
"( one two three ( four )",
")",
"#(st",
"#u8(0 ",
"#u8(256)",
"#u8(two)",
"(one two ,three)",
"(one two ,@three)",
"`(one two ,@4.0)",
"(. two)",
"(one . two . three)",
];
println!("+ Testing Happy Cases...");
happy_cases.iter()
.for_each(|(case, result)| {
println!(" - case: {}", *case);
let mut p = Parser::from(Lexer::from(Rc::from(*case)));
let res = p.next();
if let None = res {
println!("{}", p.has_error_state.unwrap());
}
assert_eq!(
format!("{}", res.unwrap()),
format!("{}", result)
);
});
println!("+ Testing Sad Cases...");
sad_cases.iter()
.for_each(|case| {
println!(" - case: {}", *case);
let mut p = Parser::from(Lexer::from(Rc::from(*case)));
assert!(p.next().is_none() && p.has_error_state.is_some())
});
}
}