Mycelium/mycelium/src/parser.rs

557 lines
20 KiB
Rust
Raw Normal View History

/* Mycelium Scheme
* Copyright (C) 2025 Ava Affine
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
use core::fmt::Display;
use crate::lexer::{
LexError,
LexToken,
LexTokenType,
Lexer,
E_CHAR_TOO_LONG,
E_END_OF_DOCUMENT
};
use crate::number::{Number, Numeric};
use crate::sexpr::{Datum, Ast};
use alloc::vec::Vec;
use alloc::vec;
use alloc::rc::Rc;
use alloc::string::String;
pub const E_LEX_ERROR: &str = "error in lexing document";
pub const E_EXTRA_CLOSE: &str = "closing parenthesis closes nothing";
pub const E_TERRIBLE: &str = "something has gone terribly wrong....";
pub const E_VECTOR_DOT: &str = "dotted notation not valid in vectors";
pub const E_DOT_NO_LIST: &str = "dotted notation used outside of list";
pub const E_CHAR_TRUNCATED: &str = "character literal is truncated";
pub const E_CHAR_HEX_PARSE: &str = "hexadecimal character literal failed to parse";
pub const E_COLLECTION_TRUNC: &str = "collection is truncated";
pub const E_BV_BADBYTE: &str = "number provided is not a real byte";
pub const E_BV_NONBYTE: &str = "bytevector elements must all be bytes";
pub const E_TOO_MANY_DOT: &str = "valid dot notation only includes one dot";
pub const E_DOT_IDX: &str = "dot should preceed only last element in list";
pub const E_DOT_EMPTY: &str = "cannot apply dotted notation to otherwise empty list";
pub const E_UNQUOTE_NONQQ: &str = "unquote must be within a quasiquoted form";
pub const E_UNQUOTE_SPL_NONQQ: &str = "unquote-splicing must be within a quasiquoted form";
pub const E_UNQUOTE_SPL_COLL: &str = "expected list or vector after unquote-splicing";
/* ParseError
* 0: error string
* 1: either problematic lexing token, or a lexing error
*/
#[derive(Clone)]
pub struct ParseError(pub &'static str, pub Option<Result<LexToken, LexError>>);
impl Display for ParseError {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
let err_snippet_start = |t: &LexToken| -> usize {
/* backtrack from current index until we either hit
* - beginning of line
* - 25 characters ago
* - the doc Start
*/
if t.source_doc.len() < 25 {
0
} else {
let mut idx = t.start_idx;
while t.start_idx - idx < 25 {
idx -= 1;
if t.source_doc[idx..]
.char_indices()
.next()
.is_some_and(|(i, x)| x == '\n' && i == idx) {
idx += 1;
break;
}
}
idx
}
};
let err_snippet_end = |t: &LexToken| -> usize {
/* read through document until we either hit
* - end of line
* - 25 characters forward
* - the doc end
*/
if t.source_doc.len() - t.end_idx < 25 {
t.source_doc.len()
} else {
let mut idx = t.end_idx;
while idx - t.end_idx < 25 {
idx += 1;
if t.source_doc[idx..]
.char_indices()
.next()
.is_some_and(|(i, x)| x == '\n' && i == idx) {
break;
}
}
idx
}
};
if let Some(frag) = &self.1 {
match frag {
Ok(token) => {
write!(f, "Error parsing syntax: {}\n", self.0)?;
write!(f," problematic token: {}\n",
&token.source_doc[token.start_idx..token.end_idx])?;
write!(f," {}\n",
&token.source_doc[err_snippet_start(token)..err_snippet_end(token)])?;
},
Err(e) => {
return e.fmt(f);
}
}
}
write!(f, "Error parsing syntax: {}\n", self.0)
}
}
pub struct Parser {
lexer: Lexer,
pub has_error_state: Option<ParseError>,
delayed: Vec<Rc<Datum>>,
quasiquoted: bool,
}
/* The From and Iterator traits serve as the primary
* interface to work with the parser. It is expected to
* make a Lexer first, and then use casting or type conv
* to make it into a parser and then a final AST, which
* we can then convert into a VM image once the compile
* step is finished.
*/
impl From<Lexer> for Parser {
fn from(l: Lexer) -> Parser {
Parser {
lexer: l,
has_error_state: None,
delayed: vec![],
quasiquoted: false
}
}
}
impl Iterator for Parser {
type Item = Rc<Datum>;
fn next(&mut self) -> Option<Self::Item> {
if self.has_error_state.is_some() {
return None;
}
if self.delayed.len() > 0 {
return self.delayed.pop()
}
let res = self.get_next_datum();
if let Err(ref e) = res {
self.has_error_state = Some(e.clone());
}
return res.ok()
}
}
fn read_number(token: LexToken) -> Result<Number, ParseError> {
return match (&token.source_doc[token.start_idx..token.end_idx]).parse::<Number>() {
Ok(num) => Ok(num),
Err(e) => Err(ParseError(e, Some(Ok(token)))),
}
}
fn read_char(token: LexToken) -> Result<u8, ParseError> {
if token.end_idx - token.start_idx < 2 {
return Err(ParseError(E_CHAR_TRUNCATED, Some(Ok(token))))
}
match &token.source_doc[token.start_idx + 2..token.end_idx] {
"alarm" => Ok(7),
"backspace" => Ok(8),
"delete" => Ok(127),
"escape" => Ok(33),
"newline" => Ok('\n' as u8),
"null" => Ok(0),
"return" => Ok(13),
"space" => Ok(32),
"tab" => Ok(11),
_ if token.source_doc[token.start_idx + 2..].starts_with('x') &&
token.end_idx - token.start_idx > 2 => {
if token.end_idx - token.start_idx > 5 {
return Err(ParseError(E_CHAR_TOO_LONG, Some(Ok(token))))
}
match u8::from_str_radix(
&token.source_doc[token.start_idx + 3..token.end_idx],
16) {
Ok(u) => Ok(u),
Err(_) => Err(ParseError(E_CHAR_HEX_PARSE, Some(Ok(token))))
}
},
_ => Ok(token.source_doc.as_bytes()[token.start_idx + 2])
}
}
fn read_bool(token: LexToken) -> bool {
match &token.source_doc[token.start_idx..token.end_idx] {
"#t" => true,
"#f" => false,
_ => panic!("impossible boolean")
}
}
fn read_string(token: LexToken) -> Vec<u8> {
if token.end_idx - token.start_idx < 3 {
// empty string other than delimiters
Vec::default()
} else {
token.source_doc[token.start_idx + 1..token.end_idx - 1]
.as_bytes()
.to_vec()
}
}
impl Parser {
/* Rules we must mind:
* 0. at this stage, drop and ignore comments, directives
* 1. quote, quasiquote, unquote, and unquote splicing
* all require another input after them (excluding
* collection end)
* 2. unquote-splicing explicitly requires a form I think?
* (verify)
* 3. vectors, lists, may have nested collections in them
* so track collection state in the parser's stack.
* 4. list dotted notation needs next datum put in cdr.
* 5. bytevectors can only have numbers from 0-255 in them.
*/
fn complete_quote(&mut self) -> Result<Rc<Datum>, ParseError> {
let next = self.get_next_datum()?;
Ok(Rc::from(Datum::List(Rc::from(Ast(
Rc::from(Datum::Symbol(String::from("quote"))),
Rc::from(Datum::List(Rc::from(Ast(
next,
Rc::from(Datum::None)
))))
)))))
}
fn complete_unquote_splicing(&mut self, tok: LexToken) -> Result<Rc<Datum>, ParseError> {
let next = self.get_next_datum()?;
match *next {
Datum::List(_) | Datum::Vector(_) | Datum::Symbol(_) => (),
_ => return Err(ParseError(E_UNQUOTE_SPL_COLL, Some(Ok(tok))))
}
Ok(Rc::from(Datum::List(Rc::from(Ast(
Rc::from(Datum::Symbol(String::from("unquote-splicing"))),
Rc::from(Datum::List(Rc::from(Ast(
next,
Rc::from(Datum::None)
))))
)))))
}
fn complete_unquote(&mut self) -> Result<Rc<Datum>, ParseError> {
let next = self.get_next_datum()?;
Ok(Rc::from(Datum::List(Rc::from(Ast(
Rc::from(Datum::Symbol(String::from("unquote"))),
Rc::from(Datum::List(Rc::from(Ast(
next,
Rc::from(Datum::None)
))))
)))))
}
fn complete_quasiquote(&mut self) -> Result<Rc<Datum>, ParseError> {
let prev = self.quasiquoted; // handle nesting appropriately
self.quasiquoted = true;
let next = self.get_next_datum()?;
self.quasiquoted = prev;
Ok(Rc::from(Datum::List(Rc::from(Ast(
Rc::from(Datum::Symbol(String::from("quasiquote"))),
Rc::from(Datum::List(Rc::from(Ast(
next,
Rc::from(Datum::None)
))))
)))))
}
fn complete_collection(&mut self, token: LexToken) -> Result<Rc<Datum>, ParseError> {
let is_bv = match token.token_type {
LexTokenType::ByteVectorStart => true,
_ => false,
};
let mut lex_stack = vec![];
let mut bv_stack = vec![];
/* counting indexes helps greatly with calculating position dependent
* syntax rules like dot notation in lists
*/
let mut iter_count = 0;
let mut dot_idx = (None, None, None);
loop {
let next_tok = self.lexer.next();
if let None = next_tok {
return Err(ParseError(E_COLLECTION_TRUNC, None))
}
let tok = next_tok.unwrap();
match tok.token_type {
// Universal cases
LexTokenType::Comment | LexTokenType::Directive => continue,
LexTokenType::NumTypes =>
return Err(ParseError(E_TERRIBLE, Some(Ok(tok)))),
LexTokenType::Unquote if !self.quasiquoted =>
return Err(ParseError(E_UNQUOTE_SPL_NONQQ, Some(Ok(tok)))),
LexTokenType::UnquoteSplice if !self.quasiquoted =>
return Err(ParseError(E_UNQUOTE_NONQQ, Some(Ok(tok)))),
// CollectionEnd must take precedence over the dot notation case
LexTokenType::CollectionEnd => break,
_ if let Some(idx) = dot_idx.0 && iter_count - idx > 2 =>
return Err(ParseError(E_DOT_IDX, Some(Ok(dot_idx.1.unwrap())))),
LexTokenType::Dot if token.token_type != LexTokenType::ListStart =>
return Err(ParseError(E_VECTOR_DOT, Some(Ok(tok)))),
// List, Vector cases
LexTokenType::ListStart | LexTokenType::VectorStart |
LexTokenType::ByteVectorStart if !is_bv =>
lex_stack.push(self.complete_collection(tok)?),
LexTokenType::String if !is_bv =>
lex_stack.push(Rc::from(Datum::String(read_string(tok)))),
LexTokenType::Number if !is_bv =>
lex_stack.push(Rc::from(Datum::Number(read_number(tok)?))),
LexTokenType::Char if !is_bv =>
lex_stack.push(Rc::from(Datum::Char(read_char(tok)?))),
LexTokenType::Boolean if !is_bv =>
lex_stack.push(Rc::from(Datum::Bool(read_bool(tok)))),
LexTokenType::Symbol if !is_bv =>
lex_stack.push(Rc::from(Datum::Symbol(
String::from(&tok.source_doc[tok.start_idx..tok.end_idx])))),
LexTokenType::Quote if !is_bv =>
lex_stack.push(self.complete_quote()?),
LexTokenType::QuasiQuote if !is_bv =>
lex_stack.push(self.complete_quasiquote()?),
LexTokenType::Unquote if !is_bv && self.quasiquoted =>
lex_stack.push(self.complete_unquote()?),
LexTokenType::UnquoteSplice if !is_bv && self.quasiquoted =>
lex_stack.push(self.complete_unquote_splicing(tok)?),
// List only cases
LexTokenType::Dot => if let Some(_) = dot_idx.0 {
return Err(ParseError(E_TOO_MANY_DOT, Some(Ok(tok))))
} else {
dot_idx = (Some(iter_count), Some(tok), None)
},
// ByteVector cases
LexTokenType::Number if is_bv => {
let n = read_number(tok.clone())?
.make_inexact();
if n.0 < 0.0 || n.0 > 255.0 || n.0.fract() != 0.0 {
return Err(ParseError(E_BV_BADBYTE, Some(Ok(tok))))
}
bv_stack.push(n.0 as u8);
},
_ if is_bv => return Err(ParseError(E_BV_NONBYTE, Some(Ok(tok)))),
// This should never get touched
_ => todo!("theoretically impossible case in parser::complete_collection"),
}
if let Some(idx) = dot_idx.0 && iter_count == idx + 1 {
dot_idx.2 = Some(lex_stack.pop());
}
iter_count += 1;
}
if is_bv {
return Ok(Rc::from(Datum::ByteVector(bv_stack)))
}
if token.token_type == LexTokenType::VectorStart {
return Ok(Rc::from(Datum::Vector(lex_stack)))
}
// handle an empty list
if lex_stack.len() < 1 {
// dont try to do something like "( . 'thing)"
if let (_, Some(node), _) = dot_idx {
return Err(ParseError(E_DOT_EMPTY, Some(Ok(node))))
}
return Ok(Rc::from(Datum::List(Rc::from(Ast(Rc::from(Datum::None),
Rc::from(Datum::None))))))
}
let mut from_rear: Rc<Ast>;
if let (_, _, Some(node)) = dot_idx {
from_rear = Rc::from(Ast(lex_stack.pop().unwrap(), node.unwrap()));
} else {
from_rear = Rc::from(Ast(lex_stack.pop().unwrap(), Rc::from(Datum::None)));
}
lex_stack.iter()
.rev()
.for_each(|x| {
from_rear = Rc::from(Ast(x.clone(), Rc::from(Datum::List(from_rear.clone()))));
});
Ok(Rc::from(Datum::List(from_rear)))
}
fn get_next_datum(&mut self) -> Result<Rc<Datum>, ParseError> {
if let Some(token) = self.lexer.next() {
match token.token_type {
// normal paths:
LexTokenType::String => Ok(Rc::from(Datum::String(read_string(token)))),
LexTokenType::Number => Ok(Rc::from(Datum::Number(read_number(token)?))),
LexTokenType::Char => Ok(Rc::from(Datum::Char(read_char(token)?))),
LexTokenType::Symbol => Ok(Rc::from(Datum::Symbol(String::from(
&token.source_doc[token.start_idx..token.end_idx])))),
LexTokenType::Boolean => Ok(Rc::from(Datum::Bool(read_bool(token)))),
LexTokenType::VectorStart | LexTokenType::ListStart |
LexTokenType::ByteVectorStart => self.complete_collection(token),
LexTokenType::Quote => self.complete_quote(),
LexTokenType::QuasiQuote => self.complete_quasiquote(),
LexTokenType::Unquote if self.quasiquoted => self.complete_unquote(),
LexTokenType::UnquoteSplice if self.quasiquoted =>
self.complete_unquote_splicing(token),
// immediate errors:
LexTokenType::CollectionEnd => Err(ParseError(E_EXTRA_CLOSE, Some(Ok(token)))),
LexTokenType::NumTypes => Err(ParseError(E_TERRIBLE, Some(Ok(token)))),
LexTokenType::Dot => Err(ParseError(E_DOT_NO_LIST, Some(Ok(token)))),
LexTokenType::Unquote if !self.quasiquoted =>
Err(ParseError(E_UNQUOTE_NONQQ, Some(Ok(token)))),
LexTokenType::UnquoteSplice if !self.quasiquoted =>
Err(ParseError(E_UNQUOTE_SPL_NONQQ, Some(Ok(token)))),
// ignore comment, directive:
_ => self.get_next_datum(),
}
// Lexer error
} else if self.lexer.has_error_state.is_some() {
Err(ParseError(E_LEX_ERROR,
Some(Err(self.lexer.has_error_state.clone().unwrap()))))
// End of document
} else {
Err(ParseError(E_END_OF_DOCUMENT, None))
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_cases() {
let happy_cases = vec![
// case, result
("\"test\"", "\"test\""),
("test", "test"),
("(1 2 3)", "(1 2 3)"),
("'test", "(quote test)"),
("`test", "(quasiquote test)"),
("`(,one)", "(quasiquote ((unquote one)))"),
("`(test ,@(two))", "(quasiquote (test (unquote-splicing (two))))"),
("#u8(0 14 249)", "#u8(0 14 249)"),
("(nested lists (are pretty cool))", "(nested lists (are pretty cool))"),
("((nested) lists (are (pretty) cool))", "((nested) lists (are (pretty) cool))"),
("(dotted . notation)", "(dotted . notation)"),
("(longer dotted . notation)", "(longer dotted . notation)"),
("(hello \"world\")", "(hello \"world\")"),
("; big doc string\n(one two)", "(one two)"),
("(list #(vect 2 3 #u8(0 0)))", "(list #(vect 2 3 #u8(0 0)))"),
("(#\\xf0 #\\alarm #\\a #\\z)", "(#\\xf0 #\\x7 #\\a #\\z)")
];
let sad_cases = vec![
"(",
"( one two ",
"( one two three ( four )",
")",
"#(st",
"#u8(0 ",
"#u8(256)",
"#u8(two)",
"(one two ,three)",
"(one two ,@three)",
"`(one two ,@4.0)",
"(. two)",
"(one . two . three)",
];
println!("+ Testing Happy Cases...");
happy_cases.iter()
.for_each(|(case, result)| {
println!(" - case: {}", *case);
let mut p = Parser::from(Lexer::from(Rc::from(*case)));
let res = p.next();
if let None = res {
println!("{}", p.has_error_state.unwrap());
}
assert_eq!(
format!("{}", res.unwrap()),
format!("{}", result)
);
});
println!("+ Testing Sad Cases...");
sad_cases.iter()
.for_each(|case| {
println!(" - case: {}", *case);
let mut p = Parser::from(Lexer::from(Rc::from(*case)));
assert!(p.next().is_none() && p.has_error_state.is_some())
});
}
}