/* Mycelium Scheme
* Copyright (C) 2025 Ava Affine
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
use core::fmt;
use alloc::rc::Rc;
pub const LEX_SPECIAL: [char; 18] = ['!', '$', '%', '&', '*', '+', '-', '/',
':', '<', '=', '>', '?', '@', '^', '_', '~', '.'];
pub const LEX_WHITESPACE: [char; 4] = [' ', '\n', '\t', '\r'];
pub const NUMERICAL_EXTRA: [char; 4] = ['.', 'i', 'e', '/'];
pub const NUMERICAL_BASE: [char; 4] = ['d', 'o', 'b', 'x'];
pub const TOK_DELIMITERS: [char; 5] = [')', ' ', '\t', '\n', '\r'];
pub const E_NO_MATCHING_QUOTE: &str = "couldn't find matching quote";
pub const E_TOO_MANY_DECIMALS: &str = "number can only have one dot";
pub const E_TOO_MANY_SLASH: &str = "number can only have one slash";
pub const E_TOO_MANY_E: &str = "number can only have one e";
pub const E_NO_MATCHING_PAREN: &str = "couldn't find matching paren";
pub const E_UNCLOSED_COMMENT: &str = "block comment has no end";
pub const E_NO_CLOSING_PIPE: &str = "expected a closing pipe";
pub const E_NO_END_TO_HASH: &str = "expected more input after hash";
pub const E_NUMBER_TRUNCATED: &str = "number literal is truncated";
pub const E_CHAR_TRUNCATED: &str = "character literal is truncated";
pub const E_STRING_TRUNCATED: &str = "string literal is truncated";
pub const E_UNDELIMITED_ESC: &str = "char escape is not delimited";
pub const E_EXTRA_CLOSE: &str = "extra closing parenthesis";
pub const E_CHAR_TOO_LONG: &str = "character literal is too long";
pub const E_NUMER_BASE_ERR: &str = "digit in number exceeds specified base";
pub const E_UNSUPPORTED_ESC: &str = "unsupported escape";
pub const E_BAD_DOT: &str = "expected space after dot in dotted notation";
pub const E_BAD_HEX: &str = "character is not valid hexadecimal notation";
pub const E_INCOMPREHENSIBLE: &str = "token does not lex";
pub const E_END_OF_DOCUMENT: &str = "no additional input left in document";
/* LexError
* 0: error string
* 1: index into document
* 2: document in question
*/
#[derive(Clone)]
pub struct LexError(pub &'static str, pub usize, pub Rc);
impl fmt::Display for LexError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let err_snippet_start = || -> usize {
/* backtrack from current index until we either hit
* - beginning of line
* - 25 characters ago
* - the doc Start
*/
if self.2.len() < 25 {
0
} else {
let mut idx = self.1;
while self.1 - idx < 25 {
idx -= 1;
if self.2[idx..]
.char_indices()
.next()
.is_some_and(|(i, x)| x == '\n' && i == idx) {
idx += 1;
break;
}
}
idx
}
};
let err_snippet_end = || -> usize {
/* read through document until we either hit
* - end of line
* - 25 characters forward
* - the doc end
*/
if self.2.len() - self.1 < 25 {
self.2.len()
} else {
let mut idx = self.1;
while idx - self.1 < 25 {
idx += 1;
if self.2[idx..]
.char_indices()
.next()
.is_some_and(|(i, x)| x == '\n' && i == idx) {
break;
}
}
idx
}
};
write!(f, "Error when lexing document here: (idx: {})\n", self.1)?;
let s = err_snippet_start();
let st = self.1 - err_snippet_start();
write!(f, " {}\n", &self.2[s..err_snippet_end()])?;
write!(f, " {}^\n", " ".repeat(st))?;
write!(f, "Error: {}\n", self.0)
}
}
#[repr(u8)]
#[derive(Debug, PartialEq, Clone)]
pub enum LexTokenType {
String = 0,
Number,
Char,
Symbol,
VectorStart,
ByteVectorStart,
ListStart,
CollectionEnd,
Boolean,
Dot,
Comment,
Directive,
Quote,
QuasiQuote,
Unquote,
UnquoteSplice,
NumTypes,
}
impl TryFrom for LexTokenType {
type Error = &'static str;
fn try_from(u: u8) -> Result {
if u >= LexTokenType::NumTypes as u8 {
Err("out of token type range")
} else {
unsafe { Ok(core::mem::transmute(u)) }
}
}
}
#[derive(Clone)]
pub struct LexToken {
pub token_type: LexTokenType,
pub start_idx: usize,
pub end_idx: usize,
pub source_doc: Rc,
}
pub struct Lexer {
document: Rc,
current_index: usize,
current_token_start: usize,
pub has_error_state: Option,
}
impl From> for Lexer {
fn from(s: Rc) -> Lexer {
Lexer {
document: Rc::from(s),
current_index: 0,
current_token_start: 0,
has_error_state: None,
}
}
}
impl Iterator for Lexer {
type Item = LexToken;
fn next(&mut self) -> Option {
if self.has_error_state.is_some() {
return None;
}
let res = self.seek_next_token();
if let Err(ref e) = res {
self.has_error_state = Some(e.clone());
}
return res.ok()
}
}
impl Lexer {
// I just didnt want to write and rewrite this...
#[inline(always)]
fn current_char(&mut self) -> char {
self.document.as_bytes()[self.current_index] as char
}
#[inline(always)]
fn peek_next_char(&mut self) -> Option {
if let Some((_, ch)) = self.document[self.current_index+1..]
.char_indices()
.next() {
Some(ch)
} else {
None
}
}
#[inline(always)]
fn advance_char(&mut self) -> Option<()> {
self.current_index += 1;
if self.current_index >= self.document.len() {
return None
}
if let Some((idx, _)) = self.document[self.current_index..]
.char_indices()
.next() {
self.current_index = idx + self.current_index;
Some(())
} else {
self.current_index = self.document.len();
None
}
}
#[inline(always)]
fn match_chunk_next(&mut self, chunk: &str) -> Option {
let saved = self.current_index;
for i in chunk.chars() {
if let None = self.advance_char() {
self.current_index = saved;
return None
}
if i != self.current_char() {
self.current_index = saved;
return Some(false)
}
}
Some(true)
}
#[inline(always)]
fn cut_new_token(&mut self, t: LexTokenType) -> Result {
let next_idx = self.advance_char()
.and_then(|_| Some(self.current_index))
.or(Some(self.document.len()))
.unwrap();
let l = LexToken{
token_type: t,
start_idx: self.current_token_start,
end_idx: next_idx,
source_doc: self.document.clone(),
};
self.current_token_start = 0;
return Ok(l);
}
#[inline(always)]
fn seek_end_of_string(&mut self) -> Result {
loop {
if let None = self.advance_char() {
return Err(LexError(E_NO_MATCHING_QUOTE,
self.current_token_start, self.document.clone()))
} else if self.current_char() == '\\' {
self.seek_end_of_escape(true)?;
} else if self.current_char() == '"' {
return self.cut_new_token(LexTokenType::String)
}
}
}
#[inline(always)]
fn seek_end_of_number(&mut self) -> Result {
let mut base = 10;
let a = self.current_char();
if NUMERICAL_BASE.contains(&a) || a == 'i' || a == 'e' {
if let None = self.advance_char() {
return Err(LexError(E_NUMBER_TRUNCATED,
self.current_token_start, self.document.clone()))
// someday rust will get its shit together and if let chaining will be adequate
} else if TOK_DELIMITERS.contains(&a) {
return Err(LexError(E_NUMBER_TRUNCATED,
self.current_token_start, self.document.clone()))
}
match a {
'x' => base = 16,
'd' => base = 10,
'o' => base = 8,
'b' => base = 2,
// ignore i or e, number parsers will handle that
_ => (),
}
}
if let Some(true) = self.match_chunk_next("inf.0") {
return self.cut_new_token(LexTokenType::Number)
}
if let Some(true) = self.match_chunk_next("nan.0") {
return self.cut_new_token(LexTokenType::Number)
}
let mut hasdot = false;
let mut hasslash = false;
let mut hase = false;
loop {
let a = self.current_char();
if a == '.' {
if hasdot || base < 10 {
return Err(LexError(E_TOO_MANY_DECIMALS,
self.current_token_start, self.document.clone()))
}
hasdot = true;
} else if a == '/' {
if hasslash || base < 10 {
return Err(LexError(E_TOO_MANY_SLASH,
self.current_token_start, self.document.clone()))
}
hasslash = true;
} else if a == 'e' {
if hase || base < 10 {
return Err(LexError(E_TOO_MANY_E,
self.current_token_start, self.document.clone()))
}
hase = true
} else if TOK_DELIMITERS.contains(&a) {
// back up one
self.current_index -= 1;
return self.cut_new_token(LexTokenType::Number)
} else if let None = a.to_digit(base) {
return Err(LexError(E_NUMER_BASE_ERR,
self.current_token_start, self.document.clone()))
}
if let None = self.advance_char() {
self.current_index = self.document.len() - 1;
return self.cut_new_token(LexTokenType::Number)
}
}
}
#[inline(always)]
fn seek_end_of_block_comment(&mut self) -> Result {
loop {
if let None = self.advance_char() {
return Err(LexError(E_UNCLOSED_COMMENT,
self.current_token_start, self.document.clone()))
}
match self.current_char() {
'|' if self.advance_char().and_then(|_|
if self.current_char() == '#' {
return Some(())
} else { return None }).is_some() =>
return self.cut_new_token(LexTokenType::Comment),
_ => continue,
};
}
}
#[inline(always)]
fn seek_end_of_line_comment(&mut self, directive: bool) -> Result {
loop {
if let None = self.advance_char() {
return Err(LexError(E_UNCLOSED_COMMENT,
self.current_token_start, self.document.clone()))
}
match self.current_char() {
'\n' if !directive => return self.cut_new_token(LexTokenType::Comment),
'\n' if directive => return self.cut_new_token(LexTokenType::Directive),
_ => continue,
};
}
}
#[inline(always)]
fn seek_closing_pipe(&mut self) -> Result {
loop {
if let None = self.advance_char() {
return Err(LexError(E_NO_CLOSING_PIPE,
self.current_token_start, self.document.clone()));
}
let c = self.current_char();
match self.current_char() {
'\\' => self.seek_end_of_escape(false)?,
'|' => return self.cut_new_token(LexTokenType::Symbol),
_ if c.is_alphanumeric() => continue,
_ if LEX_SPECIAL.contains(&c) => continue,
_ if c == ' ' || c == '\n' => continue,
// quote case caught here
_ => return Err(LexError(E_INCOMPREHENSIBLE,
self.current_token_start, self.document.clone())),
};
}
}
#[inline(always)]
fn seek_end_from_hash(&mut self) -> Result {
let c = self.advance_char().and_then(|_| Some(self.current_char()));
if let Some(ch) = c {
match ch {
't' | 'f' => return self.cut_new_token(LexTokenType::Boolean),
'|' => return self.seek_end_of_block_comment(),
'!' => return self.seek_end_of_line_comment(true),
'u' if self.match_chunk_next("8(").is_some_and(|x| x) =>
return self.cut_new_token(LexTokenType::ByteVectorStart),
'(' => return self.cut_new_token(LexTokenType::VectorStart),
'\\' => self.seek_end_of_escape(false, )
.and_then(|_| self.cut_new_token(LexTokenType::Char)),
_ if NUMERICAL_BASE.contains(&ch) => return self.seek_end_of_number(),
'i' | 'e' => return self.seek_end_of_number(),
_ => return Err(LexError(E_INCOMPREHENSIBLE,
self.current_token_start, self.document.clone())),
}
} else {
Err(LexError(E_NO_END_TO_HASH, self.current_token_start, self.document.clone()))
}
}
// DOES NOT RETURN A TOKEN.......
// only the caller knows what actually needs to be returned
#[inline(always)]
fn seek_end_of_escape(&mut self, in_string: bool) -> Result<(), LexError> {
// little helper to deduplicate logic for advancing characters
macro_rules! adv {
() => {
if let None = self.advance_char() {
let mut error_msg = E_CHAR_TRUNCATED;
if in_string { error_msg = E_STRING_TRUNCATED; }
Err(LexError(error_msg, self.current_token_start,
self.document.clone()))
} else { Ok(()) }
};
}
let delim = |x| -> bool {
in_string || TOK_DELIMITERS.contains(&x)
};
// advance char once
adv!()?;
/* if match_chunk_next fails then the index is unmoved
* allowing us to treat this like a single char escape
*/
match self.current_char() {
// char escapes
'a' if !in_string => self.match_chunk_next("larm"),
'b' if !in_string => self.match_chunk_next("ackspace"),
'd' if !in_string => self.match_chunk_next("elete"),
'e' if !in_string => self.match_chunk_next("scape"),
'n' if !in_string => self.match_chunk_next("ewline").or(
self.match_chunk_next("ull")
),
'r' if !in_string => self.match_chunk_next("eturn"),
's' if !in_string => self.match_chunk_next("pace"),
't' if !in_string => self.match_chunk_next("ab"),
// string escapes
'a' | 'b' | 't' | 'n' | 'r' | '"' | '\\' if in_string => None,
// both
'x' => {
// we look for TWO hex digits
adv!()?;
self.current_char().to_digit(16)
.ok_or(LexError(E_BAD_HEX, self.current_index,
self.document.clone()))?;
adv!()?;
self.current_char().to_digit(16)
.ok_or(LexError(E_BAD_HEX, self.current_index,
self.document.clone()))?;
None
},
// catchalls
_ if !in_string => None,
_ => return Err(LexError(E_UNSUPPORTED_ESC, self.current_index,
self.document.clone())),
};
let saved_idx = self.current_index;
if saved_idx == self.document.len() - 1 {
return Ok(())
}
// make sure next character is a proper delimiter
adv!().and_then(|_| if !delim(self.current_char()) {
return Err(LexError(E_UNDELIMITED_ESC, self.current_index,
self.document.clone()))
} else { self.current_index = saved_idx; Ok(()) })
}
/* Called to output a token by the iterator implementation
* I dont think this has to be inlined. The other ones are inlined to
* prevent the process of parsing a token from being slowed down by
* so many stack frames. This one is called once per token.
*/
fn seek_next_token(&mut self) -> Result {
let mut output: Option> = None;
if self.current_index >= self.document.len() {
return Err(LexError(E_END_OF_DOCUMENT,
self.document.len(), self.document.clone()));
}
while LEX_WHITESPACE.contains(&self.current_char()) {
if let None = self.advance_char() {
return Err(LexError(E_END_OF_DOCUMENT,
self.document.len(), self.document.clone()));
}
}
self.current_token_start = self.current_index;
// handle syntactic sugar cases
match self.current_char() {
';' => output = Some(self.seek_end_of_line_comment(false)),
'\'' => output = Some(self.cut_new_token(LexTokenType::Quote)),
'`' => output = Some(self.cut_new_token(LexTokenType::QuasiQuote)),
'(' => output = Some(self.cut_new_token(LexTokenType::ListStart)),
')' => output = Some(self.cut_new_token(LexTokenType::CollectionEnd)),
'#' => output = Some(self.seek_end_from_hash()),
'"' => output = Some(self.seek_end_of_string()),
/* This code commented out. I dont think you can open a char without '#'
* '\\' => output = Some(self.seek_end_of_escape(false)
.and_then(|_|
self.cut_new_token(LexTokenType::Char))),*/
'|' => output = Some(self.seek_closing_pipe()),
'+' | '-' => output = Some(self.seek_end_of_number()),
_ if self.current_char().is_numeric() => output =
Some(self.seek_end_of_number()),
_ => (),
}
if output.is_none() {
if self.current_char() == '.' {
if let Some(x) = self.peek_next_char() && x == ' ' {
output = Some(self.cut_new_token(LexTokenType::Dot));
} /* else {
output = Some(Err(LexError(E_BAD_DOT, self.current_index)));
} SYKE! It could be a symbol... */
}
if self.current_char() == ',' {
if let Some(x) = self.peek_next_char() && x == '@'{
self.advance_char();
output = Some(self.cut_new_token(LexTokenType::UnquoteSplice));
} else {
output = Some(self.cut_new_token(LexTokenType::Unquote));
}
}
}
/* Broken out into a separate case to maintain precedence of the
* unquote syntax and dotted notation.
*/
if output.is_none() {
loop {
let c = self.current_char();
if !c.is_alphanumeric() && !LEX_SPECIAL.contains(&c) && c != ' ' {
output = Some(Err(LexError(E_INCOMPREHENSIBLE,
self.current_index, self.document.clone())));
break;
}
if let Some(c) = self.peek_next_char() {
if c == ' ' || c == ')' {
output = Some(self.cut_new_token(LexTokenType::Symbol));
break;
}
self.advance_char().unwrap();
} else {
output = Some(self.cut_new_token(LexTokenType::Symbol));
break;
}
}
}
if let Some(ref res) = output {
if let Err(ref e) = res {
self.has_error_state = Some(e.clone());
}
}
return output.unwrap()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_token_evaluations() {
// indexed by LexTokenType
let cases: [(Vec<&str>, Vec<&str>); LexTokenType::NumTypes as usize] = [
/* String Cases */ (
// HAPPY CASES
vec!["\"asdf\"", "\"as sdf\"", "\"asdflkj\\n\"",
"\"LKsldkf;l\"", "\" sdlkfj \"", "\"#;sdf\"",
"\"\"", "\"\\\" \\\"\""],
// SAD CASES
vec!["\"sdf"]
),
/* Number Cases */ (
// HAPPY CASES
vec!["1", "1.0", "#d1.1", "#o1423", "#b11", "#xDF",
"#e1e1", "#i1/4", "+inf.0", "1e1"],
// SAD CASES
vec!["1.1.1", "#o9", "#b1.01", "#i1/3/3"]
),
/* Char Cases */ (
// HAPPY CASES
vec!["#\\a", "#\\t", "#\\\"", "#\\t", "#\\space",
"#\\alarm", "#\\s", "#\\x20"],
// SAD CASES
vec!["\\c", "\\x20"]
),
/* Identifier Cases */ (
// HAPPY CASES
vec!["...", "<=?", "V17a", "a34kTMNs", "lambda", "q",
"list->vector", "|two words|", "|two\nwords|",
"the-word-recursion-has-many-meanings"],
// SAD CASES
vec!["|\"\"|", "|(|", "|valid"]
),
/* Vector Start Cases */ (
// HAPPY CASES
vec!["#("],
// SAD CASES
vec![]
),
/* Byte Vector Cases */ (
// HAPPY CASES
vec!["#u8("],
// SAD CASES
vec!["#u8", "#u9", "#u("]
),
/* List Start Cases */ (
// HAPPY CASES
vec!["("],
// SAD CASES
vec![]
),
/* Collection End Cases */ (
// HAPPY CASES
vec![")"],
// SAD CASES
vec![]
),
/* Boolean Cases */ (
// HAPPY CASES
vec!["#t", "#f"],
// SAD CASES
vec![]
),
/* Dot Cases */ (
// HAPPY CASES
vec![" . "],
// SAD CASES
vec![]
),
/* Comment cases */ (
// HAPPY CASES
vec!["; (\n", "; #\n", ";\"\n", "#| ; ( \" |#"],
// SAD CASES
vec!["#|", "; "]
),
/* Directive cases */ (
// HAPPY CASES
vec!["#!test-directive\n"],
// SAD CASES
vec!["#!test-directive"]
),
/* Quote cases */ (
// HAPPY CASES
vec!["'"],
// SAD CASES
vec![]
),
/* QuasiQuote cases */ (
// HAPPY CASES
vec!["`"],
// SAD CASES
vec![]
),
/* Unquote cases */ (
// HAPPY CASES
vec![",x", ","],
// SAD CASES
vec![]
),
/* UnquoteSplice cases */ (
// HAPPY CASES
vec![",@x", ",@(", ",@", ",@(two)"],
// SAD CASES
vec![]
),
];
let no_subtoken_check_cases = [
LexTokenType::Dot as u8,
LexTokenType::Unquote as u8,
LexTokenType::UnquoteSplice as u8
];
cases.iter().enumerate().for_each(|(idx, case)| {
println!("+ Testing {:#?} Cases...", LexTokenType::try_from(idx as u8).unwrap());
case.0.iter()
.for_each(|subcase| {
println!(" - happy case: {}", subcase);
let token = Lexer::from(Rc::from(*subcase))
.next()
.unwrap();
assert_eq!(token.token_type,
LexTokenType::try_from(idx as u8)
.unwrap());
if no_subtoken_check_cases.contains(&(idx as u8)) {
/* DO NOTHING, ignore the dot case since its subcase is
* a superset of the actual token substring
*/
} else {
assert_eq!(&token.source_doc[token.start_idx..token.end_idx],
*subcase)
}
});
case.1.iter()
.for_each(|subcase| {
println!(" - sad case: {}", subcase);
assert!(Lexer::from(Rc::from(*subcase)).next().is_none())
});
});
}
#[test]
fn test_multi_token_iter() {
let mut res = vec![];
Lexer::from(Rc::from("( one two three )"))
.into_iter()
.collect_into(&mut res);
assert_eq!(res.len(), 5);
assert_eq!(res[0].token_type, LexTokenType::ListStart);
assert_eq!(res[0].start_idx, 0);
assert_eq!(res[0].end_idx, 1);
assert_eq!(&res[0].source_doc[res[0].start_idx..res[0].end_idx], "(");
assert_eq!(res[1].token_type, LexTokenType::Symbol);
assert_eq!(res[1].start_idx, 2);
assert_eq!(res[1].end_idx, 5);
assert_eq!(&res[1].source_doc[res[1].start_idx..res[1].end_idx], "one");
assert_eq!(res[2].token_type, LexTokenType::Symbol);
assert_eq!(res[2].start_idx, 6);
assert_eq!(res[2].end_idx, 9);
assert_eq!(&res[2].source_doc[res[2].start_idx..res[2].end_idx], "two");
assert_eq!(res[3].token_type, LexTokenType::Symbol);
assert_eq!(res[3].start_idx, 10);
assert_eq!(res[3].end_idx, 15);
assert_eq!(&res[3].source_doc[res[3].start_idx..res[3].end_idx], "three");
assert_eq!(res[4].token_type, LexTokenType::CollectionEnd);
assert_eq!(res[4].start_idx, 16);
assert_eq!(res[4].end_idx, 17);
assert_eq!(&res[4].source_doc[res[4].start_idx..res[4].end_idx], ")");
}
#[test]
fn test_error_state_blocking() {
let mut l = Lexer::from(Rc::from("( 1 2.2.2 valid_token"))
.into_iter();
assert!(l.next().is_some());
assert!(l.next().is_some());
assert!(l.next().is_none());
assert!(l.has_error_state.is_some());
assert!(l.next().is_none());
assert!(l.has_error_state.is_some());
}
#[test]
fn char_lex_with_close() {
let mut res = vec![];
Lexer::from(Rc::from("(#\\a)"))
.into_iter()
.collect_into(&mut res);
assert_eq!(res.len(), 3);
assert_eq!(res[0].token_type, LexTokenType::ListStart);
assert_eq!(&res[0].source_doc[res[0].start_idx..res[0].end_idx], "(");
assert_eq!(res[1].token_type, LexTokenType::Char);
assert_eq!(&res[1].source_doc[res[1].start_idx..res[1].end_idx], "#\\a");
assert_eq!(res[2].token_type, LexTokenType::CollectionEnd);
assert_eq!(&res[2].source_doc[res[2].start_idx..res[2].end_idx], ")");
}
}