/* Mycelium Scheme * Copyright (C) 2025 Ava Affine * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ use core::fmt; use alloc::rc::Rc; pub const LEX_SPECIAL: [char; 18] = ['!', '$', '%', '&', '*', '+', '-', '/', ':', '<', '=', '>', '?', '@', '^', '_', '~', '.']; pub const LEX_WHITESPACE: [char; 4] = [' ', '\n', '\t', '\r']; pub const NUMERICAL_EXTRA: [char; 3] = ['.', 'i', 'e']; pub const NUMERICAL_BASE: [char; 3] = ['d', 'o', 'b']; pub const E_NO_MATCHING_QUOTE: &str = "couldn't find matching quote"; pub const E_TOO_MANY_DECIMALS: &str = "number can only have one of {i e .}"; pub const E_NO_MATCHING_PAREN: &str = "couldn't find matching paren"; pub const E_UNCLOSED_COMMENT: &str = "block comment has no end"; pub const E_NO_CLOSING_PIPE: &str = "expected a closing pipe"; pub const E_NO_END_TO_HASH: &str = "expected more input after hash"; pub const E_NUMBER_TRUNCATED: &str = "number literal is truncated"; pub const E_CHAR_TRUNCATED: &str = "character literal is truncated"; pub const E_STRING_TRUNCATED: &str = "string literal is truncated"; pub const E_EXTRA_CLOSE: &str = "extra closing parenthesis"; pub const E_UNIMPLEMENTED_HEX: &str = "hexadecimal literals not supported"; pub const E_NUMER_BASE_ERR: &str = "digit in number exceeds specified base"; pub const E_UNSUPPORTED_ESC: &str = "unsupported escape"; pub const E_BAD_DOT: &str = "expected space after dot in dotted notation"; pub const E_INCOMPREHENSIBLE: &str = "token does not lex"; pub const E_END_OF_DOCUMENT: &str = "no additional input left in document"; /* LexError * 0: error string * 1: index into document * 2: document in question */ #[derive(Clone)] pub struct LexError(pub &'static str, pub usize, pub Rc); impl fmt::Display for LexError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let err_snippet_start = || -> usize { /* backtrack from current index until we either hit * - beginning of line * - 25 characters ago * - the doc Start */ if self.2.len() < 25 { 0 } else { let mut idx = self.1; while self.1 - idx > 25 { idx -= 1; if self.2[idx..] .char_indices() .next() .is_some_and(|(i, x)| x == '\n' && i == idx) { idx += 1; break; } } idx } }; let err_snippet_end = || -> usize { /* read through document until we either hit * - end of line * - 25 characters forward * - the doc end */ if self.2.len() - self.1 < 25 { self.2.len() } else { let mut idx = self.1; while idx - self.1 < 25 { idx += 1; if self.2[idx..] .char_indices() .next() .is_some_and(|(i, x)| x == '\n' && i == idx) { break; } } idx } }; write!(f, "Error when lexing document here:\n\n")?; write!(f, " {}\n", &self.2[err_snippet_start()..err_snippet_end()])?; write!(f, "Error: {}\n", self.0) } } #[repr(u8)] #[derive(Debug, PartialEq, Clone)] pub enum LexTokenType { String = 0, Number, Char, Symbol, VectorStart, ByteVectorStart, ListStart, CollectionEnd, Boolean, Dot, Comment, Directive, Quote, QuasiQuote, Unquote, UnquoteSpliceTemplate, NumTypes, } impl TryFrom for LexTokenType { type Error = &'static str; fn try_from(u: u8) -> Result { if u >= LexTokenType::NumTypes as u8 { Err("out of token type range") } else { unsafe { Ok(core::mem::transmute(u)) } } } } #[derive(Clone)] pub struct LexToken { pub token_type: LexTokenType, pub start_idx: usize, pub end_idx: usize, pub source_doc: Rc, } pub struct Lexer { document: Rc, current_index: usize, current_token_start: usize, pub has_error_state: Option, } impl From> for Lexer { fn from(s: Rc) -> Lexer { Lexer { document: Rc::from(s), current_index: 0, current_token_start: 0, has_error_state: None, } } } impl Iterator for Lexer { type Item = LexToken; fn next(&mut self) -> Option { if self.has_error_state.is_some() { return None; } let res = self.seek_next_token(); if let Err(ref e) = res { self.has_error_state = Some(e.clone()); } return res.ok() } } impl Lexer { // I just didnt want to write and rewrite this... #[inline(always)] fn current_char(&mut self) -> char { self.document.as_bytes()[self.current_index] as char } #[inline(always)] fn peek_next_char(&mut self) -> Option { if let Some((_, ch)) = self.document[self.current_index+1..] .char_indices() .next() { Some(ch) } else { None } } #[inline(always)] fn advance_char(&mut self) -> Option<()> { self.current_index += 1; if let Some((idx, _)) = self.document[self.current_index..] .char_indices() .next() { self.current_index = idx + self.current_index; Some(()) } else { self.current_index = self.document.len(); None } } #[inline(always)] fn match_chunk_next(&mut self, chunk: &str) -> Option { for i in chunk.chars() { self.advance_char()?; if i != self.current_char() { return Some(false) } } Some(true) } /* TODO * I figured this function would be useful for supporting hexadec encoding * later down the line. We can use this instead of the base check in the * number function. #[inline(always)] fn next_chars_allowed(&mut self, len: usize, allowed: &str) -> Option { let mut i = len; while i < 0 { if !allowed.contains(self.current_char()) { return Some(false) } i -= 1; self.advance_char()?; } Some(true) } */ #[inline(always)] fn cut_new_token(&mut self, t: LexTokenType) -> Result { let next_idx = self.advance_char() .and_then(|_| Some(self.current_index)) .or(Some(self.document.len())) .unwrap(); let l = LexToken{ token_type: t, start_idx: self.current_token_start, end_idx: next_idx, source_doc: self.document.clone(), }; self.current_token_start = 0; return Ok(l); } #[inline(always)] fn seek_end_of_string(&mut self) -> Result { // TODO: support escaped quotes loop { if let None = self.advance_char() { return Err(LexError(E_NO_MATCHING_QUOTE, self.current_token_start, self.document.clone())) } else if self.current_char() == '"' { return self.cut_new_token(LexTokenType::String) } } } #[inline(always)] fn seek_end_of_number(&mut self) -> Result { let mut base = 10; let a = self.current_char(); if NUMERICAL_BASE.contains(&a) { if let None = self.advance_char() { return Err(LexError(E_NUMBER_TRUNCATED, self.current_token_start, self.document.clone())) } match a { 'd' => base = 10, 'o' => base = 8, 'b' => base = 2, _ => (), } } let mut hasdot = false; loop { let a = self.current_char(); if NUMERICAL_EXTRA.contains(&a) { if hasdot || base < 10 { return Err(LexError(E_TOO_MANY_DECIMALS, self.current_token_start, self.document.clone())) } hasdot = true; } else if a == ' ' || a == ')' { // back up one self.current_index -= 1; return self.cut_new_token(LexTokenType::Number) } else if !a.is_numeric() { return Err(LexError(E_INCOMPREHENSIBLE, self.current_token_start, self.document.clone())) } else if a.to_digit(10).unwrap() >= base { return Err(LexError(E_NUMER_BASE_ERR, self.current_token_start, self.document.clone())) } if let None = self.advance_char() { self.current_index = self.document.len() - 1; return self.cut_new_token(LexTokenType::Number) } } } #[inline(always)] fn seek_end_of_block_comment(&mut self) -> Result { loop { if let None = self.advance_char() { return Err(LexError(E_UNCLOSED_COMMENT, self.current_token_start, self.document.clone())) } match self.current_char() { '|' if self.advance_char().and_then(|_| if self.current_char() == '#' { return Some(()) } else { return None }).is_some() => return self.cut_new_token(LexTokenType::Comment), _ => continue, }; } } #[inline(always)] fn seek_end_of_line_comment(&mut self, directive: bool) -> Result { loop { if let None = self.advance_char() { return Err(LexError(E_UNCLOSED_COMMENT, self.current_token_start, self.document.clone())) } match self.current_char() { '\n' if !directive => return self.cut_new_token(LexTokenType::Comment), '\n' if directive => return self.cut_new_token(LexTokenType::Directive), _ => continue, }; } } #[inline(always)] fn seek_closing_pipe(&mut self) -> Result { loop { if let None = self.advance_char() { return Err(LexError(E_NO_CLOSING_PIPE, self.current_token_start, self.document.clone())); } let c = self.current_char(); match self.current_char() { '\\' => self.seek_end_of_escape(false)?, '|' => return self.cut_new_token(LexTokenType::Symbol), _ if c.is_alphanumeric() => continue, _ if LEX_SPECIAL.contains(&c) => continue, _ if c == ' ' || c == '\n' => continue, // quote case caught here _ => return Err(LexError(E_INCOMPREHENSIBLE, self.current_token_start, self.document.clone())), }; } } #[inline(always)] fn seek_end_from_hash(&mut self) -> Result { let c = self.advance_char().and_then(|_| Some(self.current_char())); if let Some(ch) = c { match ch { 't' | 'f' => return self.cut_new_token(LexTokenType::Boolean), '|' => return self.seek_end_of_block_comment(), '!' => return self.seek_end_of_line_comment(true), 'u' if self.match_chunk_next("8(").is_some_and(|x| x) => return self.cut_new_token(LexTokenType::ByteVectorStart), '(' => return self.cut_new_token(LexTokenType::VectorStart), '\\' => self.seek_end_of_escape(false) .and_then(|_| self.cut_new_token(LexTokenType::Char)), 'x' => return Err(LexError(E_UNIMPLEMENTED_HEX, self.current_index, self.document.clone())), _ if NUMERICAL_BASE.contains(&ch) => return self.seek_end_of_number(), _ => return Err(LexError(E_INCOMPREHENSIBLE, self.current_token_start, self.document.clone())), } } else { Err(LexError(E_NO_END_TO_HASH, self.current_token_start, self.document.clone())) } } // DOES NOT RETURN A TOKEN....... // only the caller knows what actually needs to be returned #[inline(always)] fn seek_end_of_escape(&mut self, in_string: bool) -> Result<(), LexError> { //let delim = if in_string { ';' } else { ' ' }; // Delim and the arg to this function will be useful once we support hexadecimal encoding if let None = self.advance_char() { let mut error_msg = E_CHAR_TRUNCATED; if in_string { error_msg = E_STRING_TRUNCATED; } return Err(LexError(error_msg, self.current_token_start, self.document.clone())) } match self.current_char() { // eat an escaped whitespace or delim ' ' | 'n' | 'r' | 't' | '|' | '\\' | '"' => { () }, 'x' => return Err(LexError(E_UNIMPLEMENTED_HEX, self.current_token_start, self.document.clone())), _ if self.current_char().is_alphabetic() => { () }, _ => return Err(LexError(E_UNSUPPORTED_ESC, self.current_index, self.document.clone())), } return Ok(()) } /* Called to output a token by the iterator implementation * I dont think this has to be inlined. The other ones are inlined to * prevent the process of parsing a token from being slowed down by * so many stack frames. This one is called once per token. */ fn seek_next_token(&mut self) -> Result { let mut output: Option> = None; if self.current_index >= self.document.len() { return Err(LexError(E_END_OF_DOCUMENT, self.document.len(), self.document.clone())); } while LEX_WHITESPACE.contains(&self.current_char()) { if let None = self.advance_char() { return Err(LexError(E_END_OF_DOCUMENT, self.document.len(), self.document.clone())); } } self.current_token_start = self.current_index; // handle syntactic sugar cases match self.current_char() { ';' => output = Some(self.seek_end_of_line_comment(false)), '\'' => output = Some(self.cut_new_token(LexTokenType::Quote)), '`' => output = Some(self.cut_new_token(LexTokenType::QuasiQuote)), '(' => output = Some(self.cut_new_token(LexTokenType::ListStart)), ')' => output = Some(self.cut_new_token(LexTokenType::CollectionEnd)), '#' => output = Some(self.seek_end_from_hash()), '"' => output = Some(self.seek_end_of_string()), '\\' => output = Some(self.seek_end_of_escape(false) .and_then(|_| self.cut_new_token(LexTokenType::Char))), '|' => output = Some(self.seek_closing_pipe()), _ if self.current_char().is_numeric() => output = Some(self.seek_end_of_number()), _ => (), } if output.is_none() { if self.current_char() == '.' { if let Some(x) = self.peek_next_char() && x == ' ' { output = Some(self.cut_new_token(LexTokenType::Dot)); } /* else { output = Some(Err(LexError(E_BAD_DOT, self.current_index))); } SYKE! It could be a symbol... */ } if self.current_char() == ',' { if let Some(x) = self.peek_next_char() && x == '@'{ output = Some(self.cut_new_token(LexTokenType::UnquoteSpliceTemplate)); } else { output = Some(self.cut_new_token(LexTokenType::Unquote)); } } } /* Broken out into a separate case to maintain precedence of the * unquote syntax and dotted notation. */ if output.is_none() { loop { let c = self.current_char(); if !c.is_alphanumeric() && !LEX_SPECIAL.contains(&c) && c != ' ' { output = Some(Err(LexError(E_INCOMPREHENSIBLE, self.current_index, self.document.clone()))); break; } if let Some(c) = self.peek_next_char() { if c == ' ' || c == ')' { output = Some(self.cut_new_token(LexTokenType::Symbol)); break; } self.advance_char().unwrap(); } else { output = Some(self.cut_new_token(LexTokenType::Symbol)); break; } } } if let Some(ref res) = output { if let Err(ref e) = res { self.has_error_state = Some(e.clone()); } } return output.unwrap() } } #[cfg(test)] mod tests { use super::*; #[test] fn test_token_evaluations() { // indexed by LexTokenType let cases: [(Vec<&str>, Vec<&str>); LexTokenType::NumTypes as usize] = [ /* String Cases */ ( // HAPPY CASES vec!["\"asdf\"", "\"as sdf\"", "\"asdflkj\\n\"", "\"LKsldkf;l\"", "\" sdlkfj \"", "\"#;sdf\"", "\"\""], // SAD CASES vec!["\"sdf"] ), /* Number Cases */ ( // HAPPY CASES vec!["1", "1.0", "#d1.1", "#o1423", "#b11"], // SAD CASES vec!["1.1.1", "#o9", "#b1.01", "#xADADAD"] ), /* Char Cases */ ( // HAPPY CASES vec!["\\a", "\\t", "\\\"", "#\\t"], // SAD CASES vec!["\\x20"] ), /* Identifier Cases */ ( // HAPPY CASES vec!["...", "+", "+soup+", "<=?", "V17a", "->string", "a34kTMNs", "lambda", "q", "list->vector", "|two words|", "|two\nwords|", "the-word-recursion-has-many-meanings"], // SAD CASES vec!["|\"\"|", "|(|", "|valid"] ), /* Vector Start Cases */ ( // HAPPY CASES vec!["#("], // SAD CASES vec![] ), /* Byte Vector Cases */ ( // HAPPY CASES vec!["#u8("], // SAD CASES vec!["#u8", "#u9", "#u("] ), /* List Start Cases */ ( // HAPPY CASES vec!["("], // SAD CASES vec![] ), /* Collection End Cases */ ( // HAPPY CASES vec![")"], // SAD CASES vec![] ), /* Boolean Cases */ ( // HAPPY CASES vec!["#t", "#f"], // SAD CASES vec![] ), /* Dot Cases */ ( // HAPPY CASES vec![" . "], // SAD CASES vec![] ), /* Comment cases */ ( // HAPPY CASES vec!["; (\n", "; #\n", ";\"\n", "#| ; ( \" |#"], // SAD CASES vec!["#|", "; "] ), /* Directive cases */ ( // HAPPY CASES vec!["#!test-directive\n"], // SAD CASES vec!["#!test-directive"] ), /* Quote cases */ ( // HAPPY CASES vec!["'"], // SAD CASES vec![] ), /* QuasiQuote cases */ ( // HAPPY CASES vec!["`"], // SAD CASES vec![] ), /* Unquote cases */ ( // HAPPY CASES vec![",x", ","], // SAD CASES vec![] ), /* UnquoteSpliceTemplate cases */ ( // HAPPY CASES vec![",@x", ",@(", ",@"], // SAD CASES vec![] ), ]; let no_subtoken_check_cases = [ LexTokenType::Dot as u8, LexTokenType::Unquote as u8, LexTokenType::UnquoteSpliceTemplate as u8 ]; cases.iter().enumerate().for_each(|(idx, case)| { println!("+ Testing {:#?} Cases...", LexTokenType::try_from(idx as u8).unwrap()); case.0.iter() .for_each(|subcase| { println!(" - happy case: {}", subcase); let token = Lexer::from(Rc::from(*subcase)) .next() .unwrap(); assert_eq!(token.token_type, LexTokenType::try_from(idx as u8) .unwrap()); if no_subtoken_check_cases.contains(&(idx as u8)) { /* DO NOTHING, ignore the dot case since its subcase is * a superset of the actual token substring */ } else { assert_eq!(&token.source_doc[token.start_idx..token.end_idx], *subcase) } }); case.1.iter() .for_each(|subcase| { println!(" - sad case: {}", subcase); assert!(Lexer::from(Rc::from(*subcase)).next().is_none()) }); }); } #[test] fn test_multi_token_iter() { let mut res = vec![]; Lexer::from(Rc::from("( one two three )")) .into_iter() .collect_into(&mut res); assert_eq!(res.len(), 5); assert_eq!(res[0].token_type, LexTokenType::ListStart); assert_eq!(res[0].start_idx, 0); assert_eq!(res[0].end_idx, 1); assert_eq!(&res[0].source_doc[res[0].start_idx..res[0].end_idx], "("); assert_eq!(res[1].token_type, LexTokenType::Symbol); assert_eq!(res[1].start_idx, 2); assert_eq!(res[1].end_idx, 5); assert_eq!(&res[1].source_doc[res[1].start_idx..res[1].end_idx], "one"); assert_eq!(res[2].token_type, LexTokenType::Symbol); assert_eq!(res[2].start_idx, 6); assert_eq!(res[2].end_idx, 9); assert_eq!(&res[2].source_doc[res[2].start_idx..res[2].end_idx], "two"); assert_eq!(res[3].token_type, LexTokenType::Symbol); assert_eq!(res[3].start_idx, 10); assert_eq!(res[3].end_idx, 15); assert_eq!(&res[3].source_doc[res[3].start_idx..res[3].end_idx], "three"); assert_eq!(res[4].token_type, LexTokenType::CollectionEnd); assert_eq!(res[4].start_idx, 16); assert_eq!(res[4].end_idx, 17); assert_eq!(&res[4].source_doc[res[4].start_idx..res[4].end_idx], ")"); } #[test] fn test_error_state_blocking() { let mut l = Lexer::from(Rc::from("( 1 2.2.2 valid_token")) .into_iter(); assert!(l.next().is_some()); assert!(l.next().is_some()); assert!(l.next().is_none()); assert!(l.has_error_state.is_some()); assert!(l.next().is_none()); assert!(l.has_error_state.is_some()); } }