All checks were successful
per-push tests / build (push) Successful in 38s
per-push tests / test-frontend (push) Successful in 31s
per-push tests / test-utility (push) Successful in 34s
per-push tests / timed-decomposer-parse (push) Successful in 26s
per-push tests / test-backend (push) Successful in 30s
Signed-off-by: Ava Affine <ava@sunnypup.io>
900 lines
30 KiB
Rust
900 lines
30 KiB
Rust
/* Mycelium Scheme
|
|
* Copyright (C) 2025 Ava Affine
|
|
*
|
|
* This program is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
use core::fmt;
|
|
use alloc::rc::Rc;
|
|
|
|
pub const LEX_SPECIAL: [char; 18] = ['!', '$', '%', '&', '*', '+', '-', '/',
|
|
':', '<', '=', '>', '?', '@', '^', '_', '~', '.'];
|
|
pub const LEX_WHITESPACE: [char; 4] = [' ', '\n', '\t', '\r'];
|
|
pub const NUMERICAL_EXTRA: [char; 4] = ['.', 'i', 'e', '/'];
|
|
pub const NUMERICAL_BASE: [char; 4] = ['d', 'o', 'b', 'x'];
|
|
pub const TOK_DELIMITERS: [char; 5] = [')', ' ', '\t', '\n', '\r'];
|
|
|
|
pub const E_NO_MATCHING_QUOTE: &str = "couldn't find matching quote";
|
|
pub const E_TOO_MANY_DECIMALS: &str = "number can only have one dot";
|
|
pub const E_TOO_MANY_SLASH: &str = "number can only have one slash";
|
|
pub const E_TOO_MANY_E: &str = "number can only have one e";
|
|
pub const E_NO_MATCHING_PAREN: &str = "couldn't find matching paren";
|
|
pub const E_UNCLOSED_COMMENT: &str = "block comment has no end";
|
|
pub const E_NO_CLOSING_PIPE: &str = "expected a closing pipe";
|
|
pub const E_NO_END_TO_HASH: &str = "expected more input after hash";
|
|
pub const E_NUMBER_TRUNCATED: &str = "number literal is truncated";
|
|
pub const E_CHAR_TRUNCATED: &str = "character literal is truncated";
|
|
pub const E_STRING_TRUNCATED: &str = "string literal is truncated";
|
|
pub const E_UNDELIMITED_ESC: &str = "char escape is not delimited";
|
|
pub const E_EXTRA_CLOSE: &str = "extra closing parenthesis";
|
|
pub const E_CHAR_TOO_LONG: &str = "character literal is too long";
|
|
pub const E_NUMER_BASE_ERR: &str = "digit in number exceeds specified base";
|
|
pub const E_UNSUPPORTED_ESC: &str = "unsupported escape";
|
|
pub const E_BAD_DOT: &str = "expected space after dot in dotted notation";
|
|
pub const E_BAD_HEX: &str = "character is not valid hexadecimal notation";
|
|
pub const E_INCOMPREHENSIBLE: &str = "token does not lex";
|
|
pub const E_END_OF_DOCUMENT: &str = "no additional input left in document";
|
|
|
|
/* LexError
|
|
* 0: error string
|
|
* 1: index into document
|
|
* 2: document in question
|
|
*/
|
|
#[derive(Clone)]
|
|
pub struct LexError(pub &'static str, pub usize, pub Rc<str>);
|
|
|
|
impl fmt::Display for LexError {
|
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
let err_snippet_start = || -> usize {
|
|
/* backtrack from current index until we either hit
|
|
* - beginning of line
|
|
* - 25 characters ago
|
|
* - the doc Start
|
|
*/
|
|
if self.2.len() < 25 {
|
|
0
|
|
|
|
} else {
|
|
let mut idx = self.1;
|
|
while self.1 - idx < 25 {
|
|
idx -= 1;
|
|
if self.2[idx..]
|
|
.char_indices()
|
|
.next()
|
|
.is_some_and(|(i, x)| x == '\n' && i == idx) {
|
|
idx += 1;
|
|
break;
|
|
}
|
|
}
|
|
|
|
idx
|
|
}
|
|
};
|
|
|
|
let err_snippet_end = || -> usize {
|
|
/* read through document until we either hit
|
|
* - end of line
|
|
* - 25 characters forward
|
|
* - the doc end
|
|
*/
|
|
if self.2.len() - self.1 < 25 {
|
|
self.2.len()
|
|
|
|
} else {
|
|
let mut idx = self.1;
|
|
while idx - self.1 < 25 {
|
|
idx += 1;
|
|
if self.2[idx..]
|
|
.char_indices()
|
|
.next()
|
|
.is_some_and(|(i, x)| x == '\n' && i == idx) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
idx
|
|
}
|
|
};
|
|
|
|
write!(f, "Error when lexing document here: (idx: {})\n", self.1)?;
|
|
let s = err_snippet_start();
|
|
let st = self.1 - err_snippet_start();
|
|
write!(f, " {}\n", &self.2[s..err_snippet_end()])?;
|
|
write!(f, " {}^\n", " ".repeat(st))?;
|
|
write!(f, "Error: {}\n", self.0)
|
|
}
|
|
}
|
|
|
|
|
|
#[repr(u8)]
|
|
#[derive(Debug, PartialEq, Clone)]
|
|
pub enum LexTokenType {
|
|
String = 0,
|
|
Number,
|
|
Char,
|
|
Symbol,
|
|
VectorStart,
|
|
ByteVectorStart,
|
|
ListStart,
|
|
CollectionEnd,
|
|
Boolean,
|
|
Dot,
|
|
Comment,
|
|
Directive,
|
|
Quote,
|
|
QuasiQuote,
|
|
Unquote,
|
|
UnquoteSplice,
|
|
NumTypes,
|
|
}
|
|
|
|
impl TryFrom<u8> for LexTokenType {
|
|
type Error = &'static str;
|
|
fn try_from(u: u8) -> Result<LexTokenType, Self::Error> {
|
|
if u >= LexTokenType::NumTypes as u8 {
|
|
Err("out of token type range")
|
|
} else {
|
|
unsafe { Ok(core::mem::transmute(u)) }
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
#[derive(Clone)]
|
|
pub struct LexToken {
|
|
pub token_type: LexTokenType,
|
|
pub start_idx: usize,
|
|
pub end_idx: usize,
|
|
pub source_doc: Rc<str>,
|
|
}
|
|
|
|
|
|
pub struct Lexer {
|
|
document: Rc<str>,
|
|
current_index: usize,
|
|
current_token_start: usize,
|
|
pub has_error_state: Option<LexError>,
|
|
}
|
|
|
|
impl From<Rc<str>> for Lexer {
|
|
fn from(s: Rc<str>) -> Lexer {
|
|
Lexer {
|
|
document: Rc::from(s),
|
|
current_index: 0,
|
|
current_token_start: 0,
|
|
has_error_state: None,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Iterator for Lexer {
|
|
type Item = LexToken;
|
|
|
|
fn next(&mut self) -> Option<Self::Item> {
|
|
if self.has_error_state.is_some() {
|
|
return None;
|
|
}
|
|
|
|
let res = self.seek_next_token();
|
|
if let Err(e) = &res {
|
|
self.has_error_state = Some(e.clone());
|
|
}
|
|
|
|
return res.ok()
|
|
}
|
|
}
|
|
|
|
impl Lexer {
|
|
// I just didnt want to write and rewrite this...
|
|
#[inline(always)]
|
|
fn current_char(&mut self) -> char {
|
|
self.document.as_bytes()[self.current_index] as char
|
|
}
|
|
|
|
#[inline(always)]
|
|
fn peek_next_char(&mut self) -> Option<char> {
|
|
if let Some((_, ch)) = self.document[self.current_index+1..]
|
|
.char_indices()
|
|
.next() {
|
|
Some(ch)
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
|
|
#[inline(always)]
|
|
fn advance_char(&mut self) -> Option<()> {
|
|
self.current_index += 1;
|
|
if self.current_index >= self.document.len() {
|
|
return None
|
|
}
|
|
|
|
if let Some((idx, _)) = self.document[self.current_index..]
|
|
.char_indices()
|
|
.next() {
|
|
|
|
self.current_index = idx + self.current_index;
|
|
Some(())
|
|
|
|
} else {
|
|
self.current_index = self.document.len();
|
|
None
|
|
}
|
|
}
|
|
|
|
#[inline(always)]
|
|
fn match_chunk_next(&mut self, chunk: &str, peek: bool) -> Option<bool> {
|
|
let saved = self.current_index;
|
|
for i in chunk.chars() {
|
|
if let None = self.advance_char() {
|
|
self.current_index = saved;
|
|
return None
|
|
}
|
|
|
|
if i != self.current_char() {
|
|
self.current_index = saved;
|
|
return Some(false)
|
|
}
|
|
}
|
|
|
|
if peek { self.current_index = saved; }
|
|
Some(true)
|
|
}
|
|
|
|
#[inline(always)]
|
|
fn cut_new_token(&mut self, t: LexTokenType) -> Result<LexToken, LexError> {
|
|
let next_idx = self.advance_char()
|
|
.and_then(|_| Some(self.current_index))
|
|
.or(Some(self.document.len()))
|
|
.unwrap();
|
|
|
|
let l = LexToken{
|
|
token_type: t,
|
|
start_idx: self.current_token_start,
|
|
end_idx: next_idx,
|
|
source_doc: self.document.clone(),
|
|
};
|
|
|
|
self.current_token_start = 0;
|
|
return Ok(l);
|
|
}
|
|
|
|
#[inline(always)]
|
|
fn seek_end_of_string(&mut self) -> Result<LexToken, LexError> {
|
|
loop {
|
|
if let None = self.advance_char() {
|
|
return Err(LexError(E_NO_MATCHING_QUOTE,
|
|
self.current_token_start, self.document.clone()))
|
|
|
|
} else if self.current_char() == '\\' {
|
|
self.seek_end_of_escape(true)?;
|
|
|
|
} else if self.current_char() == '"' {
|
|
return self.cut_new_token(LexTokenType::String)
|
|
}
|
|
}
|
|
}
|
|
|
|
#[inline(always)]
|
|
fn seek_end_of_number(&mut self) -> Result<LexToken, LexError> {
|
|
let mut base = 10;
|
|
let a = self.current_char();
|
|
|
|
if let Some(true) = self.match_chunk_next("inf.0", false) {
|
|
return self.cut_new_token(LexTokenType::Number)
|
|
}
|
|
|
|
if let Some(true) = self.match_chunk_next("nan.0", false) {
|
|
return self.cut_new_token(LexTokenType::Number)
|
|
}
|
|
|
|
if NUMERICAL_BASE.contains(&a) || a == 'i' || a == 'e' || a == '+' || a == '-' {
|
|
if let None = self.advance_char() {
|
|
return Err(LexError(E_NUMBER_TRUNCATED,
|
|
self.current_token_start, self.document.clone()))
|
|
|
|
// someday rust will get its shit together and if let chaining will be adequate
|
|
} else if TOK_DELIMITERS.contains(&a) {
|
|
return Err(LexError(E_NUMBER_TRUNCATED,
|
|
self.current_token_start, self.document.clone()))
|
|
}
|
|
|
|
match a {
|
|
'x' => base = 16,
|
|
'd' => base = 10,
|
|
'o' => base = 8,
|
|
'b' => base = 2,
|
|
// ignore i or e, number parsers will handle that
|
|
_ => (),
|
|
}
|
|
}
|
|
|
|
let mut hasdot = false;
|
|
let mut hasslash = false;
|
|
let mut hase = false;
|
|
loop {
|
|
let a = self.current_char();
|
|
if a == '.' {
|
|
if hasdot || base < 10 {
|
|
return Err(LexError(E_TOO_MANY_DECIMALS,
|
|
self.current_token_start, self.document.clone()))
|
|
}
|
|
hasdot = true;
|
|
|
|
} else if a == '/' {
|
|
if hasslash || base < 10 {
|
|
return Err(LexError(E_TOO_MANY_SLASH,
|
|
self.current_token_start, self.document.clone()))
|
|
}
|
|
hasslash = true;
|
|
|
|
} else if a == 'e' {
|
|
if hase || base < 10 {
|
|
return Err(LexError(E_TOO_MANY_E,
|
|
self.current_token_start, self.document.clone()))
|
|
}
|
|
hase = true
|
|
|
|
} else if TOK_DELIMITERS.contains(&a) {
|
|
// back up one
|
|
self.current_index -= 1;
|
|
return self.cut_new_token(LexTokenType::Number)
|
|
|
|
} else if let None = a.to_digit(base) {
|
|
return Err(LexError(E_NUMER_BASE_ERR,
|
|
self.current_token_start, self.document.clone()))
|
|
}
|
|
|
|
if let None = self.advance_char() {
|
|
self.current_index = self.document.len() - 1;
|
|
return self.cut_new_token(LexTokenType::Number)
|
|
}
|
|
}
|
|
}
|
|
|
|
#[inline(always)]
|
|
fn seek_end_of_block_comment(&mut self) -> Result<LexToken, LexError> {
|
|
loop {
|
|
if let None = self.advance_char() {
|
|
return Err(LexError(E_UNCLOSED_COMMENT,
|
|
self.current_token_start, self.document.clone()))
|
|
}
|
|
|
|
match self.current_char() {
|
|
'|' if self.advance_char().and_then(|_|
|
|
if self.current_char() == '#' {
|
|
return Some(())
|
|
} else { return None }).is_some() =>
|
|
return self.cut_new_token(LexTokenType::Comment),
|
|
_ => continue,
|
|
};
|
|
}
|
|
}
|
|
|
|
#[inline(always)]
|
|
fn seek_end_of_line_comment(&mut self, directive: bool) -> Result<LexToken, LexError> {
|
|
loop {
|
|
if let None = self.advance_char() {
|
|
return Err(LexError(E_UNCLOSED_COMMENT,
|
|
self.current_token_start, self.document.clone()))
|
|
}
|
|
|
|
match self.current_char() {
|
|
'\n' if !directive => return self.cut_new_token(LexTokenType::Comment),
|
|
'\n' if directive => return self.cut_new_token(LexTokenType::Directive),
|
|
_ => continue,
|
|
};
|
|
}
|
|
}
|
|
|
|
#[inline(always)]
|
|
fn seek_closing_pipe(&mut self) -> Result<LexToken, LexError> {
|
|
loop {
|
|
if let None = self.advance_char() {
|
|
return Err(LexError(E_NO_CLOSING_PIPE,
|
|
self.current_token_start, self.document.clone()));
|
|
}
|
|
|
|
let c = self.current_char();
|
|
match self.current_char() {
|
|
'\\' => self.seek_end_of_escape(false)?,
|
|
'|' => return self.cut_new_token(LexTokenType::Symbol),
|
|
_ if c.is_alphanumeric() => continue,
|
|
_ if LEX_SPECIAL.contains(&c) => continue,
|
|
_ if c == ' ' || c == '\n' => continue,
|
|
// quote case caught here
|
|
_ => return Err(LexError(E_INCOMPREHENSIBLE,
|
|
self.current_token_start, self.document.clone())),
|
|
};
|
|
}
|
|
}
|
|
|
|
#[inline(always)]
|
|
fn seek_end_from_hash(&mut self) -> Result<LexToken, LexError> {
|
|
let c = self.advance_char().and_then(|_| Some(self.current_char()));
|
|
if let Some(ch) = c {
|
|
match ch {
|
|
't' | 'f' => return self.cut_new_token(LexTokenType::Boolean),
|
|
'|' => return self.seek_end_of_block_comment(),
|
|
'!' => return self.seek_end_of_line_comment(true),
|
|
'u' if self.match_chunk_next("8(", false).is_some_and(|x| x) =>
|
|
return self.cut_new_token(LexTokenType::ByteVectorStart),
|
|
'(' => return self.cut_new_token(LexTokenType::VectorStart),
|
|
'\\' => self.seek_end_of_escape(false, )
|
|
.and_then(|_| self.cut_new_token(LexTokenType::Char)),
|
|
_ if NUMERICAL_BASE.contains(&ch) => return self.seek_end_of_number(),
|
|
'i' | 'e' => return self.seek_end_of_number(),
|
|
_ => return Err(LexError(E_INCOMPREHENSIBLE,
|
|
self.current_token_start, self.document.clone())),
|
|
}
|
|
} else {
|
|
Err(LexError(E_NO_END_TO_HASH, self.current_token_start, self.document.clone()))
|
|
}
|
|
}
|
|
|
|
// DOES NOT RETURN A TOKEN.......
|
|
// only the caller knows what actually needs to be returned
|
|
#[inline(always)]
|
|
fn seek_end_of_escape(&mut self, in_string: bool) -> Result<(), LexError> {
|
|
// little helper to deduplicate logic for advancing characters
|
|
macro_rules! adv {
|
|
() => {
|
|
if let None = self.advance_char() {
|
|
let mut error_msg = E_CHAR_TRUNCATED;
|
|
if in_string { error_msg = E_STRING_TRUNCATED; }
|
|
Err(LexError(error_msg, self.current_token_start,
|
|
self.document.clone()))
|
|
} else { Ok(()) }
|
|
};
|
|
}
|
|
|
|
let delim = |x| -> bool {
|
|
in_string || TOK_DELIMITERS.contains(&x)
|
|
};
|
|
|
|
// advance char once
|
|
adv!()?;
|
|
|
|
/* if match_chunk_next fails then the index is unmoved
|
|
* allowing us to treat this like a single char escape
|
|
*/
|
|
match self.current_char() {
|
|
// char escapes
|
|
'a' if !in_string => self.match_chunk_next("larm", false),
|
|
'b' if !in_string => self.match_chunk_next("ackspace", false),
|
|
'd' if !in_string => self.match_chunk_next("elete", false),
|
|
'e' if !in_string => self.match_chunk_next("scape", false),
|
|
'n' if !in_string => self.match_chunk_next("ewline", false)
|
|
.or(self.match_chunk_next("ull", false)),
|
|
'r' if !in_string => self.match_chunk_next("eturn", false),
|
|
's' if !in_string => self.match_chunk_next("pace", false),
|
|
't' if !in_string => self.match_chunk_next("ab", false),
|
|
// specifically catch a non hex 'x' character escape
|
|
'x' if self.peek_next_char()
|
|
.is_none_or(|c| TOK_DELIMITERS.contains(&c)) && !in_string
|
|
=> None,
|
|
|
|
// string escapes
|
|
'a' | 'b' | 't' | 'n' | 'r' | '"' | '\\' if in_string => None,
|
|
|
|
// both
|
|
'x' => {
|
|
// we look for TWO hex digits
|
|
adv!()?;
|
|
self.current_char().to_digit(16)
|
|
.ok_or(LexError(E_BAD_HEX, self.current_index,
|
|
self.document.clone()))?;
|
|
adv!()?;
|
|
self.current_char().to_digit(16)
|
|
.ok_or(LexError(E_BAD_HEX, self.current_index,
|
|
self.document.clone()))?;
|
|
None
|
|
},
|
|
|
|
// catchalls
|
|
_ if !in_string => None,
|
|
_ => return Err(LexError(E_UNSUPPORTED_ESC, self.current_index,
|
|
self.document.clone())),
|
|
};
|
|
|
|
let saved_idx = self.current_index;
|
|
if saved_idx == self.document.len() - 1 {
|
|
return Ok(())
|
|
}
|
|
|
|
// make sure next character is a proper delimiter
|
|
adv!().and_then(|_| if !delim(self.current_char()) {
|
|
return Err(LexError(E_UNDELIMITED_ESC, self.current_index,
|
|
self.document.clone()))
|
|
} else { self.current_index = saved_idx; Ok(()) })
|
|
}
|
|
|
|
/* Called to output a token by the iterator implementation
|
|
* I dont think this has to be inlined. The other ones are inlined to
|
|
* prevent the process of parsing a token from being slowed down by
|
|
* so many stack frames. This one is called once per token.
|
|
*/
|
|
fn seek_next_token(&mut self) -> Result<LexToken, LexError> {
|
|
let mut output: Option<Result<LexToken, LexError>> = None;
|
|
|
|
if self.current_index >= self.document.len() {
|
|
return Err(LexError(E_END_OF_DOCUMENT,
|
|
self.document.len(), self.document.clone()));
|
|
}
|
|
|
|
while LEX_WHITESPACE.contains(&self.current_char()) {
|
|
if let None = self.advance_char() {
|
|
return Err(LexError(E_END_OF_DOCUMENT,
|
|
self.document.len(), self.document.clone()));
|
|
}
|
|
}
|
|
|
|
self.current_token_start = self.current_index;
|
|
|
|
macro_rules! numeric {
|
|
( $x:expr ) => {
|
|
$x.is_numeric() || self.match_chunk_next("inf.0", true)
|
|
.or(self.match_chunk_next("nan.0", true))
|
|
.or(Some(false))
|
|
.unwrap()
|
|
};
|
|
}
|
|
match self.current_char() {
|
|
';' => output = Some(self.seek_end_of_line_comment(false)),
|
|
'\'' => output = Some(self.cut_new_token(LexTokenType::Quote)),
|
|
'`' => output = Some(self.cut_new_token(LexTokenType::QuasiQuote)),
|
|
'(' => output = Some(self.cut_new_token(LexTokenType::ListStart)),
|
|
')' => output = Some(self.cut_new_token(LexTokenType::CollectionEnd)),
|
|
'#' => output = Some(self.seek_end_from_hash()),
|
|
'"' => output = Some(self.seek_end_of_string()),
|
|
'|' => output = Some(self.seek_closing_pipe()),
|
|
'+' | '-' if self.peek_next_char()
|
|
.and_then(|x| Some(numeric!(x)))
|
|
.or(Some(false))
|
|
.unwrap() => output = Some(self.seek_end_of_number()),
|
|
_ if self.current_char().is_numeric() => output =
|
|
Some(self.seek_end_of_number()),
|
|
_ => (),
|
|
}
|
|
|
|
if output.is_none() {
|
|
if self.current_char() == '.' {
|
|
if let Some(x) = self.peek_next_char() && x == ' ' {
|
|
output = Some(self.cut_new_token(LexTokenType::Dot));
|
|
} /* else {
|
|
output = Some(Err(LexError(E_BAD_DOT, self.current_index)));
|
|
} SYKE! It could be a symbol... */
|
|
}
|
|
|
|
if self.current_char() == ',' {
|
|
if let Some(x) = self.peek_next_char() && x == '@'{
|
|
self.advance_char();
|
|
output = Some(self.cut_new_token(LexTokenType::UnquoteSplice));
|
|
} else {
|
|
output = Some(self.cut_new_token(LexTokenType::Unquote));
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Broken out into a separate case to maintain precedence of the
|
|
* unquote syntax and dotted notation.
|
|
*/
|
|
if output.is_none() {
|
|
loop {
|
|
let c = self.current_char();
|
|
if !c.is_alphanumeric() &&
|
|
!LEX_SPECIAL.contains(&c) &&
|
|
!TOK_DELIMITERS.contains(&c) {
|
|
|
|
output = Some(Err(LexError(E_INCOMPREHENSIBLE,
|
|
self.current_index, self.document.clone())));
|
|
break;
|
|
}
|
|
|
|
if let Some(c) = self.peek_next_char() {
|
|
if c == ' ' || c == ')' {
|
|
output = Some(self.cut_new_token(LexTokenType::Symbol));
|
|
break;
|
|
}
|
|
|
|
self.advance_char().unwrap();
|
|
} else {
|
|
output = Some(self.cut_new_token(LexTokenType::Symbol));
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if let Some(ref res) = output {
|
|
if let Err(e) = &res {
|
|
self.has_error_state = Some(e.clone());
|
|
}
|
|
}
|
|
|
|
return output.unwrap()
|
|
}
|
|
}
|
|
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_token_evaluations() {
|
|
// indexed by LexTokenType
|
|
let cases: [(Vec<&str>, Vec<&str>); LexTokenType::NumTypes as usize] = [
|
|
/* String Cases */ (
|
|
// HAPPY CASES
|
|
vec!["\"asdf\"", "\"as sdf\"", "\"asdflkj\\n\"",
|
|
"\"LKsldkf;l\"", "\" sdlkfj \"", "\"#;sdf\"",
|
|
"\"\"", "\"\\\" \\\"\""],
|
|
|
|
// SAD CASES
|
|
vec!["\"sdf"]
|
|
),
|
|
|
|
/* Number Cases */ (
|
|
// HAPPY CASES
|
|
vec!["1", "1.0", "#d1.1", "#o1423", "#b11", "#xDF",
|
|
"#e1e1", "#i1/4", "+inf.0", "1e1", "-1"],
|
|
|
|
// SAD CASES
|
|
vec!["1.1.1", "#o9", "#b1.01", "#i1/3/3"]
|
|
),
|
|
|
|
/* Char Cases */ (
|
|
// HAPPY CASES
|
|
vec!["#\\a", "#\\t", "#\\\"", "#\\t", "#\\space",
|
|
"#\\alarm", "#\\s", "#\\x20", "#\\x", "#\\\\"],
|
|
|
|
// SAD CASES
|
|
vec!["\\c", "\\x20"]
|
|
),
|
|
|
|
/* Identifier Cases */ (
|
|
// HAPPY CASES
|
|
vec!["...", "<=?", "V17a", "a34kTMNs", "lambda", "q",
|
|
"list->vector", "|two words|", "|two\nwords|",
|
|
"the-word-recursion-has-many-meanings", "+", "-",
|
|
"slatex.*slatex*"],
|
|
|
|
// SAD CASES
|
|
vec!["|\"\"|", "|(|", "|valid"]
|
|
),
|
|
|
|
/* Vector Start Cases */ (
|
|
// HAPPY CASES
|
|
vec!["#("],
|
|
|
|
// SAD CASES
|
|
vec![]
|
|
),
|
|
|
|
|
|
/* Byte Vector Cases */ (
|
|
// HAPPY CASES
|
|
vec!["#u8("],
|
|
|
|
// SAD CASES
|
|
vec!["#u8", "#u9", "#u("]
|
|
),
|
|
|
|
/* List Start Cases */ (
|
|
// HAPPY CASES
|
|
vec!["("],
|
|
|
|
// SAD CASES
|
|
vec![]
|
|
),
|
|
|
|
/* Collection End Cases */ (
|
|
// HAPPY CASES
|
|
vec![")"],
|
|
|
|
// SAD CASES
|
|
vec![]
|
|
),
|
|
|
|
/* Boolean Cases */ (
|
|
// HAPPY CASES
|
|
vec!["#t", "#f"],
|
|
|
|
// SAD CASES
|
|
vec![]
|
|
),
|
|
|
|
/* Dot Cases */ (
|
|
// HAPPY CASES
|
|
vec![" . "],
|
|
|
|
// SAD CASES
|
|
vec![]
|
|
),
|
|
|
|
/* Comment cases */ (
|
|
// HAPPY CASES
|
|
vec!["; (\n", "; #\n", ";\"\n", "#| ; ( \" |#"],
|
|
|
|
// SAD CASES
|
|
vec!["#|", "; "]
|
|
),
|
|
|
|
/* Directive cases */ (
|
|
// HAPPY CASES
|
|
vec!["#!test-directive\n"],
|
|
|
|
// SAD CASES
|
|
vec!["#!test-directive"]
|
|
),
|
|
|
|
/* Quote cases */ (
|
|
// HAPPY CASES
|
|
vec!["'"],
|
|
|
|
// SAD CASES
|
|
vec![]
|
|
),
|
|
|
|
/* QuasiQuote cases */ (
|
|
// HAPPY CASES
|
|
vec!["`"],
|
|
|
|
// SAD CASES
|
|
vec![]
|
|
),
|
|
|
|
/* Unquote cases */ (
|
|
// HAPPY CASES
|
|
vec![",x", ","],
|
|
|
|
// SAD CASES
|
|
vec![]
|
|
),
|
|
|
|
/* UnquoteSplice cases */ (
|
|
// HAPPY CASES
|
|
vec![",@x", ",@(", ",@", ",@(two)"],
|
|
|
|
// SAD CASES
|
|
vec![]
|
|
),
|
|
];
|
|
|
|
let no_subtoken_check_cases = [
|
|
LexTokenType::Dot as u8,
|
|
LexTokenType::Unquote as u8,
|
|
LexTokenType::UnquoteSplice as u8
|
|
];
|
|
|
|
cases.iter().enumerate().for_each(|(idx, case)| {
|
|
println!("+ Testing {:#?} Cases...", LexTokenType::try_from(idx as u8).unwrap());
|
|
|
|
case.0.iter()
|
|
.for_each(|subcase| {
|
|
println!(" - happy case: {}", subcase);
|
|
let token = Lexer::from(Rc::from(*subcase))
|
|
.next()
|
|
.unwrap();
|
|
assert_eq!(token.token_type,
|
|
LexTokenType::try_from(idx as u8)
|
|
.unwrap());
|
|
if no_subtoken_check_cases.contains(&(idx as u8)) {
|
|
/* DO NOTHING, ignore the dot case since its subcase is
|
|
* a superset of the actual token substring
|
|
*/
|
|
} else {
|
|
assert_eq!(&token.source_doc[token.start_idx..token.end_idx],
|
|
*subcase)
|
|
}
|
|
});
|
|
|
|
case.1.iter()
|
|
.for_each(|subcase| {
|
|
println!(" - sad case: {}", subcase);
|
|
assert!(Lexer::from(Rc::from(*subcase)).next().is_none())
|
|
});
|
|
});
|
|
}
|
|
|
|
#[test]
|
|
fn test_multi_token_iter() {
|
|
let mut res = vec![];
|
|
Lexer::from(Rc::from("( one two three )"))
|
|
.into_iter()
|
|
.collect_into(&mut res);
|
|
assert_eq!(res.len(), 5);
|
|
|
|
assert_eq!(res[0].token_type, LexTokenType::ListStart);
|
|
assert_eq!(res[0].start_idx, 0);
|
|
assert_eq!(res[0].end_idx, 1);
|
|
assert_eq!(&res[0].source_doc[res[0].start_idx..res[0].end_idx], "(");
|
|
|
|
assert_eq!(res[1].token_type, LexTokenType::Symbol);
|
|
assert_eq!(res[1].start_idx, 2);
|
|
assert_eq!(res[1].end_idx, 5);
|
|
assert_eq!(&res[1].source_doc[res[1].start_idx..res[1].end_idx], "one");
|
|
|
|
assert_eq!(res[2].token_type, LexTokenType::Symbol);
|
|
assert_eq!(res[2].start_idx, 6);
|
|
assert_eq!(res[2].end_idx, 9);
|
|
assert_eq!(&res[2].source_doc[res[2].start_idx..res[2].end_idx], "two");
|
|
|
|
assert_eq!(res[3].token_type, LexTokenType::Symbol);
|
|
assert_eq!(res[3].start_idx, 10);
|
|
assert_eq!(res[3].end_idx, 15);
|
|
assert_eq!(&res[3].source_doc[res[3].start_idx..res[3].end_idx], "three");
|
|
|
|
assert_eq!(res[4].token_type, LexTokenType::CollectionEnd);
|
|
assert_eq!(res[4].start_idx, 16);
|
|
assert_eq!(res[4].end_idx, 17);
|
|
assert_eq!(&res[4].source_doc[res[4].start_idx..res[4].end_idx], ")");
|
|
}
|
|
|
|
#[test]
|
|
fn test_error_state_blocking() {
|
|
let mut l = Lexer::from(Rc::from("( 1 2.2.2 valid_token"))
|
|
.into_iter();
|
|
|
|
assert!(l.next().is_some());
|
|
assert!(l.next().is_some());
|
|
assert!(l.next().is_none());
|
|
assert!(l.has_error_state.is_some());
|
|
assert!(l.next().is_none());
|
|
assert!(l.has_error_state.is_some());
|
|
}
|
|
|
|
#[test]
|
|
fn char_lex_with_close() {
|
|
let mut res = vec![];
|
|
Lexer::from(Rc::from("(#\\a)"))
|
|
.into_iter()
|
|
.collect_into(&mut res);
|
|
assert_eq!(res.len(), 3);
|
|
|
|
assert_eq!(res[0].token_type, LexTokenType::ListStart);
|
|
assert_eq!(&res[0].source_doc[res[0].start_idx..res[0].end_idx], "(");
|
|
|
|
assert_eq!(res[1].token_type, LexTokenType::Char);
|
|
assert_eq!(&res[1].source_doc[res[1].start_idx..res[1].end_idx], "#\\a");
|
|
|
|
assert_eq!(res[2].token_type, LexTokenType::CollectionEnd);
|
|
assert_eq!(&res[2].source_doc[res[2].start_idx..res[2].end_idx], ")");
|
|
}
|
|
|
|
#[test]
|
|
fn num_lex_plusnum_case() {
|
|
let mut res = vec![];
|
|
Lexer::from(Rc::from("+1"))
|
|
.into_iter()
|
|
.collect_into(&mut res);
|
|
assert_eq!(res.len(), 1);
|
|
assert_eq!(res[0].token_type, LexTokenType::Number);
|
|
}
|
|
|
|
#[test]
|
|
fn char_lex_xchar_case() {
|
|
let mut res = vec![];
|
|
Lexer::from(Rc::from("#\\x)"))
|
|
.into_iter()
|
|
.collect_into(&mut res);
|
|
assert_eq!(res.len(), 2);
|
|
|
|
assert_eq!(res[0].token_type, LexTokenType::Char);
|
|
assert_eq!(&res[0].source_doc[res[0].start_idx..res[0].end_idx], "#\\x");
|
|
|
|
assert_eq!(res[1].token_type, LexTokenType::CollectionEnd);
|
|
assert_eq!(&res[1].source_doc[res[1].start_idx..res[1].end_idx], ")");
|
|
}
|
|
}
|