682 lines
22 KiB
Rust
682 lines
22 KiB
Rust
|
|
/* Mycelium Scheme
|
||
|
|
* Copyright (C) 2025 Ava Affine
|
||
|
|
*
|
||
|
|
* This program is free software: you can redistribute it and/or modify
|
||
|
|
* it under the terms of the GNU General Public License as published by
|
||
|
|
* the Free Software Foundation, either version 3 of the License, or
|
||
|
|
* (at your option) any later version.
|
||
|
|
*
|
||
|
|
* This program is distributed in the hope that it will be useful,
|
||
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
|
* GNU General Public License for more details.
|
||
|
|
*
|
||
|
|
* You should have received a copy of the GNU General Public License
|
||
|
|
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||
|
|
*/
|
||
|
|
|
||
|
|
use alloc::rc::Rc;
|
||
|
|
|
||
|
|
pub const LEX_SPECIAL: [char; 18] = ['!', '$', '%', '&', '*', '+', '-', '/',
|
||
|
|
':', '<', '=', '>', '?', '@', '^', '_', '~', '.'];
|
||
|
|
pub const LEX_WHITESPACE: [char; 4] = [' ', '\n', '\t', '\r'];
|
||
|
|
pub const NUMERICAL_EXTRA: [char; 3] = ['.', 'i', 'e'];
|
||
|
|
pub const NUMERICAL_BASE: [char; 3] = ['d', 'o', 'b'];
|
||
|
|
|
||
|
|
pub const E_NO_MATCHING_QUOTE: &str = "couldn't find matching quote";
|
||
|
|
pub const E_TOO_MANY_DECIMALS: &str = "number can only have one of {i e .}";
|
||
|
|
pub const E_NO_MATCHING_PAREN: &str = "couldn't find matching paren";
|
||
|
|
pub const E_UNCLOSED_COMMENT: &str = "block comment has no end";
|
||
|
|
pub const E_NO_CLOSING_PIPE: &str = "expected a closing pipe";
|
||
|
|
pub const E_NO_END_TO_HASH: &str = "expected more input after hash";
|
||
|
|
pub const E_NUMBER_TRUNCATED: &str = "number literal is truncated";
|
||
|
|
pub const E_CHAR_TRUNCATED: &str = "character literal is truncated";
|
||
|
|
pub const E_STRING_TRUNCATED: &str = "string literal is truncated";
|
||
|
|
pub const E_EXTRA_CLOSE: &str = "extra closing parenthesis";
|
||
|
|
pub const E_UNIMPLEMENTED_HEX: &str = "hexadecimal literals not supported";
|
||
|
|
pub const E_NUMER_BASE_ERR: &str = "digit in number exceeds specified base";
|
||
|
|
pub const E_UNSUPPORTED_ESC: &str = "unsupported escape";
|
||
|
|
pub const E_BAD_DOT: &str = "expected space after dot in dotted notation";
|
||
|
|
pub const E_NO_SPLICE_TEMPL: &str = "expected more input after unquote splicing";
|
||
|
|
pub const E_INCOMPREHENSIBLE: &str = "token does not lex";
|
||
|
|
pub const E_END_OF_DOCUMENT: &str = "no additional input left in document";
|
||
|
|
|
||
|
|
/* LexError
|
||
|
|
* 0: error string
|
||
|
|
* 1: index into document
|
||
|
|
*/
|
||
|
|
#[derive(Clone)]
|
||
|
|
pub struct LexError(pub &'static str, pub usize);
|
||
|
|
|
||
|
|
#[repr(u8)]
|
||
|
|
#[derive(Debug, PartialEq)]
|
||
|
|
pub enum LexTokenType {
|
||
|
|
String = 0,
|
||
|
|
Number,
|
||
|
|
Char,
|
||
|
|
Symbol,
|
||
|
|
VectorStart,
|
||
|
|
ByteVectorStart,
|
||
|
|
ListStart,
|
||
|
|
CollectionEnd,
|
||
|
|
Boolean,
|
||
|
|
Dot,
|
||
|
|
Comment,
|
||
|
|
Directive,
|
||
|
|
Quote,
|
||
|
|
QuasiQuote,
|
||
|
|
Unquote,
|
||
|
|
UnquoteSpliceTemplate,
|
||
|
|
NumTypes,
|
||
|
|
}
|
||
|
|
|
||
|
|
impl TryFrom<u8> for LexTokenType {
|
||
|
|
type Error = &'static str;
|
||
|
|
fn try_from(u: u8) -> Result<LexTokenType, Self::Error> {
|
||
|
|
if u >= LexTokenType::NumTypes as u8 {
|
||
|
|
Err("out of token type range")
|
||
|
|
} else {
|
||
|
|
unsafe { Ok(core::mem::transmute(u)) }
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
pub struct LexToken {
|
||
|
|
token_type: LexTokenType,
|
||
|
|
start_idx: usize,
|
||
|
|
end_idx: usize,
|
||
|
|
source_doc: Rc<str>,
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
pub struct Lexer {
|
||
|
|
document: Rc<str>,
|
||
|
|
current_index: usize,
|
||
|
|
current_token_start: usize,
|
||
|
|
has_error_state: Option<LexError>,
|
||
|
|
}
|
||
|
|
|
||
|
|
impl From<Rc<str>> for Lexer {
|
||
|
|
fn from(s: Rc<str>) -> Lexer {
|
||
|
|
Lexer {
|
||
|
|
document: Rc::from(s),
|
||
|
|
current_index: 0,
|
||
|
|
current_token_start: 0,
|
||
|
|
has_error_state: None,
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
impl Iterator for Lexer {
|
||
|
|
type Item = LexToken;
|
||
|
|
|
||
|
|
fn next(&mut self) -> Option<Self::Item> {
|
||
|
|
if self.has_error_state.is_some() {
|
||
|
|
return None;
|
||
|
|
}
|
||
|
|
|
||
|
|
let res = self.seek_next_token();
|
||
|
|
if let Err(ref e) = res {
|
||
|
|
self.has_error_state = Some(e.clone());
|
||
|
|
}
|
||
|
|
|
||
|
|
return res.ok()
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
impl Lexer {
|
||
|
|
// I just didnt want to write and rewrite this...
|
||
|
|
#[inline(always)]
|
||
|
|
fn current_char(&mut self) -> char {
|
||
|
|
self.document.as_bytes()[self.current_index] as char
|
||
|
|
}
|
||
|
|
|
||
|
|
#[inline(always)]
|
||
|
|
fn peek_next_char(&mut self) -> Option<char> {
|
||
|
|
if let Some((_, ch)) = self.document[self.current_index+1..]
|
||
|
|
.char_indices()
|
||
|
|
.next() {
|
||
|
|
Some(ch)
|
||
|
|
} else {
|
||
|
|
None
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
#[inline(always)]
|
||
|
|
fn advance_char(&mut self) -> Option<()> {
|
||
|
|
self.current_index += 1;
|
||
|
|
if let Some((idx, _)) = self.document[self.current_index..]
|
||
|
|
.char_indices()
|
||
|
|
.next() {
|
||
|
|
|
||
|
|
self.current_index = idx + self.current_index;
|
||
|
|
Some(())
|
||
|
|
|
||
|
|
} else {
|
||
|
|
self.current_index = self.document.len();
|
||
|
|
None
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
#[inline(always)]
|
||
|
|
fn match_chunk_next(&mut self, chunk: &str) -> Option<bool> {
|
||
|
|
for i in chunk.chars() {
|
||
|
|
self.advance_char()?;
|
||
|
|
if i != self.current_char() {
|
||
|
|
return Some(false)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
Some(true)
|
||
|
|
}
|
||
|
|
|
||
|
|
/* TODO
|
||
|
|
* I figured this function would be useful for supporting hexadec encoding
|
||
|
|
* later down the line. We can use this instead of the base check in the
|
||
|
|
* number function.
|
||
|
|
#[inline(always)]
|
||
|
|
fn next_chars_allowed(&mut self, len: usize, allowed: &str) -> Option<bool> {
|
||
|
|
let mut i = len;
|
||
|
|
while i < 0 {
|
||
|
|
if !allowed.contains(self.current_char()) {
|
||
|
|
return Some(false)
|
||
|
|
}
|
||
|
|
|
||
|
|
i -= 1;
|
||
|
|
self.advance_char()?;
|
||
|
|
}
|
||
|
|
|
||
|
|
Some(true)
|
||
|
|
}
|
||
|
|
*/
|
||
|
|
|
||
|
|
#[inline(always)]
|
||
|
|
fn cut_new_token(&mut self, t: LexTokenType) -> Result<LexToken, LexError> {
|
||
|
|
let next_idx = self.advance_char()
|
||
|
|
.and_then(|_| Some(self.current_index))
|
||
|
|
.or(Some(self.document.len()))
|
||
|
|
.unwrap();
|
||
|
|
|
||
|
|
let l = LexToken{
|
||
|
|
token_type: t,
|
||
|
|
start_idx: self.current_token_start,
|
||
|
|
end_idx: next_idx,
|
||
|
|
source_doc: self.document.clone(),
|
||
|
|
};
|
||
|
|
|
||
|
|
self.current_token_start = 0;
|
||
|
|
return Ok(l);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[inline(always)]
|
||
|
|
fn seek_end_of_string(&mut self) -> Result<LexToken, LexError> {
|
||
|
|
// TODO: support escaped quotes
|
||
|
|
loop {
|
||
|
|
if let None = self.advance_char() {
|
||
|
|
return Err(LexError(E_NO_MATCHING_QUOTE, self.current_token_start))
|
||
|
|
} else if self.current_char() == '"' {
|
||
|
|
return self.cut_new_token(LexTokenType::String)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
#[inline(always)]
|
||
|
|
fn seek_end_of_number(&mut self) -> Result<LexToken, LexError> {
|
||
|
|
let mut base = 10;
|
||
|
|
let a = self.current_char();
|
||
|
|
if NUMERICAL_BASE.contains(&a) {
|
||
|
|
if let None = self.advance_char() {
|
||
|
|
return Err(LexError(E_NUMBER_TRUNCATED, self.current_token_start))
|
||
|
|
}
|
||
|
|
match a {
|
||
|
|
'd' => base = 10,
|
||
|
|
'o' => base = 8,
|
||
|
|
'b' => base = 2,
|
||
|
|
_ => (),
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
let mut hasdot = false;
|
||
|
|
loop {
|
||
|
|
let a = self.current_char();
|
||
|
|
if NUMERICAL_EXTRA.contains(&a) {
|
||
|
|
if hasdot || base < 10 {
|
||
|
|
return Err(LexError(E_TOO_MANY_DECIMALS, self.current_token_start))
|
||
|
|
}
|
||
|
|
hasdot = true;
|
||
|
|
|
||
|
|
} else if a == ' ' || a == ')' {
|
||
|
|
// back up one
|
||
|
|
self.current_index -= 1;
|
||
|
|
return self.cut_new_token(LexTokenType::Number)
|
||
|
|
|
||
|
|
} else if !a.is_numeric() {
|
||
|
|
return Err(LexError(E_INCOMPREHENSIBLE, self.current_token_start))
|
||
|
|
|
||
|
|
} else if a.to_digit(10).unwrap() >= base {
|
||
|
|
return Err(LexError(E_NUMER_BASE_ERR, self.current_token_start))
|
||
|
|
}
|
||
|
|
|
||
|
|
if let None = self.advance_char() {
|
||
|
|
self.current_index = self.document.len() - 1;
|
||
|
|
return self.cut_new_token(LexTokenType::Number)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
#[inline(always)]
|
||
|
|
fn seek_end_of_block_comment(&mut self) -> Result<LexToken, LexError> {
|
||
|
|
loop {
|
||
|
|
if let None = self.advance_char() {
|
||
|
|
return Err(LexError(E_UNCLOSED_COMMENT, self.current_token_start))
|
||
|
|
}
|
||
|
|
|
||
|
|
match self.current_char() {
|
||
|
|
'|' if self.advance_char().and_then(|_|
|
||
|
|
if self.current_char() == '#' {
|
||
|
|
return Some(())
|
||
|
|
} else { return None }).is_some() =>
|
||
|
|
return self.cut_new_token(LexTokenType::Comment),
|
||
|
|
_ => continue,
|
||
|
|
};
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
#[inline(always)]
|
||
|
|
fn seek_end_of_line_comment(&mut self, directive: bool) -> Result<LexToken, LexError> {
|
||
|
|
loop {
|
||
|
|
if let None = self.advance_char() {
|
||
|
|
return Err(LexError(E_UNCLOSED_COMMENT, self.current_token_start))
|
||
|
|
}
|
||
|
|
|
||
|
|
match self.current_char() {
|
||
|
|
'\n' if !directive => return self.cut_new_token(LexTokenType::Comment),
|
||
|
|
'\n' if directive => return self.cut_new_token(LexTokenType::Directive),
|
||
|
|
_ => continue,
|
||
|
|
};
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
#[inline(always)]
|
||
|
|
fn seek_closing_pipe(&mut self) -> Result<LexToken, LexError> {
|
||
|
|
loop {
|
||
|
|
if let None = self.advance_char() {
|
||
|
|
return Err(LexError(E_NO_CLOSING_PIPE, self.current_token_start));
|
||
|
|
}
|
||
|
|
|
||
|
|
let c = self.current_char();
|
||
|
|
match self.current_char() {
|
||
|
|
'\\' => self.seek_end_of_escape(false)?,
|
||
|
|
'|' => return self.cut_new_token(LexTokenType::Symbol),
|
||
|
|
_ if c.is_alphanumeric() => continue,
|
||
|
|
_ if LEX_SPECIAL.contains(&c) => continue,
|
||
|
|
_ if c == ' ' || c == '\n' => continue,
|
||
|
|
// quote case caught here
|
||
|
|
_ => return Err(LexError(E_INCOMPREHENSIBLE, self.current_token_start)),
|
||
|
|
};
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
#[inline(always)]
|
||
|
|
fn seek_end_from_hash(&mut self) -> Result<LexToken, LexError> {
|
||
|
|
let c = self.advance_char().and_then(|_| Some(self.current_char()));
|
||
|
|
if let Some(ch) = c {
|
||
|
|
match ch {
|
||
|
|
't' | 'f' => return self.cut_new_token(LexTokenType::Boolean),
|
||
|
|
'|' => return self.seek_end_of_block_comment(),
|
||
|
|
'!' => return self.seek_end_of_line_comment(true),
|
||
|
|
'u' if self.match_chunk_next("8(").is_some_and(|x| x) =>
|
||
|
|
return self.cut_new_token(LexTokenType::ByteVectorStart),
|
||
|
|
'(' => return self.cut_new_token(LexTokenType::VectorStart),
|
||
|
|
'\\' => self.seek_end_of_escape(false)
|
||
|
|
.and_then(|_| self.cut_new_token(LexTokenType::Char)),
|
||
|
|
'x' => return Err(LexError(E_UNIMPLEMENTED_HEX, self.current_index)),
|
||
|
|
_ if NUMERICAL_BASE.contains(&ch) => return self.seek_end_of_number(),
|
||
|
|
_ => return Err(LexError(E_INCOMPREHENSIBLE, self.current_token_start)),
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
Err(LexError(E_NO_END_TO_HASH, self.current_token_start))
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// DOES NOT RETURN A TOKEN.......
|
||
|
|
// only the caller knows what actually needs to be returned
|
||
|
|
#[inline(always)]
|
||
|
|
fn seek_end_of_escape(&mut self, in_string: bool) -> Result<(), LexError> {
|
||
|
|
//let delim = if in_string { ';' } else { ' ' };
|
||
|
|
// Delim and the arg to this function will be useful once we support hexadecimal encoding
|
||
|
|
if let None = self.advance_char() {
|
||
|
|
let mut error_msg = E_CHAR_TRUNCATED;
|
||
|
|
if in_string { error_msg = E_STRING_TRUNCATED; }
|
||
|
|
return Err(LexError(error_msg, self.current_token_start))
|
||
|
|
}
|
||
|
|
|
||
|
|
match self.current_char() {
|
||
|
|
// eat an escaped whitespace or delim
|
||
|
|
' ' | 'n' | 'r' | 't' | '|' | '\\' | '"' => { () },
|
||
|
|
'x' => return Err(LexError(E_UNIMPLEMENTED_HEX, self.current_token_start)),
|
||
|
|
_ if self.current_char().is_alphabetic() => { () },
|
||
|
|
_ => return Err(LexError(E_UNSUPPORTED_ESC, self.current_index)),
|
||
|
|
}
|
||
|
|
|
||
|
|
return Ok(())
|
||
|
|
}
|
||
|
|
|
||
|
|
/* Called to output a token by the iterator implementation
|
||
|
|
* I dont think this has to be inlined. The other ones are inlined to
|
||
|
|
* prevent the process of parsing a token from being slowed down by
|
||
|
|
* so many stack frames. This one is called once per token.
|
||
|
|
*/
|
||
|
|
fn seek_next_token(&mut self) -> Result<LexToken, LexError> {
|
||
|
|
let mut output: Option<Result<LexToken, LexError>> = None;
|
||
|
|
|
||
|
|
if self.current_index >= self.document.len() {
|
||
|
|
return Err(LexError(E_END_OF_DOCUMENT, self.document.len()));
|
||
|
|
}
|
||
|
|
|
||
|
|
while LEX_WHITESPACE.contains(&self.current_char()) {
|
||
|
|
if let None = self.advance_char() {
|
||
|
|
return Err(LexError(E_END_OF_DOCUMENT, self.document.len()));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
self.current_token_start = self.current_index;
|
||
|
|
|
||
|
|
// handle syntactic sugar cases
|
||
|
|
match self.current_char() {
|
||
|
|
';' => output = Some(self.seek_end_of_line_comment(false)),
|
||
|
|
'\'' => output = Some(self.cut_new_token(LexTokenType::Quote)),
|
||
|
|
'`' => output = Some(self.cut_new_token(LexTokenType::QuasiQuote)),
|
||
|
|
'(' => output = Some(self.cut_new_token(LexTokenType::ListStart)),
|
||
|
|
')' => output = Some(self.cut_new_token(LexTokenType::CollectionEnd)),
|
||
|
|
'#' => output = Some(self.seek_end_from_hash()),
|
||
|
|
'"' => output = Some(self.seek_end_of_string()),
|
||
|
|
'\\' => output = Some(self.seek_end_of_escape(false)
|
||
|
|
.and_then(|_|
|
||
|
|
self.cut_new_token(LexTokenType::Char))),
|
||
|
|
'|' => output = Some(self.seek_closing_pipe()),
|
||
|
|
_ if self.current_char().is_numeric() => output =
|
||
|
|
Some(self.seek_end_of_number()),
|
||
|
|
_ => (),
|
||
|
|
}
|
||
|
|
|
||
|
|
if output.is_none() {
|
||
|
|
if self.current_char() == '.' {
|
||
|
|
if let Some(x) = self.peek_next_char() && x == ' ' {
|
||
|
|
output = Some(self.cut_new_token(LexTokenType::Dot));
|
||
|
|
} /* else {
|
||
|
|
output = Some(Err(LexError(E_BAD_DOT, self.current_index)));
|
||
|
|
} SYKE! It could be a symbol... */
|
||
|
|
}
|
||
|
|
|
||
|
|
if self.current_char() == ',' {
|
||
|
|
if let Some(x) = self.peek_next_char() && x == '@'{
|
||
|
|
output = Some(self.cut_new_token(LexTokenType::UnquoteSpliceTemplate));
|
||
|
|
} else {
|
||
|
|
output = Some(self.cut_new_token(LexTokenType::Unquote));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/* Broken out into a separate case to maintain precedence of the
|
||
|
|
* unquote syntax and dotted notation.
|
||
|
|
*/
|
||
|
|
if output.is_none() {
|
||
|
|
loop {
|
||
|
|
let c = self.current_char();
|
||
|
|
if !c.is_alphanumeric() && !LEX_SPECIAL.contains(&c) && c != ' ' {
|
||
|
|
output = Some(Err(LexError(E_INCOMPREHENSIBLE, self.current_index)));
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
if let Some(c) = self.peek_next_char() {
|
||
|
|
if c == ' ' || c == ')' {
|
||
|
|
output = Some(self.cut_new_token(LexTokenType::Symbol));
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
self.advance_char().unwrap();
|
||
|
|
} else {
|
||
|
|
output = Some(self.cut_new_token(LexTokenType::Symbol));
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if let Some(ref res) = output {
|
||
|
|
if let Err(ref e) = res {
|
||
|
|
self.has_error_state = Some(e.clone());
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return output.unwrap()
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
#[cfg(test)]
|
||
|
|
mod tests {
|
||
|
|
use super::*;
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_token_evaluations() {
|
||
|
|
// indexed by LexTokenType
|
||
|
|
let cases: [(Vec<&str>, Vec<&str>); LexTokenType::NumTypes as usize] = [
|
||
|
|
/* String Cases */ (
|
||
|
|
// HAPPY CASES
|
||
|
|
vec!["\"asdf\"", "\"as sdf\"", "\"asdflkj\\n\"",
|
||
|
|
"\"LKsldkf;l\"", "\" sdlkfj \"", "\"#;sdf\""],
|
||
|
|
|
||
|
|
// SAD CASES
|
||
|
|
vec!["\"sdf"]
|
||
|
|
),
|
||
|
|
|
||
|
|
/* Number Cases */ (
|
||
|
|
// HAPPY CASES
|
||
|
|
vec!["1", "1.0", "#d1.1", "#o1423", "#b11"],
|
||
|
|
|
||
|
|
// SAD CASES
|
||
|
|
vec!["1.1.1", "#o9", "#b1.01", "#xADADAD"]
|
||
|
|
),
|
||
|
|
|
||
|
|
/* Char Cases */ (
|
||
|
|
// HAPPY CASES
|
||
|
|
vec!["\\a", "\\t", "\\\"", "#\\t"],
|
||
|
|
|
||
|
|
// SAD CASES
|
||
|
|
vec!["\\x20"]
|
||
|
|
),
|
||
|
|
|
||
|
|
/* Identifier Cases */ (
|
||
|
|
// HAPPY CASES
|
||
|
|
vec!["...", "+", "+soup+", "<=?", "V17a", "->string", "a34kTMNs",
|
||
|
|
"lambda", "q", "list->vector", "|two words|", "|two\nwords|",
|
||
|
|
"the-word-recursion-has-many-meanings"],
|
||
|
|
|
||
|
|
// SAD CASES
|
||
|
|
vec!["|\"\"|", "|(|", "|valid"]
|
||
|
|
),
|
||
|
|
|
||
|
|
/* Vector Start Cases */ (
|
||
|
|
// HAPPY CASES
|
||
|
|
vec!["#("],
|
||
|
|
|
||
|
|
// SAD CASES
|
||
|
|
vec![]
|
||
|
|
),
|
||
|
|
|
||
|
|
|
||
|
|
/* Byte Vector Cases */ (
|
||
|
|
// HAPPY CASES
|
||
|
|
vec!["#u8("],
|
||
|
|
|
||
|
|
// SAD CASES
|
||
|
|
vec!["#u8", "#u9", "#u("]
|
||
|
|
),
|
||
|
|
|
||
|
|
/* List Start Cases */ (
|
||
|
|
// HAPPY CASES
|
||
|
|
vec!["("],
|
||
|
|
|
||
|
|
// SAD CASES
|
||
|
|
vec![]
|
||
|
|
),
|
||
|
|
|
||
|
|
/* Collection End Cases */ (
|
||
|
|
// HAPPY CASES
|
||
|
|
vec![")"],
|
||
|
|
|
||
|
|
// SAD CASES
|
||
|
|
vec![]
|
||
|
|
),
|
||
|
|
|
||
|
|
/* Boolean Cases */ (
|
||
|
|
// HAPPY CASES
|
||
|
|
vec!["#t", "#f"],
|
||
|
|
|
||
|
|
// SAD CASES
|
||
|
|
vec![]
|
||
|
|
),
|
||
|
|
|
||
|
|
/* Dot Cases */ (
|
||
|
|
// HAPPY CASES
|
||
|
|
vec![" . "],
|
||
|
|
|
||
|
|
// SAD CASES
|
||
|
|
vec![]
|
||
|
|
),
|
||
|
|
|
||
|
|
/* Comment cases */ (
|
||
|
|
// HAPPY CASES
|
||
|
|
vec!["; (\n", "; #\n", ";\"\n", "#| ; ( \" |#"],
|
||
|
|
|
||
|
|
// SAD CASES
|
||
|
|
vec!["#|", "; "]
|
||
|
|
),
|
||
|
|
|
||
|
|
/* Directive cases */ (
|
||
|
|
// HAPPY CASES
|
||
|
|
vec!["#!test-directive\n"],
|
||
|
|
|
||
|
|
// SAD CASES
|
||
|
|
vec!["#!test-directive"]
|
||
|
|
),
|
||
|
|
|
||
|
|
/* Quote cases */ (
|
||
|
|
// HAPPY CASES
|
||
|
|
vec!["'"],
|
||
|
|
|
||
|
|
// SAD CASES
|
||
|
|
vec![]
|
||
|
|
),
|
||
|
|
|
||
|
|
/* QuasiQuote cases */ (
|
||
|
|
// HAPPY CASES
|
||
|
|
vec!["`"],
|
||
|
|
|
||
|
|
// SAD CASES
|
||
|
|
vec![]
|
||
|
|
),
|
||
|
|
|
||
|
|
/* Unquote cases */ (
|
||
|
|
// HAPPY CASES
|
||
|
|
vec![",x", ","],
|
||
|
|
|
||
|
|
// SAD CASES
|
||
|
|
vec![]
|
||
|
|
),
|
||
|
|
|
||
|
|
/* UnquoteSpliceTemplate cases */ (
|
||
|
|
// HAPPY CASES
|
||
|
|
vec![",@x", ",@(", ",@"],
|
||
|
|
|
||
|
|
// SAD CASES
|
||
|
|
vec![","]
|
||
|
|
),
|
||
|
|
];
|
||
|
|
|
||
|
|
let no_subtoken_check_cases = [
|
||
|
|
LexTokenType::Dot as u8,
|
||
|
|
LexTokenType::Unquote as u8,
|
||
|
|
LexTokenType::UnquoteSpliceTemplate as u8
|
||
|
|
];
|
||
|
|
|
||
|
|
cases.iter().enumerate().for_each(|(idx, case)| {
|
||
|
|
println!("+ Testing {:#?} Cases...", LexTokenType::try_from(idx as u8).unwrap());
|
||
|
|
|
||
|
|
case.0.iter()
|
||
|
|
.for_each(|subcase| {
|
||
|
|
println!(" - happy case: {}", subcase);
|
||
|
|
let token = Lexer::from(Rc::from(*subcase))
|
||
|
|
.next()
|
||
|
|
.unwrap();
|
||
|
|
assert_eq!(token.token_type,
|
||
|
|
LexTokenType::try_from(idx as u8)
|
||
|
|
.unwrap());
|
||
|
|
if no_subtoken_check_cases.contains(&(idx as u8)) {
|
||
|
|
/* DO NOTHING, ignore the dot case since its subcase is
|
||
|
|
* a superset of the actual token substring
|
||
|
|
*/
|
||
|
|
} else {
|
||
|
|
assert_eq!(&token.source_doc[token.start_idx..token.end_idx],
|
||
|
|
*subcase)
|
||
|
|
}
|
||
|
|
});
|
||
|
|
|
||
|
|
case.1.iter()
|
||
|
|
.for_each(|subcase| {
|
||
|
|
println!(" - sad case: {}", subcase);
|
||
|
|
assert!(Lexer::from(Rc::from(*subcase)).next().is_none())
|
||
|
|
});
|
||
|
|
});
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_multi_token_iter() {
|
||
|
|
let mut res = vec![];
|
||
|
|
Lexer::from(Rc::from("( one two three )"))
|
||
|
|
.into_iter()
|
||
|
|
.collect_into(&mut res);
|
||
|
|
assert_eq!(res.len(), 5);
|
||
|
|
|
||
|
|
assert_eq!(res[0].token_type, LexTokenType::ListStart);
|
||
|
|
assert_eq!(res[0].start_idx, 0);
|
||
|
|
assert_eq!(res[0].end_idx, 1);
|
||
|
|
assert_eq!(&res[0].source_doc[res[0].start_idx..res[0].end_idx], "(");
|
||
|
|
|
||
|
|
assert_eq!(res[1].token_type, LexTokenType::Symbol);
|
||
|
|
assert_eq!(res[1].start_idx, 2);
|
||
|
|
assert_eq!(res[1].end_idx, 5);
|
||
|
|
assert_eq!(&res[1].source_doc[res[1].start_idx..res[1].end_idx], "one");
|
||
|
|
|
||
|
|
assert_eq!(res[2].token_type, LexTokenType::Symbol);
|
||
|
|
assert_eq!(res[2].start_idx, 6);
|
||
|
|
assert_eq!(res[2].end_idx, 9);
|
||
|
|
assert_eq!(&res[2].source_doc[res[2].start_idx..res[2].end_idx], "two");
|
||
|
|
|
||
|
|
assert_eq!(res[3].token_type, LexTokenType::Symbol);
|
||
|
|
assert_eq!(res[3].start_idx, 10);
|
||
|
|
assert_eq!(res[3].end_idx, 15);
|
||
|
|
assert_eq!(&res[3].source_doc[res[3].start_idx..res[3].end_idx], "three");
|
||
|
|
|
||
|
|
assert_eq!(res[4].token_type, LexTokenType::CollectionEnd);
|
||
|
|
assert_eq!(res[4].start_idx, 16);
|
||
|
|
assert_eq!(res[4].end_idx, 17);
|
||
|
|
assert_eq!(&res[4].source_doc[res[4].start_idx..res[4].end_idx], ")");
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_error_state_blocking() {
|
||
|
|
let mut l = Lexer::from(Rc::from("( 1 2.2.2 valid_token"))
|
||
|
|
.into_iter();
|
||
|
|
|
||
|
|
assert!(l.next().is_some());
|
||
|
|
assert!(l.next().is_some());
|
||
|
|
assert!(l.next().is_none());
|
||
|
|
assert!(l.has_error_state.is_some());
|
||
|
|
assert!(l.next().is_none());
|
||
|
|
assert!(l.has_error_state.is_some());
|
||
|
|
}
|
||
|
|
}
|