Decomposer: fixes from found code

This commit includes a new utility, the decomposer, which has
primarily been used to test the AST against found scheme code in
the wild (internet). Decomposer will time and test the lexing and
parsing of any document full of scheme.

This commit includes additional test cases and logical fixes for
issues found during the testing performed.

Signed-off-by: Ava Affine <ava@sunnypup.io>
This commit is contained in:
Ava Apples Affine 2025-05-21 14:48:36 -07:00
parent 86f905ba1d
commit e4c6e0924a
7 changed files with 417 additions and 40 deletions

View file

@ -234,7 +234,7 @@ impl Lexer {
}
#[inline(always)]
fn match_chunk_next(&mut self, chunk: &str) -> Option<bool> {
fn match_chunk_next(&mut self, chunk: &str, peek: bool) -> Option<bool> {
let saved = self.current_index;
for i in chunk.chars() {
if let None = self.advance_char() {
@ -248,6 +248,7 @@ impl Lexer {
}
}
if peek { self.current_index = saved; }
Some(true)
}
@ -290,7 +291,15 @@ impl Lexer {
let mut base = 10;
let a = self.current_char();
if NUMERICAL_BASE.contains(&a) || a == 'i' || a == 'e' {
if let Some(true) = self.match_chunk_next("inf.0", false) {
return self.cut_new_token(LexTokenType::Number)
}
if let Some(true) = self.match_chunk_next("nan.0", false) {
return self.cut_new_token(LexTokenType::Number)
}
if NUMERICAL_BASE.contains(&a) || a == 'i' || a == 'e' || a == '+' || a == '-' {
if let None = self.advance_char() {
return Err(LexError(E_NUMBER_TRUNCATED,
self.current_token_start, self.document.clone()))
@ -311,14 +320,6 @@ impl Lexer {
}
}
if let Some(true) = self.match_chunk_next("inf.0") {
return self.cut_new_token(LexTokenType::Number)
}
if let Some(true) = self.match_chunk_next("nan.0") {
return self.cut_new_token(LexTokenType::Number)
}
let mut hasdot = false;
let mut hasslash = false;
let mut hase = false;
@ -374,7 +375,7 @@ impl Lexer {
'|' if self.advance_char().and_then(|_|
if self.current_char() == '#' {
return Some(())
} else { return None }).is_some() =>
} else { return None }).is_some() =>
return self.cut_new_token(LexTokenType::Comment),
_ => continue,
};
@ -427,7 +428,7 @@ impl Lexer {
't' | 'f' => return self.cut_new_token(LexTokenType::Boolean),
'|' => return self.seek_end_of_block_comment(),
'!' => return self.seek_end_of_line_comment(true),
'u' if self.match_chunk_next("8(").is_some_and(|x| x) =>
'u' if self.match_chunk_next("8(", false).is_some_and(|x| x) =>
return self.cut_new_token(LexTokenType::ByteVectorStart),
'(' => return self.cut_new_token(LexTokenType::VectorStart),
'\\' => self.seek_end_of_escape(false, )
@ -470,16 +471,19 @@ impl Lexer {
*/
match self.current_char() {
// char escapes
'a' if !in_string => self.match_chunk_next("larm"),
'b' if !in_string => self.match_chunk_next("ackspace"),
'd' if !in_string => self.match_chunk_next("elete"),
'e' if !in_string => self.match_chunk_next("scape"),
'n' if !in_string => self.match_chunk_next("ewline").or(
self.match_chunk_next("ull")
),
'r' if !in_string => self.match_chunk_next("eturn"),
's' if !in_string => self.match_chunk_next("pace"),
't' if !in_string => self.match_chunk_next("ab"),
'a' if !in_string => self.match_chunk_next("larm", false),
'b' if !in_string => self.match_chunk_next("ackspace", false),
'd' if !in_string => self.match_chunk_next("elete", false),
'e' if !in_string => self.match_chunk_next("scape", false),
'n' if !in_string => self.match_chunk_next("ewline", false)
.or(self.match_chunk_next("ull", false)),
'r' if !in_string => self.match_chunk_next("eturn", false),
's' if !in_string => self.match_chunk_next("pace", false),
't' if !in_string => self.match_chunk_next("ab", false),
// specifically catch a non hex 'x' character escape
'x' if self.peek_next_char()
.is_none_or(|c| TOK_DELIMITERS.contains(&c)) && !in_string
=> None,
// string escapes
'a' | 'b' | 't' | 'n' | 'r' | '"' | '\\' if in_string => None,
@ -538,7 +542,14 @@ impl Lexer {
self.current_token_start = self.current_index;
// handle syntactic sugar cases
macro_rules! numeric {
( $x:expr ) => {
$x.is_numeric() || self.match_chunk_next("inf.0", true)
.or(self.match_chunk_next("nan.0", true))
.or(Some(false))
.unwrap()
};
}
match self.current_char() {
';' => output = Some(self.seek_end_of_line_comment(false)),
'\'' => output = Some(self.cut_new_token(LexTokenType::Quote)),
@ -547,12 +558,11 @@ impl Lexer {
')' => output = Some(self.cut_new_token(LexTokenType::CollectionEnd)),
'#' => output = Some(self.seek_end_from_hash()),
'"' => output = Some(self.seek_end_of_string()),
/* This code commented out. I dont think you can open a char without '#'
* '\\' => output = Some(self.seek_end_of_escape(false)
.and_then(|_|
self.cut_new_token(LexTokenType::Char))),*/
'|' => output = Some(self.seek_closing_pipe()),
'+' | '-' => output = Some(self.seek_end_of_number()),
'+' | '-' if self.peek_next_char()
.and_then(|x| Some(numeric!(x)))
.or(Some(false))
.unwrap() => output = Some(self.seek_end_of_number()),
_ if self.current_char().is_numeric() => output =
Some(self.seek_end_of_number()),
_ => (),
@ -583,7 +593,10 @@ impl Lexer {
if output.is_none() {
loop {
let c = self.current_char();
if !c.is_alphanumeric() && !LEX_SPECIAL.contains(&c) && c != ' ' {
if !c.is_alphanumeric() &&
!LEX_SPECIAL.contains(&c) &&
!TOK_DELIMITERS.contains(&c) {
output = Some(Err(LexError(E_INCOMPREHENSIBLE,
self.current_index, self.document.clone())));
break;
@ -635,7 +648,7 @@ mod tests {
/* Number Cases */ (
// HAPPY CASES
vec!["1", "1.0", "#d1.1", "#o1423", "#b11", "#xDF",
"#e1e1", "#i1/4", "+inf.0", "1e1"],
"#e1e1", "#i1/4", "+inf.0", "1e1", "-1"],
// SAD CASES
vec!["1.1.1", "#o9", "#b1.01", "#i1/3/3"]
@ -644,7 +657,7 @@ mod tests {
/* Char Cases */ (
// HAPPY CASES
vec!["#\\a", "#\\t", "#\\\"", "#\\t", "#\\space",
"#\\alarm", "#\\s", "#\\x20"],
"#\\alarm", "#\\s", "#\\x20", "#\\x", "#\\\\"],
// SAD CASES
vec!["\\c", "\\x20"]
@ -654,7 +667,8 @@ mod tests {
// HAPPY CASES
vec!["...", "<=?", "V17a", "a34kTMNs", "lambda", "q",
"list->vector", "|two words|", "|two\nwords|",
"the-word-recursion-has-many-meanings"],
"the-word-recursion-has-many-meanings", "+", "-",
"slatex.*slatex*"],
// SAD CASES
vec!["|\"\"|", "|(|", "|valid"]
@ -858,4 +872,29 @@ mod tests {
assert_eq!(res[2].token_type, LexTokenType::CollectionEnd);
assert_eq!(&res[2].source_doc[res[2].start_idx..res[2].end_idx], ")");
}
#[test]
fn num_lex_plusnum_case() {
let mut res = vec![];
Lexer::from(Rc::from("+1"))
.into_iter()
.collect_into(&mut res);
assert_eq!(res.len(), 1);
assert_eq!(res[0].token_type, LexTokenType::Number);
}
#[test]
fn char_lex_xchar_case() {
let mut res = vec![];
Lexer::from(Rc::from("#\\x)"))
.into_iter()
.collect_into(&mut res);
assert_eq!(res.len(), 2);
assert_eq!(res[0].token_type, LexTokenType::Char);
assert_eq!(&res[0].source_doc[res[0].start_idx..res[0].end_idx], "#\\x");
assert_eq!(res[1].token_type, LexTokenType::CollectionEnd);
assert_eq!(&res[1].source_doc[res[1].start_idx..res[1].end_idx], ")");
}
}