Decomposer: fixes from found code
This commit includes a new utility, the decomposer, which has primarily been used to test the AST against found scheme code in the wild (internet). Decomposer will time and test the lexing and parsing of any document full of scheme. This commit includes additional test cases and logical fixes for issues found during the testing performed. Signed-off-by: Ava Affine <ava@sunnypup.io>
This commit is contained in:
parent
86f905ba1d
commit
e4c6e0924a
7 changed files with 417 additions and 40 deletions
|
|
@ -234,7 +234,7 @@ impl Lexer {
|
|||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn match_chunk_next(&mut self, chunk: &str) -> Option<bool> {
|
||||
fn match_chunk_next(&mut self, chunk: &str, peek: bool) -> Option<bool> {
|
||||
let saved = self.current_index;
|
||||
for i in chunk.chars() {
|
||||
if let None = self.advance_char() {
|
||||
|
|
@ -248,6 +248,7 @@ impl Lexer {
|
|||
}
|
||||
}
|
||||
|
||||
if peek { self.current_index = saved; }
|
||||
Some(true)
|
||||
}
|
||||
|
||||
|
|
@ -290,7 +291,15 @@ impl Lexer {
|
|||
let mut base = 10;
|
||||
let a = self.current_char();
|
||||
|
||||
if NUMERICAL_BASE.contains(&a) || a == 'i' || a == 'e' {
|
||||
if let Some(true) = self.match_chunk_next("inf.0", false) {
|
||||
return self.cut_new_token(LexTokenType::Number)
|
||||
}
|
||||
|
||||
if let Some(true) = self.match_chunk_next("nan.0", false) {
|
||||
return self.cut_new_token(LexTokenType::Number)
|
||||
}
|
||||
|
||||
if NUMERICAL_BASE.contains(&a) || a == 'i' || a == 'e' || a == '+' || a == '-' {
|
||||
if let None = self.advance_char() {
|
||||
return Err(LexError(E_NUMBER_TRUNCATED,
|
||||
self.current_token_start, self.document.clone()))
|
||||
|
|
@ -311,14 +320,6 @@ impl Lexer {
|
|||
}
|
||||
}
|
||||
|
||||
if let Some(true) = self.match_chunk_next("inf.0") {
|
||||
return self.cut_new_token(LexTokenType::Number)
|
||||
}
|
||||
|
||||
if let Some(true) = self.match_chunk_next("nan.0") {
|
||||
return self.cut_new_token(LexTokenType::Number)
|
||||
}
|
||||
|
||||
let mut hasdot = false;
|
||||
let mut hasslash = false;
|
||||
let mut hase = false;
|
||||
|
|
@ -374,7 +375,7 @@ impl Lexer {
|
|||
'|' if self.advance_char().and_then(|_|
|
||||
if self.current_char() == '#' {
|
||||
return Some(())
|
||||
} else { return None }).is_some() =>
|
||||
} else { return None }).is_some() =>
|
||||
return self.cut_new_token(LexTokenType::Comment),
|
||||
_ => continue,
|
||||
};
|
||||
|
|
@ -427,7 +428,7 @@ impl Lexer {
|
|||
't' | 'f' => return self.cut_new_token(LexTokenType::Boolean),
|
||||
'|' => return self.seek_end_of_block_comment(),
|
||||
'!' => return self.seek_end_of_line_comment(true),
|
||||
'u' if self.match_chunk_next("8(").is_some_and(|x| x) =>
|
||||
'u' if self.match_chunk_next("8(", false).is_some_and(|x| x) =>
|
||||
return self.cut_new_token(LexTokenType::ByteVectorStart),
|
||||
'(' => return self.cut_new_token(LexTokenType::VectorStart),
|
||||
'\\' => self.seek_end_of_escape(false, )
|
||||
|
|
@ -470,16 +471,19 @@ impl Lexer {
|
|||
*/
|
||||
match self.current_char() {
|
||||
// char escapes
|
||||
'a' if !in_string => self.match_chunk_next("larm"),
|
||||
'b' if !in_string => self.match_chunk_next("ackspace"),
|
||||
'd' if !in_string => self.match_chunk_next("elete"),
|
||||
'e' if !in_string => self.match_chunk_next("scape"),
|
||||
'n' if !in_string => self.match_chunk_next("ewline").or(
|
||||
self.match_chunk_next("ull")
|
||||
),
|
||||
'r' if !in_string => self.match_chunk_next("eturn"),
|
||||
's' if !in_string => self.match_chunk_next("pace"),
|
||||
't' if !in_string => self.match_chunk_next("ab"),
|
||||
'a' if !in_string => self.match_chunk_next("larm", false),
|
||||
'b' if !in_string => self.match_chunk_next("ackspace", false),
|
||||
'd' if !in_string => self.match_chunk_next("elete", false),
|
||||
'e' if !in_string => self.match_chunk_next("scape", false),
|
||||
'n' if !in_string => self.match_chunk_next("ewline", false)
|
||||
.or(self.match_chunk_next("ull", false)),
|
||||
'r' if !in_string => self.match_chunk_next("eturn", false),
|
||||
's' if !in_string => self.match_chunk_next("pace", false),
|
||||
't' if !in_string => self.match_chunk_next("ab", false),
|
||||
// specifically catch a non hex 'x' character escape
|
||||
'x' if self.peek_next_char()
|
||||
.is_none_or(|c| TOK_DELIMITERS.contains(&c)) && !in_string
|
||||
=> None,
|
||||
|
||||
// string escapes
|
||||
'a' | 'b' | 't' | 'n' | 'r' | '"' | '\\' if in_string => None,
|
||||
|
|
@ -538,7 +542,14 @@ impl Lexer {
|
|||
|
||||
self.current_token_start = self.current_index;
|
||||
|
||||
// handle syntactic sugar cases
|
||||
macro_rules! numeric {
|
||||
( $x:expr ) => {
|
||||
$x.is_numeric() || self.match_chunk_next("inf.0", true)
|
||||
.or(self.match_chunk_next("nan.0", true))
|
||||
.or(Some(false))
|
||||
.unwrap()
|
||||
};
|
||||
}
|
||||
match self.current_char() {
|
||||
';' => output = Some(self.seek_end_of_line_comment(false)),
|
||||
'\'' => output = Some(self.cut_new_token(LexTokenType::Quote)),
|
||||
|
|
@ -547,12 +558,11 @@ impl Lexer {
|
|||
')' => output = Some(self.cut_new_token(LexTokenType::CollectionEnd)),
|
||||
'#' => output = Some(self.seek_end_from_hash()),
|
||||
'"' => output = Some(self.seek_end_of_string()),
|
||||
/* This code commented out. I dont think you can open a char without '#'
|
||||
* '\\' => output = Some(self.seek_end_of_escape(false)
|
||||
.and_then(|_|
|
||||
self.cut_new_token(LexTokenType::Char))),*/
|
||||
'|' => output = Some(self.seek_closing_pipe()),
|
||||
'+' | '-' => output = Some(self.seek_end_of_number()),
|
||||
'+' | '-' if self.peek_next_char()
|
||||
.and_then(|x| Some(numeric!(x)))
|
||||
.or(Some(false))
|
||||
.unwrap() => output = Some(self.seek_end_of_number()),
|
||||
_ if self.current_char().is_numeric() => output =
|
||||
Some(self.seek_end_of_number()),
|
||||
_ => (),
|
||||
|
|
@ -583,7 +593,10 @@ impl Lexer {
|
|||
if output.is_none() {
|
||||
loop {
|
||||
let c = self.current_char();
|
||||
if !c.is_alphanumeric() && !LEX_SPECIAL.contains(&c) && c != ' ' {
|
||||
if !c.is_alphanumeric() &&
|
||||
!LEX_SPECIAL.contains(&c) &&
|
||||
!TOK_DELIMITERS.contains(&c) {
|
||||
|
||||
output = Some(Err(LexError(E_INCOMPREHENSIBLE,
|
||||
self.current_index, self.document.clone())));
|
||||
break;
|
||||
|
|
@ -635,7 +648,7 @@ mod tests {
|
|||
/* Number Cases */ (
|
||||
// HAPPY CASES
|
||||
vec!["1", "1.0", "#d1.1", "#o1423", "#b11", "#xDF",
|
||||
"#e1e1", "#i1/4", "+inf.0", "1e1"],
|
||||
"#e1e1", "#i1/4", "+inf.0", "1e1", "-1"],
|
||||
|
||||
// SAD CASES
|
||||
vec!["1.1.1", "#o9", "#b1.01", "#i1/3/3"]
|
||||
|
|
@ -644,7 +657,7 @@ mod tests {
|
|||
/* Char Cases */ (
|
||||
// HAPPY CASES
|
||||
vec!["#\\a", "#\\t", "#\\\"", "#\\t", "#\\space",
|
||||
"#\\alarm", "#\\s", "#\\x20"],
|
||||
"#\\alarm", "#\\s", "#\\x20", "#\\x", "#\\\\"],
|
||||
|
||||
// SAD CASES
|
||||
vec!["\\c", "\\x20"]
|
||||
|
|
@ -654,7 +667,8 @@ mod tests {
|
|||
// HAPPY CASES
|
||||
vec!["...", "<=?", "V17a", "a34kTMNs", "lambda", "q",
|
||||
"list->vector", "|two words|", "|two\nwords|",
|
||||
"the-word-recursion-has-many-meanings"],
|
||||
"the-word-recursion-has-many-meanings", "+", "-",
|
||||
"slatex.*slatex*"],
|
||||
|
||||
// SAD CASES
|
||||
vec!["|\"\"|", "|(|", "|valid"]
|
||||
|
|
@ -858,4 +872,29 @@ mod tests {
|
|||
assert_eq!(res[2].token_type, LexTokenType::CollectionEnd);
|
||||
assert_eq!(&res[2].source_doc[res[2].start_idx..res[2].end_idx], ")");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn num_lex_plusnum_case() {
|
||||
let mut res = vec![];
|
||||
Lexer::from(Rc::from("+1"))
|
||||
.into_iter()
|
||||
.collect_into(&mut res);
|
||||
assert_eq!(res.len(), 1);
|
||||
assert_eq!(res[0].token_type, LexTokenType::Number);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn char_lex_xchar_case() {
|
||||
let mut res = vec![];
|
||||
Lexer::from(Rc::from("#\\x)"))
|
||||
.into_iter()
|
||||
.collect_into(&mut res);
|
||||
assert_eq!(res.len(), 2);
|
||||
|
||||
assert_eq!(res[0].token_type, LexTokenType::Char);
|
||||
assert_eq!(&res[0].source_doc[res[0].start_idx..res[0].end_idx], "#\\x");
|
||||
|
||||
assert_eq!(res[1].token_type, LexTokenType::CollectionEnd);
|
||||
assert_eq!(&res[1].source_doc[res[1].start_idx..res[1].end_idx], ")");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue