This commit adds a parser, complete with tests. The parser implements
an iterator which returns Datum. It wraps around a Lexer and uses the
Lexer's iterator interfact to consume lexemes. It may return an error
which may wrap around a LexError or a fully lexed lexeme.

In the implementation of the Parser bugs were found in the lexer
package. This resulted in the lexing tests being extended as well as
several small logic updates.

The number package has had slight tweaks to make number representations
less cumbersome.

Finally, the Datum display logic in the sexpr package has also been updated.

Signed-off-by: Ava Affine <ava@sunnypup.io>
This commit is contained in:
Ava Apples Affine 2025-05-19 14:38:11 -07:00
parent a48fc52fab
commit 86f905ba1d
5 changed files with 632 additions and 29 deletions

View file

@ -67,7 +67,7 @@ impl fmt::Display for LexError {
} else {
let mut idx = self.1;
while self.1 - idx > 25 {
while self.1 - idx < 25 {
idx -= 1;
if self.2[idx..]
.char_indices()
@ -107,8 +107,11 @@ impl fmt::Display for LexError {
}
};
write!(f, "Error when lexing document here:\n\n")?;
write!(f, " {}\n", &self.2[err_snippet_start()..err_snippet_end()])?;
write!(f, "Error when lexing document here: (idx: {})\n", self.1)?;
let s = err_snippet_start();
let st = self.1 - err_snippet_start();
write!(f, " {}\n", &self.2[s..err_snippet_end()])?;
write!(f, " {}^\n", " ".repeat(st))?;
write!(f, "Error: {}\n", self.0)
}
}
@ -132,7 +135,7 @@ pub enum LexTokenType {
Quote,
QuasiQuote,
Unquote,
UnquoteSpliceTemplate,
UnquoteSplice,
NumTypes,
}
@ -506,10 +509,11 @@ impl Lexer {
return Ok(())
}
// make sure next character is a proper delimiter
adv!().and_then(|_| if !delim(self.current_char()) {
return Err(LexError(E_UNDELIMITED_ESC, self.current_index,
self.document.clone()))
} else { if in_string {self.current_index = saved_idx }; Ok(()) })
} else { self.current_index = saved_idx; Ok(()) })
}
/* Called to output a token by the iterator implementation
@ -565,7 +569,8 @@ impl Lexer {
if self.current_char() == ',' {
if let Some(x) = self.peek_next_char() && x == '@'{
output = Some(self.cut_new_token(LexTokenType::UnquoteSpliceTemplate));
self.advance_char();
output = Some(self.cut_new_token(LexTokenType::UnquoteSplice));
} else {
output = Some(self.cut_new_token(LexTokenType::Unquote));
}
@ -578,7 +583,7 @@ impl Lexer {
if output.is_none() {
loop {
let c = self.current_char();
if !c.is_alphanumeric() && !LEX_SPECIAL.contains(&c) && c != ' ' {
if !c.is_alphanumeric() && !LEX_SPECIAL.contains(&c) && c != ' ' {
output = Some(Err(LexError(E_INCOMPREHENSIBLE,
self.current_index, self.document.clone())));
break;
@ -638,7 +643,8 @@ mod tests {
/* Char Cases */ (
// HAPPY CASES
vec!["#\\a", "#\\t", "#\\\"", "#\\t", "#\\space", "#\\x20"],
vec!["#\\a", "#\\t", "#\\\"", "#\\t", "#\\space",
"#\\alarm", "#\\s", "#\\x20"],
// SAD CASES
vec!["\\c", "\\x20"]
@ -743,9 +749,9 @@ mod tests {
vec![]
),
/* UnquoteSpliceTemplate cases */ (
/* UnquoteSplice cases */ (
// HAPPY CASES
vec![",@x", ",@(", ",@"],
vec![",@x", ",@(", ",@", ",@(two)"],
// SAD CASES
vec![]
@ -755,7 +761,7 @@ mod tests {
let no_subtoken_check_cases = [
LexTokenType::Dot as u8,
LexTokenType::Unquote as u8,
LexTokenType::UnquoteSpliceTemplate as u8
LexTokenType::UnquoteSplice as u8
];
cases.iter().enumerate().for_each(|(idx, case)| {
@ -834,4 +840,22 @@ mod tests {
assert!(l.next().is_none());
assert!(l.has_error_state.is_some());
}
#[test]
fn char_lex_with_close() {
let mut res = vec![];
Lexer::from(Rc::from("(#\\a)"))
.into_iter()
.collect_into(&mut res);
assert_eq!(res.len(), 3);
assert_eq!(res[0].token_type, LexTokenType::ListStart);
assert_eq!(&res[0].source_doc[res[0].start_idx..res[0].end_idx], "(");
assert_eq!(res[1].token_type, LexTokenType::Char);
assert_eq!(&res[1].source_doc[res[1].start_idx..res[1].end_idx], "#\\a");
assert_eq!(res[2].token_type, LexTokenType::CollectionEnd);
assert_eq!(&res[2].source_doc[res[2].start_idx..res[2].end_idx], ")");
}
}