Parser

This commit adds a parser, complete with tests. The parser implements an iterator which returns Datum. It wraps around a Lexer and uses the Lexer's iterator interfact to consume lexemes. It may return an error which may wrap around a LexError or a fully lexed lexeme. In the implementation of the Parser bugs were found in the lexer package. This resulted in the lexing tests being extended as well as several small logic updates. The number package has had slight tweaks to make number representations less cumbersome. Finally, the Datum display logic in the sexpr package has also been updated. Signed-off-by: Ava Affine <ava@sunnypup.io>
2025-05-19 14:38:11 -07:00 · 2025-05-19 14:38:11 -07:00 · 86f905ba1d
commit 86f905ba1d
parent a48fc52fab
5 changed files with 632 additions and 29 deletions
--- a/mycelium/src/lexer.rs
+++ b/mycelium/src/lexer.rs
@ -67,7 +67,7 @@ impl fmt::Display for LexError {

            } else {
                let mut idx = self.1;
-                while self.1 - idx > 25 {
+                while self.1 - idx < 25 {
                    idx -= 1;
                    if self.2[idx..]
                        .char_indices()
@ -107,8 +107,11 @@ impl fmt::Display for LexError {
            }
        };

-        write!(f, "Error when lexing document here:\n\n")?;
-        write!(f, "    {}\n", &self.2[err_snippet_start()..err_snippet_end()])?;
+        write!(f, "Error when lexing document here: (idx: {})\n", self.1)?;
+        let s  = err_snippet_start();
+        let st = self.1 - err_snippet_start();
+        write!(f, "    {}\n", &self.2[s..err_snippet_end()])?;
+        write!(f, "    {}^\n", " ".repeat(st))?;
        write!(f, "Error: {}\n", self.0)
    }
 }
@ -132,7 +135,7 @@ pub enum LexTokenType {
    Quote,
    QuasiQuote,
    Unquote,
-    UnquoteSpliceTemplate,
+    UnquoteSplice,
    NumTypes,
 }

@ -506,10 +509,11 @@ impl Lexer {
            return Ok(())
        }

+        // make sure next character is a proper delimiter
        adv!().and_then(|_| if !delim(self.current_char()) {
                return Err(LexError(E_UNDELIMITED_ESC, self.current_index,
                                    self.document.clone()))
-            } else { if in_string {self.current_index = saved_idx }; Ok(()) })
+            } else { self.current_index = saved_idx; Ok(()) })
    }

    /* Called to output a token by the iterator implementation
@ -565,7 +569,8 @@ impl Lexer {

            if self.current_char() == ',' {
                if let Some(x) = self.peek_next_char() && x == '@'{
-                    output = Some(self.cut_new_token(LexTokenType::UnquoteSpliceTemplate));
+                    self.advance_char();
+                    output = Some(self.cut_new_token(LexTokenType::UnquoteSplice));
                }  else {
                    output = Some(self.cut_new_token(LexTokenType::Unquote));
                }
@ -578,7 +583,7 @@ impl Lexer {
        if output.is_none() {
            loop {
                let c = self.current_char();
-                if !c.is_alphanumeric() && !LEX_SPECIAL.contains(&c) &&  c != ' ' {
+                if !c.is_alphanumeric() && !LEX_SPECIAL.contains(&c) && c != ' ' {
                    output = Some(Err(LexError(E_INCOMPREHENSIBLE,
                                        self.current_index, self.document.clone())));
                    break;
@ -638,7 +643,8 @@ mod tests {

            /* Char Cases */ (
                // HAPPY CASES
-                vec!["#\\a", "#\\t", "#\\\"", "#\\t", "#\\space", "#\\x20"],
+                vec!["#\\a", "#\\t", "#\\\"", "#\\t", "#\\space", 
+                     "#\\alarm", "#\\s", "#\\x20"],

                // SAD CASES
                vec!["\\c", "\\x20"]
@ -743,9 +749,9 @@ mod tests {
                vec![]
            ),

-            /* UnquoteSpliceTemplate cases */ (
+            /* UnquoteSplice cases */ (
                // HAPPY CASES
-                vec![",@x", ",@(", ",@"],
+                vec![",@x", ",@(", ",@", ",@(two)"],

                // SAD CASES
                vec![]
@ -755,7 +761,7 @@ mod tests {
        let no_subtoken_check_cases = [
            LexTokenType::Dot as u8,
            LexTokenType::Unquote as u8,
-            LexTokenType::UnquoteSpliceTemplate as u8
+            LexTokenType::UnquoteSplice as u8
        ];

        cases.iter().enumerate().for_each(|(idx, case)| {
@ -834,4 +840,22 @@ mod tests {
        assert!(l.next().is_none());
        assert!(l.has_error_state.is_some());
    }
+
+    #[test]
+    fn char_lex_with_close() {
+        let mut res = vec![];
+        Lexer::from(Rc::from("(#\\a)"))
+            .into_iter()
+            .collect_into(&mut res);
+        assert_eq!(res.len(), 3);
+
+        assert_eq!(res[0].token_type, LexTokenType::ListStart);
+        assert_eq!(&res[0].source_doc[res[0].start_idx..res[0].end_idx], "(");
+
+        assert_eq!(res[1].token_type, LexTokenType::Char);
+        assert_eq!(&res[1].source_doc[res[1].start_idx..res[1].end_idx], "#\\a");
+
+        assert_eq!(res[2].token_type, LexTokenType::CollectionEnd);
+        assert_eq!(&res[2].source_doc[res[2].start_idx..res[2].end_idx], ")");
+    }
 }