Decomposer: fixes from found code

This commit includes a new utility, the decomposer, which has primarily been used to test the AST against found scheme code in the wild (internet). Decomposer will time and test the lexing and parsing of any document full of scheme. This commit includes additional test cases and logical fixes for issues found during the testing performed. Signed-off-by: Ava Affine <ava@sunnypup.io>
2025-05-21 14:48:36 -07:00 · 2025-05-21 14:48:36 -07:00 · e4c6e0924a
commit e4c6e0924a
parent 86f905ba1d
7 changed files with 417 additions and 40 deletions
--- a/mycelium/src/lexer.rs
+++ b/mycelium/src/lexer.rs
@ -234,7 +234,7 @@ impl Lexer {
    }

    #[inline(always)]
-    fn match_chunk_next(&mut self, chunk: &str) -> Option<bool> {
+    fn match_chunk_next(&mut self, chunk: &str, peek: bool) -> Option<bool> {
        let saved = self.current_index;
        for i in chunk.chars() {
            if let None = self.advance_char() {
@ -248,6 +248,7 @@ impl Lexer {
            }
        }

+        if peek { self.current_index = saved; }
        Some(true)
    }

@ -290,7 +291,15 @@ impl Lexer {
        let mut base = 10;
        let a = self.current_char();

-        if NUMERICAL_BASE.contains(&a) || a == 'i' || a == 'e' {
+        if let Some(true) = self.match_chunk_next("inf.0", false) {
+            return self.cut_new_token(LexTokenType::Number)
+        }
+
+        if let Some(true) = self.match_chunk_next("nan.0", false) {
+            return self.cut_new_token(LexTokenType::Number)
+        }
+
+        if NUMERICAL_BASE.contains(&a) || a == 'i' || a == 'e' || a == '+' || a == '-' {
            if let None = self.advance_char() {
                return Err(LexError(E_NUMBER_TRUNCATED,
                            self.current_token_start, self.document.clone()))
@ -311,14 +320,6 @@ impl Lexer {
            }
        }

-        if let Some(true) = self.match_chunk_next("inf.0") {
-            return self.cut_new_token(LexTokenType::Number)
-        }
-
-        if let Some(true) = self.match_chunk_next("nan.0") {
-            return self.cut_new_token(LexTokenType::Number)
-        }
-
        let mut hasdot   = false;
        let mut hasslash = false;
        let mut hase     = false;
@ -374,7 +375,7 @@ impl Lexer {
                '|' if self.advance_char().and_then(|_|
                        if self.current_char() == '#' {
                            return Some(())
-                        } else { return None }).is_some() => 
+                        } else { return None }).is_some() =>
                            return self.cut_new_token(LexTokenType::Comment),
                 _ => continue,
            };
@ -427,7 +428,7 @@ impl Lexer {
                't' | 'f' => return self.cut_new_token(LexTokenType::Boolean),
                '|'       => return self.seek_end_of_block_comment(),
                '!' => return self.seek_end_of_line_comment(true),
-                'u' if self.match_chunk_next("8(").is_some_and(|x| x) => 
+                'u' if self.match_chunk_next("8(", false).is_some_and(|x| x) =>
                        return self.cut_new_token(LexTokenType::ByteVectorStart),
                '(' => return self.cut_new_token(LexTokenType::VectorStart),
                '\\' => self.seek_end_of_escape(false, )
@ -470,16 +471,19 @@ impl Lexer {
         */
        match self.current_char() {
            // char escapes
-            'a' if !in_string => self.match_chunk_next("larm"),
-            'b' if !in_string => self.match_chunk_next("ackspace"),
-            'd' if !in_string => self.match_chunk_next("elete"),
-            'e' if !in_string => self.match_chunk_next("scape"),
-            'n' if !in_string => self.match_chunk_next("ewline").or(
-                    self.match_chunk_next("ull")
-                ),
-            'r' if !in_string => self.match_chunk_next("eturn"),
-            's' if !in_string => self.match_chunk_next("pace"),
-            't' if !in_string => self.match_chunk_next("ab"),
+            'a' if !in_string => self.match_chunk_next("larm", false),
+            'b' if !in_string => self.match_chunk_next("ackspace", false),
+            'd' if !in_string => self.match_chunk_next("elete", false),
+            'e' if !in_string => self.match_chunk_next("scape", false),
+            'n' if !in_string => self.match_chunk_next("ewline", false)
+                .or(self.match_chunk_next("ull", false)),
+            'r' if !in_string => self.match_chunk_next("eturn", false),
+            's' if !in_string => self.match_chunk_next("pace", false),
+            't' if !in_string => self.match_chunk_next("ab", false),
+            // specifically catch a non hex 'x' character escape
+            'x' if self.peek_next_char()
+                .is_none_or(|c| TOK_DELIMITERS.contains(&c)) && !in_string
+                    => None,

            // string escapes
            'a' | 'b' | 't' | 'n' | 'r' | '"' | '\\' if in_string => None,
@ -538,7 +542,14 @@ impl Lexer {

        self.current_token_start = self.current_index;

-        // handle syntactic sugar cases
+        macro_rules! numeric {
+            ( $x:expr ) => {
+                $x.is_numeric() || self.match_chunk_next("inf.0", true)
+                                       .or(self.match_chunk_next("nan.0", true))
+                                       .or(Some(false))
+                                       .unwrap()
+            };
+        }
        match self.current_char() {
            ';'  => output = Some(self.seek_end_of_line_comment(false)),
            '\'' => output = Some(self.cut_new_token(LexTokenType::Quote)),
@ -547,12 +558,11 @@ impl Lexer {
            ')'  => output = Some(self.cut_new_token(LexTokenType::CollectionEnd)),
            '#'  => output = Some(self.seek_end_from_hash()),
            '"'  => output = Some(self.seek_end_of_string()),
-            /* This code commented out. I dont think you can open a char without '#'
-             * '\\' => output = Some(self.seek_end_of_escape(false)
-                                     .and_then(|_| 
-                                         self.cut_new_token(LexTokenType::Char))),*/
            '|'  => output = Some(self.seek_closing_pipe()),
-            '+' | '-' => output = Some(self.seek_end_of_number()),
+            '+' | '-' if self.peek_next_char()
+                            .and_then(|x| Some(numeric!(x)))
+                            .or(Some(false))
+                            .unwrap() => output = Some(self.seek_end_of_number()),
             _ if self.current_char().is_numeric() => output =
                        Some(self.seek_end_of_number()),
             _   => (),
@ -583,7 +593,10 @@ impl Lexer {
        if output.is_none() {
            loop {
                let c = self.current_char();
-                if !c.is_alphanumeric() && !LEX_SPECIAL.contains(&c) && c != ' ' {
+                if  !c.is_alphanumeric() &&
+                    !LEX_SPECIAL.contains(&c) &&
+                    !TOK_DELIMITERS.contains(&c) {
+
                    output = Some(Err(LexError(E_INCOMPREHENSIBLE,
                                        self.current_index, self.document.clone())));
                    break;
@ -635,7 +648,7 @@ mod tests {
            /* Number Cases */ (
                // HAPPY CASES
                vec!["1", "1.0", "#d1.1", "#o1423", "#b11", "#xDF",
-                     "#e1e1", "#i1/4", "+inf.0", "1e1"],
+                     "#e1e1", "#i1/4", "+inf.0", "1e1", "-1"],

                // SAD CASES
                vec!["1.1.1", "#o9", "#b1.01", "#i1/3/3"]
@ -644,7 +657,7 @@ mod tests {
            /* Char Cases */ (
                // HAPPY CASES
                vec!["#\\a", "#\\t", "#\\\"", "#\\t", "#\\space", 
-                     "#\\alarm", "#\\s", "#\\x20"],
+                     "#\\alarm", "#\\s", "#\\x20", "#\\x", "#\\\\"],

                // SAD CASES
                vec!["\\c", "\\x20"]
@ -654,7 +667,8 @@ mod tests {
                // HAPPY CASES
                vec!["...", "<=?", "V17a", "a34kTMNs", "lambda", "q",
                     "list->vector", "|two words|", "|two\nwords|",
-                     "the-word-recursion-has-many-meanings"],
+                     "the-word-recursion-has-many-meanings", "+", "-",
+                     "slatex.*slatex*"],

                // SAD CASES
                vec!["|\"\"|", "|(|", "|valid"]
@ -858,4 +872,29 @@ mod tests {
        assert_eq!(res[2].token_type, LexTokenType::CollectionEnd);
        assert_eq!(&res[2].source_doc[res[2].start_idx..res[2].end_idx], ")");
    }
+
+    #[test]
+    fn num_lex_plusnum_case() {
+        let mut res = vec![];
+        Lexer::from(Rc::from("+1"))
+            .into_iter()
+            .collect_into(&mut res);
+        assert_eq!(res.len(), 1);
+        assert_eq!(res[0].token_type, LexTokenType::Number);
+    }
+
+    #[test]
+    fn char_lex_xchar_case() {
+        let mut res = vec![];
+        Lexer::from(Rc::from("#\\x)"))
+            .into_iter()
+            .collect_into(&mut res);
+        assert_eq!(res.len(), 2);
+
+        assert_eq!(res[0].token_type, LexTokenType::Char);
+        assert_eq!(&res[0].source_doc[res[0].start_idx..res[0].end_idx], "#\\x");
+
+        assert_eq!(res[1].token_type, LexTokenType::CollectionEnd);
+        assert_eq!(&res[1].source_doc[res[1].start_idx..res[1].end_idx], ")");
+    }
 }