Decomposer: fixes from found code

This commit includes a new utility, the decomposer, which has primarily been used to test the AST against found scheme code in the wild (internet). Decomposer will time and test the lexing and parsing of any document full of scheme. This commit includes additional test cases and logical fixes for issues found during the testing performed. Signed-off-by: Ava Affine <ava@sunnypup.io>
2025-05-21 14:48:36 -07:00 · 2025-05-21 14:48:36 -07:00 · e4c6e0924a
commit e4c6e0924a
parent 86f905ba1d
7 changed files with 417 additions and 40 deletions
--- a/mycelium/src/lexer.rs
+++ b/mycelium/src/lexer.rs
@ -234,7 +234,7 @@ impl Lexer {
    }

    #[inline(always)]
-    fn match_chunk_next(&mut self, chunk: &str) -> Option<bool> {
+    fn match_chunk_next(&mut self, chunk: &str, peek: bool) -> Option<bool> {
        let saved = self.current_index;
        for i in chunk.chars() {
            if let None = self.advance_char() {
@ -248,6 +248,7 @@ impl Lexer {
            }
        }

+        if peek { self.current_index = saved; }
        Some(true)
    }

@ -290,7 +291,15 @@ impl Lexer {
        let mut base = 10;
        let a = self.current_char();

-        if NUMERICAL_BASE.contains(&a) || a == 'i' || a == 'e' {
+        if let Some(true) = self.match_chunk_next("inf.0", false) {
+            return self.cut_new_token(LexTokenType::Number)
+        }
+
+        if let Some(true) = self.match_chunk_next("nan.0", false) {
+            return self.cut_new_token(LexTokenType::Number)
+        }
+
+        if NUMERICAL_BASE.contains(&a) || a == 'i' || a == 'e' || a == '+' || a == '-' {
            if let None = self.advance_char() {
                return Err(LexError(E_NUMBER_TRUNCATED,
                            self.current_token_start, self.document.clone()))
@ -311,14 +320,6 @@ impl Lexer {
            }
        }

-        if let Some(true) = self.match_chunk_next("inf.0") {
-            return self.cut_new_token(LexTokenType::Number)
-        }
-
-        if let Some(true) = self.match_chunk_next("nan.0") {
-            return self.cut_new_token(LexTokenType::Number)
-        }
-
        let mut hasdot   = false;
        let mut hasslash = false;
        let mut hase     = false;
@ -374,7 +375,7 @@ impl Lexer {
                '|' if self.advance_char().and_then(|_|
                        if self.current_char() == '#' {
                            return Some(())
-                        } else { return None }).is_some() => 
+                        } else { return None }).is_some() =>
                            return self.cut_new_token(LexTokenType::Comment),
                 _ => continue,
            };
@ -427,7 +428,7 @@ impl Lexer {
                't' | 'f' => return self.cut_new_token(LexTokenType::Boolean),
                '|'       => return self.seek_end_of_block_comment(),
                '!' => return self.seek_end_of_line_comment(true),
-                'u' if self.match_chunk_next("8(").is_some_and(|x| x) => 
+                'u' if self.match_chunk_next("8(", false).is_some_and(|x| x) =>
                        return self.cut_new_token(LexTokenType::ByteVectorStart),
                '(' => return self.cut_new_token(LexTokenType::VectorStart),
                '\\' => self.seek_end_of_escape(false, )
@ -470,16 +471,19 @@ impl Lexer {
         */
        match self.current_char() {
            // char escapes
-            'a' if !in_string => self.match_chunk_next("larm"),
-            'b' if !in_string => self.match_chunk_next("ackspace"),
-            'd' if !in_string => self.match_chunk_next("elete"),
-            'e' if !in_string => self.match_chunk_next("scape"),
-            'n' if !in_string => self.match_chunk_next("ewline").or(
-                    self.match_chunk_next("ull")
-                ),
-            'r' if !in_string => self.match_chunk_next("eturn"),
-            's' if !in_string => self.match_chunk_next("pace"),
-            't' if !in_string => self.match_chunk_next("ab"),
+            'a' if !in_string => self.match_chunk_next("larm", false),
+            'b' if !in_string => self.match_chunk_next("ackspace", false),
+            'd' if !in_string => self.match_chunk_next("elete", false),
+            'e' if !in_string => self.match_chunk_next("scape", false),
+            'n' if !in_string => self.match_chunk_next("ewline", false)
+                .or(self.match_chunk_next("ull", false)),
+            'r' if !in_string => self.match_chunk_next("eturn", false),
+            's' if !in_string => self.match_chunk_next("pace", false),
+            't' if !in_string => self.match_chunk_next("ab", false),
+            // specifically catch a non hex 'x' character escape
+            'x' if self.peek_next_char()
+                .is_none_or(|c| TOK_DELIMITERS.contains(&c)) && !in_string
+                    => None,

            // string escapes
            'a' | 'b' | 't' | 'n' | 'r' | '"' | '\\' if in_string => None,
@ -538,7 +542,14 @@ impl Lexer {

        self.current_token_start = self.current_index;

-        // handle syntactic sugar cases
+        macro_rules! numeric {
+            ( $x:expr ) => {
+                $x.is_numeric() || self.match_chunk_next("inf.0", true)
+                                       .or(self.match_chunk_next("nan.0", true))
+                                       .or(Some(false))
+                                       .unwrap()
+            };
+        }
        match self.current_char() {
            ';'  => output = Some(self.seek_end_of_line_comment(false)),
            '\'' => output = Some(self.cut_new_token(LexTokenType::Quote)),
@ -547,12 +558,11 @@ impl Lexer {
            ')'  => output = Some(self.cut_new_token(LexTokenType::CollectionEnd)),
            '#'  => output = Some(self.seek_end_from_hash()),
            '"'  => output = Some(self.seek_end_of_string()),
-            /* This code commented out. I dont think you can open a char without '#'
-             * '\\' => output = Some(self.seek_end_of_escape(false)
-                                     .and_then(|_| 
-                                         self.cut_new_token(LexTokenType::Char))),*/
            '|'  => output = Some(self.seek_closing_pipe()),
-            '+' | '-' => output = Some(self.seek_end_of_number()),
+            '+' | '-' if self.peek_next_char()
+                            .and_then(|x| Some(numeric!(x)))
+                            .or(Some(false))
+                            .unwrap() => output = Some(self.seek_end_of_number()),
             _ if self.current_char().is_numeric() => output =
                        Some(self.seek_end_of_number()),
             _   => (),
@ -583,7 +593,10 @@ impl Lexer {
        if output.is_none() {
            loop {
                let c = self.current_char();
-                if !c.is_alphanumeric() && !LEX_SPECIAL.contains(&c) && c != ' ' {
+                if  !c.is_alphanumeric() &&
+                    !LEX_SPECIAL.contains(&c) &&
+                    !TOK_DELIMITERS.contains(&c) {
+
                    output = Some(Err(LexError(E_INCOMPREHENSIBLE,
                                        self.current_index, self.document.clone())));
                    break;
@ -635,7 +648,7 @@ mod tests {
            /* Number Cases */ (
                // HAPPY CASES
                vec!["1", "1.0", "#d1.1", "#o1423", "#b11", "#xDF",
-                     "#e1e1", "#i1/4", "+inf.0", "1e1"],
+                     "#e1e1", "#i1/4", "+inf.0", "1e1", "-1"],

                // SAD CASES
                vec!["1.1.1", "#o9", "#b1.01", "#i1/3/3"]
@ -644,7 +657,7 @@ mod tests {
            /* Char Cases */ (
                // HAPPY CASES
                vec!["#\\a", "#\\t", "#\\\"", "#\\t", "#\\space", 
-                     "#\\alarm", "#\\s", "#\\x20"],
+                     "#\\alarm", "#\\s", "#\\x20", "#\\x", "#\\\\"],

                // SAD CASES
                vec!["\\c", "\\x20"]
@ -654,7 +667,8 @@ mod tests {
                // HAPPY CASES
                vec!["...", "<=?", "V17a", "a34kTMNs", "lambda", "q",
                     "list->vector", "|two words|", "|two\nwords|",
-                     "the-word-recursion-has-many-meanings"],
+                     "the-word-recursion-has-many-meanings", "+", "-",
+                     "slatex.*slatex*"],

                // SAD CASES
                vec!["|\"\"|", "|(|", "|valid"]
@ -858,4 +872,29 @@ mod tests {
        assert_eq!(res[2].token_type, LexTokenType::CollectionEnd);
        assert_eq!(&res[2].source_doc[res[2].start_idx..res[2].end_idx], ")");
    }
+
+    #[test]
+    fn num_lex_plusnum_case() {
+        let mut res = vec![];
+        Lexer::from(Rc::from("+1"))
+            .into_iter()
+            .collect_into(&mut res);
+        assert_eq!(res.len(), 1);
+        assert_eq!(res[0].token_type, LexTokenType::Number);
+    }
+
+    #[test]
+    fn char_lex_xchar_case() {
+        let mut res = vec![];
+        Lexer::from(Rc::from("#\\x)"))
+            .into_iter()
+            .collect_into(&mut res);
+        assert_eq!(res.len(), 2);
+
+        assert_eq!(res[0].token_type, LexTokenType::Char);
+        assert_eq!(&res[0].source_doc[res[0].start_idx..res[0].end_idx], "#\\x");
+
+        assert_eq!(res[1].token_type, LexTokenType::CollectionEnd);
+        assert_eq!(&res[1].source_doc[res[1].start_idx..res[1].end_idx], ")");
+    }
 }
--- a/mycelium/src/number.rs
+++ b/mycelium/src/number.rs
@ -19,7 +19,7 @@ use alloc::string::String;
 use alloc::format;
 use alloc::fmt::Debug;
 use core::{cmp::Ordering, f64, ops::{Add, Div, Mul, Sub}, str::FromStr};
-use num::{integer::{gcd}, pow::{self, Pow}};
+use num::{integer::{gcd}, pow::Pow};

 pub const E_INCOMPREHENSIBLE:       &str = "could not comprehend number literal";
 pub const E_BASE_PARSE_FAIL:        &str = "failed to parse explicit base literal";
--- a/mycelium/src/parser.rs
+++ b/mycelium/src/parser.rs
@ -62,7 +62,7 @@ pub struct ParseError(pub &'static str, pub Option<Result<LexToken, LexError>>);
 impl Display for ParseError {
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
        let err_snippet_start = |t: &LexToken| -> usize {
-            /* backtrack from current index until we either hit 
+            /* backtrack from current index until we either hit
             *   - beginning of line
             *   - 25 characters ago
             *   - the doc Start
@ -187,7 +187,7 @@ fn read_number(token: LexToken) -> Result<Number, ParseError> {
 }

 fn read_char(token: LexToken) -> Result<u8, ParseError> {
-    if token.end_idx - token.start_idx < 2 {
+    if token.end_idx - token.start_idx < 3 {
        return Err(ParseError(E_CHAR_TRUNCATED, Some(Ok(token))))
    }

@ -202,7 +202,7 @@ fn read_char(token: LexToken) -> Result<u8, ParseError> {
        "space"     => Ok(32),
        "tab"       => Ok(11),
        _ if token.source_doc[token.start_idx + 2..].starts_with('x') &&
-            token.end_idx - token.start_idx > 2 => {
+            token.end_idx - token.start_idx > 3 => {
                if token.end_idx - token.start_idx > 5 {
                    return Err(ParseError(E_CHAR_TOO_LONG, Some(Ok(token))))
                }
@ -327,7 +327,10 @@ impl Parser {
        loop {
            let next_tok = self.lexer.next();
            if let None = next_tok {
-                return Err(ParseError(E_COLLECTION_TRUNC, None))
+                if let Some(e) = &self.lexer.has_error_state {
+                    return Err(ParseError(E_LEX_ERROR, Some(Err(e.clone()))))
+                }
+                return Err(ParseError(E_COLLECTION_TRUNC, Some(Ok(token))))
            }

            let tok = next_tok.unwrap();
@ -476,7 +479,7 @@ impl Parser {
            }

        // Lexer error
-        } else if self.lexer.has_error_state.is_some() {
+        } else if let Some(e) = &self.lexer.has_error_state && e.0 != E_END_OF_DOCUMENT {
            Err(ParseError(E_LEX_ERROR,
                           Some(Err(self.lexer.has_error_state.clone().unwrap()))))

@ -511,7 +514,10 @@ mod tests {
            ("(hello \"world\")", "(hello \"world\")"),
            ("; big doc string\n(one two)", "(one two)"),
            ("(list #(vect 2 3 #u8(0 0)))", "(list #(vect 2 3 #u8(0 0)))"),
-            ("(#\\xf0 #\\alarm #\\a #\\z)", "(#\\xf0 #\\x7 #\\a #\\z)")
+            ("(#\\xf0 #\\alarm #\\a #\\z)", "(#\\xf0 #\\x7 #\\a #\\z)"),
+            ("(- q 1)", "(- q 1)"),
+            ("(+ q 1)", "(+ q 1)"),
+            ("(#\\x)", "(#\\x)"),
        ];

        let sad_cases = vec![