Lexer supports hexadecimal escaping and number literals

This commit updates the Lexer to fully support character and string escaping as well as hexadecimal in number literals. Updates to lexing were performed according to the R7RS small specification. Test cases were extended to cover new support, as well as more number cases mirrored in the parsing logic of the number package. Signed-off-by: Ava Affine <ava@sunnypup.io>
2025-05-15 15:55:05 -07:00 · 2025-05-15 15:55:05 -07:00 · 3174494001
commit 3174494001
parent 41216d3526
1 changed files with 139 additions and 62 deletions
--- a/mycelium/src/lexer.rs
+++ b/mycelium/src/lexer.rs
@ -21,11 +21,14 @@ use alloc::rc::Rc;
 pub const LEX_SPECIAL:     [char; 18] = ['!', '$', '%', '&', '*', '+', '-', '/',
     ':', '<', '=', '>', '?', '@', '^', '_', '~', '.'];
 pub const LEX_WHITESPACE:  [char; 4] = [' ', '\n', '\t', '\r'];
-pub const NUMERICAL_EXTRA: [char; 3] = ['.', 'i', 'e'];
+pub const NUMERICAL_EXTRA: [char; 4] = ['.', 'i', 'e', '/'];
-pub const NUMERICAL_BASE:  [char; 3] = ['d', 'o', 'b'];
+pub const NUMERICAL_BASE:  [char; 4] = ['d', 'o', 'b', 'x'];
 pub const TOK_DELIMITERS:  [char; 5] = [')', ' ', '\t', '\n', '\r'];
 pub const E_NO_MATCHING_QUOTE:  &str = "couldn't find matching quote";
-pub const E_TOO_MANY_DECIMALS:  &str = "number can only have one of {i e .}";
+pub const E_TOO_MANY_DECIMALS:  &str = "number can only have one dot";
 pub const E_TOO_MANY_SLASH:     &str = "number can only have one slash";
 pub const E_TOO_MANY_E:         &str = "number can only have one e";
 pub const E_NO_MATCHING_PAREN:  &str = "couldn't find matching paren";
 pub const E_UNCLOSED_COMMENT:   &str = "block comment has no end";
 pub const E_NO_CLOSING_PIPE:    &str = "expected a closing pipe";
@ -33,11 +36,13 @@ pub const E_NO_END_TO_HASH:     &str = "expected more input after hash";
 pub const E_NUMBER_TRUNCATED:   &str = "number literal is truncated";
 pub const E_CHAR_TRUNCATED:     &str = "character literal is truncated";
 pub const E_STRING_TRUNCATED:   &str = "string literal is truncated";
 pub const E_UNDELIMITED_ESC:    &str = "char escape is not delimited";
 pub const E_EXTRA_CLOSE:        &str = "extra closing parenthesis";
-pub const E_UNIMPLEMENTED_HEX:  &str = "hexadecimal literals not supported";
+pub const E_CHAR_TOO_LONG:      &str = "character literal is too long";
 pub const E_NUMER_BASE_ERR:     &str = "digit in number exceeds specified base";
 pub const E_UNSUPPORTED_ESC:    &str = "unsupported escape";
 pub const E_BAD_DOT:            &str = "expected space after dot in dotted notation";
 pub const E_BAD_HEX:            &str = "character is not valid hexadecimal notation";
 pub const E_INCOMPREHENSIBLE:   &str = "token does not lex";
 pub const E_END_OF_DOCUMENT:    &str = "no additional input left in document";
@ -208,6 +213,10 @@ impl Lexer {
    #[inline(always)]
    fn advance_char(&mut self) -> Option<()> {
        self.current_index += 1;
        if self.current_index >= self.document.len() {
            return None
        }
        if let Some((idx, _)) = self.document[self.current_index..]
                            .char_indices()
                            .next() {
@ -223,9 +232,15 @@ impl Lexer {
    #[inline(always)]
    fn match_chunk_next(&mut self, chunk: &str) -> Option<bool> {
        let saved = self.current_index;
        for i in chunk.chars() {
-            self.advance_char()?;
+            if let None = self.advance_char() {
                self.current_index = saved;
                return None
            }
            if i != self.current_char() {
                self.current_index = saved;
                return Some(false)
            }
        }
@ -233,26 +248,6 @@ impl Lexer {
        Some(true)
    }
    /* TODO
     * I figured this function would be useful for supporting hexadec encoding
     * later down the line. We can use this instead of the base check in the
     * number function.
    #[inline(always)]
    fn next_chars_allowed(&mut self, len: usize, allowed: &str) -> Option<bool> {
        let mut i = len;
        while i < 0 {
            if !allowed.contains(self.current_char()) {
                return Some(false)
            }
            i -= 1;
            self.advance_char()?;
        }
        Some(true)
    }
    */
    #[inline(always)]
    fn cut_new_token(&mut self, t: LexTokenType) -> Result<LexToken, LexError> {
        let next_idx = self.advance_char()
@ -273,11 +268,14 @@ impl Lexer {
    #[inline(always)]
    fn seek_end_of_string(&mut self) -> Result<LexToken, LexError> {
        // TODO: support escaped quotes
        loop {
            if let None = self.advance_char() {
                return Err(LexError(E_NO_MATCHING_QUOTE,
                        self.current_token_start, self.document.clone()))
            } else if self.current_char() == '\\' {
                self.seek_end_of_escape(true)?;
            } else if self.current_char() == '"' {
                return self.cut_new_token(LexTokenType::String)
            }
@ -288,39 +286,68 @@ impl Lexer {
    fn seek_end_of_number(&mut self) -> Result<LexToken, LexError> {
        let mut base = 10;
        let a = self.current_char();
-        if NUMERICAL_BASE.contains(&a) {
+
        if NUMERICAL_BASE.contains(&a) || a == 'i' || a == 'e' {
            if let None = self.advance_char() {
                return Err(LexError(E_NUMBER_TRUNCATED,
                            self.current_token_start, self.document.clone()))
            // someday rust will get its shit together and if let chaining will be adequate
            } else if TOK_DELIMITERS.contains(&a) {
                return Err(LexError(E_NUMBER_TRUNCATED,
                            self.current_token_start, self.document.clone()))
            }
            match a {
                'x' => base = 16,
                'd' => base = 10,
                'o' => base = 8,
                'b' => base = 2,
                // ignore i or e, number parsers will handle that
                 _ => (),
            }
        }
-        let mut hasdot = false;
+        if let Some(true) = self.match_chunk_next("inf.0") {
            return self.cut_new_token(LexTokenType::Number)
        }
        if let Some(true) = self.match_chunk_next("nan.0") {
            return self.cut_new_token(LexTokenType::Number)
        }
        let mut hasdot   = false;
        let mut hasslash = false;
        let mut hase     = false;
        loop {
            let a = self.current_char();
-            if NUMERICAL_EXTRA.contains(&a) {
+            if a == '.' {
                if hasdot || base < 10 {
-                    return Err(LexError(E_TOO_MANY_DECIMALS, 
+                    return Err(LexError(E_TOO_MANY_DECIMALS,
                                self.current_token_start, self.document.clone()))
                }
                hasdot = true;
-            } else if a == ' ' || a == ')' {
+            } else if a == '/' {
                if hasslash || base < 10 {
                    return Err(LexError(E_TOO_MANY_SLASH,
                                self.current_token_start, self.document.clone()))
                }
                hasslash = true;
            } else if a == 'e' {
                if hase || base < 10 {
                    return Err(LexError(E_TOO_MANY_E,
                                self.current_token_start, self.document.clone()))
                }
                hase = true
            } else if TOK_DELIMITERS.contains(&a) {
                // back up one
                self.current_index -= 1;
                return self.cut_new_token(LexTokenType::Number)
-            } else if !a.is_numeric() {
+            } else if let None = a.to_digit(base) {
                return Err(LexError(E_INCOMPREHENSIBLE,
                            self.current_token_start, self.document.clone()))
            } else if a.to_digit(10).unwrap() >= base {
                return Err(LexError(E_NUMER_BASE_ERR,
                            self.current_token_start, self.document.clone()))
            }
@ -400,11 +427,10 @@ impl Lexer {
                'u' if self.match_chunk_next("8(").is_some_and(|x| x) => 
                        return self.cut_new_token(LexTokenType::ByteVectorStart),
                '(' => return self.cut_new_token(LexTokenType::VectorStart),
-                '\\' => self.seek_end_of_escape(false)
+                '\\' => self.seek_end_of_escape(false, )
                            .and_then(|_| self.cut_new_token(LexTokenType::Char)),
                'x'  => return Err(LexError(E_UNIMPLEMENTED_HEX,
                                    self.current_index, self.document.clone())),
                 _ if  NUMERICAL_BASE.contains(&ch) => return self.seek_end_of_number(),
                 'i' | 'e' => return self.seek_end_of_number(),
                 _ => return Err(LexError(E_INCOMPREHENSIBLE,
                                    self.current_token_start, self.document.clone())),
            }
@ -417,25 +443,73 @@ impl Lexer {
    // only the caller knows what actually needs to be returned
    #[inline(always)]
    fn seek_end_of_escape(&mut self, in_string: bool) -> Result<(), LexError> {
-        //let delim = if in_string { ';' } else { ' ' };
+        // little helper to deduplicate logic for advancing characters
-        // Delim and the arg to this function will be useful once we support hexadecimal encoding
+        macro_rules! adv {
-        if let None = self.advance_char() {
+            () => {
-            let mut error_msg = E_CHAR_TRUNCATED;
+                if let None = self.advance_char() {
-            if in_string { error_msg = E_STRING_TRUNCATED; }
+                    let mut error_msg = E_CHAR_TRUNCATED;
-            return Err(LexError(error_msg, self.current_token_start, self.document.clone()))
+                    if in_string { error_msg = E_STRING_TRUNCATED; }
                    Err(LexError(error_msg, self.current_token_start,
                                 self.document.clone()))
                } else { Ok(()) }
            };
        }
        let delim = |x| -> bool {
               in_string || TOK_DELIMITERS.contains(&x)
            };
        // advance char once
        adv!()?;
        /* if match_chunk_next fails then the index is unmoved
         * allowing us to treat this like a single char escape
         */
        match self.current_char() {
-            // eat an escaped whitespace or delim
+            // char escapes
-            ' ' | 'n' | 'r' | 't' | '|' | '\\' | '"' => { () },
+            'a' if !in_string => self.match_chunk_next("larm"),
-            'x' => return Err(LexError(E_UNIMPLEMENTED_HEX,
+            'b' if !in_string => self.match_chunk_next("ackspace"),
-                                self.current_token_start, self.document.clone())),
+            'd' if !in_string => self.match_chunk_next("elete"),
-             _  if self.current_char().is_alphabetic() => { () },
+            'e' if !in_string => self.match_chunk_next("scape"),
-             _  => return Err(LexError(E_UNSUPPORTED_ESC,
+            'n' if !in_string => self.match_chunk_next("ewline").or(
-                                self.current_index, self.document.clone())),
+                    self.match_chunk_next("ull")
                ),
            'r' if !in_string => self.match_chunk_next("eturn"),
            's' if !in_string => self.match_chunk_next("pace"),
            't' if !in_string => self.match_chunk_next("ab"),
            // string escapes
            'a' | 'b' | 't' | 'n' | 'r' | '"' | '\\' if in_string => None,
            // both
            'x' => {
                // we look for TWO hex digits
                adv!()?;
                self.current_char().to_digit(16)
                    .ok_or(LexError(E_BAD_HEX, self.current_index,
                                    self.document.clone()))?;
                adv!()?;
                self.current_char().to_digit(16)
                    .ok_or(LexError(E_BAD_HEX, self.current_index,
                                    self.document.clone()))?;
                None
            },
            // catchalls
            _  if !in_string => None,
            _ => return Err(LexError(E_UNSUPPORTED_ESC, self.current_index,
                                     self.document.clone())),
        };
        let saved_idx = self.current_index;
        if saved_idx == self.document.len() - 1 {
            return Ok(())
        }
-        return Ok(())
+        adv!().and_then(|_| if !delim(self.current_char()) {
                return Err(LexError(E_UNDELIMITED_ESC, self.current_index,
                                    self.document.clone()))
            } else { if in_string {self.current_index = saved_idx }; Ok(()) })
    }
    /* Called to output a token by the iterator implementation
@ -469,10 +543,12 @@ impl Lexer {
            ')'  => output = Some(self.cut_new_token(LexTokenType::CollectionEnd)),
            '#'  => output = Some(self.seek_end_from_hash()),
            '"'  => output = Some(self.seek_end_of_string()),
-            '\\' => output = Some(self.seek_end_of_escape(false)
+            /* This code commented out. I dont think you can open a char without '#'
             * '\\' => output = Some(self.seek_end_of_escape(false)
                                     .and_then(|_| 
-                                         self.cut_new_token(LexTokenType::Char))),
+                                         self.cut_new_token(LexTokenType::Char))),*/
            '|'  => output = Some(self.seek_closing_pipe()),
            '+' | '-' => output = Some(self.seek_end_of_number()),
             _ if self.current_char().is_numeric() => output =
                        Some(self.seek_end_of_number()),
             _   => (),
@ -545,7 +621,7 @@ mod tests {
                // HAPPY CASES
                vec!["\"asdf\"", "\"as sdf\"", "\"asdflkj\\n\"",
                     "\"LKsldkf;l\"", "\" sdlkfj \"", "\"#;sdf\"",
-                     "\"\""],
+                     "\"\"", "\"\\\" \\\"\""],
                // SAD CASES
                vec!["\"sdf"]
@ -553,24 +629,25 @@ mod tests {
            /* Number Cases */ (
                // HAPPY CASES
-                vec!["1", "1.0", "#d1.1", "#o1423", "#b11"],
+                vec!["1", "1.0", "#d1.1", "#o1423", "#b11", "#xDF",
                     "#e1e1", "#i1/4", "+inf.0", "1e1"],
                // SAD CASES
-                vec!["1.1.1", "#o9", "#b1.01", "#xADADAD"]
+                vec!["1.1.1", "#o9", "#b1.01", "#i1/3/3"]
            ),
            /* Char Cases */ (
                // HAPPY CASES
-                vec!["\\a", "\\t", "\\\"", "#\\t"],
+                vec!["#\\a", "#\\t", "#\\\"", "#\\t", "#\\space", "#\\x20"],
                // SAD CASES
-                vec!["\\x20"]
+                vec!["\\c", "\\x20"]
            ),
            /* Identifier Cases */ (
                // HAPPY CASES
-                vec!["...", "+", "+soup+", "<=?", "V17a", "->string", "a34kTMNs",
+                vec!["...", "<=?", "V17a", "a34kTMNs", "lambda", "q",
-                     "lambda", "q", "list->vector", "|two words|", "|two\nwords|",
+                     "list->vector", "|two words|", "|two\nwords|",
                     "the-word-recursion-has-many-meanings"],
                // SAD CASES