Mycelium/mycelium/src/lexer.rs

/*  Mycelium Scheme
 *  Copyright (C) 2025 Ava Affine
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */

use alloc::rc::Rc;

pub const LEX_SPECIAL:     [char; 18] = ['!', '$', '%', '&', '*', '+', '-', '/',
     ':', '<', '=', '>', '?', '@', '^', '_', '~', '.'];
pub const LEX_WHITESPACE:  [char; 4] = [' ', '\n', '\t', '\r'];
pub const NUMERICAL_EXTRA: [char; 3] = ['.', 'i', 'e'];
pub const NUMERICAL_BASE:  [char; 3] = ['d', 'o', 'b'];

pub const E_NO_MATCHING_QUOTE:  &str = "couldn't find matching quote";
pub const E_TOO_MANY_DECIMALS:  &str = "number can only have one of {i e .}";
pub const E_NO_MATCHING_PAREN:  &str = "couldn't find matching paren";
pub const E_UNCLOSED_COMMENT:   &str = "block comment has no end";
pub const E_NO_CLOSING_PIPE:    &str = "expected a closing pipe";
pub const E_NO_END_TO_HASH:     &str = "expected more input after hash";
pub const E_NUMBER_TRUNCATED:   &str = "number literal is truncated";
pub const E_CHAR_TRUNCATED:     &str = "character literal is truncated";
pub const E_STRING_TRUNCATED:   &str = "string literal is truncated";
pub const E_EXTRA_CLOSE:        &str = "extra closing parenthesis";
pub const E_UNIMPLEMENTED_HEX:  &str = "hexadecimal literals not supported";
pub const E_NUMER_BASE_ERR:     &str = "digit in number exceeds specified base";
pub const E_UNSUPPORTED_ESC:    &str = "unsupported escape";
pub const E_BAD_DOT:            &str = "expected space after dot in dotted notation";
pub const E_NO_SPLICE_TEMPL:    &str = "expected more input after unquote splicing";
pub const E_INCOMPREHENSIBLE:   &str = "token does not lex";
pub const E_END_OF_DOCUMENT:    &str = "no additional input left in document";

/* LexError
 * 0: error string
 * 1: index into document
 */
#[derive(Clone)]
pub struct LexError(pub &'static str, pub usize);

#[repr(u8)]
#[derive(Debug, PartialEq)]
pub enum LexTokenType {
    String = 0,
    Number,
    Char,
    Symbol,
    VectorStart,
    ByteVectorStart,
    ListStart,
    CollectionEnd,
    Boolean,
    Dot,
    Comment,
    Directive,
    Quote,
    QuasiQuote,
    Unquote,
    UnquoteSpliceTemplate,
    NumTypes,
}

impl TryFrom<u8> for LexTokenType {
    type Error = &'static str;
    fn try_from(u: u8) -> Result<LexTokenType, Self::Error> {
        if u >= LexTokenType::NumTypes as u8 {
            Err("out of token type range")
        } else {
            unsafe { Ok(core::mem::transmute(u)) }
        }
    }
}


pub struct LexToken {
    token_type: LexTokenType,
    start_idx: usize,
    end_idx: usize,
    source_doc: Rc<str>,
}


pub struct Lexer {
    document: Rc<str>,
    current_index: usize,
    current_token_start: usize,
    has_error_state: Option<LexError>,
}

impl From<Rc<str>> for Lexer {
    fn from(s: Rc<str>) -> Lexer {
       Lexer {
            document: Rc::from(s),
            current_index: 0,
            current_token_start: 0,
            has_error_state: None,
        }
    }
}

impl Iterator for Lexer {
    type Item = LexToken;

    fn next(&mut self) -> Option<Self::Item> {
        if self.has_error_state.is_some() {
            return None;
        }

        let res = self.seek_next_token();
        if let Err(ref e) = res {
            self.has_error_state = Some(e.clone());
        }

        return res.ok()
    }
}

impl Lexer {
    // I just didnt want to write and rewrite this...
    #[inline(always)]
    fn current_char(&mut self) -> char {
        self.document.as_bytes()[self.current_index] as char
    }

    #[inline(always)]
    fn peek_next_char(&mut self) -> Option<char> {
        if let Some((_, ch)) = self.document[self.current_index+1..]
                .char_indices()
                .next() {
            Some(ch)
        } else {
            None
        }
    }

    #[inline(always)]
    fn advance_char(&mut self) -> Option<()> {
        self.current_index += 1;
        if let Some((idx, _)) = self.document[self.current_index..]
                            .char_indices()
                            .next() {

            self.current_index = idx + self.current_index;
            Some(())

        } else {
            self.current_index = self.document.len();
            None
        }
    }

    #[inline(always)]
    fn match_chunk_next(&mut self, chunk: &str) -> Option<bool> {
        for i in chunk.chars() {
            self.advance_char()?;
            if i != self.current_char() {
                return Some(false)
            }
        }

        Some(true)
    }

    /* TODO
     * I figured this function would be useful for supporting hexadec encoding
     * later down the line. We can use this instead of the base check in the
     * number function.
    #[inline(always)]
    fn next_chars_allowed(&mut self, len: usize, allowed: &str) -> Option<bool> {
        let mut i = len;
        while i < 0 {
            if !allowed.contains(self.current_char()) {
                return Some(false)
            }

            i -= 1;
            self.advance_char()?;
        }

        Some(true)
    }
    */

    #[inline(always)]
    fn cut_new_token(&mut self, t: LexTokenType) -> Result<LexToken, LexError> {
        let next_idx = self.advance_char()
            .and_then(|_| Some(self.current_index))
            .or(Some(self.document.len()))
            .unwrap();

        let l = LexToken{
            token_type: t,
            start_idx: self.current_token_start,
            end_idx: next_idx,
            source_doc: self.document.clone(),
        };

        self.current_token_start = 0;
        return Ok(l);
    }

    #[inline(always)]
    fn seek_end_of_string(&mut self) -> Result<LexToken, LexError> {
        // TODO: support escaped quotes
        loop {
            if let None = self.advance_char() {
                return Err(LexError(E_NO_MATCHING_QUOTE, self.current_token_start))
            } else if self.current_char() == '"' {
                return self.cut_new_token(LexTokenType::String)
            }
        }
    }

    #[inline(always)]
    fn seek_end_of_number(&mut self) -> Result<LexToken, LexError> {
        let mut base = 10;
        let a = self.current_char();
        if NUMERICAL_BASE.contains(&a) {
            if let None = self.advance_char() {
                return Err(LexError(E_NUMBER_TRUNCATED, self.current_token_start))
            }
            match a {
                'd' => base = 10,
                'o' => base = 8,
                'b' => base = 2,
                 _ => (),
            }
        }

        let mut hasdot = false;
        loop {
            let a = self.current_char();
            if NUMERICAL_EXTRA.contains(&a) {
                if hasdot || base < 10 {
                    return Err(LexError(E_TOO_MANY_DECIMALS, self.current_token_start))
                }
                hasdot = true;

            } else if a == ' ' || a == ')' {
                // back up one
                self.current_index -= 1;
                return self.cut_new_token(LexTokenType::Number)

            } else if !a.is_numeric() {
                return Err(LexError(E_INCOMPREHENSIBLE, self.current_token_start))

            } else if a.to_digit(10).unwrap() >= base {
                return Err(LexError(E_NUMER_BASE_ERR, self.current_token_start))
            }

            if let None = self.advance_char() {
                self.current_index = self.document.len() - 1;
                return self.cut_new_token(LexTokenType::Number)
            }
        }
    }

    #[inline(always)]
    fn seek_end_of_block_comment(&mut self) -> Result<LexToken, LexError> {
        loop {
            if let None = self.advance_char() {
                return Err(LexError(E_UNCLOSED_COMMENT, self.current_token_start))
            }

            match self.current_char() {
                '|' if self.advance_char().and_then(|_|
                        if self.current_char() == '#' {
                            return Some(())
                        } else { return None }).is_some() => 
                            return self.cut_new_token(LexTokenType::Comment),
                 _ => continue,
            };
        }
    }

    #[inline(always)]
    fn seek_end_of_line_comment(&mut self, directive: bool) -> Result<LexToken, LexError> {
        loop {
            if let None = self.advance_char() {
                return Err(LexError(E_UNCLOSED_COMMENT, self.current_token_start))
            }

            match self.current_char() {
                '\n' if !directive => return self.cut_new_token(LexTokenType::Comment),
                '\n' if  directive => return self.cut_new_token(LexTokenType::Directive),
                  _   => continue,
            };
        }
    }

    #[inline(always)]
    fn seek_closing_pipe(&mut self) -> Result<LexToken, LexError> {
        loop {
            if let None = self.advance_char() {
                return Err(LexError(E_NO_CLOSING_PIPE, self.current_token_start));
            }

            let c = self.current_char();
            match self.current_char() {
                '\\' => self.seek_end_of_escape(false)?,
                '|' => return self.cut_new_token(LexTokenType::Symbol),
                 _  if c.is_alphanumeric() => continue,
                 _  if LEX_SPECIAL.contains(&c) => continue,
                 _  if c == ' ' || c == '\n' => continue,
                    // quote case caught here
                 _  => return Err(LexError(E_INCOMPREHENSIBLE, self.current_token_start)),
            };
        }
    }

    #[inline(always)]
    fn seek_end_from_hash(&mut self) -> Result<LexToken, LexError> {
        let c = self.advance_char().and_then(|_| Some(self.current_char()));
        if let Some(ch) = c {
            match ch {
                't' | 'f' => return self.cut_new_token(LexTokenType::Boolean),
                '|'       => return self.seek_end_of_block_comment(),
                '!' => return self.seek_end_of_line_comment(true),
                'u' if self.match_chunk_next("8(").is_some_and(|x| x) => 
                        return self.cut_new_token(LexTokenType::ByteVectorStart),
                '(' => return self.cut_new_token(LexTokenType::VectorStart),
                '\\' => self.seek_end_of_escape(false)
                            .and_then(|_| self.cut_new_token(LexTokenType::Char)),
                'x'  => return Err(LexError(E_UNIMPLEMENTED_HEX, self.current_index)),
                 _ if  NUMERICAL_BASE.contains(&ch) => return self.seek_end_of_number(),
                 _ => return Err(LexError(E_INCOMPREHENSIBLE, self.current_token_start)),
            }
        } else {
            Err(LexError(E_NO_END_TO_HASH, self.current_token_start))
        }
    }

    // DOES NOT RETURN A TOKEN.......
    // only the caller knows what actually needs to be returned
    #[inline(always)]
    fn seek_end_of_escape(&mut self, in_string: bool) -> Result<(), LexError> {
        //let delim = if in_string { ';' } else { ' ' };
        // Delim and the arg to this function will be useful once we support hexadecimal encoding
        if let None = self.advance_char() {
            let mut error_msg = E_CHAR_TRUNCATED;
            if in_string { error_msg = E_STRING_TRUNCATED; }
            return Err(LexError(error_msg, self.current_token_start))
        }

        match self.current_char() {
            // eat an escaped whitespace or delim
            ' ' | 'n' | 'r' | 't' | '|' | '\\' | '"' => { () },
            'x' => return Err(LexError(E_UNIMPLEMENTED_HEX, self.current_token_start)),
             _  if self.current_char().is_alphabetic() => { () },
             _  => return Err(LexError(E_UNSUPPORTED_ESC, self.current_index)),
        }

        return Ok(())
    }

    /* Called to output a token by the iterator implementation
     * I dont think this has to be inlined. The other ones are inlined to
     *     prevent the process of parsing a token from being slowed down by
     *     so many stack frames. This one is called once per token.
     */
    fn seek_next_token(&mut self) -> Result<LexToken, LexError> {
        let mut output: Option<Result<LexToken, LexError>> = None;

        if self.current_index >= self.document.len() {
            return Err(LexError(E_END_OF_DOCUMENT, self.document.len()));
        }

        while LEX_WHITESPACE.contains(&self.current_char()) {
            if let None = self.advance_char() {
                return Err(LexError(E_END_OF_DOCUMENT, self.document.len()));
            }
        }

        self.current_token_start = self.current_index;

        // handle syntactic sugar cases
        match self.current_char() {
            ';'  => output = Some(self.seek_end_of_line_comment(false)),
            '\'' => output = Some(self.cut_new_token(LexTokenType::Quote)),
            '`'  => output = Some(self.cut_new_token(LexTokenType::QuasiQuote)),
            '('  => output = Some(self.cut_new_token(LexTokenType::ListStart)),
            ')'  => output = Some(self.cut_new_token(LexTokenType::CollectionEnd)),
            '#'  => output = Some(self.seek_end_from_hash()),
            '"'  => output = Some(self.seek_end_of_string()),
            '\\' => output = Some(self.seek_end_of_escape(false)
                                     .and_then(|_| 
                                         self.cut_new_token(LexTokenType::Char))),
            '|'  => output = Some(self.seek_closing_pipe()),
             _ if self.current_char().is_numeric() => output =
                        Some(self.seek_end_of_number()),
             _   => (),
        }

        if output.is_none() {
            if self.current_char() == '.' {
                if let Some(x) = self.peek_next_char() && x == ' ' {
                    output = Some(self.cut_new_token(LexTokenType::Dot));
                } /* else {
                    output = Some(Err(LexError(E_BAD_DOT, self.current_index)));
                } SYKE! It could be a symbol... */
            }

            if self.current_char() == ',' {
                if let Some(x) = self.peek_next_char() && x == '@'{
                    output = Some(self.cut_new_token(LexTokenType::UnquoteSpliceTemplate));
                }  else {
                    output = Some(self.cut_new_token(LexTokenType::Unquote));
                }
            }
        }

        /* Broken out into a separate case to maintain precedence of the
         *     unquote syntax and dotted notation.
         */
        if output.is_none() {
            loop {
                let c = self.current_char();
                if !c.is_alphanumeric() && !LEX_SPECIAL.contains(&c) &&  c != ' ' {
                    output = Some(Err(LexError(E_INCOMPREHENSIBLE, self.current_index)));
                    break;
                }

                if let Some(c) = self.peek_next_char() {
                    if c == ' ' || c == ')' {
                        output = Some(self.cut_new_token(LexTokenType::Symbol));
                        break;
                    }

                    self.advance_char().unwrap();
                } else {
                    output = Some(self.cut_new_token(LexTokenType::Symbol));
                    break;
                }
            }
        }

        if let Some(ref res) = output {
            if let Err(ref e) = res {
                self.has_error_state = Some(e.clone());
            }
        }

        return output.unwrap()
    }
}


#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_token_evaluations() {
        // indexed by LexTokenType
        let cases: [(Vec<&str>, Vec<&str>); LexTokenType::NumTypes as usize] = [
            /* String Cases */ (
                // HAPPY CASES
                vec!["\"asdf\"", "\"as sdf\"", "\"asdflkj\\n\"",
                     "\"LKsldkf;l\"", "\" sdlkfj \"", "\"#;sdf\""],

                // SAD CASES
                vec!["\"sdf"]
            ),

            /* Number Cases */ (
                // HAPPY CASES
                vec!["1", "1.0", "#d1.1", "#o1423", "#b11"],

                // SAD CASES
                vec!["1.1.1", "#o9", "#b1.01", "#xADADAD"]
            ),

            /* Char Cases */ (
                // HAPPY CASES
                vec!["\\a", "\\t", "\\\"", "#\\t"],

                // SAD CASES
                vec!["\\x20"]
            ),

            /* Identifier Cases */ (
                // HAPPY CASES
                vec!["...", "+", "+soup+", "<=?", "V17a", "->string", "a34kTMNs",
                     "lambda", "q", "list->vector", "|two words|", "|two\nwords|",
                     "the-word-recursion-has-many-meanings"],

                // SAD CASES
                vec!["|\"\"|", "|(|", "|valid"]
            ),

            /* Vector Start Cases */ (
                // HAPPY CASES
                vec!["#("],

                // SAD CASES
                vec![]
            ),


            /* Byte Vector Cases */ (
                // HAPPY CASES
                vec!["#u8("],

                // SAD CASES
                vec!["#u8", "#u9", "#u("]
            ),

            /* List Start Cases */ (
                // HAPPY CASES
                vec!["("],

                // SAD CASES
                vec![]
            ),

            /* Collection End Cases */ (
                // HAPPY CASES
                vec![")"],

                // SAD CASES
                vec![]
            ),

            /* Boolean Cases */ (
                // HAPPY CASES
                vec!["#t", "#f"],

                // SAD CASES
                vec![]
            ),

            /* Dot Cases */ (
                // HAPPY CASES
                vec![" . "],

                // SAD CASES
                vec![]
            ),

            /* Comment cases */ (
                // HAPPY CASES
                vec!["; (\n", "; #\n", ";\"\n", "#| ; ( \" |#"],

                // SAD CASES
                vec!["#|", "; "]
            ),

            /* Directive cases */ (
                // HAPPY CASES
                vec!["#!test-directive\n"],

                // SAD CASES
                vec!["#!test-directive"]
            ),

            /* Quote cases */ (
                // HAPPY CASES
                vec!["'"],

                // SAD CASES
                vec![]
            ),

            /* QuasiQuote cases */ (
                // HAPPY CASES
                vec!["`"],

                // SAD CASES
                vec![]
            ),

            /* Unquote cases */ (
                // HAPPY CASES
                vec![",x", ","],

                // SAD CASES
                vec![]
            ),

            /* UnquoteSpliceTemplate cases */ (
                // HAPPY CASES
                vec![",@x", ",@(", ",@"],

                // SAD CASES
                vec![","]
            ),
        ];

        let no_subtoken_check_cases = [
            LexTokenType::Dot as u8,
            LexTokenType::Unquote as u8,
            LexTokenType::UnquoteSpliceTemplate as u8
        ];

        cases.iter().enumerate().for_each(|(idx, case)| {
            println!("+ Testing {:#?} Cases...", LexTokenType::try_from(idx as u8).unwrap());

            case.0.iter()
                .for_each(|subcase| {
                    println!("  - happy case: {}", subcase);
                    let token = Lexer::from(Rc::from(*subcase))
                        .next()
                        .unwrap();
                    assert_eq!(token.token_type,
                               LexTokenType::try_from(idx as u8)
                                    .unwrap());
                    if no_subtoken_check_cases.contains(&(idx as u8)) {
                        /* DO NOTHING, ignore the dot case since its subcase is
                         *  a superset of the actual token substring
                         */
                    } else {
                        assert_eq!(&token.source_doc[token.start_idx..token.end_idx],
                                   *subcase)
                    }
                });

            case.1.iter()
                .for_each(|subcase| {
                    println!("  - sad case: {}", subcase);
                    assert!(Lexer::from(Rc::from(*subcase)).next().is_none())
                });
        });
    }

    #[test]
    fn test_multi_token_iter() {
        let mut res = vec![];
        Lexer::from(Rc::from("( one two three )"))
            .into_iter()
            .collect_into(&mut res);
        assert_eq!(res.len(), 5);

        assert_eq!(res[0].token_type, LexTokenType::ListStart);
        assert_eq!(res[0].start_idx, 0);
        assert_eq!(res[0].end_idx, 1);
        assert_eq!(&res[0].source_doc[res[0].start_idx..res[0].end_idx], "(");

        assert_eq!(res[1].token_type, LexTokenType::Symbol);
        assert_eq!(res[1].start_idx, 2);
        assert_eq!(res[1].end_idx, 5);
        assert_eq!(&res[1].source_doc[res[1].start_idx..res[1].end_idx], "one");

        assert_eq!(res[2].token_type, LexTokenType::Symbol);
        assert_eq!(res[2].start_idx, 6);
        assert_eq!(res[2].end_idx, 9);
        assert_eq!(&res[2].source_doc[res[2].start_idx..res[2].end_idx], "two");

        assert_eq!(res[3].token_type, LexTokenType::Symbol);
        assert_eq!(res[3].start_idx, 10);
        assert_eq!(res[3].end_idx, 15);
        assert_eq!(&res[3].source_doc[res[3].start_idx..res[3].end_idx], "three");

        assert_eq!(res[4].token_type, LexTokenType::CollectionEnd);
        assert_eq!(res[4].start_idx, 16);
        assert_eq!(res[4].end_idx, 17);
        assert_eq!(&res[4].source_doc[res[4].start_idx..res[4].end_idx], ")");
    }

    #[test]
    fn test_error_state_blocking() {
        let mut l = Lexer::from(Rc::from("( 1 2.2.2 valid_token"))
            .into_iter();

        assert!(l.next().is_some());
        assert!(l.next().is_some());
        assert!(l.next().is_none());
        assert!(l.has_error_state.is_some());
        assert!(l.next().is_none());
        assert!(l.has_error_state.is_some());
    }
}
Lexer and S-Expression data types The lexer is complete with tests. It fully encapsulates the logic of splitting an input document into a stream of tokens. It can be instantiated from an Rc<str>, meaning no lifetimes need be managed references to the original document (like a stringview) can be passed around carelessly. The Lexer implements the iterator method which should help elegantly design repls / compilers, etc. The S-Expression data type represents the parsed AST. The actual parsing logic is yet to be added. It is intended that the AST be the last step before compiling to bytecode. The data representation here is cons cells of datum. Formatting is implemented. Signed-off-by: Ava Affine <ava@sunnypup.io> 2025-05-07 09:19:33 -07:00			`/* Mycelium Scheme`
			`* Copyright (C) 2025 Ava Affine`
			`*`
			`* This program is free software: you can redistribute it and/or modify`
			`* it under the terms of the GNU General Public License as published by`
			`* the Free Software Foundation, either version 3 of the License, or`
			`* (at your option) any later version.`
			`*`
			`* This program is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`* GNU General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU General Public License`
			`* along with this program. If not, see <https://www.gnu.org/licenses/>.`
			`*/`

			`use alloc::rc::Rc;`

			`pub const LEX_SPECIAL: [char; 18] = ['!', '$', '%', '&', '*', '+', '-', '/',`
			`':', '<', '=', '>', '?', '@', '^', '_', '~', '.'];`
			`pub const LEX_WHITESPACE: [char; 4] = [' ', '\n', '\t', '\r'];`
			`pub const NUMERICAL_EXTRA: [char; 3] = ['.', 'i', 'e'];`
			`pub const NUMERICAL_BASE: [char; 3] = ['d', 'o', 'b'];`

			`pub const E_NO_MATCHING_QUOTE: &str = "couldn't find matching quote";`
			`pub const E_TOO_MANY_DECIMALS: &str = "number can only have one of {i e .}";`
			`pub const E_NO_MATCHING_PAREN: &str = "couldn't find matching paren";`
			`pub const E_UNCLOSED_COMMENT: &str = "block comment has no end";`
			`pub const E_NO_CLOSING_PIPE: &str = "expected a closing pipe";`
			`pub const E_NO_END_TO_HASH: &str = "expected more input after hash";`
			`pub const E_NUMBER_TRUNCATED: &str = "number literal is truncated";`
			`pub const E_CHAR_TRUNCATED: &str = "character literal is truncated";`
			`pub const E_STRING_TRUNCATED: &str = "string literal is truncated";`
			`pub const E_EXTRA_CLOSE: &str = "extra closing parenthesis";`
			`pub const E_UNIMPLEMENTED_HEX: &str = "hexadecimal literals not supported";`
			`pub const E_NUMER_BASE_ERR: &str = "digit in number exceeds specified base";`
			`pub const E_UNSUPPORTED_ESC: &str = "unsupported escape";`
			`pub const E_BAD_DOT: &str = "expected space after dot in dotted notation";`
			`pub const E_NO_SPLICE_TEMPL: &str = "expected more input after unquote splicing";`
			`pub const E_INCOMPREHENSIBLE: &str = "token does not lex";`
			`pub const E_END_OF_DOCUMENT: &str = "no additional input left in document";`

			`/* LexError`
			`* 0: error string`
			`* 1: index into document`
			`*/`
			`#[derive(Clone)]`
			`pub struct LexError(pub &'static str, pub usize);`

			`#[repr(u8)]`
			`#[derive(Debug, PartialEq)]`
			`pub enum LexTokenType {`
			`String = 0,`
			`Number,`
			`Char,`
			`Symbol,`
			`VectorStart,`
			`ByteVectorStart,`
			`ListStart,`
			`CollectionEnd,`
			`Boolean,`
			`Dot,`
			`Comment,`
			`Directive,`
			`Quote,`
			`QuasiQuote,`
			`Unquote,`
			`UnquoteSpliceTemplate,`
			`NumTypes,`
			`}`

			`impl TryFrom<u8> for LexTokenType {`
			`type Error = &'static str;`
			`fn try_from(u: u8) -> Result<LexTokenType, Self::Error> {`
			`if u >= LexTokenType::NumTypes as u8 {`
			`Err("out of token type range")`
			`} else {`
			`unsafe { Ok(core::mem::transmute(u)) }`
			`}`
			`}`
			`}`


			`pub struct LexToken {`
			`token_type: LexTokenType,`
			`start_idx: usize,`
			`end_idx: usize,`
			`source_doc: Rc<str>,`
			`}`


			`pub struct Lexer {`
			`document: Rc<str>,`
			`current_index: usize,`
			`current_token_start: usize,`
			`has_error_state: Option<LexError>,`
			`}`

			`impl From<Rc<str>> for Lexer {`
			`fn from(s: Rc<str>) -> Lexer {`
			`Lexer {`
			`document: Rc::from(s),`
			`current_index: 0,`
			`current_token_start: 0,`
			`has_error_state: None,`
			`}`
			`}`
			`}`

			`impl Iterator for Lexer {`
			`type Item = LexToken;`

			`fn next(&mut self) -> Option<Self::Item> {`
			`if self.has_error_state.is_some() {`
			`return None;`
			`}`

			`let res = self.seek_next_token();`
			`if let Err(ref e) = res {`
			`self.has_error_state = Some(e.clone());`
			`}`

			`return res.ok()`
			`}`
			`}`

			`impl Lexer {`
			`// I just didnt want to write and rewrite this...`
			`#[inline(always)]`
			`fn current_char(&mut self) -> char {`
			`self.document.as_bytes()[self.current_index] as char`
			`}`

			`#[inline(always)]`
			`fn peek_next_char(&mut self) -> Option<char> {`
			`if let Some((_, ch)) = self.document[self.current_index+1..]`
			`.char_indices()`
			`.next() {`
			`Some(ch)`
			`} else {`
			`None`
			`}`
			`}`

			`#[inline(always)]`
			`fn advance_char(&mut self) -> Option<()> {`
			`self.current_index += 1;`
			`if let Some((idx, _)) = self.document[self.current_index..]`
			`.char_indices()`
			`.next() {`

			`self.current_index = idx + self.current_index;`
			`Some(())`

			`} else {`
			`self.current_index = self.document.len();`
			`None`
			`}`
			`}`

			`#[inline(always)]`
			`fn match_chunk_next(&mut self, chunk: &str) -> Option<bool> {`
			`for i in chunk.chars() {`
			`self.advance_char()?;`
			`if i != self.current_char() {`
			`return Some(false)`
			`}`
			`}`

			`Some(true)`
			`}`

			`/* TODO`
			`* I figured this function would be useful for supporting hexadec encoding`
			`* later down the line. We can use this instead of the base check in the`
			`* number function.`
			`#[inline(always)]`
			`fn next_chars_allowed(&mut self, len: usize, allowed: &str) -> Option<bool> {`
			`let mut i = len;`
			`while i < 0 {`
			`if !allowed.contains(self.current_char()) {`
			`return Some(false)`
			`}`

			`i -= 1;`
			`self.advance_char()?;`
			`}`

			`Some(true)`
			`}`
			`*/`

			`#[inline(always)]`
			`fn cut_new_token(&mut self, t: LexTokenType) -> Result<LexToken, LexError> {`
			`let next_idx = self.advance_char()`
			`.and_then(\|_\| Some(self.current_index))`
			`.or(Some(self.document.len()))`
			`.unwrap();`

			`let l = LexToken{`
			`token_type: t,`
			`start_idx: self.current_token_start,`
			`end_idx: next_idx,`
			`source_doc: self.document.clone(),`
			`};`

			`self.current_token_start = 0;`
			`return Ok(l);`
			`}`

			`#[inline(always)]`
			`fn seek_end_of_string(&mut self) -> Result<LexToken, LexError> {`
			`// TODO: support escaped quotes`
			`loop {`
			`if let None = self.advance_char() {`
			`return Err(LexError(E_NO_MATCHING_QUOTE, self.current_token_start))`
			`} else if self.current_char() == '"' {`
			`return self.cut_new_token(LexTokenType::String)`
			`}`
			`}`
			`}`

			`#[inline(always)]`
			`fn seek_end_of_number(&mut self) -> Result<LexToken, LexError> {`
			`let mut base = 10;`
			`let a = self.current_char();`
			`if NUMERICAL_BASE.contains(&a) {`
			`if let None = self.advance_char() {`
			`return Err(LexError(E_NUMBER_TRUNCATED, self.current_token_start))`
			`}`
			`match a {`
			`'d' => base = 10,`
			`'o' => base = 8,`
			`'b' => base = 2,`
			`_ => (),`
			`}`
			`}`

			`let mut hasdot = false;`
			`loop {`
			`let a = self.current_char();`
			`if NUMERICAL_EXTRA.contains(&a) {`
			`if hasdot \|\| base < 10 {`
			`return Err(LexError(E_TOO_MANY_DECIMALS, self.current_token_start))`
			`}`
			`hasdot = true;`

			`} else if a == ' ' \|\| a == ')' {`
			`// back up one`
			`self.current_index -= 1;`
			`return self.cut_new_token(LexTokenType::Number)`

			`} else if !a.is_numeric() {`
			`return Err(LexError(E_INCOMPREHENSIBLE, self.current_token_start))`

			`} else if a.to_digit(10).unwrap() >= base {`
			`return Err(LexError(E_NUMER_BASE_ERR, self.current_token_start))`
			`}`

			`if let None = self.advance_char() {`
			`self.current_index = self.document.len() - 1;`
			`return self.cut_new_token(LexTokenType::Number)`
			`}`
			`}`
			`}`

			`#[inline(always)]`
			`fn seek_end_of_block_comment(&mut self) -> Result<LexToken, LexError> {`
			`loop {`
			`if let None = self.advance_char() {`
			`return Err(LexError(E_UNCLOSED_COMMENT, self.current_token_start))`
			`}`

			`match self.current_char() {`
			`'\|' if self.advance_char().and_then(\|_\|`
			`if self.current_char() == '#' {`
			`return Some(())`
			`} else { return None }).is_some() =>`
			`return self.cut_new_token(LexTokenType::Comment),`
			`_ => continue,`
			`};`
			`}`
			`}`

			`#[inline(always)]`
			`fn seek_end_of_line_comment(&mut self, directive: bool) -> Result<LexToken, LexError> {`
			`loop {`
			`if let None = self.advance_char() {`
			`return Err(LexError(E_UNCLOSED_COMMENT, self.current_token_start))`
			`}`

			`match self.current_char() {`
			`'\n' if !directive => return self.cut_new_token(LexTokenType::Comment),`
			`'\n' if directive => return self.cut_new_token(LexTokenType::Directive),`
			`_ => continue,`
			`};`
			`}`
			`}`

			`#[inline(always)]`
			`fn seek_closing_pipe(&mut self) -> Result<LexToken, LexError> {`
			`loop {`
			`if let None = self.advance_char() {`
			`return Err(LexError(E_NO_CLOSING_PIPE, self.current_token_start));`
			`}`

			`let c = self.current_char();`
			`match self.current_char() {`
			`'\\' => self.seek_end_of_escape(false)?,`
			`'\|' => return self.cut_new_token(LexTokenType::Symbol),`
			`_ if c.is_alphanumeric() => continue,`
			`_ if LEX_SPECIAL.contains(&c) => continue,`
			`_ if c == ' ' \|\| c == '\n' => continue,`
			`// quote case caught here`
			`_ => return Err(LexError(E_INCOMPREHENSIBLE, self.current_token_start)),`
			`};`
			`}`
			`}`

			`#[inline(always)]`
			`fn seek_end_from_hash(&mut self) -> Result<LexToken, LexError> {`
			`let c = self.advance_char().and_then(\|_\| Some(self.current_char()));`
			`if let Some(ch) = c {`
			`match ch {`
			`'t' \| 'f' => return self.cut_new_token(LexTokenType::Boolean),`
			`'\|' => return self.seek_end_of_block_comment(),`
			`'!' => return self.seek_end_of_line_comment(true),`
			`'u' if self.match_chunk_next("8(").is_some_and(\|x\| x) =>`
			`return self.cut_new_token(LexTokenType::ByteVectorStart),`
			`'(' => return self.cut_new_token(LexTokenType::VectorStart),`
			`'\\' => self.seek_end_of_escape(false)`
			`.and_then(\|_\| self.cut_new_token(LexTokenType::Char)),`
			`'x' => return Err(LexError(E_UNIMPLEMENTED_HEX, self.current_index)),`
			`_ if NUMERICAL_BASE.contains(&ch) => return self.seek_end_of_number(),`
			`_ => return Err(LexError(E_INCOMPREHENSIBLE, self.current_token_start)),`
			`}`
			`} else {`
			`Err(LexError(E_NO_END_TO_HASH, self.current_token_start))`
			`}`
			`}`

			`// DOES NOT RETURN A TOKEN.......`
			`// only the caller knows what actually needs to be returned`
			`#[inline(always)]`
			`fn seek_end_of_escape(&mut self, in_string: bool) -> Result<(), LexError> {`
			`//let delim = if in_string { ';' } else { ' ' };`
			`// Delim and the arg to this function will be useful once we support hexadecimal encoding`
			`if let None = self.advance_char() {`
			`let mut error_msg = E_CHAR_TRUNCATED;`
			`if in_string { error_msg = E_STRING_TRUNCATED; }`
			`return Err(LexError(error_msg, self.current_token_start))`
			`}`

			`match self.current_char() {`
			`// eat an escaped whitespace or delim`
			`' ' \| 'n' \| 'r' \| 't' \| '\|' \| '\\' \| '"' => { () },`
			`'x' => return Err(LexError(E_UNIMPLEMENTED_HEX, self.current_token_start)),`
			`_ if self.current_char().is_alphabetic() => { () },`
			`_ => return Err(LexError(E_UNSUPPORTED_ESC, self.current_index)),`
			`}`

			`return Ok(())`
			`}`

			`/* Called to output a token by the iterator implementation`
			`* I dont think this has to be inlined. The other ones are inlined to`
			`* prevent the process of parsing a token from being slowed down by`
			`* so many stack frames. This one is called once per token.`
			`*/`
			`fn seek_next_token(&mut self) -> Result<LexToken, LexError> {`
			`let mut output: Option<Result<LexToken, LexError>> = None;`

			`if self.current_index >= self.document.len() {`
			`return Err(LexError(E_END_OF_DOCUMENT, self.document.len()));`
			`}`

			`while LEX_WHITESPACE.contains(&self.current_char()) {`
			`if let None = self.advance_char() {`
			`return Err(LexError(E_END_OF_DOCUMENT, self.document.len()));`
			`}`
			`}`

			`self.current_token_start = self.current_index;`

			`// handle syntactic sugar cases`
			`match self.current_char() {`
			`';' => output = Some(self.seek_end_of_line_comment(false)),`
			`'\'' => output = Some(self.cut_new_token(LexTokenType::Quote)),`
			'`' => output = Some(self.cut_new_token(LexTokenType::QuasiQuote)),
			`'(' => output = Some(self.cut_new_token(LexTokenType::ListStart)),`
			`')' => output = Some(self.cut_new_token(LexTokenType::CollectionEnd)),`
			`'#' => output = Some(self.seek_end_from_hash()),`
			`'"' => output = Some(self.seek_end_of_string()),`
			`'\\' => output = Some(self.seek_end_of_escape(false)`
			`.and_then(\|_\|`
			`self.cut_new_token(LexTokenType::Char))),`
			`'\|' => output = Some(self.seek_closing_pipe()),`
			`_ if self.current_char().is_numeric() => output =`
			`Some(self.seek_end_of_number()),`
			`_ => (),`
			`}`

			`if output.is_none() {`
			`if self.current_char() == '.' {`
			`if let Some(x) = self.peek_next_char() && x == ' ' {`
			`output = Some(self.cut_new_token(LexTokenType::Dot));`
			`} /* else {`
			`output = Some(Err(LexError(E_BAD_DOT, self.current_index)));`
			`} SYKE! It could be a symbol... */`
			`}`

			`if self.current_char() == ',' {`
			`if let Some(x) = self.peek_next_char() && x == '@'{`
			`output = Some(self.cut_new_token(LexTokenType::UnquoteSpliceTemplate));`
			`} else {`
			`output = Some(self.cut_new_token(LexTokenType::Unquote));`
			`}`
			`}`
			`}`

			`/* Broken out into a separate case to maintain precedence of the`
			`* unquote syntax and dotted notation.`
			`*/`
			`if output.is_none() {`
			`loop {`
			`let c = self.current_char();`
			`if !c.is_alphanumeric() && !LEX_SPECIAL.contains(&c) && c != ' ' {`
			`output = Some(Err(LexError(E_INCOMPREHENSIBLE, self.current_index)));`
			`break;`
			`}`

			`if let Some(c) = self.peek_next_char() {`
			`if c == ' ' \|\| c == ')' {`
			`output = Some(self.cut_new_token(LexTokenType::Symbol));`
			`break;`
			`}`

			`self.advance_char().unwrap();`
			`} else {`
			`output = Some(self.cut_new_token(LexTokenType::Symbol));`
			`break;`
			`}`
			`}`
			`}`

			`if let Some(ref res) = output {`
			`if let Err(ref e) = res {`
			`self.has_error_state = Some(e.clone());`
			`}`
			`}`

			`return output.unwrap()`
			`}`
			`}`


			`#[cfg(test)]`
			`mod tests {`
			`use super::*;`

			`#[test]`
			`fn test_token_evaluations() {`
			`// indexed by LexTokenType`
			`let cases: [(Vec<&str>, Vec<&str>); LexTokenType::NumTypes as usize] = [`
			`/* String Cases */ (`
			`// HAPPY CASES`
			`vec!["\"asdf\"", "\"as sdf\"", "\"asdflkj\\n\"",`
			`"\"LKsldkf;l\"", "\" sdlkfj \"", "\"#;sdf\""],`

			`// SAD CASES`
			`vec!["\"sdf"]`
			`),`

			`/* Number Cases */ (`
			`// HAPPY CASES`
			`vec!["1", "1.0", "#d1.1", "#o1423", "#b11"],`

			`// SAD CASES`
			`vec!["1.1.1", "#o9", "#b1.01", "#xADADAD"]`
			`),`

			`/* Char Cases */ (`
			`// HAPPY CASES`
			`vec!["\\a", "\\t", "\\\"", "#\\t"],`

			`// SAD CASES`
			`vec!["\\x20"]`
			`),`

			`/* Identifier Cases */ (`
			`// HAPPY CASES`
			`vec!["...", "+", "+soup+", "<=?", "V17a", "->string", "a34kTMNs",`
			`"lambda", "q", "list->vector", "\|two words\|", "\|two\nwords\|",`
			`"the-word-recursion-has-many-meanings"],`

			`// SAD CASES`
			`vec!["\|\"\"\|", "\|(\|", "\|valid"]`
			`),`

			`/* Vector Start Cases */ (`
			`// HAPPY CASES`
			`vec!["#("],`

			`// SAD CASES`
			`vec![]`
			`),`


			`/* Byte Vector Cases */ (`
			`// HAPPY CASES`
			`vec!["#u8("],`

			`// SAD CASES`
			`vec!["#u8", "#u9", "#u("]`
			`),`

			`/* List Start Cases */ (`
			`// HAPPY CASES`
			`vec!["("],`

			`// SAD CASES`
			`vec![]`
			`),`

			`/* Collection End Cases */ (`
			`// HAPPY CASES`
			`vec![")"],`

			`// SAD CASES`
			`vec![]`
			`),`

			`/* Boolean Cases */ (`
			`// HAPPY CASES`
			`vec!["#t", "#f"],`

			`// SAD CASES`
			`vec![]`
			`),`

			`/* Dot Cases */ (`
			`// HAPPY CASES`
			`vec![" . "],`

			`// SAD CASES`
			`vec![]`
			`),`

			`/* Comment cases */ (`
			`// HAPPY CASES`
			`vec!["; (\n", "; #\n", ";\"\n", "#\| ; ( \" \|#"],`

			`// SAD CASES`
			`vec!["#\|", "; "]`
			`),`

			`/* Directive cases */ (`
			`// HAPPY CASES`
			`vec!["#!test-directive\n"],`

			`// SAD CASES`
			`vec!["#!test-directive"]`
			`),`

			`/* Quote cases */ (`
			`// HAPPY CASES`
			`vec!["'"],`

			`// SAD CASES`
			`vec![]`
			`),`

			`/* QuasiQuote cases */ (`
			`// HAPPY CASES`
			vec!["`"],

			`// SAD CASES`
			`vec![]`
			`),`

			`/* Unquote cases */ (`
			`// HAPPY CASES`
			`vec![",x", ","],`

			`// SAD CASES`
			`vec![]`
			`),`

			`/* UnquoteSpliceTemplate cases */ (`
			`// HAPPY CASES`
			`vec![",@x", ",@(", ",@"],`

			`// SAD CASES`
			`vec![","]`
			`),`
			`];`

			`let no_subtoken_check_cases = [`
			`LexTokenType::Dot as u8,`
			`LexTokenType::Unquote as u8,`
			`LexTokenType::UnquoteSpliceTemplate as u8`
			`];`

			`cases.iter().enumerate().for_each(\|(idx, case)\| {`
			`println!("+ Testing {:#?} Cases...", LexTokenType::try_from(idx as u8).unwrap());`

			`case.0.iter()`
			`.for_each(\|subcase\| {`
			`println!(" - happy case: {}", subcase);`
			`let token = Lexer::from(Rc::from(*subcase))`
			`.next()`
			`.unwrap();`
			`assert_eq!(token.token_type,`
			`LexTokenType::try_from(idx as u8)`
			`.unwrap());`
			`if no_subtoken_check_cases.contains(&(idx as u8)) {`
			`/* DO NOTHING, ignore the dot case since its subcase is`
			`* a superset of the actual token substring`
			`*/`
			`} else {`
			`assert_eq!(&token.source_doc[token.start_idx..token.end_idx],`
			`*subcase)`
			`}`
			`});`

			`case.1.iter()`
			`.for_each(\|subcase\| {`
			`println!(" - sad case: {}", subcase);`
			`assert!(Lexer::from(Rc::from(*subcase)).next().is_none())`
			`});`
			`});`
			`}`

			`#[test]`
			`fn test_multi_token_iter() {`
			`let mut res = vec![];`
			`Lexer::from(Rc::from("( one two three )"))`
			`.into_iter()`
			`.collect_into(&mut res);`
			`assert_eq!(res.len(), 5);`

			`assert_eq!(res[0].token_type, LexTokenType::ListStart);`
			`assert_eq!(res[0].start_idx, 0);`
			`assert_eq!(res[0].end_idx, 1);`
			`assert_eq!(&res[0].source_doc[res[0].start_idx..res[0].end_idx], "(");`

			`assert_eq!(res[1].token_type, LexTokenType::Symbol);`
			`assert_eq!(res[1].start_idx, 2);`
			`assert_eq!(res[1].end_idx, 5);`
			`assert_eq!(&res[1].source_doc[res[1].start_idx..res[1].end_idx], "one");`

			`assert_eq!(res[2].token_type, LexTokenType::Symbol);`
			`assert_eq!(res[2].start_idx, 6);`
			`assert_eq!(res[2].end_idx, 9);`
			`assert_eq!(&res[2].source_doc[res[2].start_idx..res[2].end_idx], "two");`

			`assert_eq!(res[3].token_type, LexTokenType::Symbol);`
			`assert_eq!(res[3].start_idx, 10);`
			`assert_eq!(res[3].end_idx, 15);`
			`assert_eq!(&res[3].source_doc[res[3].start_idx..res[3].end_idx], "three");`

			`assert_eq!(res[4].token_type, LexTokenType::CollectionEnd);`
			`assert_eq!(res[4].start_idx, 16);`
			`assert_eq!(res[4].end_idx, 17);`
			`assert_eq!(&res[4].source_doc[res[4].start_idx..res[4].end_idx], ")");`
			`}`

			`#[test]`
			`fn test_error_state_blocking() {`
			`let mut l = Lexer::from(Rc::from("( 1 2.2.2 valid_token"))`
			`.into_iter();`

			`assert!(l.next().is_some());`
			`assert!(l.next().is_some());`
			`assert!(l.next().is_none());`
			`assert!(l.has_error_state.is_some());`
			`assert!(l.next().is_none());`
			`assert!(l.has_error_state.is_some());`
			`}`
			`}`