Mycelium/mycelium/src/lexer.rs

/*  Mycelium Scheme
 *  Copyright (C) 2025 Ava Affine
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */

use core::fmt;
use alloc::rc::Rc;

pub const LEX_SPECIAL:     [char; 18] = ['!', '$', '%', '&', '*', '+', '-', '/',
     ':', '<', '=', '>', '?', '@', '^', '_', '~', '.'];
pub const LEX_WHITESPACE:  [char; 4] = [' ', '\n', '\t', '\r'];
pub const NUMERICAL_EXTRA: [char; 4] = ['.', 'i', 'e', '/'];
pub const NUMERICAL_BASE:  [char; 4] = ['d', 'o', 'b', 'x'];
pub const TOK_DELIMITERS:  [char; 5] = [')', ' ', '\t', '\n', '\r'];

pub const E_NO_MATCHING_QUOTE:  &str = "couldn't find matching quote";
pub const E_TOO_MANY_DECIMALS:  &str = "number can only have one dot";
pub const E_TOO_MANY_SLASH:     &str = "number can only have one slash";
pub const E_TOO_MANY_E:         &str = "number can only have one e";
pub const E_NO_MATCHING_PAREN:  &str = "couldn't find matching paren";
pub const E_UNCLOSED_COMMENT:   &str = "block comment has no end";
pub const E_NO_CLOSING_PIPE:    &str = "expected a closing pipe";
pub const E_NO_END_TO_HASH:     &str = "expected more input after hash";
pub const E_NUMBER_TRUNCATED:   &str = "number literal is truncated";
pub const E_CHAR_TRUNCATED:     &str = "character literal is truncated";
pub const E_STRING_TRUNCATED:   &str = "string literal is truncated";
pub const E_UNDELIMITED_ESC:    &str = "char escape is not delimited";
pub const E_EXTRA_CLOSE:        &str = "extra closing parenthesis";
pub const E_CHAR_TOO_LONG:      &str = "character literal is too long";
pub const E_NUMER_BASE_ERR:     &str = "digit in number exceeds specified base";
pub const E_UNSUPPORTED_ESC:    &str = "unsupported escape";
pub const E_BAD_DOT:            &str = "expected space after dot in dotted notation";
pub const E_BAD_HEX:            &str = "character is not valid hexadecimal notation";
pub const E_INCOMPREHENSIBLE:   &str = "token does not lex";
pub const E_END_OF_DOCUMENT:    &str = "no additional input left in document";

/* LexError
 * 0: error string
 * 1: index into document
 * 2: document in question
 */
#[derive(Clone)]
pub struct LexError(pub &'static str, pub usize, pub Rc<str>);

impl fmt::Display for LexError {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        let err_snippet_start = || -> usize {
            /* backtrack from current index until we either hit
             *   - beginning of line
             *   - 25 characters ago
             *   - the doc Start
             */
            if self.2.len() < 25 {
                0

            } else {
                let mut idx = self.1;
                while self.1 - idx < 25 {
                    idx -= 1;
                    if self.2[idx..]
                        .char_indices()
                        .next()
                        .is_some_and(|(i, x)| x == '\n' && i == idx) {
                            idx += 1;
                            break;
                        }
                }

                idx
            }
        };

        let err_snippet_end = || -> usize {
            /* read through document until we either hit
             *   - end of line
             *   - 25 characters forward
             *   - the doc end
             */
            if self.2.len() - self.1 < 25 {
                self.2.len()

            } else {
                let mut idx = self.1;
                while idx - self.1 < 25 {
                    idx += 1;
                    if self.2[idx..]
                        .char_indices()
                        .next()
                        .is_some_and(|(i, x)| x == '\n' && i == idx) {
                            break;
                    }
                }

                idx
            }
        };

        write!(f, "Error when lexing document here: (idx: {})\n", self.1)?;
        let s  = err_snippet_start();
        let st = self.1 - err_snippet_start();
        write!(f, "    {}\n", &self.2[s..err_snippet_end()])?;
        write!(f, "    {}^\n", " ".repeat(st))?;
        write!(f, "Error: {}\n", self.0)
    }
}


#[repr(u8)]
#[derive(Debug, PartialEq, Clone)]
pub enum LexTokenType {
    String = 0,
    Number,
    Char,
    Symbol,
    VectorStart,
    ByteVectorStart,
    ListStart,
    CollectionEnd,
    Boolean,
    Dot,
    Comment,
    Directive,
    Quote,
    QuasiQuote,
    Unquote,
    UnquoteSplice,
    NumTypes,
}

impl TryFrom<u8> for LexTokenType {
    type Error = &'static str;
    fn try_from(u: u8) -> Result<LexTokenType, Self::Error> {
        if u >= LexTokenType::NumTypes as u8 {
            Err("out of token type range")
        } else {
            unsafe { Ok(core::mem::transmute(u)) }
        }
    }
}


#[derive(Clone)]
pub struct LexToken {
    pub token_type: LexTokenType,
    pub start_idx: usize,
    pub end_idx: usize,
    pub source_doc: Rc<str>,
}


pub struct Lexer {
    document: Rc<str>,
    current_index: usize,
    current_token_start: usize,
    pub has_error_state: Option<LexError>,
}

impl From<Rc<str>> for Lexer {
    fn from(s: Rc<str>) -> Lexer {
       Lexer {
            document: Rc::from(s),
            current_index: 0,
            current_token_start: 0,
            has_error_state: None,
        }
    }
}

impl Iterator for Lexer {
    type Item = LexToken;

    fn next(&mut self) -> Option<Self::Item> {
        if self.has_error_state.is_some() {
            return None;
        }

        let res = self.seek_next_token();
        if let Err(e) = &res {
            self.has_error_state = Some(e.clone());
        }

        return res.ok()
    }
}

impl Lexer {
    // I just didnt want to write and rewrite this...
    #[inline(always)]
    fn current_char(&mut self) -> char {
        self.document.as_bytes()[self.current_index] as char
    }

    #[inline(always)]
    fn peek_next_char(&mut self) -> Option<char> {
        if let Some((_, ch)) = self.document[self.current_index+1..]
                .char_indices()
                .next() {
            Some(ch)
        } else {
            None
        }
    }

    #[inline(always)]
    fn advance_char(&mut self) -> Option<()> {
        self.current_index += 1;
        if self.current_index >= self.document.len() {
            return None
        }

        if let Some((idx, _)) = self.document[self.current_index..]
                            .char_indices()
                            .next() {

            self.current_index = idx + self.current_index;
            Some(())

        } else {
            self.current_index = self.document.len();
            None
        }
    }

    #[inline(always)]
    fn match_chunk_next(&mut self, chunk: &str, peek: bool) -> Option<bool> {
        let saved = self.current_index;
        for i in chunk.chars() {
            if let None = self.advance_char() {
                self.current_index = saved;
                return None
            }

            if i != self.current_char() {
                self.current_index = saved;
                return Some(false)
            }
        }

        if peek { self.current_index = saved; }
        Some(true)
    }

    #[inline(always)]
    fn cut_new_token(&mut self, t: LexTokenType) -> Result<LexToken, LexError> {
        let next_idx = self.advance_char()
            .and_then(|_| Some(self.current_index))
            .or(Some(self.document.len()))
            .unwrap();

        let l = LexToken{
            token_type: t,
            start_idx: self.current_token_start,
            end_idx: next_idx,
            source_doc: self.document.clone(),
        };

        self.current_token_start = 0;
        return Ok(l);
    }

    #[inline(always)]
    fn seek_end_of_string(&mut self) -> Result<LexToken, LexError> {
        loop {
            if let None = self.advance_char() {
                return Err(LexError(E_NO_MATCHING_QUOTE,
                        self.current_token_start, self.document.clone()))

            } else if self.current_char() == '\\' {
                self.seek_end_of_escape(true)?;

            } else if self.current_char() == '"' {
                return self.cut_new_token(LexTokenType::String)
            }
        }
    }

    #[inline(always)]
    fn seek_end_of_number(&mut self) -> Result<LexToken, LexError> {
        let mut base = 10;
        let a = self.current_char();

        if let Some(true) = self.match_chunk_next("inf.0", false) {
            return self.cut_new_token(LexTokenType::Number)
        }

        if let Some(true) = self.match_chunk_next("nan.0", false) {
            return self.cut_new_token(LexTokenType::Number)
        }

        if NUMERICAL_BASE.contains(&a) || a == 'i' || a == 'e' || a == '+' || a == '-' {
            if let None = self.advance_char() {
                return Err(LexError(E_NUMBER_TRUNCATED,
                            self.current_token_start, self.document.clone()))

            // someday rust will get its shit together and if let chaining will be adequate
            } else if TOK_DELIMITERS.contains(&a) {
                return Err(LexError(E_NUMBER_TRUNCATED,
                            self.current_token_start, self.document.clone()))
            }

            match a {
                'x' => base = 16,
                'd' => base = 10,
                'o' => base = 8,
                'b' => base = 2,
                // ignore i or e, number parsers will handle that
                 _ => (),
            }
        }

        let mut hasdot   = false;
        let mut hasslash = false;
        let mut hase     = false;
        loop {
            let a = self.current_char();
            if a == '.' {
                if hasdot || base < 10 {
                    return Err(LexError(E_TOO_MANY_DECIMALS,
                                self.current_token_start, self.document.clone()))
                }
                hasdot = true;

            } else if a == '/' {
                if hasslash || base < 10 {
                    return Err(LexError(E_TOO_MANY_SLASH,
                                self.current_token_start, self.document.clone()))
                }
                hasslash = true;

            } else if a == 'e' {
                if hase || base < 10 {
                    return Err(LexError(E_TOO_MANY_E,
                                self.current_token_start, self.document.clone()))
                }
                hase = true

            } else if TOK_DELIMITERS.contains(&a) {
                // back up one
                self.current_index -= 1;
                return self.cut_new_token(LexTokenType::Number)

            } else if let None = a.to_digit(base) {
                return Err(LexError(E_NUMER_BASE_ERR,
                            self.current_token_start, self.document.clone()))
            }

            if let None = self.advance_char() {
                self.current_index = self.document.len() - 1;
                return self.cut_new_token(LexTokenType::Number)
            }
        }
    }

    #[inline(always)]
    fn seek_end_of_block_comment(&mut self) -> Result<LexToken, LexError> {
        loop {
            if let None = self.advance_char() {
                return Err(LexError(E_UNCLOSED_COMMENT,
                            self.current_token_start, self.document.clone()))
            }

            match self.current_char() {
                '|' if self.advance_char().and_then(|_|
                        if self.current_char() == '#' {
                            return Some(())
                        } else { return None }).is_some() =>
                            return self.cut_new_token(LexTokenType::Comment),
                 _ => continue,
            };
        }
    }

    #[inline(always)]
    fn seek_end_of_line_comment(&mut self, directive: bool) -> Result<LexToken, LexError> {
        loop {
            if let None = self.advance_char() {
                return Err(LexError(E_UNCLOSED_COMMENT,
                            self.current_token_start, self.document.clone()))
            }

            match self.current_char() {
                '\n' if !directive => return self.cut_new_token(LexTokenType::Comment),
                '\n' if  directive => return self.cut_new_token(LexTokenType::Directive),
                  _   => continue,
            };
        }
    }

    #[inline(always)]
    fn seek_closing_pipe(&mut self) -> Result<LexToken, LexError> {
        loop {
            if let None = self.advance_char() {
                return Err(LexError(E_NO_CLOSING_PIPE,
                            self.current_token_start, self.document.clone()));
            }

            let c = self.current_char();
            match self.current_char() {
                '\\' => self.seek_end_of_escape(false)?,
                '|' => return self.cut_new_token(LexTokenType::Symbol),
                 _  if c.is_alphanumeric() => continue,
                 _  if LEX_SPECIAL.contains(&c) => continue,
                 _  if c == ' ' || c == '\n' => continue,
                    // quote case caught here
                 _  => return Err(LexError(E_INCOMPREHENSIBLE,
                                    self.current_token_start, self.document.clone())),
            };
        }
    }

    #[inline(always)]
    fn seek_end_from_hash(&mut self) -> Result<LexToken, LexError> {
        let c = self.advance_char().and_then(|_| Some(self.current_char()));
        if let Some(ch) = c {
            match ch {
                't' | 'f' => return self.cut_new_token(LexTokenType::Boolean),
                '|'       => return self.seek_end_of_block_comment(),
                '!' => return self.seek_end_of_line_comment(true),
                'u' if self.match_chunk_next("8(", false).is_some_and(|x| x) =>
                        return self.cut_new_token(LexTokenType::ByteVectorStart),
                '(' => return self.cut_new_token(LexTokenType::VectorStart),
                '\\' => self.seek_end_of_escape(false, )
                            .and_then(|_| self.cut_new_token(LexTokenType::Char)),
                 _ if  NUMERICAL_BASE.contains(&ch) => return self.seek_end_of_number(),
                 'i' | 'e' => return self.seek_end_of_number(),
                 _ => return Err(LexError(E_INCOMPREHENSIBLE,
                                    self.current_token_start, self.document.clone())),
            }
        } else {
            Err(LexError(E_NO_END_TO_HASH, self.current_token_start, self.document.clone()))
        }
    }

    // DOES NOT RETURN A TOKEN.......
    // only the caller knows what actually needs to be returned
    #[inline(always)]
    fn seek_end_of_escape(&mut self, in_string: bool) -> Result<(), LexError> {
        // little helper to deduplicate logic for advancing characters
        macro_rules! adv {
            () => {
                if let None = self.advance_char() {
                    let mut error_msg = E_CHAR_TRUNCATED;
                    if in_string { error_msg = E_STRING_TRUNCATED; }
                    Err(LexError(error_msg, self.current_token_start,
                                 self.document.clone()))
                } else { Ok(()) }
            };
        }

        let delim = |x| -> bool {
               in_string || TOK_DELIMITERS.contains(&x)
            };

        // advance char once
        adv!()?;

        /* if match_chunk_next fails then the index is unmoved
         * allowing us to treat this like a single char escape
         */
        match self.current_char() {
            // char escapes
            'a' if !in_string => self.match_chunk_next("larm", false),
            'b' if !in_string => self.match_chunk_next("ackspace", false),
            'd' if !in_string => self.match_chunk_next("elete", false),
            'e' if !in_string => self.match_chunk_next("scape", false),
            'n' if !in_string => self.match_chunk_next("ewline", false)
                .or(self.match_chunk_next("ull", false)),
            'r' if !in_string => self.match_chunk_next("eturn", false),
            's' if !in_string => self.match_chunk_next("pace", false),
            't' if !in_string => self.match_chunk_next("ab", false),
            // specifically catch a non hex 'x' character escape
            'x' if self.peek_next_char()
                .is_none_or(|c| TOK_DELIMITERS.contains(&c)) && !in_string
                    => None,

            // string escapes
            'a' | 'b' | 't' | 'n' | 'r' | '"' | '\\' if in_string => None,

            // both
            'x' => {
                // we look for TWO hex digits
                adv!()?;
                self.current_char().to_digit(16)
                    .ok_or(LexError(E_BAD_HEX, self.current_index,
                                    self.document.clone()))?;
                adv!()?;
                self.current_char().to_digit(16)
                    .ok_or(LexError(E_BAD_HEX, self.current_index,
                                    self.document.clone()))?;
                None
            },

            // catchalls
            _  if !in_string => None,
            _ => return Err(LexError(E_UNSUPPORTED_ESC, self.current_index,
                                     self.document.clone())),
        };

        let saved_idx = self.current_index;
        if saved_idx == self.document.len() - 1 {
            return Ok(())
        }

        // make sure next character is a proper delimiter
        adv!().and_then(|_| if !delim(self.current_char()) {
                return Err(LexError(E_UNDELIMITED_ESC, self.current_index,
                                    self.document.clone()))
            } else { self.current_index = saved_idx; Ok(()) })
    }

    /* Called to output a token by the iterator implementation
     * I dont think this has to be inlined. The other ones are inlined to
     *     prevent the process of parsing a token from being slowed down by
     *     so many stack frames. This one is called once per token.
     */
    fn seek_next_token(&mut self) -> Result<LexToken, LexError> {
        let mut output: Option<Result<LexToken, LexError>> = None;

        if self.current_index >= self.document.len() {
            return Err(LexError(E_END_OF_DOCUMENT,
                        self.document.len(), self.document.clone()));
        }

        while LEX_WHITESPACE.contains(&self.current_char()) {
            if let None = self.advance_char() {
                return Err(LexError(E_END_OF_DOCUMENT,
                            self.document.len(), self.document.clone()));
            }
        }

        self.current_token_start = self.current_index;

        macro_rules! numeric {
            ( $x:expr ) => {
                $x.is_numeric() || self.match_chunk_next("inf.0", true)
                                       .or(self.match_chunk_next("nan.0", true))
                                       .or(Some(false))
                                       .unwrap()
            };
        }
        match self.current_char() {
            ';'  => output = Some(self.seek_end_of_line_comment(false)),
            '\'' => output = Some(self.cut_new_token(LexTokenType::Quote)),
            '`'  => output = Some(self.cut_new_token(LexTokenType::QuasiQuote)),
            '('  => output = Some(self.cut_new_token(LexTokenType::ListStart)),
            ')'  => output = Some(self.cut_new_token(LexTokenType::CollectionEnd)),
            '#'  => output = Some(self.seek_end_from_hash()),
            '"'  => output = Some(self.seek_end_of_string()),
            '|'  => output = Some(self.seek_closing_pipe()),
            '+' | '-' if self.peek_next_char()
                            .and_then(|x| Some(numeric!(x)))
                            .or(Some(false))
                            .unwrap() => output = Some(self.seek_end_of_number()),
             _ if self.current_char().is_numeric() => output =
                        Some(self.seek_end_of_number()),
             _   => (),
        }

        if output.is_none() {
            if self.current_char() == '.' {
                if let Some(x) = self.peek_next_char() && x == ' ' {
                    output = Some(self.cut_new_token(LexTokenType::Dot));
                } /* else {
                    output = Some(Err(LexError(E_BAD_DOT, self.current_index)));
                } SYKE! It could be a symbol... */
            }

            if self.current_char() == ',' {
                if let Some(x) = self.peek_next_char() && x == '@'{
                    self.advance_char();
                    output = Some(self.cut_new_token(LexTokenType::UnquoteSplice));
                }  else {
                    output = Some(self.cut_new_token(LexTokenType::Unquote));
                }
            }
        }

        /* Broken out into a separate case to maintain precedence of the
         *     unquote syntax and dotted notation.
         */
        if output.is_none() {
            loop {
                let c = self.current_char();
                if  !c.is_alphanumeric() &&
                    !LEX_SPECIAL.contains(&c) &&
                    !TOK_DELIMITERS.contains(&c) {

                    output = Some(Err(LexError(E_INCOMPREHENSIBLE,
                                        self.current_index, self.document.clone())));
                    break;
                }

                if let Some(c) = self.peek_next_char() {
                    if c == ' ' || c == ')' {
                        output = Some(self.cut_new_token(LexTokenType::Symbol));
                        break;
                    }

                    self.advance_char().unwrap();
                } else {
                    output = Some(self.cut_new_token(LexTokenType::Symbol));
                    break;
                }
            }
        }

        if let Some(ref res) = output {
            if let Err(e) = &res {
                self.has_error_state = Some(e.clone());
            }
        }

        return output.unwrap()
    }
}


#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_token_evaluations() {
        // indexed by LexTokenType
        let cases: [(Vec<&str>, Vec<&str>); LexTokenType::NumTypes as usize] = [
            /* String Cases */ (
                // HAPPY CASES
                vec!["\"asdf\"", "\"as sdf\"", "\"asdflkj\\n\"",
                     "\"LKsldkf;l\"", "\" sdlkfj \"", "\"#;sdf\"",
                     "\"\"", "\"\\\" \\\"\""],

                // SAD CASES
                vec!["\"sdf"]
            ),

            /* Number Cases */ (
                // HAPPY CASES
                vec!["1", "1.0", "#d1.1", "#o1423", "#b11", "#xDF",
                     "#e1e1", "#i1/4", "+inf.0", "1e1", "-1"],

                // SAD CASES
                vec!["1.1.1", "#o9", "#b1.01", "#i1/3/3"]
            ),

            /* Char Cases */ (
                // HAPPY CASES
                vec!["#\\a", "#\\t", "#\\\"", "#\\t", "#\\space",
                     "#\\alarm", "#\\s", "#\\x20", "#\\x", "#\\\\"],

                // SAD CASES
                vec!["\\c", "\\x20"]
            ),

            /* Identifier Cases */ (
                // HAPPY CASES
                vec!["...", "<=?", "V17a", "a34kTMNs", "lambda", "q",
                     "list->vector", "|two words|", "|two\nwords|",
                     "the-word-recursion-has-many-meanings", "+", "-",
                     "slatex.*slatex*"],

                // SAD CASES
                vec!["|\"\"|", "|(|", "|valid"]
            ),

            /* Vector Start Cases */ (
                // HAPPY CASES
                vec!["#("],

                // SAD CASES
                vec![]
            ),


            /* Byte Vector Cases */ (
                // HAPPY CASES
                vec!["#u8("],

                // SAD CASES
                vec!["#u8", "#u9", "#u("]
            ),

            /* List Start Cases */ (
                // HAPPY CASES
                vec!["("],

                // SAD CASES
                vec![]
            ),

            /* Collection End Cases */ (
                // HAPPY CASES
                vec![")"],

                // SAD CASES
                vec![]
            ),

            /* Boolean Cases */ (
                // HAPPY CASES
                vec!["#t", "#f"],

                // SAD CASES
                vec![]
            ),

            /* Dot Cases */ (
                // HAPPY CASES
                vec![" . "],

                // SAD CASES
                vec![]
            ),

            /* Comment cases */ (
                // HAPPY CASES
                vec!["; (\n", "; #\n", ";\"\n", "#| ; ( \" |#"],

                // SAD CASES
                vec!["#|", "; "]
            ),

            /* Directive cases */ (
                // HAPPY CASES
                vec!["#!test-directive\n"],

                // SAD CASES
                vec!["#!test-directive"]
            ),

            /* Quote cases */ (
                // HAPPY CASES
                vec!["'"],

                // SAD CASES
                vec![]
            ),

            /* QuasiQuote cases */ (
                // HAPPY CASES
                vec!["`"],

                // SAD CASES
                vec![]
            ),

            /* Unquote cases */ (
                // HAPPY CASES
                vec![",x", ","],

                // SAD CASES
                vec![]
            ),

            /* UnquoteSplice cases */ (
                // HAPPY CASES
                vec![",@x", ",@(", ",@", ",@(two)"],

                // SAD CASES
                vec![]
            ),
        ];

        let no_subtoken_check_cases = [
            LexTokenType::Dot as u8,
            LexTokenType::Unquote as u8,
            LexTokenType::UnquoteSplice as u8
        ];

        cases.iter().enumerate().for_each(|(idx, case)| {
            println!("+ Testing {:#?} Cases...", LexTokenType::try_from(idx as u8).unwrap());

            case.0.iter()
                .for_each(|subcase| {
                    println!("  - happy case: {}", subcase);
                    let token = Lexer::from(Rc::from(*subcase))
                        .next()
                        .unwrap();
                    assert_eq!(token.token_type,
                               LexTokenType::try_from(idx as u8)
                                    .unwrap());
                    if no_subtoken_check_cases.contains(&(idx as u8)) {
                        /* DO NOTHING, ignore the dot case since its subcase is
                         *  a superset of the actual token substring
                         */
                    } else {
                        assert_eq!(&token.source_doc[token.start_idx..token.end_idx],
                                   *subcase)
                    }
                });

            case.1.iter()
                .for_each(|subcase| {
                    println!("  - sad case: {}", subcase);
                    assert!(Lexer::from(Rc::from(*subcase)).next().is_none())
                });
        });
    }

    #[test]
    fn test_multi_token_iter() {
        let mut res = vec![];
        Lexer::from(Rc::from("( one two three )"))
            .into_iter()
            .collect_into(&mut res);
        assert_eq!(res.len(), 5);

        assert_eq!(res[0].token_type, LexTokenType::ListStart);
        assert_eq!(res[0].start_idx, 0);
        assert_eq!(res[0].end_idx, 1);
        assert_eq!(&res[0].source_doc[res[0].start_idx..res[0].end_idx], "(");

        assert_eq!(res[1].token_type, LexTokenType::Symbol);
        assert_eq!(res[1].start_idx, 2);
        assert_eq!(res[1].end_idx, 5);
        assert_eq!(&res[1].source_doc[res[1].start_idx..res[1].end_idx], "one");

        assert_eq!(res[2].token_type, LexTokenType::Symbol);
        assert_eq!(res[2].start_idx, 6);
        assert_eq!(res[2].end_idx, 9);
        assert_eq!(&res[2].source_doc[res[2].start_idx..res[2].end_idx], "two");

        assert_eq!(res[3].token_type, LexTokenType::Symbol);
        assert_eq!(res[3].start_idx, 10);
        assert_eq!(res[3].end_idx, 15);
        assert_eq!(&res[3].source_doc[res[3].start_idx..res[3].end_idx], "three");

        assert_eq!(res[4].token_type, LexTokenType::CollectionEnd);
        assert_eq!(res[4].start_idx, 16);
        assert_eq!(res[4].end_idx, 17);
        assert_eq!(&res[4].source_doc[res[4].start_idx..res[4].end_idx], ")");
    }

    #[test]
    fn test_error_state_blocking() {
        let mut l = Lexer::from(Rc::from("( 1 2.2.2 valid_token"))
            .into_iter();

        assert!(l.next().is_some());
        assert!(l.next().is_some());
        assert!(l.next().is_none());
        assert!(l.has_error_state.is_some());
        assert!(l.next().is_none());
        assert!(l.has_error_state.is_some());
    }

    #[test]
    fn char_lex_with_close() {
        let mut res = vec![];
        Lexer::from(Rc::from("(#\\a)"))
            .into_iter()
            .collect_into(&mut res);
        assert_eq!(res.len(), 3);

        assert_eq!(res[0].token_type, LexTokenType::ListStart);
        assert_eq!(&res[0].source_doc[res[0].start_idx..res[0].end_idx], "(");

        assert_eq!(res[1].token_type, LexTokenType::Char);
        assert_eq!(&res[1].source_doc[res[1].start_idx..res[1].end_idx], "#\\a");

        assert_eq!(res[2].token_type, LexTokenType::CollectionEnd);
        assert_eq!(&res[2].source_doc[res[2].start_idx..res[2].end_idx], ")");
    }

    #[test]
    fn num_lex_plusnum_case() {
        let mut res = vec![];
        Lexer::from(Rc::from("+1"))
            .into_iter()
            .collect_into(&mut res);
        assert_eq!(res.len(), 1);
        assert_eq!(res[0].token_type, LexTokenType::Number);
    }

    #[test]
    fn char_lex_xchar_case() {
        let mut res = vec![];
        Lexer::from(Rc::from("#\\x)"))
            .into_iter()
            .collect_into(&mut res);
        assert_eq!(res.len(), 2);

        assert_eq!(res[0].token_type, LexTokenType::Char);
        assert_eq!(&res[0].source_doc[res[0].start_idx..res[0].end_idx], "#\\x");

        assert_eq!(res[1].token_type, LexTokenType::CollectionEnd);
        assert_eq!(&res[1].source_doc[res[1].start_idx..res[1].end_idx], ")");
    }
}