/*  Mycelium Scheme
 *  Copyright (C) 2025 Ava Affine
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */

use core::fmt;
use alloc::rc::Rc;

pub const LEX_SPECIAL:     [char; 18] = ['!', '$', '%', '&', '*', '+', '-', '/',
     ':', '<', '=', '>', '?', '@', '^', '_', '~', '.'];
pub const LEX_WHITESPACE:  [char; 4] = [' ', '\n', '\t', '\r'];
pub const NUMERICAL_EXTRA: [char; 3] = ['.', 'i', 'e'];
pub const NUMERICAL_BASE:  [char; 3] = ['d', 'o', 'b'];

pub const E_NO_MATCHING_QUOTE:  &str = "couldn't find matching quote";
pub const E_TOO_MANY_DECIMALS:  &str = "number can only have one of {i e .}";
pub const E_NO_MATCHING_PAREN:  &str = "couldn't find matching paren";
pub const E_UNCLOSED_COMMENT:   &str = "block comment has no end";
pub const E_NO_CLOSING_PIPE:    &str = "expected a closing pipe";
pub const E_NO_END_TO_HASH:     &str = "expected more input after hash";
pub const E_NUMBER_TRUNCATED:   &str = "number literal is truncated";
pub const E_CHAR_TRUNCATED:     &str = "character literal is truncated";
pub const E_STRING_TRUNCATED:   &str = "string literal is truncated";
pub const E_EXTRA_CLOSE:        &str = "extra closing parenthesis";
pub const E_UNIMPLEMENTED_HEX:  &str = "hexadecimal literals not supported";
pub const E_NUMER_BASE_ERR:     &str = "digit in number exceeds specified base";
pub const E_UNSUPPORTED_ESC:    &str = "unsupported escape";
pub const E_BAD_DOT:            &str = "expected space after dot in dotted notation";
pub const E_INCOMPREHENSIBLE:   &str = "token does not lex";
pub const E_END_OF_DOCUMENT:    &str = "no additional input left in document";

/* LexError
 * 0: error string
 * 1: index into document
 * 2: document in question
 */
#[derive(Clone)]
pub struct LexError(pub &'static str, pub usize, pub Rc<str>);

impl fmt::Display for LexError {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        let err_snippet_start = || -> usize {
            /* backtrack from current index until we either hit 
             *   - beginning of line
             *   - 25 characters ago
             *   - the doc Start
             */
            if self.2.len() < 25 {
                0

            } else {
                let mut idx = self.1;
                while self.1 - idx > 25 {
                    idx -= 1;
                    if self.2[idx..]
                        .char_indices()
                        .next()
                        .is_some_and(|(i, x)| x == '\n' && i == idx) {
                            idx += 1;
                            break;
                        }
                }

                idx
            }
        };

        let err_snippet_end = || -> usize {
            /* read through document until we either hit
             *   - end of line
             *   - 25 characters forward
             *   - the doc end
             */
            if self.2.len() - self.1 < 25 {
                self.2.len()

            } else {
                let mut idx = self.1;
                while idx - self.1 < 25 {
                    idx += 1;
                    if self.2[idx..]
                        .char_indices()
                        .next()
                        .is_some_and(|(i, x)| x == '\n' && i == idx) {
                            break;
                    }
                }

                idx
            }
        };

        write!(f, "Error when lexing document here:\n\n")?;
        write!(f, "    {}\n", &self.2[err_snippet_start()..err_snippet_end()])?;
        write!(f, "Error: {}\n", self.0)
    }
}


#[repr(u8)]
#[derive(Debug, PartialEq, Clone)]
pub enum LexTokenType {
    String = 0,
    Number,
    Char,
    Symbol,
    VectorStart,
    ByteVectorStart,
    ListStart,
    CollectionEnd,
    Boolean,
    Dot,
    Comment,
    Directive,
    Quote,
    QuasiQuote,
    Unquote,
    UnquoteSpliceTemplate,
    NumTypes,
}

impl TryFrom<u8> for LexTokenType {
    type Error = &'static str;
    fn try_from(u: u8) -> Result<LexTokenType, Self::Error> {
        if u >= LexTokenType::NumTypes as u8 {
            Err("out of token type range")
        } else {
            unsafe { Ok(core::mem::transmute(u)) }
        }
    }
}


#[derive(Clone)]
pub struct LexToken {
    pub token_type: LexTokenType,
    pub start_idx: usize,
    pub end_idx: usize,
    pub source_doc: Rc<str>,
}


pub struct Lexer {
    document: Rc<str>,
    current_index: usize,
    current_token_start: usize,
    pub has_error_state: Option<LexError>,
}

impl From<Rc<str>> for Lexer {
    fn from(s: Rc<str>) -> Lexer {
       Lexer {
            document: Rc::from(s),
            current_index: 0,
            current_token_start: 0,
            has_error_state: None,
        }
    }
}

impl Iterator for Lexer {
    type Item = LexToken;

    fn next(&mut self) -> Option<Self::Item> {
        if self.has_error_state.is_some() {
            return None;
        }

        let res = self.seek_next_token();
        if let Err(ref e) = res {
            self.has_error_state = Some(e.clone());
        }

        return res.ok()
    }
}

impl Lexer {
    // I just didnt want to write and rewrite this...
    #[inline(always)]
    fn current_char(&mut self) -> char {
        self.document.as_bytes()[self.current_index] as char
    }

    #[inline(always)]
    fn peek_next_char(&mut self) -> Option<char> {
        if let Some((_, ch)) = self.document[self.current_index+1..]
                .char_indices()
                .next() {
            Some(ch)
        } else {
            None
        }
    }

    #[inline(always)]
    fn advance_char(&mut self) -> Option<()> {
        self.current_index += 1;
        if let Some((idx, _)) = self.document[self.current_index..]
                            .char_indices()
                            .next() {

            self.current_index = idx + self.current_index;
            Some(())

        } else {
            self.current_index = self.document.len();
            None
        }
    }

    #[inline(always)]
    fn match_chunk_next(&mut self, chunk: &str) -> Option<bool> {
        for i in chunk.chars() {
            self.advance_char()?;
            if i != self.current_char() {
                return Some(false)
            }
        }

        Some(true)
    }

    /* TODO
     * I figured this function would be useful for supporting hexadec encoding
     * later down the line. We can use this instead of the base check in the
     * number function.
    #[inline(always)]
    fn next_chars_allowed(&mut self, len: usize, allowed: &str) -> Option<bool> {
        let mut i = len;
        while i < 0 {
            if !allowed.contains(self.current_char()) {
                return Some(false)
            }

            i -= 1;
            self.advance_char()?;
        }

        Some(true)
    }
    */

    #[inline(always)]
    fn cut_new_token(&mut self, t: LexTokenType) -> Result<LexToken, LexError> {
        let next_idx = self.advance_char()
            .and_then(|_| Some(self.current_index))
            .or(Some(self.document.len()))
            .unwrap();

        let l = LexToken{
            token_type: t,
            start_idx: self.current_token_start,
            end_idx: next_idx,
            source_doc: self.document.clone(),
        };

        self.current_token_start = 0;
        return Ok(l);
    }

    #[inline(always)]
    fn seek_end_of_string(&mut self) -> Result<LexToken, LexError> {
        // TODO: support escaped quotes
        loop {
            if let None = self.advance_char() {
                return Err(LexError(E_NO_MATCHING_QUOTE,
                        self.current_token_start, self.document.clone()))
            } else if self.current_char() == '"' {
                return self.cut_new_token(LexTokenType::String)
            }
        }
    }

    #[inline(always)]
    fn seek_end_of_number(&mut self) -> Result<LexToken, LexError> {
        let mut base = 10;
        let a = self.current_char();
        if NUMERICAL_BASE.contains(&a) {
            if let None = self.advance_char() {
                return Err(LexError(E_NUMBER_TRUNCATED,
                            self.current_token_start, self.document.clone()))
            }
            match a {
                'd' => base = 10,
                'o' => base = 8,
                'b' => base = 2,
                 _ => (),
            }
        }

        let mut hasdot = false;
        loop {
            let a = self.current_char();
            if NUMERICAL_EXTRA.contains(&a) {
                if hasdot || base < 10 {
                    return Err(LexError(E_TOO_MANY_DECIMALS, 
                                self.current_token_start, self.document.clone()))
                }
                hasdot = true;

            } else if a == ' ' || a == ')' {
                // back up one
                self.current_index -= 1;
                return self.cut_new_token(LexTokenType::Number)

            } else if !a.is_numeric() {
                return Err(LexError(E_INCOMPREHENSIBLE,
                            self.current_token_start, self.document.clone()))

            } else if a.to_digit(10).unwrap() >= base {
                return Err(LexError(E_NUMER_BASE_ERR,
                            self.current_token_start, self.document.clone()))
            }

            if let None = self.advance_char() {
                self.current_index = self.document.len() - 1;
                return self.cut_new_token(LexTokenType::Number)
            }
        }
    }

    #[inline(always)]
    fn seek_end_of_block_comment(&mut self) -> Result<LexToken, LexError> {
        loop {
            if let None = self.advance_char() {
                return Err(LexError(E_UNCLOSED_COMMENT,
                            self.current_token_start, self.document.clone()))
            }

            match self.current_char() {
                '|' if self.advance_char().and_then(|_|
                        if self.current_char() == '#' {
                            return Some(())
                        } else { return None }).is_some() => 
                            return self.cut_new_token(LexTokenType::Comment),
                 _ => continue,
            };
        }
    }

    #[inline(always)]
    fn seek_end_of_line_comment(&mut self, directive: bool) -> Result<LexToken, LexError> {
        loop {
            if let None = self.advance_char() {
                return Err(LexError(E_UNCLOSED_COMMENT,
                            self.current_token_start, self.document.clone()))
            }

            match self.current_char() {
                '\n' if !directive => return self.cut_new_token(LexTokenType::Comment),
                '\n' if  directive => return self.cut_new_token(LexTokenType::Directive),
                  _   => continue,
            };
        }
    }

    #[inline(always)]
    fn seek_closing_pipe(&mut self) -> Result<LexToken, LexError> {
        loop {
            if let None = self.advance_char() {
                return Err(LexError(E_NO_CLOSING_PIPE,
                            self.current_token_start, self.document.clone()));
            }

            let c = self.current_char();
            match self.current_char() {
                '\\' => self.seek_end_of_escape(false)?,
                '|' => return self.cut_new_token(LexTokenType::Symbol),
                 _  if c.is_alphanumeric() => continue,
                 _  if LEX_SPECIAL.contains(&c) => continue,
                 _  if c == ' ' || c == '\n' => continue,
                    // quote case caught here
                 _  => return Err(LexError(E_INCOMPREHENSIBLE,
                                    self.current_token_start, self.document.clone())),
            };
        }
    }

    #[inline(always)]
    fn seek_end_from_hash(&mut self) -> Result<LexToken, LexError> {
        let c = self.advance_char().and_then(|_| Some(self.current_char()));
        if let Some(ch) = c {
            match ch {
                't' | 'f' => return self.cut_new_token(LexTokenType::Boolean),
                '|'       => return self.seek_end_of_block_comment(),
                '!' => return self.seek_end_of_line_comment(true),
                'u' if self.match_chunk_next("8(").is_some_and(|x| x) => 
                        return self.cut_new_token(LexTokenType::ByteVectorStart),
                '(' => return self.cut_new_token(LexTokenType::VectorStart),
                '\\' => self.seek_end_of_escape(false)
                            .and_then(|_| self.cut_new_token(LexTokenType::Char)),
                'x'  => return Err(LexError(E_UNIMPLEMENTED_HEX,
                                    self.current_index, self.document.clone())),
                 _ if  NUMERICAL_BASE.contains(&ch) => return self.seek_end_of_number(),
                 _ => return Err(LexError(E_INCOMPREHENSIBLE,
                                    self.current_token_start, self.document.clone())),
            }
        } else {
            Err(LexError(E_NO_END_TO_HASH, self.current_token_start, self.document.clone()))
        }
    }

    // DOES NOT RETURN A TOKEN.......
    // only the caller knows what actually needs to be returned
    #[inline(always)]
    fn seek_end_of_escape(&mut self, in_string: bool) -> Result<(), LexError> {
        //let delim = if in_string { ';' } else { ' ' };
        // Delim and the arg to this function will be useful once we support hexadecimal encoding
        if let None = self.advance_char() {
            let mut error_msg = E_CHAR_TRUNCATED;
            if in_string { error_msg = E_STRING_TRUNCATED; }
            return Err(LexError(error_msg, self.current_token_start, self.document.clone()))
        }

        match self.current_char() {
            // eat an escaped whitespace or delim
            ' ' | 'n' | 'r' | 't' | '|' | '\\' | '"' => { () },
            'x' => return Err(LexError(E_UNIMPLEMENTED_HEX,
                                self.current_token_start, self.document.clone())),
             _  if self.current_char().is_alphabetic() => { () },
             _  => return Err(LexError(E_UNSUPPORTED_ESC,
                                self.current_index, self.document.clone())),
        }

        return Ok(())
    }

    /* Called to output a token by the iterator implementation
     * I dont think this has to be inlined. The other ones are inlined to
     *     prevent the process of parsing a token from being slowed down by
     *     so many stack frames. This one is called once per token.
     */
    fn seek_next_token(&mut self) -> Result<LexToken, LexError> {
        let mut output: Option<Result<LexToken, LexError>> = None;

        if self.current_index >= self.document.len() {
            return Err(LexError(E_END_OF_DOCUMENT,
                        self.document.len(), self.document.clone()));
        }

        while LEX_WHITESPACE.contains(&self.current_char()) {
            if let None = self.advance_char() {
                return Err(LexError(E_END_OF_DOCUMENT,
                            self.document.len(), self.document.clone()));
            }
        }

        self.current_token_start = self.current_index;

        // handle syntactic sugar cases
        match self.current_char() {
            ';'  => output = Some(self.seek_end_of_line_comment(false)),
            '\'' => output = Some(self.cut_new_token(LexTokenType::Quote)),
            '`'  => output = Some(self.cut_new_token(LexTokenType::QuasiQuote)),
            '('  => output = Some(self.cut_new_token(LexTokenType::ListStart)),
            ')'  => output = Some(self.cut_new_token(LexTokenType::CollectionEnd)),
            '#'  => output = Some(self.seek_end_from_hash()),
            '"'  => output = Some(self.seek_end_of_string()),
            '\\' => output = Some(self.seek_end_of_escape(false)
                                     .and_then(|_| 
                                         self.cut_new_token(LexTokenType::Char))),
            '|'  => output = Some(self.seek_closing_pipe()),
             _ if self.current_char().is_numeric() => output =
                        Some(self.seek_end_of_number()),
             _   => (),
        }

        if output.is_none() {
            if self.current_char() == '.' {
                if let Some(x) = self.peek_next_char() && x == ' ' {
                    output = Some(self.cut_new_token(LexTokenType::Dot));
                } /* else {
                    output = Some(Err(LexError(E_BAD_DOT, self.current_index)));
                } SYKE! It could be a symbol... */
            }

            if self.current_char() == ',' {
                if let Some(x) = self.peek_next_char() && x == '@'{
                    output = Some(self.cut_new_token(LexTokenType::UnquoteSpliceTemplate));
                }  else {
                    output = Some(self.cut_new_token(LexTokenType::Unquote));
                }
            }
        }

        /* Broken out into a separate case to maintain precedence of the
         *     unquote syntax and dotted notation.
         */
        if output.is_none() {
            loop {
                let c = self.current_char();
                if !c.is_alphanumeric() && !LEX_SPECIAL.contains(&c) &&  c != ' ' {
                    output = Some(Err(LexError(E_INCOMPREHENSIBLE,
                                        self.current_index, self.document.clone())));
                    break;
                }

                if let Some(c) = self.peek_next_char() {
                    if c == ' ' || c == ')' {
                        output = Some(self.cut_new_token(LexTokenType::Symbol));
                        break;
                    }

                    self.advance_char().unwrap();
                } else {
                    output = Some(self.cut_new_token(LexTokenType::Symbol));
                    break;
                }
            }
        }

        if let Some(ref res) = output {
            if let Err(ref e) = res {
                self.has_error_state = Some(e.clone());
            }
        }

        return output.unwrap()
    }
}


#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_token_evaluations() {
        // indexed by LexTokenType
        let cases: [(Vec<&str>, Vec<&str>); LexTokenType::NumTypes as usize] = [
            /* String Cases */ (
                // HAPPY CASES
                vec!["\"asdf\"", "\"as sdf\"", "\"asdflkj\\n\"",
                     "\"LKsldkf;l\"", "\" sdlkfj \"", "\"#;sdf\"",
                     "\"\""],

                // SAD CASES
                vec!["\"sdf"]
            ),

            /* Number Cases */ (
                // HAPPY CASES
                vec!["1", "1.0", "#d1.1", "#o1423", "#b11"],

                // SAD CASES
                vec!["1.1.1", "#o9", "#b1.01", "#xADADAD"]
            ),

            /* Char Cases */ (
                // HAPPY CASES
                vec!["\\a", "\\t", "\\\"", "#\\t"],

                // SAD CASES
                vec!["\\x20"]
            ),

            /* Identifier Cases */ (
                // HAPPY CASES
                vec!["...", "+", "+soup+", "<=?", "V17a", "->string", "a34kTMNs",
                     "lambda", "q", "list->vector", "|two words|", "|two\nwords|",
                     "the-word-recursion-has-many-meanings"],

                // SAD CASES
                vec!["|\"\"|", "|(|", "|valid"]
            ),

            /* Vector Start Cases */ (
                // HAPPY CASES
                vec!["#("],

                // SAD CASES
                vec![]
            ),


            /* Byte Vector Cases */ (
                // HAPPY CASES
                vec!["#u8("],

                // SAD CASES
                vec!["#u8", "#u9", "#u("]
            ),

            /* List Start Cases */ (
                // HAPPY CASES
                vec!["("],

                // SAD CASES
                vec![]
            ),

            /* Collection End Cases */ (
                // HAPPY CASES
                vec![")"],

                // SAD CASES
                vec![]
            ),

            /* Boolean Cases */ (
                // HAPPY CASES
                vec!["#t", "#f"],

                // SAD CASES
                vec![]
            ),

            /* Dot Cases */ (
                // HAPPY CASES
                vec![" . "],

                // SAD CASES
                vec![]
            ),

            /* Comment cases */ (
                // HAPPY CASES
                vec!["; (\n", "; #\n", ";\"\n", "#| ; ( \" |#"],

                // SAD CASES
                vec!["#|", "; "]
            ),

            /* Directive cases */ (
                // HAPPY CASES
                vec!["#!test-directive\n"],

                // SAD CASES
                vec!["#!test-directive"]
            ),

            /* Quote cases */ (
                // HAPPY CASES
                vec!["'"],

                // SAD CASES
                vec![]
            ),

            /* QuasiQuote cases */ (
                // HAPPY CASES
                vec!["`"],

                // SAD CASES
                vec![]
            ),

            /* Unquote cases */ (
                // HAPPY CASES
                vec![",x", ","],

                // SAD CASES
                vec![]
            ),

            /* UnquoteSpliceTemplate cases */ (
                // HAPPY CASES
                vec![",@x", ",@(", ",@"],

                // SAD CASES
                vec![]
            ),
        ];

        let no_subtoken_check_cases = [
            LexTokenType::Dot as u8,
            LexTokenType::Unquote as u8,
            LexTokenType::UnquoteSpliceTemplate as u8
        ];

        cases.iter().enumerate().for_each(|(idx, case)| {
            println!("+ Testing {:#?} Cases...", LexTokenType::try_from(idx as u8).unwrap());

            case.0.iter()
                .for_each(|subcase| {
                    println!("  - happy case: {}", subcase);
                    let token = Lexer::from(Rc::from(*subcase))
                        .next()
                        .unwrap();
                    assert_eq!(token.token_type,
                               LexTokenType::try_from(idx as u8)
                                    .unwrap());
                    if no_subtoken_check_cases.contains(&(idx as u8)) {
                        /* DO NOTHING, ignore the dot case since its subcase is
                         *  a superset of the actual token substring
                         */
                    } else {
                        assert_eq!(&token.source_doc[token.start_idx..token.end_idx],
                                   *subcase)
                    }
                });

            case.1.iter()
                .for_each(|subcase| {
                    println!("  - sad case: {}", subcase);
                    assert!(Lexer::from(Rc::from(*subcase)).next().is_none())
                });
        });
    }

    #[test]
    fn test_multi_token_iter() {
        let mut res = vec![];
        Lexer::from(Rc::from("( one two three )"))
            .into_iter()
            .collect_into(&mut res);
        assert_eq!(res.len(), 5);

        assert_eq!(res[0].token_type, LexTokenType::ListStart);
        assert_eq!(res[0].start_idx, 0);
        assert_eq!(res[0].end_idx, 1);
        assert_eq!(&res[0].source_doc[res[0].start_idx..res[0].end_idx], "(");

        assert_eq!(res[1].token_type, LexTokenType::Symbol);
        assert_eq!(res[1].start_idx, 2);
        assert_eq!(res[1].end_idx, 5);
        assert_eq!(&res[1].source_doc[res[1].start_idx..res[1].end_idx], "one");

        assert_eq!(res[2].token_type, LexTokenType::Symbol);
        assert_eq!(res[2].start_idx, 6);
        assert_eq!(res[2].end_idx, 9);
        assert_eq!(&res[2].source_doc[res[2].start_idx..res[2].end_idx], "two");

        assert_eq!(res[3].token_type, LexTokenType::Symbol);
        assert_eq!(res[3].start_idx, 10);
        assert_eq!(res[3].end_idx, 15);
        assert_eq!(&res[3].source_doc[res[3].start_idx..res[3].end_idx], "three");

        assert_eq!(res[4].token_type, LexTokenType::CollectionEnd);
        assert_eq!(res[4].start_idx, 16);
        assert_eq!(res[4].end_idx, 17);
        assert_eq!(&res[4].source_doc[res[4].start_idx..res[4].end_idx], ")");
    }

    #[test]
    fn test_error_state_blocking() {
        let mut l = Lexer::from(Rc::from("( 1 2.2.2 valid_token"))
            .into_iter();

        assert!(l.next().is_some());
        assert!(l.next().is_some());
        assert!(l.next().is_none());
        assert!(l.has_error_state.is_some());
        assert!(l.next().is_none());
        assert!(l.has_error_state.is_some());
    }
}