Mycelium/mycelium/src/parser.rs

/*  Mycelium Scheme
 *  Copyright (C) 2025 Ava Affine
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */

use core::fmt::Display;
use core::cell::RefCell;

use crate::lexer::{
    LexError,
    LexToken,
    LexTokenType,
    Lexer,
    E_CHAR_TOO_LONG,
    E_END_OF_DOCUMENT
};
use organelle::{Number, Numeric};
use crate::sexpr::{Datum, Ast};

use alloc::vec::Vec;
use alloc::vec;
use alloc::rc::Rc;
use alloc::string::String;


pub const E_LEX_ERROR:         &str = "error in lexing document";
pub const E_EXTRA_CLOSE:       &str = "closing parenthesis closes nothing";
pub const E_TERRIBLE:          &str = "something has gone terribly wrong....";
pub const E_VECTOR_DOT:        &str = "dotted notation not valid in vectors";
pub const E_DOT_NO_LIST:       &str = "dotted notation used outside of list";
pub const E_CHAR_TRUNCATED:    &str = "character literal is truncated";
pub const E_CHAR_HEX_PARSE:    &str = "hexadecimal character literal failed to parse";
pub const E_COLLECTION_TRUNC:  &str = "collection is truncated";
pub const E_BV_BADBYTE:        &str = "number provided is not a real byte";
pub const E_BV_NONBYTE:        &str = "bytevector elements must all be bytes";
pub const E_TOO_MANY_DOT:      &str = "valid dot notation only includes one dot";
pub const E_DOT_IDX:           &str = "dot should preceed only last element in list";
pub const E_DOT_EMPTY:         &str = "cannot apply dotted notation to otherwise empty list";
pub const E_UNQUOTE_NONQQ:     &str = "unquote must be within a quasiquoted form";
pub const E_UNQUOTE_SPL_NONQQ: &str = "unquote-splicing must be within a quasiquoted form";
pub const E_UNQUOTE_SPL_COLL:  &str = "expected list or vector after unquote-splicing";


/* ParseError
 * 0: error string
 * 1: either problematic lexing token, or a lexing error
 */
#[derive(Clone)]
pub struct ParseError(pub &'static str, pub Option<Result<LexToken, LexError>>);

impl Display for ParseError {
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
        let err_snippet_start = |t: &LexToken| -> usize {
            /* backtrack from current index until we either hit
             *   - beginning of line
             *   - 25 characters ago
             *   - the doc Start
             */
            if t.source_doc.len() < 25 {
                0

            } else {
                let mut idx = t.start_idx;
                while t.start_idx - idx < 25 {
                    idx -= 1;
                    if t.source_doc[idx..]
                        .char_indices()
                        .next()
                        .is_some_and(|(i, x)| x == '\n' && i == idx) {
                            idx += 1;
                            break;
                        }
                }

                idx
            }
        };

        let err_snippet_end = |t: &LexToken| -> usize {
            /* read through document until we either hit
             *   - end of line
             *   - 25 characters forward
             *   - the doc end
             */
            if t.source_doc.len() - t.end_idx < 25 {
                t.source_doc.len()

            } else {
                let mut idx = t.end_idx;
                while idx - t.end_idx < 25 {
                    idx += 1;
                    if t.source_doc[idx..]
                        .char_indices()
                        .next()
                        .is_some_and(|(i, x)| x == '\n' && i == idx) {
                            break;
                    }
                }

                idx
            }
        };

        if let Some(frag) = &self.1 {
            match frag {
                Ok(token) => {
                    write!(f, "Error parsing syntax: {}\n", self.0)?;
                    write!(f,"    problematic token: {}\n",
                        &token.source_doc[token.start_idx..token.end_idx])?;
                    write!(f,"    {}\n",
                        &token.source_doc[err_snippet_start(token)..err_snippet_end(token)])?;
                },

                Err(e) => {
                    return e.fmt(f);
                }
            }
        }

        write!(f, "Error parsing syntax: {}\n", self.0)
    }
}

pub struct Parser {
    lexer: Lexer,
    pub has_error_state: Option<ParseError>,
    delayed: Vec<Rc<Datum>>,
    quasiquoted: bool,
}

/* The From and Iterator traits serve as the primary
 * interface to work with the parser. It is expected to
 * make a Lexer first, and then use casting or type conv
 * to make it into a parser and then a final AST, which
 * we can then convert into a VM image once the compile
 * step is finished.
 */

impl From<Lexer> for Parser {
    fn from(l: Lexer) -> Parser {
        Parser {
            lexer: l,
            has_error_state: None,
            delayed: vec![],
            quasiquoted: false
        }
    }
}

impl Iterator for Parser {
    type Item = Rc<Datum>;

    fn next(&mut self) -> Option<Self::Item> {
        if self.has_error_state.is_some() {
            return None;
        }

        if self.delayed.len() > 0 {
            return self.delayed.pop()
        }

        let res = self.get_next_datum();
        if let Err(ref e) = res {
            self.has_error_state = Some(e.clone());
        }

        return res.ok()
    }
}

fn read_number(token: LexToken) -> Result<Number, ParseError> {
    return match (&token.source_doc[token.start_idx..token.end_idx]).parse::<Number>() {
        Ok(num) => Ok(num),
        Err(e)  => Err(ParseError(e, Some(Ok(token)))),
    }
}

fn read_char(token: LexToken) -> Result<u8, ParseError> {
    if token.end_idx - token.start_idx < 3 {
        return Err(ParseError(E_CHAR_TRUNCATED, Some(Ok(token))))
    }

    match &token.source_doc[token.start_idx + 2..token.end_idx] {
        "alarm"     => Ok(7),
        "backspace" => Ok(8),
        "delete"    => Ok(127),
        "escape"    => Ok(33),
        "newline"   => Ok('\n' as u8),
        "null"      => Ok(0),
        "return"    => Ok(13),
        "space"     => Ok(32),
        "tab"       => Ok(11),
        _ if token.source_doc[token.start_idx + 2..].starts_with('x') &&
            token.end_idx - token.start_idx > 3 => {
                if token.end_idx - token.start_idx > 5 {
                    return Err(ParseError(E_CHAR_TOO_LONG, Some(Ok(token))))
                }

                match u8::from_str_radix(
                    &token.source_doc[token.start_idx + 3..token.end_idx],
                    16) {
                        Ok(u)  => Ok(u),
                        Err(_) => Err(ParseError(E_CHAR_HEX_PARSE, Some(Ok(token))))
                }
        },
        _ => Ok(token.source_doc.as_bytes()[token.start_idx + 2])
    }
}

fn read_bool(token: LexToken) -> bool {
    match &token.source_doc[token.start_idx..token.end_idx] {
        "#t" => true,
        "#f" => false,
         _   => panic!("impossible boolean")
    }
}

fn read_string(token: LexToken) -> Vec<u8> {
    if token.end_idx - token.start_idx < 3 {
        // empty string other than delimiters
        Vec::default()
    } else {
        token.source_doc[token.start_idx + 1..token.end_idx - 1]
            .as_bytes()
            .to_vec()
    }
}

impl Parser {
    /* Rules we must mind:
     * 0. at this stage, drop and ignore comments, directives
     * 1. quote, quasiquote, unquote, and unquote splicing
     *     all require another input after them (excluding
     *     collection end)
     * 2. unquote-splicing explicitly requires a form I think?
     *     (verify)
     * 3. vectors, lists, may have nested collections in them
     *     so track collection state in the parser's stack.
     * 4. list dotted notation needs next datum put in cdr.
     * 5. bytevectors can only have numbers from 0-255 in them.
     */

    fn complete_quote(&mut self) -> Result<Rc<Datum>, ParseError> {
        let next = self.get_next_datum()?;
        Ok(Rc::from(Datum::List(Rc::from(Ast(
            Rc::from(Datum::Symbol(String::from("quote"))),

            Rc::from(Datum::List(Rc::from(Ast(
                next,
                Rc::from(Datum::None)
            ))))
        )))))

    }

    fn complete_unquote_splicing(&mut self, tok: LexToken) -> Result<Rc<Datum>, ParseError> {
        let next = self.get_next_datum()?;
        match *next {
            Datum::List(_) | Datum::Vector(_) | Datum::Symbol(_) => (),
            _ => return Err(ParseError(E_UNQUOTE_SPL_COLL, Some(Ok(tok))))
        }

        Ok(Rc::from(Datum::List(Rc::from(Ast(
            Rc::from(Datum::Symbol(String::from("unquote-splicing"))),

            Rc::from(Datum::List(Rc::from(Ast(
                next,
                Rc::from(Datum::None)
            ))))
        )))))
    }

    fn complete_unquote(&mut self) -> Result<Rc<Datum>, ParseError> {
        let next = self.get_next_datum()?;
        Ok(Rc::from(Datum::List(Rc::from(Ast(
            Rc::from(Datum::Symbol(String::from("unquote"))),

            Rc::from(Datum::List(Rc::from(Ast(
                next,
                Rc::from(Datum::None)
            ))))
        )))))
    }

    fn complete_quasiquote(&mut self) -> Result<Rc<Datum>, ParseError> {
        let prev = self.quasiquoted; // handle nesting appropriately
        self.quasiquoted = true;
        let next = self.get_next_datum()?;
        self.quasiquoted = prev;

        Ok(Rc::from(Datum::List(Rc::from(Ast(
            Rc::from(Datum::Symbol(String::from("quasiquote"))),

            Rc::from(Datum::List(Rc::from(Ast(
                next,
                Rc::from(Datum::None)
            ))))
        )))))
    }

    fn complete_collection(&mut self, token: LexToken) -> Result<Rc<Datum>, ParseError> {
        let is_bv = match token.token_type {
            LexTokenType::ByteVectorStart => true,
            _ => false,
        };

        let mut lex_stack = vec![];
        let mut bv_stack  = vec![];

        /* counting indexes helps greatly with calculating position dependent
         * syntax rules like dot notation in lists
         */
        let mut iter_count = 0;
        let mut dot_idx = (None, None, None);

        loop {
            let next_tok = self.lexer.next();
            if let None = next_tok {
                if let Some(e) = &self.lexer.has_error_state {
                    return Err(ParseError(E_LEX_ERROR, Some(Err(e.clone()))))
                }
                return Err(ParseError(E_COLLECTION_TRUNC, Some(Ok(token))))
            }

            let tok = next_tok.unwrap();

            match tok.token_type {
                // Universal cases
                LexTokenType::Comment | LexTokenType::Directive => continue,
                LexTokenType::NumTypes =>
                    return Err(ParseError(E_TERRIBLE, Some(Ok(tok)))),
                LexTokenType::Unquote if !self.quasiquoted =>
                    return Err(ParseError(E_UNQUOTE_SPL_NONQQ, Some(Ok(tok)))),
                LexTokenType::UnquoteSplice if !self.quasiquoted =>
                    return Err(ParseError(E_UNQUOTE_NONQQ, Some(Ok(tok)))),


                // CollectionEnd must take precedence over the dot notation case
                LexTokenType::CollectionEnd => break,
                _ if let Some(idx) = dot_idx.0 && iter_count - idx > 2 =>
                    return Err(ParseError(E_DOT_IDX, Some(Ok(dot_idx.1.unwrap())))),

                LexTokenType::Dot if token.token_type != LexTokenType::ListStart =>
                    return Err(ParseError(E_VECTOR_DOT, Some(Ok(tok)))),


                // List, Vector cases
                LexTokenType::ListStart | LexTokenType::VectorStart |
                    LexTokenType::ByteVectorStart if !is_bv =>
                        lex_stack.push(self.complete_collection(tok)?),
                LexTokenType::String if !is_bv =>
                    lex_stack.push(Rc::from(Datum::String(read_string(tok)))),
                LexTokenType::Number if !is_bv =>
                    lex_stack.push(Rc::from(Datum::Number(read_number(tok)?))),
                LexTokenType::Char if !is_bv =>
                    lex_stack.push(Rc::from(Datum::Char(read_char(tok)?))),
                LexTokenType::Boolean if !is_bv =>
                    lex_stack.push(Rc::from(Datum::Bool(read_bool(tok)))),
                LexTokenType::Symbol if !is_bv =>
                    lex_stack.push(Rc::from(Datum::Symbol(
                        String::from(&tok.source_doc[tok.start_idx..tok.end_idx])))),
                LexTokenType::Quote if !is_bv =>
                    lex_stack.push(self.complete_quote()?),
                LexTokenType::QuasiQuote if !is_bv =>
                    lex_stack.push(self.complete_quasiquote()?),
                LexTokenType::Unquote if !is_bv && self.quasiquoted =>
                    lex_stack.push(self.complete_unquote()?),
                LexTokenType::UnquoteSplice if !is_bv && self.quasiquoted =>
                    lex_stack.push(self.complete_unquote_splicing(tok)?),


                // List only cases
                LexTokenType::Dot => if let Some(_) = dot_idx.0 {
                    return Err(ParseError(E_TOO_MANY_DOT, Some(Ok(tok))))
                } else {
                    dot_idx = (Some(iter_count), Some(tok), None)
                },


                // ByteVector cases
                LexTokenType::Number if is_bv => {
                    let n = read_number(tok.clone())?
                        .make_inexact();

                    if n.0 < 0.0 || n.0 > 255.0 || n.0.fract() != 0.0 {
                        return Err(ParseError(E_BV_BADBYTE, Some(Ok(tok))))
                    }

                    bv_stack.push(n.0 as u8);
                },

                _ if is_bv => return Err(ParseError(E_BV_NONBYTE, Some(Ok(tok)))),

                // This should never get touched
                _ => todo!("theoretically impossible case in parser::complete_collection"),
            }

            if let Some(idx) = dot_idx.0 && iter_count == idx + 1 {
                dot_idx.2 = Some(lex_stack.pop());
            }

            iter_count += 1;
        }

        if is_bv {
            return Ok(Rc::from(Datum::ByteVector(RefCell::from(bv_stack))))
        }

        if token.token_type == LexTokenType::VectorStart {
            return Ok(Rc::from(Datum::Vector(RefCell::from(lex_stack))))
        }

        // handle an empty list
        if lex_stack.len() < 1 {
            // dont try to do something like "( . 'thing)"
            if let (_, Some(node), _) = dot_idx {
                return Err(ParseError(E_DOT_EMPTY, Some(Ok(node))))
            }
            return Ok(Rc::from(Datum::List(Rc::from(Ast(Rc::from(Datum::None),
                                                        Rc::from(Datum::None))))))
        }

        let mut from_rear: Rc<Ast>;
        if let (_, _, Some(node)) = dot_idx {
            from_rear = Rc::from(Ast(lex_stack.pop().unwrap(), node.unwrap()));
        } else {
            from_rear = Rc::from(Ast(lex_stack.pop().unwrap(), Rc::from(Datum::None)));
        }

        lex_stack.iter()
            .rev()
            .for_each(|x| {
                from_rear = Rc::from(Ast(x.clone(), Rc::from(Datum::List(from_rear.clone()))));
            });

        Ok(Rc::from(Datum::List(from_rear)))
    }

    fn get_next_datum(&mut self) -> Result<Rc<Datum>, ParseError> {
        if let Some(token) = self.lexer.next() {
            match token.token_type {
                // normal paths:
                LexTokenType::String => Ok(Rc::from(Datum::String(read_string(token)))),
                LexTokenType::Number => Ok(Rc::from(Datum::Number(read_number(token)?))),
                LexTokenType::Char => Ok(Rc::from(Datum::Char(read_char(token)?))),
                LexTokenType::Symbol => Ok(Rc::from(Datum::Symbol(String::from(
                                &token.source_doc[token.start_idx..token.end_idx])))),
                LexTokenType::Boolean => Ok(Rc::from(Datum::Bool(read_bool(token)))),
                LexTokenType::VectorStart | LexTokenType::ListStart |
                    LexTokenType::ByteVectorStart => self.complete_collection(token),
                LexTokenType::Quote => self.complete_quote(),
                LexTokenType::QuasiQuote => self.complete_quasiquote(),
                LexTokenType::Unquote if self.quasiquoted => self.complete_unquote(),
                LexTokenType::UnquoteSplice if self.quasiquoted =>
                    self.complete_unquote_splicing(token),

                // immediate errors:
                LexTokenType::CollectionEnd => Err(ParseError(E_EXTRA_CLOSE, Some(Ok(token)))),
                LexTokenType::NumTypes => Err(ParseError(E_TERRIBLE, Some(Ok(token)))),
                LexTokenType::Dot => Err(ParseError(E_DOT_NO_LIST, Some(Ok(token)))),
                LexTokenType::Unquote if !self.quasiquoted =>
                    Err(ParseError(E_UNQUOTE_NONQQ, Some(Ok(token)))),
                LexTokenType::UnquoteSplice if !self.quasiquoted =>
                    Err(ParseError(E_UNQUOTE_SPL_NONQQ, Some(Ok(token)))),

                // ignore comment, directive:
                _ => self.get_next_datum(),
            }

        // Lexer error
        } else if let Some(e) = &self.lexer.has_error_state && e.0 != E_END_OF_DOCUMENT {
            Err(ParseError(E_LEX_ERROR,
                           Some(Err(self.lexer.has_error_state.clone().unwrap()))))

        // End of document
        } else {
            Err(ParseError(E_END_OF_DOCUMENT, None))
        }
    }
}


#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_parse_cases() {
        let happy_cases = vec![
            // case, result
            ("\"test\"", "\"test\""),
            ("test", "test"),
            ("(1 2 3)", "(1 2 3)"),
            ("'test", "(quote test)"),
            ("`test", "(quasiquote test)"),
            ("`(,one)", "(quasiquote ((unquote one)))"),
            ("`(test ,@(two))", "(quasiquote (test (unquote-splicing (two))))"),
            ("#u8(0 14 249)", "#u8(0 14 249)"),
            ("(nested lists (are pretty cool))", "(nested lists (are pretty cool))"),
            ("((nested) lists (are (pretty) cool))", "((nested) lists (are (pretty) cool))"),
            ("(dotted . notation)", "(dotted . notation)"),
            ("(longer dotted . notation)", "(longer dotted . notation)"),
            ("(hello \"world\")", "(hello \"world\")"),
            ("; big doc string\n(one two)", "(one two)"),
            ("(list #(vect 2 3 #u8(0 0)))", "(list #(vect 2 3 #u8(0 0)))"),
            ("(#\\xf0 #\\alarm #\\a #\\z)", "(#\\xf0 #\\x7 #\\a #\\z)"),
            ("(- q 1)", "(- q 1)"),
            ("(+ q 1)", "(+ q 1)"),
            ("(#\\x)", "(#\\x)"),
        ];

        let sad_cases = vec![
            "(",
            "( one two ",
            "( one two three ( four )",
            ")",
            "#(st",
            "#u8(0 ",
            "#u8(256)",
            "#u8(two)",
            "(one two ,three)",
            "(one two ,@three)",
            "`(one two ,@4.0)",
            "(. two)",
            "(one . two . three)",
        ];

        println!("+ Testing Happy Cases...");
        happy_cases.iter()
            .for_each(|(case, result)| {
                println!("  - case: {}", *case);
                let mut p = Parser::from(Lexer::from(Rc::from(*case)));
                let res = p.next();
                if let None = res {
                    println!("{}", p.has_error_state.unwrap());
                }
                assert_eq!(
                    format!("{}", res.unwrap()),
                    format!("{}", result)
                );
            });

        println!("+ Testing Sad Cases...");
        sad_cases.iter()
            .for_each(|case| {
                println!("  - case: {}", *case);
                let mut p = Parser::from(Lexer::from(Rc::from(*case)));
                assert!(p.next().is_none() && p.has_error_state.is_some())
            });
    }
}