flesh/src/lex.rs

/* relish: highly versatile lisp interpreter
 * Copyright (C) 2021 Aidan Hahn
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

use crate::segment::{Ctr, Seg};

const UNMATCHED_STR_DELIM: &str = "Unmatched string delimiter in input";
const UNMATCHED_LIST_DELIM: &str = "Unmatched list delimiter in input";

/* takes a line of user input
 * returns an unsimplified tree of tokens.
 */
pub fn lex(document: &String) -> Result<Box<Seg>, String> {
    if !document.is_ascii() {
        return Err("document may only contain ascii characters".to_string());
    }

    // finish a singlet token, or do nothing
    let document_normal = document.clone() + " ";
    let tree = process(&document_normal);

    // TODO: Make multiple forms of Ok()
    // To represent the multiple passable outcomes
    return match tree {
        Err(e) => Err(format!("Problem lexing document: {:?}", e)),
        Ok(t) => Ok(t),
    };
}

/* The logic used in lex
 * Returns Ok(Rc<Seg>) if lexing passes
 * Returns Err(String) if an error occurs
 */
fn process(document: &String) -> Result<Box<Seg>, String> {
    let doc_len = document.len();

    if doc_len == 0 {
        return Err("Empty document".to_string());
    }

    /* State variables
     * TODO: describe all of them
     */
    let mut is_str = false;
    let mut ign = false;
    let mut token = String::new();
    let mut delim_stack = Vec::new();
    let mut ref_stack = vec![];

    /* Iterate over document
     * Manage currently sought delimiter
     */
    for c in document.chars() {
        let mut needs_alloc = false;
        let mut alloc_list = false;
        let delim: char;
        if let Some(d) = delim_stack.last() {
            delim = *d;

            if delim == '*' {
                token.push(c);
                delim_stack.pop();
                continue;

            // normal delimiter cases
            } else if c == delim {
                needs_alloc = true;
                // reset comment line status
                if delim == '\n' {
                    delim_stack.pop();
                    ign = false;
                    continue;
                }

                // catch too many list end
                // set alloc_list
                if delim == ')' {
                    alloc_list = true;
                    if ref_stack.is_empty() {
                        return Err("too many end parens".to_string());
                    }
                }
                delim_stack.pop();

            // if we are in a commented out space, skip this char
            } else if ign {
                continue;
            }
        }
        // try to generalize all whitespace
        if !needs_alloc && char::is_whitespace(c) && !is_str {
            // dont make empty tokens just because the document has consecutive whitespace
            if token.is_empty() {
                continue;
            }
            needs_alloc = true;
        }
        // match a delimiter
        if !needs_alloc {
            match c {
                // add a new Seg reference to the stack
                '(' => {
                    if is_str {
                        token.push(c);
                        continue;
                    }

                    if !token.is_empty() {
                        return Err("list started in middle of another token".to_string());
                    }

                    ref_stack.push(Seg::new());
                    delim_stack.push(')');
                }
                // begin parsing a string
                '"' | '\'' | '`' if !is_str => {
                    is_str = true;
                    delim_stack.push(c);
                }
                // eat the whole line
                '#' | ';'  => {
                    ign = true;
                    delim_stack.push('\n');
                }
                // escape next char
                '\\' => {
                    delim_stack.push('*');
                }
                // add to token
                _ => {
                    token.push(c);
                }
            }

        /* 1. Handle allocation of new Ctr
         * 2. Handle expansion of current list ref
         */
        } else {
            if token.is_empty() && !is_str && !alloc_list {
                return Err("Empty token".to_string());
            }

            let mut return_singlet = false;
            let mut current_seg = ref_stack.pop().unwrap_or_else(|| {
                return_singlet = true;
                Seg::new()
            });
            let obj;
            if is_str {
                obj = Box::from(Ctr::String(token));
                is_str = false;
                token = String::new();
                current_seg.append(obj);
            } else if !token.is_empty() {
                if token == "true" {
                    obj = Box::from(Ctr::Bool(true));
                } else if token == "false" {
                    obj = Box::from(Ctr::Bool(false));
                } else if let Ok(i) = token.parse::<i128>() {
                    obj = Box::from(Ctr::Integer(i));
                } else if let Ok(f) = token.parse::<f64>() {
                    obj = Box::from(Ctr::Float(f));
                } else if let Some(s) = tok_is_symbol(&token) {
                    obj = Box::from(Ctr::Symbol(s));
                } else {
                    return Err(format!("Unparsable token: {}", token));
                }

                token = String::new();
                current_seg.append(obj.clone());
            }

            if alloc_list || return_singlet {
                // return if we have finished the document
                if ref_stack.is_empty() {
                    return Ok(Box::new(current_seg));
                }

                let t = current_seg;
                current_seg = ref_stack.pop().unwrap();
                current_seg.append(Box::from(Ctr::Seg(t)));
            }

            ref_stack.push(current_seg);
        }
    }

    if is_str {
        Err(UNMATCHED_STR_DELIM.to_string())
    } else {
        Err(UNMATCHED_LIST_DELIM.to_string())
    }
}

/* Returns true if token
 *   - is all alphanumeric except dash, question, and underscore
 *   - equals is also allowed but only for shell command compatibility
 * else returns false
 */
fn tok_is_symbol(token: &str) -> Option<String> {
    for t in token.chars() {
        if !t.is_alphanumeric() &&
            t != '-' &&
            t != '_' &&
            t != '?' &&
            t != '=' &&
            t != '.' &&
            t != '/'
        {
            return None;
        }
    }

    Some(String::from(token))
}