flesh/src/lex.rs

/* relish: highly versatile lisp interpreter
 * Copyright (C) 2021 Aidan Hahn
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

use crate::segment::{Ctr, Seg};

const UNMATCHED_STR_DELIM: &str = "Unmatched string delimiter in input";
const UNMATCHED_LIST_DELIM: &str = "Unmatched list delimiter in input";

/* takes a line of user input
 * returns an unsimplified tree of tokens.
 */
pub fn lex(document: &String) -> Result<Box<Seg>, String> {
    if !document.is_ascii() {
        return Err("document may only contain ascii characters".to_string());
    }

    let mut document_normal = document.clone();
    if !document_normal.ends_with(')') {
        document_normal = document_normal + ")";
    }
    if !document_normal.starts_with('(') {
        document_normal = "(".to_string() + &document_normal;
    }

    let tree = process(&document_normal);

    // TODO: Make multiple forms of Ok()
    // To represent the multiple passable outcomes
    return match tree {
        Err(e) => Err(format!("Problem lexing document: {:?}", e)),
        Ok(t) => Ok(t),
    };
}

/* The logic used in lex
 * Returns Ok(Rc<Seg>) if lexing passes
 * Returns Err(String) if an error occurs
 */
fn process(document: &String) -> Result<Box<Seg>, String> {
    let doc_len = document.len();

    if doc_len == 0 {
        return Err("Empty document".to_string());
    }

    /* State variables
     * TODO: describe all of them
     */
    let mut is_str = false;
    let mut ign = false;
    let mut token = String::new();
    let mut delim_stack = Vec::new();
    let mut ref_stack = vec![];

    /* Iterate over document
     * Manage currently sought delimiter
     */
    for c in document.chars() {
        let mut needs_alloc = false;
        let mut alloc_list = false;
        let delim: char;
        if let Some(d) = delim_stack.last() {
            delim = *d;

            if delim == '*' {
                token.push(c);
                delim_stack.pop();
                continue;

            // normal delimiter cases
            } else if c == delim {
                needs_alloc = true;
                // reset comment line status
                if delim == '\n' {
                    delim_stack.pop();
                    ign = false;
                    continue;
                }

                // catch too many list end
                // set alloc_list
                if delim == ')' {
                    alloc_list = true;
                    if ref_stack.is_empty() {
                        return Err("too many end parens".to_string());
                    }
                }
                delim_stack.pop();

            // if we are in a commented out space, skip this char
            } else if ign {
                continue;
            }
        }
        // try to generalize all whitespace
        if !needs_alloc && char::is_whitespace(c) && !is_str {
            // dont make empty tokens just because the document has consecutive whitespace
            if token.is_empty() {
                continue;
            }
            needs_alloc = true;
        }
        // match a delimiter
        if !needs_alloc {
            match c {
                // add a new Seg reference to the stack
                '(' => {
                    if is_str {
                        token.push(c);
                        continue;
                    }

                    if !token.is_empty() {
                        return Err("list started in middle of another token".to_string());
                    }

                    ref_stack.push(Seg::new());
                    delim_stack.push(')');
                }
                // begin parsing a string
                '"' | '\'' | '`' => {
                    is_str = true;
                    delim_stack.push(c);
                }
                // eat the whole line
                '#' => {
                    ign = true;
                    delim_stack.push('\n');
                }
                // escape next char
                '\\' => {
                    delim_stack.push('*');
                }
                // add to token
                _ => {
                    token.push(c);
                }
            }

        /* 1. Handle allocation of new Ctr
         * 2. Handle expansion of current list ref
         */
        } else {
            if token.is_empty() && !is_str && !alloc_list {
                return Err("Empty token".to_string());
            }

            let mut current_seg = ref_stack.pop().unwrap();
            let obj;
            if is_str {
                obj = Box::from(Ctr::String(token));
                is_str = false;
                token = String::new();
                current_seg.append(obj);
            } else if !token.is_empty() {
                if token == "true" {
                    obj = Box::from(Ctr::Bool(true));
                } else if token == "false" {
                    obj = Box::from(Ctr::Bool(false));
                } else if let Ok(i) = token.parse::<i128>() {
                    obj = Box::from(Ctr::Integer(i));
                } else if let Ok(f) = token.parse::<f64>() {
                    obj = Box::from(Ctr::Float(f));
                } else if let Some(s) = tok_is_symbol(&token) {
                    obj = Box::from(Ctr::Symbol(s));
                } else {
                    return Err(format!("Unparsable token: {}", token));
                }

                token = String::new();
                current_seg.append(obj.clone());
            }

            if alloc_list {
                // return if we have finished the document
                if ref_stack.is_empty() {
                    return Ok(Box::new(current_seg));
                }

                let t = current_seg;
                current_seg = ref_stack.pop().unwrap();
                current_seg.append(Box::from(Ctr::Seg(t)));
            }

            ref_stack.push(current_seg);
        }
    }

    if is_str {
        Err(UNMATCHED_STR_DELIM.to_string())
    } else {
        Err(UNMATCHED_LIST_DELIM.to_string())
    }
}

/* Returns true if token
 *   - is all alphanumeric except dash and underscore
 *
 * else returns false
 */
fn tok_is_symbol(token: &str) -> Option<String> {
    for t in token.chars() {
        if !t.is_alphanumeric() && t != '-' && t != '_' {
            return None;
        }
    }

    Some(String::from(token))
}
- syntax tree datatypes - prototype lex function - a lex unit test - gitignore - library structure - license 2021-01-24 12:34:58 -08:00			`/* relish: highly versatile lisp interpreter`
			`* Copyright (C) 2021 Aidan Hahn`
			`*`
			`* This program is free software: you can redistribute it and/or modify`
			`* it under the terms of the GNU General Public License as published by`
			`* the Free Software Foundation, either version 3 of the License, or`
			`* (at your option) any later version.`
			`*`
			`* This program is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`* GNU General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU General Public License`
			`* along with this program. If not, see <http://www.gnu.org/licenses/>.`
			`*/`

WIP commit: * Fix up project structures * combine vars and funcs table * make a place for old code that may be useful to reference * singleton pattern for sym table Commentary: When this change is finally finished I promise to use feature branches from here on out 2023-02-15 23:27:00 -08:00			`use crate::segment::{Ctr, Seg};`
- syntax tree datatypes - prototype lex function - a lex unit test - gitignore - library structure - license 2021-01-24 12:34:58 -08:00
			`const UNMATCHED_STR_DELIM: &str = "Unmatched string delimiter in input";`
			`const UNMATCHED_LIST_DELIM: &str = "Unmatched list delimiter in input";`

			`/* takes a line of user input`
			`* returns an unsimplified tree of tokens.`
			`*/`
significant refactor and simplification 2023-02-17 21:00:07 -08:00			`pub fn lex(document: &String) -> Result<Box<Seg>, String> {`
- syntax tree datatypes - prototype lex function - a lex unit test - gitignore - library structure - license 2021-01-24 12:34:58 -08:00			`if !document.is_ascii() {`
			`return Err("document may only contain ascii characters".to_string());`
			`}`

fix the many-body-script and lex-singlet problems Signed-off-by: Ava Hahn <ava@aidanis.online> 2023-03-01 12:20:43 -08:00			`let mut document_normal = document.clone();`
			`if !document_normal.ends_with(')') {`
			`document_normal = document_normal + ")";`
			`}`
			`if !document_normal.starts_with('(') {`
			`document_normal = "(".to_string() + &document_normal;`
			`}`

			`let tree = process(&document_normal);`
- syntax tree datatypes - prototype lex function - a lex unit test - gitignore - library structure - license 2021-01-24 12:34:58 -08:00
			`// TODO: Make multiple forms of Ok()`
			`// To represent the multiple passable outcomes`
			`return match tree {`
			`Err(e) => Err(format!("Problem lexing document: {:?}", e)),`
refmt 2022-01-16 22:02:40 -08:00			`Ok(t) => Ok(t),`
			`};`
- syntax tree datatypes - prototype lex function - a lex unit test - gitignore - library structure - license 2021-01-24 12:34:58 -08:00			`}`

			`/* The logic used in lex`
Big referencing refactor - RC+RefCell pattern used... everywhere - Ast type implemented - unit tests for func_call - more changes, but this commit scope has grown significantly and I cannot list them all 2021-03-14 16:14:57 -07:00			`* Returns Ok(Rc<Seg>) if lexing passes`
- syntax tree datatypes - prototype lex function - a lex unit test - gitignore - library structure - license 2021-01-24 12:34:58 -08:00			`* Returns Err(String) if an error occurs`
			`*/`
significant refactor and simplification 2023-02-17 21:00:07 -08:00			`fn process(document: &String) -> Result<Box<Seg>, String> {`
- syntax tree datatypes - prototype lex function - a lex unit test - gitignore - library structure - license 2021-01-24 12:34:58 -08:00			`let doc_len = document.len();`

			`if doc_len == 0 {`
			`return Err("Empty document".to_string());`
			`}`

			`/* State variables`
			`* TODO: describe all of them`
			`*/`
			`let mut is_str = false;`
			`let mut ign = false;`
			`let mut token = String::new();`
- added more unit tests for lexer - corrected defects revealed by added tests 2021-01-24 22:04:26 -08:00			`let mut delim_stack = Vec::new();`
			`let mut ref_stack = vec![];`
- syntax tree datatypes - prototype lex function - a lex unit test - gitignore - library structure - license 2021-01-24 12:34:58 -08:00
			`/* Iterate over document`
			`* Manage currently sought delimiter`
			`*/`
			`for c in document.chars() {`
- added more unit tests for lexer - corrected defects revealed by added tests 2021-01-24 22:04:26 -08:00			`let mut needs_alloc = false;`
- syntax tree datatypes - prototype lex function - a lex unit test - gitignore - library structure - license 2021-01-24 12:34:58 -08:00			`let mut alloc_list = false;`
- added more unit tests for lexer - corrected defects revealed by added tests 2021-01-24 22:04:26 -08:00			`let delim: char;`
			`if let Some(d) = delim_stack.last() {`
			`delim = *d;`

			`if delim == '*' {`
			`token.push(c);`
			`delim_stack.pop();`
			`continue;`

			`// normal delimiter cases`
			`} else if c == delim {`
			`needs_alloc = true;`
			`// reset comment line status`
			`if delim == '\n' {`
- fixed lexing of inline and postline comments 2021-01-25 20:55:16 -08:00			`delim_stack.pop();`
add more unit tests for lexing 2021-01-24 22:32:09 -08:00			`ign = false;`
			`continue;`
- added more unit tests for lexer - corrected defects revealed by added tests 2021-01-24 22:04:26 -08:00			`}`
- syntax tree datatypes - prototype lex function - a lex unit test - gitignore - library structure - license 2021-01-24 12:34:58 -08:00
- added more unit tests for lexer - corrected defects revealed by added tests 2021-01-24 22:04:26 -08:00			`// catch too many list end`
			`// set alloc_list`
			`if delim == ')' {`
			`alloc_list = true;`
significant refactor and simplification 2023-02-17 21:00:07 -08:00			`if ref_stack.is_empty() {`
- added more unit tests for lexer - corrected defects revealed by added tests 2021-01-24 22:04:26 -08:00			`return Err("too many end parens".to_string());`
			`}`
- syntax tree datatypes - prototype lex function - a lex unit test - gitignore - library structure - license 2021-01-24 12:34:58 -08:00			`}`
- added more unit tests for lexer - corrected defects revealed by added tests 2021-01-24 22:04:26 -08:00			`delim_stack.pop();`
- syntax tree datatypes - prototype lex function - a lex unit test - gitignore - library structure - license 2021-01-24 12:34:58 -08:00
- added more unit tests for lexer - corrected defects revealed by added tests 2021-01-24 22:04:26 -08:00			`// if we are in a commented out space, skip this char`
			`} else if ign {`
			`continue;`
			`}`
			`}`
- syntax tree datatypes - prototype lex function - a lex unit test - gitignore - library structure - license 2021-01-24 12:34:58 -08:00			`// try to generalize all whitespace`
- added more unit tests for lexer - corrected defects revealed by added tests 2021-01-24 22:04:26 -08:00			`if !needs_alloc && char::is_whitespace(c) && !is_str {`
			`// dont make empty tokens just because the document has consecutive whitespace`
significant refactor and simplification 2023-02-17 21:00:07 -08:00			`if token.is_empty() {`
- added more unit tests for lexer - corrected defects revealed by added tests 2021-01-24 22:04:26 -08:00			`continue;`
			`}`
			`needs_alloc = true;`
			`}`
			`// match a delimiter`
			`if !needs_alloc {`
- syntax tree datatypes - prototype lex function - a lex unit test - gitignore - library structure - license 2021-01-24 12:34:58 -08:00			`match c {`
Big referencing refactor - RC+RefCell pattern used... everywhere - Ast type implemented - unit tests for func_call - more changes, but this commit scope has grown significantly and I cannot list them all 2021-03-14 16:14:57 -07:00			`// add a new Seg reference to the stack`
- syntax tree datatypes - prototype lex function - a lex unit test - gitignore - library structure - license 2021-01-24 12:34:58 -08:00			`'(' => {`
new fixes for lexing process, tests to go with them 2021-09-18 16:48:24 -07:00			`if is_str {`
			`token.push(c);`
refmt 2022-01-16 22:02:40 -08:00			`continue;`
new fixes for lexing process, tests to go with them 2021-09-18 16:48:24 -07:00			`}`

significant refactor and simplification 2023-02-17 21:00:07 -08:00			`if !token.is_empty() {`
- syntax tree datatypes - prototype lex function - a lex unit test - gitignore - library structure - license 2021-01-24 12:34:58 -08:00			`return Err("list started in middle of another token".to_string());`
			`}`

WIP commit: * Fix up project structures * combine vars and funcs table * make a place for old code that may be useful to reference * singleton pattern for sym table Commentary: When this change is finally finished I promise to use feature branches from here on out 2023-02-15 23:27:00 -08:00			`ref_stack.push(Seg::new());`
- syntax tree datatypes - prototype lex function - a lex unit test - gitignore - library structure - license 2021-01-24 12:34:58 -08:00			`delim_stack.push(')');`
refmt 2022-01-16 22:02:40 -08:00			`}`
- syntax tree datatypes - prototype lex function - a lex unit test - gitignore - library structure - license 2021-01-24 12:34:58 -08:00			`// begin parsing a string`
			'"' \| '\'' \| '`' => {
			`is_str = true;`
			`delim_stack.push(c);`
refmt 2022-01-16 22:02:40 -08:00			`}`
- syntax tree datatypes - prototype lex function - a lex unit test - gitignore - library structure - license 2021-01-24 12:34:58 -08:00			`// eat the whole line`
			`'#' => {`
			`ign = true;`
			`delim_stack.push('\n');`
refmt 2022-01-16 22:02:40 -08:00			`}`
- syntax tree datatypes - prototype lex function - a lex unit test - gitignore - library structure - license 2021-01-24 12:34:58 -08:00			`// escape next char`
			`'\\' => {`
			`delim_stack.push('*');`
			`}`
			`// add to token`
			`_ => {`
new fixes for lexing process, tests to go with them 2021-09-18 16:48:24 -07:00			`token.push(c);`
- syntax tree datatypes - prototype lex function - a lex unit test - gitignore - library structure - license 2021-01-24 12:34:58 -08:00			`}`
			`}`

			`/* 1. Handle allocation of new Ctr`
			`* 2. Handle expansion of current list ref`
			`*/`
- added more unit tests for lexer - corrected defects revealed by added tests 2021-01-24 22:04:26 -08:00			`} else {`
significant refactor and simplification 2023-02-17 21:00:07 -08:00			`if token.is_empty() && !is_str && !alloc_list {`
- syntax tree datatypes - prototype lex function - a lex unit test - gitignore - library structure - license 2021-01-24 12:34:58 -08:00			`return Err("Empty token".to_string());`
			`}`

fix the many-body-script and lex-singlet problems Signed-off-by: Ava Hahn <ava@aidanis.online> 2023-03-01 12:20:43 -08:00			`let mut current_seg = ref_stack.pop().unwrap();`
WIP commit: * Fix up project structures * combine vars and funcs table * make a place for old code that may be useful to reference * singleton pattern for sym table Commentary: When this change is finally finished I promise to use feature branches from here on out 2023-02-15 23:27:00 -08:00			`let obj;`
new fixes for lexing process, tests to go with them 2021-09-18 16:48:24 -07:00			`if is_str {`
WIP commit: * Fix up project structures * combine vars and funcs table * make a place for old code that may be useful to reference * singleton pattern for sym table Commentary: When this change is finally finished I promise to use feature branches from here on out 2023-02-15 23:27:00 -08:00			`obj = Box::from(Ctr::String(token));`
new fixes for lexing process, tests to go with them 2021-09-18 16:48:24 -07:00			`is_str = false;`
			`token = String::new();`
WIP commit: * Fix up project structures * combine vars and funcs table * make a place for old code that may be useful to reference * singleton pattern for sym table Commentary: When this change is finally finished I promise to use feature branches from here on out 2023-02-15 23:27:00 -08:00			`current_seg.append(obj);`
significant refactor and simplification 2023-02-17 21:00:07 -08:00			`} else if !token.is_empty() {`
new fixes for lexing process, tests to go with them 2021-09-18 16:48:24 -07:00			`if token == "true" {`
WIP commit: * Fix up project structures * combine vars and funcs table * make a place for old code that may be useful to reference * singleton pattern for sym table Commentary: When this change is finally finished I promise to use feature branches from here on out 2023-02-15 23:27:00 -08:00			`obj = Box::from(Ctr::Bool(true));`
- added more unit tests for lexer - corrected defects revealed by added tests 2021-01-24 22:04:26 -08:00			`} else if token == "false" {`
WIP commit: * Fix up project structures * combine vars and funcs table * make a place for old code that may be useful to reference * singleton pattern for sym table Commentary: When this change is finally finished I promise to use feature branches from here on out 2023-02-15 23:27:00 -08:00			`obj = Box::from(Ctr::Bool(false));`
- added more unit tests for lexer - corrected defects revealed by added tests 2021-01-24 22:04:26 -08:00			`} else if let Ok(i) = token.parse::<i128>() {`
WIP commit: * Fix up project structures * combine vars and funcs table * make a place for old code that may be useful to reference * singleton pattern for sym table Commentary: When this change is finally finished I promise to use feature branches from here on out 2023-02-15 23:27:00 -08:00			`obj = Box::from(Ctr::Integer(i));`
- added more unit tests for lexer - corrected defects revealed by added tests 2021-01-24 22:04:26 -08:00			`} else if let Ok(f) = token.parse::<f64>() {`
WIP commit: * Fix up project structures * combine vars and funcs table * make a place for old code that may be useful to reference * singleton pattern for sym table Commentary: When this change is finally finished I promise to use feature branches from here on out 2023-02-15 23:27:00 -08:00			`obj = Box::from(Ctr::Float(f));`
- added more unit tests for lexer - corrected defects revealed by added tests 2021-01-24 22:04:26 -08:00			`} else if let Some(s) = tok_is_symbol(&token) {`
WIP commit: * Fix up project structures * combine vars and funcs table * make a place for old code that may be useful to reference * singleton pattern for sym table Commentary: When this change is finally finished I promise to use feature branches from here on out 2023-02-15 23:27:00 -08:00			`obj = Box::from(Ctr::Symbol(s));`
- added more unit tests for lexer - corrected defects revealed by added tests 2021-01-24 22:04:26 -08:00			`} else {`
more complex tests 2021-07-19 23:59:03 -07:00			`return Err(format!("Unparsable token: {}", token));`
- syntax tree datatypes - prototype lex function - a lex unit test - gitignore - library structure - license 2021-01-24 12:34:58 -08:00			`}`

- added more unit tests for lexer - corrected defects revealed by added tests 2021-01-24 22:04:26 -08:00			`token = String::new();`
WIP commit: * Fix up project structures * combine vars and funcs table * make a place for old code that may be useful to reference * singleton pattern for sym table Commentary: When this change is finally finished I promise to use feature branches from here on out 2023-02-15 23:27:00 -08:00			`current_seg.append(obj.clone());`
- added more unit tests for lexer - corrected defects revealed by added tests 2021-01-24 22:04:26 -08:00			`}`

			`if alloc_list {`
- syntax tree datatypes - prototype lex function - a lex unit test - gitignore - library structure - license 2021-01-24 12:34:58 -08:00			`// return if we have finished the document`
significant refactor and simplification 2023-02-17 21:00:07 -08:00			`if ref_stack.is_empty() {`
WIP commit: * Fix up project structures * combine vars and funcs table * make a place for old code that may be useful to reference * singleton pattern for sym table Commentary: When this change is finally finished I promise to use feature branches from here on out 2023-02-15 23:27:00 -08:00			`return Ok(Box::new(current_seg));`
- syntax tree datatypes - prototype lex function - a lex unit test - gitignore - library structure - license 2021-01-24 12:34:58 -08:00			`}`

WIP commit: * Fix up project structures * combine vars and funcs table * make a place for old code that may be useful to reference * singleton pattern for sym table Commentary: When this change is finally finished I promise to use feature branches from here on out 2023-02-15 23:27:00 -08:00			`let t = current_seg;`
			`current_seg = ref_stack.pop().unwrap();`
			`current_seg.append(Box::from(Ctr::Seg(t)));`
- syntax tree datatypes - prototype lex function - a lex unit test - gitignore - library structure - license 2021-01-24 12:34:58 -08:00			`}`

big temp status Signed-off-by: Ava Hahn <ava@aidanis.online> 2023-01-27 17:45:19 -08:00			`ref_stack.push(current_seg);`
- syntax tree datatypes - prototype lex function - a lex unit test - gitignore - library structure - license 2021-01-24 12:34:58 -08:00			`}`
			`}`

			`if is_str {`
significant refactor and simplification 2023-02-17 21:00:07 -08:00			`Err(UNMATCHED_STR_DELIM.to_string())`
repl now complete Signed-off-by: Ava Hahn <ava@aidanis.online> 2023-03-01 11:14:42 -08:00			`} else {`
			`Err(UNMATCHED_LIST_DELIM.to_string())`
- syntax tree datatypes - prototype lex function - a lex unit test - gitignore - library structure - license 2021-01-24 12:34:58 -08:00			`}`
			`}`

			`/* Returns true if token`
more complex tests 2021-07-19 23:59:03 -07:00			`* - is all alphanumeric except dash and underscore`
- syntax tree datatypes - prototype lex function - a lex unit test - gitignore - library structure - license 2021-01-24 12:34:58 -08:00			`*`
			`* else returns false`
			`*/`
significant refactor and simplification 2023-02-17 21:00:07 -08:00			`fn tok_is_symbol(token: &str) -> Option<String> {`
			`for t in token.chars() {`
			`if !t.is_alphanumeric() && t != '-' && t != '_' {`
refmt 2022-01-16 22:02:40 -08:00			`return None;`
- syntax tree datatypes - prototype lex function - a lex unit test - gitignore - library structure - license 2021-01-24 12:34:58 -08:00			`}`
			`}`

significant refactor and simplification 2023-02-17 21:00:07 -08:00			`Some(String::from(token))`
- syntax tree datatypes - prototype lex function - a lex unit test - gitignore - library structure - license 2021-01-24 12:34:58 -08:00			`}`