- syntax tree datatypes

- prototype lex function
- a lex unit test
- gitignore
- library structure
- license
This commit is contained in:
Aidan 2021-01-24 12:34:58 -08:00
commit e4f2fbaa70
No known key found for this signature in database
GPG key ID: 327711E983899316
8 changed files with 995 additions and 0 deletions

219
src/lex.rs Normal file
View file

@ -0,0 +1,219 @@
/* relish: highly versatile lisp interpreter
* Copyright (C) 2021 Aidan Hahn
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
use std::boxed::Box;
use crate::cell::{Ctr, append, Cell};
const UNMATCHED_STR_DELIM: &str = "Unmatched string delimiter in input";
const UNMATCHED_LIST_DELIM: &str = "Unmatched list delimiter in input";
/* takes a line of user input
* returns an unsimplified tree of tokens.
*
* WARNING: lex and process ONLY SUPPORT ASCII CHARACTERS.
* Unicode and other technology where one rune can take multiple indexes
* can cause havoc if part of a rune matches a whitespace or other operator
*/
pub fn lex(document: String) -> Result<Box<Cell>, String> {
if !document.is_ascii() {
return Err("document may only contain ascii characters".to_string());
}
let tree = process(document);
// TODO: Make multiple forms of Ok()
// To represent the multiple passable outcomes
return match tree {
Err(e) => Err(format!("Problem lexing document: {:?}", e)),
Ok(t) => Ok(t)
}
}
/* The logic used in lex
* Returns Ok(Box<Cell>) if lexing passes
* Returns Err(String) if an error occurs
*
* WARNING: read docs for lex
*/
fn process(document: String) -> Result<Box<Cell>, String> {
let doc_len = document.len();
if doc_len == 0 {
return Err("Empty document".to_string());
}
/* State variables
* TODO: describe all of them
*/
let mut is_str = false;
let mut ign = false;
let mut token = String::new();
let mut delim_stack = vec![')', ' '];
let mut ref_stack = vec![Box::new(Cell{
car: Ctr::None,
cdr: Ctr::None
})];
/* Iterate over document
* Manage currently sought delimiter
*/
for c in document.chars() {
let mut needs_alloc = true;
let mut alloc_list = false;
let delim = delim_stack.last().unwrap();
// case only happens when escaping a char
if *delim == '*' {
token.push(c);
// normal delimiter cases
} else if c == *delim {
// reset comment line status
if *delim == '\n' {
ign = false
}
// catch too many list end
// set alloc_list
if *delim == ')' {
alloc_list = true;
if ref_stack.len() < 1 {
return Err("too many end parens".to_string());
}
}
delim_stack.pop();
// try to generalize all whitespace
} else if *delim == ' ' && char::is_whitespace(c) {
delim_stack.pop();
// match a delimiter
} else {
needs_alloc = false;
match c {
// add a new Cell reference to the stack
'(' => {
if token != "" || *(delim_stack.last().unwrap()) != ' ' {
return Err("list started in middle of another token".to_string());
}
ref_stack.push(Box::new(Cell{
car: Ctr::None,
cdr: Ctr::None
}));
delim_stack.push(')');
},
// begin parsing a string
'"' | '\'' | '`' => {
is_str = true;
delim_stack.push(c);
},
// eat the whole line
'#' => {
ign = true;
delim_stack.push('\n');
},
// escape next char
'\\' => {
delim_stack.push('*');
}
// add to token
_ => {
token.push(c)
}
}
}
if ign {
continue;
}
/* 1. Handle allocation of new Ctr
* 2. Handle expansion of current list ref
*/
if needs_alloc {
if delim_stack.len() == 0 {
delim_stack.push(' ');
}
if token.len() == 0 && !is_str && !alloc_list {
return Err("Empty token".to_string());
}
let mut current_cell_ref = ref_stack.pop().unwrap();
// throws warning (overwritten before read) not sure how to handle
let mut obj = Ctr::None;
if alloc_list {
// we should never hit this but if we do I want to know
if token.len() > 0 {
return Err("list/token conflict".to_string());
}
// return if we have finished the document
if ref_stack.len() == 0 {
return Ok(current_cell_ref);
}
obj = Ctr::CELL(Box::new(*current_cell_ref));
current_cell_ref = ref_stack.pop().unwrap();
} else if is_str {
obj = Ctr::STRING(token);
is_str = false;
} else if token == "true" {
obj = Ctr::BOOL(true);
} else if token == "false" {
obj = Ctr::BOOL(false);
} else if let Ok(i) = token.parse::<i128>() {
obj = Ctr::INTEGER(i);
} else if let Ok(f) = token.parse::<f64>() {
obj = Ctr::FLOAT(f);
} else if let Some(s) = tok_is_symbol(&token) {
obj = Ctr::SYMBOL(s);
} else {
return Err(format!("Unparsable token: {}", token));
}
append(&mut current_cell_ref, obj);
// reset token
token = String::new();
}
}
if is_str {
return Err(UNMATCHED_STR_DELIM.to_string());
}
return Err(UNMATCHED_LIST_DELIM.to_string());
}
/* Returns true if token
* - is all alphanumeric
*
* else returns false
*/
fn tok_is_symbol(token: &String) -> Option<String> {
let tok = token.as_str();
for t in tok.chars() {
if !t.is_alphabetic() && !t.is_digit(10) {
return None
}
}
return Some(String::from(tok))
}