Parser
This commit adds a parser, complete with tests. The parser implements an iterator which returns Datum. It wraps around a Lexer and uses the Lexer's iterator interfact to consume lexemes. It may return an error which may wrap around a LexError or a fully lexed lexeme. In the implementation of the Parser bugs were found in the lexer package. This resulted in the lexing tests being extended as well as several small logic updates. The number package has had slight tweaks to make number representations less cumbersome. Finally, the Datum display logic in the sexpr package has also been updated. Signed-off-by: Ava Affine <ava@sunnypup.io>
This commit is contained in:
parent
a48fc52fab
commit
86f905ba1d
5 changed files with 632 additions and 29 deletions
|
|
@ -67,7 +67,7 @@ impl fmt::Display for LexError {
|
|||
|
||||
} else {
|
||||
let mut idx = self.1;
|
||||
while self.1 - idx > 25 {
|
||||
while self.1 - idx < 25 {
|
||||
idx -= 1;
|
||||
if self.2[idx..]
|
||||
.char_indices()
|
||||
|
|
@ -107,8 +107,11 @@ impl fmt::Display for LexError {
|
|||
}
|
||||
};
|
||||
|
||||
write!(f, "Error when lexing document here:\n\n")?;
|
||||
write!(f, " {}\n", &self.2[err_snippet_start()..err_snippet_end()])?;
|
||||
write!(f, "Error when lexing document here: (idx: {})\n", self.1)?;
|
||||
let s = err_snippet_start();
|
||||
let st = self.1 - err_snippet_start();
|
||||
write!(f, " {}\n", &self.2[s..err_snippet_end()])?;
|
||||
write!(f, " {}^\n", " ".repeat(st))?;
|
||||
write!(f, "Error: {}\n", self.0)
|
||||
}
|
||||
}
|
||||
|
|
@ -132,7 +135,7 @@ pub enum LexTokenType {
|
|||
Quote,
|
||||
QuasiQuote,
|
||||
Unquote,
|
||||
UnquoteSpliceTemplate,
|
||||
UnquoteSplice,
|
||||
NumTypes,
|
||||
}
|
||||
|
||||
|
|
@ -506,10 +509,11 @@ impl Lexer {
|
|||
return Ok(())
|
||||
}
|
||||
|
||||
// make sure next character is a proper delimiter
|
||||
adv!().and_then(|_| if !delim(self.current_char()) {
|
||||
return Err(LexError(E_UNDELIMITED_ESC, self.current_index,
|
||||
self.document.clone()))
|
||||
} else { if in_string {self.current_index = saved_idx }; Ok(()) })
|
||||
} else { self.current_index = saved_idx; Ok(()) })
|
||||
}
|
||||
|
||||
/* Called to output a token by the iterator implementation
|
||||
|
|
@ -565,7 +569,8 @@ impl Lexer {
|
|||
|
||||
if self.current_char() == ',' {
|
||||
if let Some(x) = self.peek_next_char() && x == '@'{
|
||||
output = Some(self.cut_new_token(LexTokenType::UnquoteSpliceTemplate));
|
||||
self.advance_char();
|
||||
output = Some(self.cut_new_token(LexTokenType::UnquoteSplice));
|
||||
} else {
|
||||
output = Some(self.cut_new_token(LexTokenType::Unquote));
|
||||
}
|
||||
|
|
@ -638,7 +643,8 @@ mod tests {
|
|||
|
||||
/* Char Cases */ (
|
||||
// HAPPY CASES
|
||||
vec!["#\\a", "#\\t", "#\\\"", "#\\t", "#\\space", "#\\x20"],
|
||||
vec!["#\\a", "#\\t", "#\\\"", "#\\t", "#\\space",
|
||||
"#\\alarm", "#\\s", "#\\x20"],
|
||||
|
||||
// SAD CASES
|
||||
vec!["\\c", "\\x20"]
|
||||
|
|
@ -743,9 +749,9 @@ mod tests {
|
|||
vec![]
|
||||
),
|
||||
|
||||
/* UnquoteSpliceTemplate cases */ (
|
||||
/* UnquoteSplice cases */ (
|
||||
// HAPPY CASES
|
||||
vec![",@x", ",@(", ",@"],
|
||||
vec![",@x", ",@(", ",@", ",@(two)"],
|
||||
|
||||
// SAD CASES
|
||||
vec![]
|
||||
|
|
@ -755,7 +761,7 @@ mod tests {
|
|||
let no_subtoken_check_cases = [
|
||||
LexTokenType::Dot as u8,
|
||||
LexTokenType::Unquote as u8,
|
||||
LexTokenType::UnquoteSpliceTemplate as u8
|
||||
LexTokenType::UnquoteSplice as u8
|
||||
];
|
||||
|
||||
cases.iter().enumerate().for_each(|(idx, case)| {
|
||||
|
|
@ -834,4 +840,22 @@ mod tests {
|
|||
assert!(l.next().is_none());
|
||||
assert!(l.has_error_state.is_some());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn char_lex_with_close() {
|
||||
let mut res = vec![];
|
||||
Lexer::from(Rc::from("(#\\a)"))
|
||||
.into_iter()
|
||||
.collect_into(&mut res);
|
||||
assert_eq!(res.len(), 3);
|
||||
|
||||
assert_eq!(res[0].token_type, LexTokenType::ListStart);
|
||||
assert_eq!(&res[0].source_doc[res[0].start_idx..res[0].end_idx], "(");
|
||||
|
||||
assert_eq!(res[1].token_type, LexTokenType::Char);
|
||||
assert_eq!(&res[1].source_doc[res[1].start_idx..res[1].end_idx], "#\\a");
|
||||
|
||||
assert_eq!(res[2].token_type, LexTokenType::CollectionEnd);
|
||||
assert_eq!(&res[2].source_doc[res[2].start_idx..res[2].end_idx], ")");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@
|
|||
#![cfg_attr(not(test), no_std)]
|
||||
#![feature(let_chains)]
|
||||
#![feature(iter_collect_into)]
|
||||
#![feature(impl_trait_in_assoc_type)]
|
||||
#![feature(if_let_guard)]
|
||||
|
||||
pub mod sexpr;
|
||||
pub mod lexer;
|
||||
|
|
|
|||
|
|
@ -38,7 +38,7 @@ pub const E_SCIENTIFIC_MULTI_E: &str = "scientific notation implies only a s
|
|||
pub const E_SCIENTIFIC_OPERAND: &str = "couldnt parse 32 bit float operand";
|
||||
pub const E_SCIENTIFIC_POWER: &str = "couldnt parse integer power";
|
||||
|
||||
trait Numeric: Copy + Clone + Debug + FromStr + Into<String> {
|
||||
pub trait Numeric: Copy + Clone + Debug + FromStr + Into<String> {
|
||||
fn is_exact(&self) -> bool;
|
||||
fn make_inexact(&self) -> Float;
|
||||
fn make_exact(&self) -> Fraction;
|
||||
|
|
@ -46,7 +46,7 @@ trait Numeric: Copy + Clone + Debug + FromStr + Into<String> {
|
|||
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq)]
|
||||
pub struct ScientificNotation (f32, isize);
|
||||
pub struct ScientificNotation (pub f32, pub isize);
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq)]
|
||||
pub enum SymbolicNumber {
|
||||
|
|
@ -57,10 +57,10 @@ pub enum SymbolicNumber {
|
|||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq)]
|
||||
pub struct Fraction (isize, isize);
|
||||
pub struct Fraction (pub isize, pub isize);
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq)]
|
||||
pub struct Float (f64);
|
||||
pub struct Float (pub f64);
|
||||
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub enum Number {
|
||||
|
|
@ -481,7 +481,7 @@ impl FromStr for Fraction {
|
|||
|
||||
impl Into<String> for Fraction {
|
||||
fn into(self) -> String {
|
||||
format!("#e{}/{}", self.0, self.1)
|
||||
format!("{}/{}", self.0, self.1)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -508,9 +508,13 @@ impl FromStr for Float {
|
|||
|
||||
impl Into<String> for Float {
|
||||
fn into(self) -> String {
|
||||
if self.is_exact() {
|
||||
format!("{}", self.0)
|
||||
} else {
|
||||
format!("#i{}", self.0)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Numeric for Float {
|
||||
fn is_exact(&self) -> bool {
|
||||
|
|
@ -560,7 +564,7 @@ impl FromStr for ScientificNotation {
|
|||
|
||||
impl Into<String> for ScientificNotation {
|
||||
fn into(self) -> String {
|
||||
format!("#{}e{}", self.0, self.1)
|
||||
format!("{}e{}", self.0, self.1)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
556
mycelium/src/parser.rs
Normal file
556
mycelium/src/parser.rs
Normal file
|
|
@ -0,0 +1,556 @@
|
|||
/* Mycelium Scheme
|
||||
* Copyright (C) 2025 Ava Affine
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
use core::fmt::Display;
|
||||
|
||||
use crate::lexer::{
|
||||
LexError,
|
||||
LexToken,
|
||||
LexTokenType,
|
||||
Lexer,
|
||||
E_CHAR_TOO_LONG,
|
||||
E_END_OF_DOCUMENT
|
||||
};
|
||||
use crate::number::{Number, Numeric};
|
||||
use crate::sexpr::{Datum, Ast};
|
||||
|
||||
use alloc::vec::Vec;
|
||||
use alloc::vec;
|
||||
use alloc::rc::Rc;
|
||||
use alloc::string::String;
|
||||
|
||||
|
||||
pub const E_LEX_ERROR: &str = "error in lexing document";
|
||||
pub const E_EXTRA_CLOSE: &str = "closing parenthesis closes nothing";
|
||||
pub const E_TERRIBLE: &str = "something has gone terribly wrong....";
|
||||
pub const E_VECTOR_DOT: &str = "dotted notation not valid in vectors";
|
||||
pub const E_DOT_NO_LIST: &str = "dotted notation used outside of list";
|
||||
pub const E_CHAR_TRUNCATED: &str = "character literal is truncated";
|
||||
pub const E_CHAR_HEX_PARSE: &str = "hexadecimal character literal failed to parse";
|
||||
pub const E_COLLECTION_TRUNC: &str = "collection is truncated";
|
||||
pub const E_BV_BADBYTE: &str = "number provided is not a real byte";
|
||||
pub const E_BV_NONBYTE: &str = "bytevector elements must all be bytes";
|
||||
pub const E_TOO_MANY_DOT: &str = "valid dot notation only includes one dot";
|
||||
pub const E_DOT_IDX: &str = "dot should preceed only last element in list";
|
||||
pub const E_DOT_EMPTY: &str = "cannot apply dotted notation to otherwise empty list";
|
||||
pub const E_UNQUOTE_NONQQ: &str = "unquote must be within a quasiquoted form";
|
||||
pub const E_UNQUOTE_SPL_NONQQ: &str = "unquote-splicing must be within a quasiquoted form";
|
||||
pub const E_UNQUOTE_SPL_COLL: &str = "expected list or vector after unquote-splicing";
|
||||
|
||||
|
||||
/* ParseError
|
||||
* 0: error string
|
||||
* 1: either problematic lexing token, or a lexing error
|
||||
*/
|
||||
#[derive(Clone)]
|
||||
pub struct ParseError(pub &'static str, pub Option<Result<LexToken, LexError>>);
|
||||
|
||||
impl Display for ParseError {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
||||
let err_snippet_start = |t: &LexToken| -> usize {
|
||||
/* backtrack from current index until we either hit
|
||||
* - beginning of line
|
||||
* - 25 characters ago
|
||||
* - the doc Start
|
||||
*/
|
||||
if t.source_doc.len() < 25 {
|
||||
0
|
||||
|
||||
} else {
|
||||
let mut idx = t.start_idx;
|
||||
while t.start_idx - idx < 25 {
|
||||
idx -= 1;
|
||||
if t.source_doc[idx..]
|
||||
.char_indices()
|
||||
.next()
|
||||
.is_some_and(|(i, x)| x == '\n' && i == idx) {
|
||||
idx += 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
idx
|
||||
}
|
||||
};
|
||||
|
||||
let err_snippet_end = |t: &LexToken| -> usize {
|
||||
/* read through document until we either hit
|
||||
* - end of line
|
||||
* - 25 characters forward
|
||||
* - the doc end
|
||||
*/
|
||||
if t.source_doc.len() - t.end_idx < 25 {
|
||||
t.source_doc.len()
|
||||
|
||||
} else {
|
||||
let mut idx = t.end_idx;
|
||||
while idx - t.end_idx < 25 {
|
||||
idx += 1;
|
||||
if t.source_doc[idx..]
|
||||
.char_indices()
|
||||
.next()
|
||||
.is_some_and(|(i, x)| x == '\n' && i == idx) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
idx
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(frag) = &self.1 {
|
||||
match frag {
|
||||
Ok(token) => {
|
||||
write!(f, "Error parsing syntax: {}\n", self.0)?;
|
||||
write!(f," problematic token: {}\n",
|
||||
&token.source_doc[token.start_idx..token.end_idx])?;
|
||||
write!(f," {}\n",
|
||||
&token.source_doc[err_snippet_start(token)..err_snippet_end(token)])?;
|
||||
},
|
||||
|
||||
Err(e) => {
|
||||
return e.fmt(f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
write!(f, "Error parsing syntax: {}\n", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Parser {
|
||||
lexer: Lexer,
|
||||
pub has_error_state: Option<ParseError>,
|
||||
delayed: Vec<Rc<Datum>>,
|
||||
quasiquoted: bool,
|
||||
}
|
||||
|
||||
/* The From and Iterator traits serve as the primary
|
||||
* interface to work with the parser. It is expected to
|
||||
* make a Lexer first, and then use casting or type conv
|
||||
* to make it into a parser and then a final AST, which
|
||||
* we can then convert into a VM image once the compile
|
||||
* step is finished.
|
||||
*/
|
||||
|
||||
impl From<Lexer> for Parser {
|
||||
fn from(l: Lexer) -> Parser {
|
||||
Parser {
|
||||
lexer: l,
|
||||
has_error_state: None,
|
||||
delayed: vec![],
|
||||
quasiquoted: false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Iterator for Parser {
|
||||
type Item = Rc<Datum>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
if self.has_error_state.is_some() {
|
||||
return None;
|
||||
}
|
||||
|
||||
if self.delayed.len() > 0 {
|
||||
return self.delayed.pop()
|
||||
}
|
||||
|
||||
let res = self.get_next_datum();
|
||||
if let Err(ref e) = res {
|
||||
self.has_error_state = Some(e.clone());
|
||||
}
|
||||
|
||||
return res.ok()
|
||||
}
|
||||
}
|
||||
|
||||
fn read_number(token: LexToken) -> Result<Number, ParseError> {
|
||||
return match (&token.source_doc[token.start_idx..token.end_idx]).parse::<Number>() {
|
||||
Ok(num) => Ok(num),
|
||||
Err(e) => Err(ParseError(e, Some(Ok(token)))),
|
||||
}
|
||||
}
|
||||
|
||||
fn read_char(token: LexToken) -> Result<u8, ParseError> {
|
||||
if token.end_idx - token.start_idx < 2 {
|
||||
return Err(ParseError(E_CHAR_TRUNCATED, Some(Ok(token))))
|
||||
}
|
||||
|
||||
match &token.source_doc[token.start_idx + 2..token.end_idx] {
|
||||
"alarm" => Ok(7),
|
||||
"backspace" => Ok(8),
|
||||
"delete" => Ok(127),
|
||||
"escape" => Ok(33),
|
||||
"newline" => Ok('\n' as u8),
|
||||
"null" => Ok(0),
|
||||
"return" => Ok(13),
|
||||
"space" => Ok(32),
|
||||
"tab" => Ok(11),
|
||||
_ if token.source_doc[token.start_idx + 2..].starts_with('x') &&
|
||||
token.end_idx - token.start_idx > 2 => {
|
||||
if token.end_idx - token.start_idx > 5 {
|
||||
return Err(ParseError(E_CHAR_TOO_LONG, Some(Ok(token))))
|
||||
}
|
||||
|
||||
match u8::from_str_radix(
|
||||
&token.source_doc[token.start_idx + 3..token.end_idx],
|
||||
16) {
|
||||
Ok(u) => Ok(u),
|
||||
Err(_) => Err(ParseError(E_CHAR_HEX_PARSE, Some(Ok(token))))
|
||||
}
|
||||
},
|
||||
_ => Ok(token.source_doc.as_bytes()[token.start_idx + 2])
|
||||
}
|
||||
}
|
||||
|
||||
fn read_bool(token: LexToken) -> bool {
|
||||
match &token.source_doc[token.start_idx..token.end_idx] {
|
||||
"#t" => true,
|
||||
"#f" => false,
|
||||
_ => panic!("impossible boolean")
|
||||
}
|
||||
}
|
||||
|
||||
fn read_string(token: LexToken) -> Vec<u8> {
|
||||
if token.end_idx - token.start_idx < 3 {
|
||||
// empty string other than delimiters
|
||||
Vec::default()
|
||||
} else {
|
||||
token.source_doc[token.start_idx + 1..token.end_idx - 1]
|
||||
.as_bytes()
|
||||
.to_vec()
|
||||
}
|
||||
}
|
||||
|
||||
impl Parser {
|
||||
/* Rules we must mind:
|
||||
* 0. at this stage, drop and ignore comments, directives
|
||||
* 1. quote, quasiquote, unquote, and unquote splicing
|
||||
* all require another input after them (excluding
|
||||
* collection end)
|
||||
* 2. unquote-splicing explicitly requires a form I think?
|
||||
* (verify)
|
||||
* 3. vectors, lists, may have nested collections in them
|
||||
* so track collection state in the parser's stack.
|
||||
* 4. list dotted notation needs next datum put in cdr.
|
||||
* 5. bytevectors can only have numbers from 0-255 in them.
|
||||
*/
|
||||
|
||||
fn complete_quote(&mut self) -> Result<Rc<Datum>, ParseError> {
|
||||
let next = self.get_next_datum()?;
|
||||
Ok(Rc::from(Datum::List(Rc::from(Ast(
|
||||
Rc::from(Datum::Symbol(String::from("quote"))),
|
||||
|
||||
Rc::from(Datum::List(Rc::from(Ast(
|
||||
next,
|
||||
Rc::from(Datum::None)
|
||||
))))
|
||||
)))))
|
||||
|
||||
}
|
||||
|
||||
fn complete_unquote_splicing(&mut self, tok: LexToken) -> Result<Rc<Datum>, ParseError> {
|
||||
let next = self.get_next_datum()?;
|
||||
match *next {
|
||||
Datum::List(_) | Datum::Vector(_) | Datum::Symbol(_) => (),
|
||||
_ => return Err(ParseError(E_UNQUOTE_SPL_COLL, Some(Ok(tok))))
|
||||
}
|
||||
|
||||
Ok(Rc::from(Datum::List(Rc::from(Ast(
|
||||
Rc::from(Datum::Symbol(String::from("unquote-splicing"))),
|
||||
|
||||
Rc::from(Datum::List(Rc::from(Ast(
|
||||
next,
|
||||
Rc::from(Datum::None)
|
||||
))))
|
||||
)))))
|
||||
}
|
||||
|
||||
fn complete_unquote(&mut self) -> Result<Rc<Datum>, ParseError> {
|
||||
let next = self.get_next_datum()?;
|
||||
Ok(Rc::from(Datum::List(Rc::from(Ast(
|
||||
Rc::from(Datum::Symbol(String::from("unquote"))),
|
||||
|
||||
Rc::from(Datum::List(Rc::from(Ast(
|
||||
next,
|
||||
Rc::from(Datum::None)
|
||||
))))
|
||||
)))))
|
||||
}
|
||||
|
||||
fn complete_quasiquote(&mut self) -> Result<Rc<Datum>, ParseError> {
|
||||
let prev = self.quasiquoted; // handle nesting appropriately
|
||||
self.quasiquoted = true;
|
||||
let next = self.get_next_datum()?;
|
||||
self.quasiquoted = prev;
|
||||
|
||||
Ok(Rc::from(Datum::List(Rc::from(Ast(
|
||||
Rc::from(Datum::Symbol(String::from("quasiquote"))),
|
||||
|
||||
Rc::from(Datum::List(Rc::from(Ast(
|
||||
next,
|
||||
Rc::from(Datum::None)
|
||||
))))
|
||||
)))))
|
||||
}
|
||||
|
||||
fn complete_collection(&mut self, token: LexToken) -> Result<Rc<Datum>, ParseError> {
|
||||
let is_bv = match token.token_type {
|
||||
LexTokenType::ByteVectorStart => true,
|
||||
_ => false,
|
||||
};
|
||||
|
||||
let mut lex_stack = vec![];
|
||||
let mut bv_stack = vec![];
|
||||
|
||||
/* counting indexes helps greatly with calculating position dependent
|
||||
* syntax rules like dot notation in lists
|
||||
*/
|
||||
let mut iter_count = 0;
|
||||
let mut dot_idx = (None, None, None);
|
||||
|
||||
loop {
|
||||
let next_tok = self.lexer.next();
|
||||
if let None = next_tok {
|
||||
return Err(ParseError(E_COLLECTION_TRUNC, None))
|
||||
}
|
||||
|
||||
let tok = next_tok.unwrap();
|
||||
|
||||
match tok.token_type {
|
||||
// Universal cases
|
||||
LexTokenType::Comment | LexTokenType::Directive => continue,
|
||||
LexTokenType::NumTypes =>
|
||||
return Err(ParseError(E_TERRIBLE, Some(Ok(tok)))),
|
||||
LexTokenType::Unquote if !self.quasiquoted =>
|
||||
return Err(ParseError(E_UNQUOTE_SPL_NONQQ, Some(Ok(tok)))),
|
||||
LexTokenType::UnquoteSplice if !self.quasiquoted =>
|
||||
return Err(ParseError(E_UNQUOTE_NONQQ, Some(Ok(tok)))),
|
||||
|
||||
|
||||
// CollectionEnd must take precedence over the dot notation case
|
||||
LexTokenType::CollectionEnd => break,
|
||||
_ if let Some(idx) = dot_idx.0 && iter_count - idx > 2 =>
|
||||
return Err(ParseError(E_DOT_IDX, Some(Ok(dot_idx.1.unwrap())))),
|
||||
|
||||
LexTokenType::Dot if token.token_type != LexTokenType::ListStart =>
|
||||
return Err(ParseError(E_VECTOR_DOT, Some(Ok(tok)))),
|
||||
|
||||
|
||||
// List, Vector cases
|
||||
LexTokenType::ListStart | LexTokenType::VectorStart |
|
||||
LexTokenType::ByteVectorStart if !is_bv =>
|
||||
lex_stack.push(self.complete_collection(tok)?),
|
||||
LexTokenType::String if !is_bv =>
|
||||
lex_stack.push(Rc::from(Datum::String(read_string(tok)))),
|
||||
LexTokenType::Number if !is_bv =>
|
||||
lex_stack.push(Rc::from(Datum::Number(read_number(tok)?))),
|
||||
LexTokenType::Char if !is_bv =>
|
||||
lex_stack.push(Rc::from(Datum::Char(read_char(tok)?))),
|
||||
LexTokenType::Boolean if !is_bv =>
|
||||
lex_stack.push(Rc::from(Datum::Bool(read_bool(tok)))),
|
||||
LexTokenType::Symbol if !is_bv =>
|
||||
lex_stack.push(Rc::from(Datum::Symbol(
|
||||
String::from(&tok.source_doc[tok.start_idx..tok.end_idx])))),
|
||||
LexTokenType::Quote if !is_bv =>
|
||||
lex_stack.push(self.complete_quote()?),
|
||||
LexTokenType::QuasiQuote if !is_bv =>
|
||||
lex_stack.push(self.complete_quasiquote()?),
|
||||
LexTokenType::Unquote if !is_bv && self.quasiquoted =>
|
||||
lex_stack.push(self.complete_unquote()?),
|
||||
LexTokenType::UnquoteSplice if !is_bv && self.quasiquoted =>
|
||||
lex_stack.push(self.complete_unquote_splicing(tok)?),
|
||||
|
||||
|
||||
// List only cases
|
||||
LexTokenType::Dot => if let Some(_) = dot_idx.0 {
|
||||
return Err(ParseError(E_TOO_MANY_DOT, Some(Ok(tok))))
|
||||
} else {
|
||||
dot_idx = (Some(iter_count), Some(tok), None)
|
||||
},
|
||||
|
||||
|
||||
// ByteVector cases
|
||||
LexTokenType::Number if is_bv => {
|
||||
let n = read_number(tok.clone())?
|
||||
.make_inexact();
|
||||
|
||||
if n.0 < 0.0 || n.0 > 255.0 || n.0.fract() != 0.0 {
|
||||
return Err(ParseError(E_BV_BADBYTE, Some(Ok(tok))))
|
||||
}
|
||||
|
||||
bv_stack.push(n.0 as u8);
|
||||
},
|
||||
|
||||
_ if is_bv => return Err(ParseError(E_BV_NONBYTE, Some(Ok(tok)))),
|
||||
|
||||
// This should never get touched
|
||||
_ => todo!("theoretically impossible case in parser::complete_collection"),
|
||||
}
|
||||
|
||||
if let Some(idx) = dot_idx.0 && iter_count == idx + 1 {
|
||||
dot_idx.2 = Some(lex_stack.pop());
|
||||
}
|
||||
|
||||
iter_count += 1;
|
||||
}
|
||||
|
||||
if is_bv {
|
||||
return Ok(Rc::from(Datum::ByteVector(bv_stack)))
|
||||
}
|
||||
|
||||
if token.token_type == LexTokenType::VectorStart {
|
||||
return Ok(Rc::from(Datum::Vector(lex_stack)))
|
||||
}
|
||||
|
||||
// handle an empty list
|
||||
if lex_stack.len() < 1 {
|
||||
// dont try to do something like "( . 'thing)"
|
||||
if let (_, Some(node), _) = dot_idx {
|
||||
return Err(ParseError(E_DOT_EMPTY, Some(Ok(node))))
|
||||
}
|
||||
return Ok(Rc::from(Datum::List(Rc::from(Ast(Rc::from(Datum::None),
|
||||
Rc::from(Datum::None))))))
|
||||
}
|
||||
|
||||
let mut from_rear: Rc<Ast>;
|
||||
if let (_, _, Some(node)) = dot_idx {
|
||||
from_rear = Rc::from(Ast(lex_stack.pop().unwrap(), node.unwrap()));
|
||||
} else {
|
||||
from_rear = Rc::from(Ast(lex_stack.pop().unwrap(), Rc::from(Datum::None)));
|
||||
}
|
||||
|
||||
lex_stack.iter()
|
||||
.rev()
|
||||
.for_each(|x| {
|
||||
from_rear = Rc::from(Ast(x.clone(), Rc::from(Datum::List(from_rear.clone()))));
|
||||
});
|
||||
|
||||
Ok(Rc::from(Datum::List(from_rear)))
|
||||
}
|
||||
|
||||
fn get_next_datum(&mut self) -> Result<Rc<Datum>, ParseError> {
|
||||
if let Some(token) = self.lexer.next() {
|
||||
match token.token_type {
|
||||
// normal paths:
|
||||
LexTokenType::String => Ok(Rc::from(Datum::String(read_string(token)))),
|
||||
LexTokenType::Number => Ok(Rc::from(Datum::Number(read_number(token)?))),
|
||||
LexTokenType::Char => Ok(Rc::from(Datum::Char(read_char(token)?))),
|
||||
LexTokenType::Symbol => Ok(Rc::from(Datum::Symbol(String::from(
|
||||
&token.source_doc[token.start_idx..token.end_idx])))),
|
||||
LexTokenType::Boolean => Ok(Rc::from(Datum::Bool(read_bool(token)))),
|
||||
LexTokenType::VectorStart | LexTokenType::ListStart |
|
||||
LexTokenType::ByteVectorStart => self.complete_collection(token),
|
||||
LexTokenType::Quote => self.complete_quote(),
|
||||
LexTokenType::QuasiQuote => self.complete_quasiquote(),
|
||||
LexTokenType::Unquote if self.quasiquoted => self.complete_unquote(),
|
||||
LexTokenType::UnquoteSplice if self.quasiquoted =>
|
||||
self.complete_unquote_splicing(token),
|
||||
|
||||
// immediate errors:
|
||||
LexTokenType::CollectionEnd => Err(ParseError(E_EXTRA_CLOSE, Some(Ok(token)))),
|
||||
LexTokenType::NumTypes => Err(ParseError(E_TERRIBLE, Some(Ok(token)))),
|
||||
LexTokenType::Dot => Err(ParseError(E_DOT_NO_LIST, Some(Ok(token)))),
|
||||
LexTokenType::Unquote if !self.quasiquoted =>
|
||||
Err(ParseError(E_UNQUOTE_NONQQ, Some(Ok(token)))),
|
||||
LexTokenType::UnquoteSplice if !self.quasiquoted =>
|
||||
Err(ParseError(E_UNQUOTE_SPL_NONQQ, Some(Ok(token)))),
|
||||
|
||||
// ignore comment, directive:
|
||||
_ => self.get_next_datum(),
|
||||
}
|
||||
|
||||
// Lexer error
|
||||
} else if self.lexer.has_error_state.is_some() {
|
||||
Err(ParseError(E_LEX_ERROR,
|
||||
Some(Err(self.lexer.has_error_state.clone().unwrap()))))
|
||||
|
||||
// End of document
|
||||
} else {
|
||||
Err(ParseError(E_END_OF_DOCUMENT, None))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_parse_cases() {
|
||||
let happy_cases = vec![
|
||||
// case, result
|
||||
("\"test\"", "\"test\""),
|
||||
("test", "test"),
|
||||
("(1 2 3)", "(1 2 3)"),
|
||||
("'test", "(quote test)"),
|
||||
("`test", "(quasiquote test)"),
|
||||
("`(,one)", "(quasiquote ((unquote one)))"),
|
||||
("`(test ,@(two))", "(quasiquote (test (unquote-splicing (two))))"),
|
||||
("#u8(0 14 249)", "#u8(0 14 249)"),
|
||||
("(nested lists (are pretty cool))", "(nested lists (are pretty cool))"),
|
||||
("((nested) lists (are (pretty) cool))", "((nested) lists (are (pretty) cool))"),
|
||||
("(dotted . notation)", "(dotted . notation)"),
|
||||
("(longer dotted . notation)", "(longer dotted . notation)"),
|
||||
("(hello \"world\")", "(hello \"world\")"),
|
||||
("; big doc string\n(one two)", "(one two)"),
|
||||
("(list #(vect 2 3 #u8(0 0)))", "(list #(vect 2 3 #u8(0 0)))"),
|
||||
("(#\\xf0 #\\alarm #\\a #\\z)", "(#\\xf0 #\\x7 #\\a #\\z)")
|
||||
];
|
||||
|
||||
let sad_cases = vec![
|
||||
"(",
|
||||
"( one two ",
|
||||
"( one two three ( four )",
|
||||
")",
|
||||
"#(st",
|
||||
"#u8(0 ",
|
||||
"#u8(256)",
|
||||
"#u8(two)",
|
||||
"(one two ,three)",
|
||||
"(one two ,@three)",
|
||||
"`(one two ,@4.0)",
|
||||
"(. two)",
|
||||
"(one . two . three)",
|
||||
];
|
||||
|
||||
println!("+ Testing Happy Cases...");
|
||||
happy_cases.iter()
|
||||
.for_each(|(case, result)| {
|
||||
println!(" - case: {}", *case);
|
||||
let mut p = Parser::from(Lexer::from(Rc::from(*case)));
|
||||
let res = p.next();
|
||||
if let None = res {
|
||||
println!("{}", p.has_error_state.unwrap());
|
||||
}
|
||||
assert_eq!(
|
||||
format!("{}", res.unwrap()),
|
||||
format!("{}", result)
|
||||
);
|
||||
});
|
||||
|
||||
println!("+ Testing Sad Cases...");
|
||||
sad_cases.iter()
|
||||
.for_each(|case| {
|
||||
println!(" - case: {}", *case);
|
||||
let mut p = Parser::from(Lexer::from(Rc::from(*case)));
|
||||
assert!(p.next().is_none() && p.has_error_state.is_some())
|
||||
});
|
||||
}
|
||||
}
|
||||
|
|
@ -16,6 +16,7 @@
|
|||
*/
|
||||
|
||||
use core::fmt::{self, Formatter};
|
||||
use alloc::format;
|
||||
use alloc::rc::Rc;
|
||||
use alloc::vec::Vec;
|
||||
use alloc::string::String;
|
||||
|
|
@ -26,33 +27,51 @@ use crate::number::Number;
|
|||
pub enum Datum {
|
||||
Number(Number),
|
||||
Bool(bool),
|
||||
List(Ast),
|
||||
List(Rc<Ast>),
|
||||
Symbol(String),
|
||||
Char(u8),
|
||||
String(Vec<u8>),
|
||||
Vector(Vec<Datum>),
|
||||
Vector(Vec<Rc<Datum>>),
|
||||
ByteVector(Vec<u8>),
|
||||
#[default]
|
||||
None,
|
||||
}
|
||||
|
||||
fn byte_to_escaped_char(b: u8) -> String {
|
||||
unimplemented!()
|
||||
// alarm, backspace, delete
|
||||
match b {
|
||||
_ if b > 31 && b < 127 => String::from(b as char),
|
||||
_ => format!("x{:x}", b),
|
||||
}
|
||||
}
|
||||
|
||||
fn fmt_vec<T: fmt::Display>(v: &Vec<T>) -> String {
|
||||
if v.len() == 0 {
|
||||
return String::new()
|
||||
}
|
||||
let mut s = format!("{}", v[0]);
|
||||
let mut i = v.iter();
|
||||
i.next(); // discard
|
||||
i.for_each(|e| {
|
||||
s = format!("{} {}", s, e);
|
||||
});
|
||||
|
||||
s
|
||||
}
|
||||
|
||||
impl fmt::Display for Datum {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
Datum::Number(n) => write!(f, "{}", Into::<String>::into(*n)),
|
||||
Datum::Bool(n) => write!(f, "{n}"),
|
||||
Datum::Bool(n) => write!(f, "{}", if *n {"#t"} else {"#f"}),
|
||||
Datum::List(n) => write!(f, "{n}"),
|
||||
Datum::Symbol(n) => write!(f, "{n}"),
|
||||
Datum::Char(n) => write!(f, "{}",
|
||||
Datum::Char(n) => write!(f, "#\\{}",
|
||||
byte_to_escaped_char(*n)),
|
||||
Datum::String(n) =>
|
||||
write!(f, "\"{}\"", String::from_utf8_lossy(&*n)),
|
||||
Datum::Vector(n) => write!(f, "#({n:?})"),
|
||||
Datum::ByteVector(n) => write!(f, "#u8({n:?})"),
|
||||
Datum::Vector(n) => write!(f, "#({})", fmt_vec(n)),
|
||||
Datum::ByteVector(n) => write!(f, "#u8({})", fmt_vec(n)),
|
||||
Datum::None => Ok(())
|
||||
}
|
||||
}
|
||||
|
|
@ -68,7 +87,7 @@ impl fmt::Debug for Datum {
|
|||
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
Datum::Number(n) => write!(f, "{}", Into::<String>::into(*n)),
|
||||
Datum::Bool(n) => write!(f, "{n}"),
|
||||
Datum::Bool(n) => write!(f, "{}", if *n {"#t"} else {"#f"}),
|
||||
Datum::List(n) => write!(f, "{n}"),
|
||||
Datum::Char(n) => write!(f, "{}",
|
||||
byte_to_escaped_char(*n)),
|
||||
|
|
@ -84,7 +103,7 @@ impl fmt::Debug for Datum {
|
|||
|
||||
|
||||
#[derive(Default, Clone)]
|
||||
pub struct Ast(Rc<Datum>, Rc<Datum>);
|
||||
pub struct Ast(pub Rc<Datum>, pub Rc<Datum>);
|
||||
|
||||
impl Iterator for Ast {
|
||||
type Item = Rc<Datum>;
|
||||
|
|
@ -120,7 +139,7 @@ impl fmt::Display for Ast {
|
|||
if let Datum::None = &*cur.1 {
|
||||
write!(f, ")")
|
||||
} else {
|
||||
write!(f, " {})", cur.1)
|
||||
write!(f, " . {})", cur.1)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue