Lexer and S-Expression data types

The lexer is complete with tests. It fully encapsulates the logic
of splitting an input document into a stream of tokens. It can be
instantiated from an Rc<str>, meaning no lifetimes need be managed
references to the original document (like a stringview) can be
passed around carelessly. The Lexer implements the iterator
method which should help elegantly design repls / compilers, etc.

The S-Expression data type represents the parsed AST. The actual
parsing logic is yet to be added. It is intended that the AST be
the last step before compiling to bytecode. The data representation
here is cons cells of datum. Formatting is implemented.

Signed-off-by: Ava Affine <ava@sunnypup.io>
This commit is contained in:
Ava Apples Affine 2025-05-07 09:19:33 -07:00
commit 6554a0639a
10 changed files with 1533 additions and 0 deletions

128
mycelium/src/sexpr.rs Normal file
View file

@ -0,0 +1,128 @@
/* Mycelium Scheme
* Copyright (C) 2025 Ava Affine
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
use core::fmt::{self, Formatter};
use alloc::rc::Rc;
use alloc::vec::Vec;
use alloc::string::String;
#[derive(Default, Clone)]
pub enum Datum {
Number(f64),
Bool(bool),
List(Ast),
Symbol(String),
String(Vec<u8>),
Vector(Vec<Datum>),
ByteVector(Vec<u8>),
#[default]
None,
}
impl fmt::Display for Datum {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
match self {
Datum::Number(n) => write!(f, "{n}"),
Datum::Bool(n) => write!(f, "{n}"),
Datum::List(n) => write!(f, "{n}"),
Datum::Symbol(n) => write!(f, "{n}"),
Datum::String(n) =>
write!(f, "\"{}\"", String::from_utf8_lossy(&*n)),
Datum::Vector(n) => write!(f, "#({n:?})"),
Datum::ByteVector(n) => write!(f, "#u8({n:?})"),
Datum::None => Ok(())
}
}
}
/* WARNING
* This is in a sense overloaded.
* Instead of using this to print debugging information for the
* Rust code, I have instead overloaded it to print the most
* maximal expanded valid syntax for this Datum
*/
impl fmt::Debug for Datum {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
match self {
Datum::Number(n) => write!(f, "{n}"),
Datum::Bool(n) => write!(f, "{n}"),
Datum::List(n) => write!(f, "{n}"),
Datum::Symbol(n) => write!(f, "{n}"),
Datum::String(n) =>
write!(f, "\"{}\"", String::from_utf8_lossy(&*n)),
Datum::Vector(n) => write!(f, "#({n:?})"),
Datum::ByteVector(n) => write!(f, "#u8({n:?})"),
Datum::None => Ok(())
}
}
}
#[derive(Default, Clone)]
pub struct Ast(Rc<Datum>, Rc<Datum>);
impl Iterator for Ast {
type Item = Rc<Datum>;
fn next(&mut self) -> Option<Self::Item> {
if let Datum::List(n) = &*self.1 {
let tmp_pair = n;
self.0 = tmp_pair.0.clone();
self.1 = tmp_pair.1.clone();
return Some(self.0.clone());
}
if let Datum::None = *self.1 {
return None;
}
let tmp = self.1.clone();
self.0 = Rc::from(Datum::None);
self.1 = Rc::from(Datum::None);
return Some(tmp);
}
}
impl fmt::Display for Ast {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "({}", self.0)?;
let mut cur = self;
while let Datum::List(next) = &*cur.1 {
cur = &next;
write!(f, " {}", cur.0)?;
}
if let Datum::None = &*cur.1 {
write!(f, ")")
} else {
write!(f, " {})", cur.1)
}
}
}
impl fmt::Debug for Ast {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "({}", self.0)?;
let mut cur = self;
let mut end = 1;
while let Datum::List(next) = &*cur.1 {
cur = &next;
end += 1;
write!(f, "({} . ", cur.0)?
}
write!(f, "{}{}", cur.1, ")".repeat(end))
}
}