Decomposer: fixes from found code

This commit includes a new utility, the decomposer, which has
primarily been used to test the AST against found scheme code in
the wild (internet). Decomposer will time and test the lexing and
parsing of any document full of scheme.

This commit includes additional test cases and logical fixes for
issues found during the testing performed.

Signed-off-by: Ava Affine <ava@sunnypup.io>
This commit is contained in:
Ava Apples Affine 2025-05-21 14:48:36 -07:00
parent 86f905ba1d
commit e4c6e0924a
7 changed files with 417 additions and 40 deletions

242
Cargo.lock generated
View file

@ -6,16 +6,132 @@ version = 4
name = "amanita"
version = "0.1.0"
[[package]]
name = "anstream"
version = "0.6.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b"
dependencies = [
"anstyle",
"anstyle-parse",
"anstyle-query",
"anstyle-wincon",
"colorchoice",
"is_terminal_polyfill",
"utf8parse",
]
[[package]]
name = "anstyle"
version = "1.0.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9"
[[package]]
name = "anstyle-parse"
version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9"
dependencies = [
"utf8parse",
]
[[package]]
name = "anstyle-query"
version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c"
dependencies = [
"windows-sys",
]
[[package]]
name = "anstyle-wincon"
version = "3.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ca3534e77181a9cc07539ad51f2141fe32f6c3ffd4df76db8ad92346b003ae4e"
dependencies = [
"anstyle",
"once_cell",
"windows-sys",
]
[[package]]
name = "autocfg"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
[[package]]
name = "clap"
version = "4.5.38"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed93b9805f8ba930df42c2590f05453d5ec36cbb85d018868a5b24d31f6ac000"
dependencies = [
"clap_builder",
"clap_derive",
]
[[package]]
name = "clap_builder"
version = "4.5.38"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "379026ff283facf611b0ea629334361c4211d1b12ee01024eec1591133b04120"
dependencies = [
"anstream",
"anstyle",
"clap_lex",
"strsim",
]
[[package]]
name = "clap_derive"
version = "4.5.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09176aae279615badda0765c0c0b3f6ed53f4709118af73cf4655d85d1530cd7"
dependencies = [
"heck",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "clap_lex"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6"
[[package]]
name = "colorchoice"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
[[package]]
name = "decomposer"
version = "0.1.0"
dependencies = [
"clap",
"mycelium",
]
[[package]]
name = "enoki"
version = "0.1.0"
[[package]]
name = "heck"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
[[package]]
name = "is_terminal_polyfill"
version = "1.70.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
[[package]]
name = "mycelium"
version = "0.1.0"
@ -95,3 +211,129 @@ checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
dependencies = [
"autocfg",
]
[[package]]
name = "once_cell"
version = "1.21.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
[[package]]
name = "proc-macro2"
version = "1.0.95"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.40"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
dependencies = [
"proc-macro2",
]
[[package]]
name = "strsim"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
[[package]]
name = "syn"
version = "2.0.101"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "unicode-ident"
version = "1.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
[[package]]
name = "utf8parse"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
[[package]]
name = "windows-sys"
version = "0.59.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
dependencies = [
"windows-targets",
]
[[package]]
name = "windows-targets"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
dependencies = [
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_gnullvm",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_gnullvm",
"windows_x86_64_msvc",
]
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
[[package]]
name = "windows_aarch64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
[[package]]
name = "windows_i686_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
[[package]]
name = "windows_i686_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
[[package]]
name = "windows_i686_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
[[package]]
name = "windows_x86_64_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
[[package]]
name = "windows_x86_64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"

View file

@ -1,3 +1,3 @@
[workspace]
resolver = "2"
members = [ "amanita", "enoki","mycelium"]
members = ["mycelium", "decomposer"]

9
decomposer/Cargo.toml Normal file
View file

@ -0,0 +1,9 @@
[package]
name = "decomposer"
version = "0.1.0"
edition = "2021"
authors = ["Ava Affine <ava@sunnypup.io>"]
[dependencies]
clap = { version = "4.5.38", features = [ "derive" ] }
mycelium = { path = "../mycelium" }

81
decomposer/src/main.rs Normal file
View file

@ -0,0 +1,81 @@
/* Mycelium Scheme
* Copyright (C) 2025 Ava Affine
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
#![feature(iter_collect_into)]
use mycelium::{lexer as l, parser as p};
use std::rc::Rc;
use std::path::PathBuf;
use std::fs;
use std::error::Error;
use std::time::Instant;
use clap::Parser;
#[derive(Parser, Debug)]
#[command(version, about, long_about = None)]
struct Args {
/// display time to parse scripts
#[arg(short, long)]
time: bool,
/// output script AST once parsed
#[arg(short, long)]
debug: bool,
scripts: Vec<PathBuf>,
}
fn main() -> Result<(), Box<dyn Error>> {
let args = Args::parse();
for script in args.scripts {
println!("+ processing {:#?}", script);
let message = fs::read_to_string(script)?;
let start: Option<Instant>;
if args.time {
start = Some(Instant::now());
} else {
start = None;
}
let mut p = p::Parser::from(l::Lexer::from(Rc::from(message.as_str())));
let mut i = p.next();
while let Some(ref r) = i {
if args.debug {
println!(" > res: {}", r);
}
i = p.next();
}
if let Some(i) = start {
println!(" > time: {:#?}", i.elapsed());
}
if p.has_error_state.is_some() {
let e = p.has_error_state.unwrap();
if e.0 != l::E_END_OF_DOCUMENT {
println!(" > error!\n{}", e);
}
}
}
Ok(())
}

View file

@ -234,7 +234,7 @@ impl Lexer {
}
#[inline(always)]
fn match_chunk_next(&mut self, chunk: &str) -> Option<bool> {
fn match_chunk_next(&mut self, chunk: &str, peek: bool) -> Option<bool> {
let saved = self.current_index;
for i in chunk.chars() {
if let None = self.advance_char() {
@ -248,6 +248,7 @@ impl Lexer {
}
}
if peek { self.current_index = saved; }
Some(true)
}
@ -290,7 +291,15 @@ impl Lexer {
let mut base = 10;
let a = self.current_char();
if NUMERICAL_BASE.contains(&a) || a == 'i' || a == 'e' {
if let Some(true) = self.match_chunk_next("inf.0", false) {
return self.cut_new_token(LexTokenType::Number)
}
if let Some(true) = self.match_chunk_next("nan.0", false) {
return self.cut_new_token(LexTokenType::Number)
}
if NUMERICAL_BASE.contains(&a) || a == 'i' || a == 'e' || a == '+' || a == '-' {
if let None = self.advance_char() {
return Err(LexError(E_NUMBER_TRUNCATED,
self.current_token_start, self.document.clone()))
@ -311,14 +320,6 @@ impl Lexer {
}
}
if let Some(true) = self.match_chunk_next("inf.0") {
return self.cut_new_token(LexTokenType::Number)
}
if let Some(true) = self.match_chunk_next("nan.0") {
return self.cut_new_token(LexTokenType::Number)
}
let mut hasdot = false;
let mut hasslash = false;
let mut hase = false;
@ -374,7 +375,7 @@ impl Lexer {
'|' if self.advance_char().and_then(|_|
if self.current_char() == '#' {
return Some(())
} else { return None }).is_some() =>
} else { return None }).is_some() =>
return self.cut_new_token(LexTokenType::Comment),
_ => continue,
};
@ -427,7 +428,7 @@ impl Lexer {
't' | 'f' => return self.cut_new_token(LexTokenType::Boolean),
'|' => return self.seek_end_of_block_comment(),
'!' => return self.seek_end_of_line_comment(true),
'u' if self.match_chunk_next("8(").is_some_and(|x| x) =>
'u' if self.match_chunk_next("8(", false).is_some_and(|x| x) =>
return self.cut_new_token(LexTokenType::ByteVectorStart),
'(' => return self.cut_new_token(LexTokenType::VectorStart),
'\\' => self.seek_end_of_escape(false, )
@ -470,16 +471,19 @@ impl Lexer {
*/
match self.current_char() {
// char escapes
'a' if !in_string => self.match_chunk_next("larm"),
'b' if !in_string => self.match_chunk_next("ackspace"),
'd' if !in_string => self.match_chunk_next("elete"),
'e' if !in_string => self.match_chunk_next("scape"),
'n' if !in_string => self.match_chunk_next("ewline").or(
self.match_chunk_next("ull")
),
'r' if !in_string => self.match_chunk_next("eturn"),
's' if !in_string => self.match_chunk_next("pace"),
't' if !in_string => self.match_chunk_next("ab"),
'a' if !in_string => self.match_chunk_next("larm", false),
'b' if !in_string => self.match_chunk_next("ackspace", false),
'd' if !in_string => self.match_chunk_next("elete", false),
'e' if !in_string => self.match_chunk_next("scape", false),
'n' if !in_string => self.match_chunk_next("ewline", false)
.or(self.match_chunk_next("ull", false)),
'r' if !in_string => self.match_chunk_next("eturn", false),
's' if !in_string => self.match_chunk_next("pace", false),
't' if !in_string => self.match_chunk_next("ab", false),
// specifically catch a non hex 'x' character escape
'x' if self.peek_next_char()
.is_none_or(|c| TOK_DELIMITERS.contains(&c)) && !in_string
=> None,
// string escapes
'a' | 'b' | 't' | 'n' | 'r' | '"' | '\\' if in_string => None,
@ -538,7 +542,14 @@ impl Lexer {
self.current_token_start = self.current_index;
// handle syntactic sugar cases
macro_rules! numeric {
( $x:expr ) => {
$x.is_numeric() || self.match_chunk_next("inf.0", true)
.or(self.match_chunk_next("nan.0", true))
.or(Some(false))
.unwrap()
};
}
match self.current_char() {
';' => output = Some(self.seek_end_of_line_comment(false)),
'\'' => output = Some(self.cut_new_token(LexTokenType::Quote)),
@ -547,12 +558,11 @@ impl Lexer {
')' => output = Some(self.cut_new_token(LexTokenType::CollectionEnd)),
'#' => output = Some(self.seek_end_from_hash()),
'"' => output = Some(self.seek_end_of_string()),
/* This code commented out. I dont think you can open a char without '#'
* '\\' => output = Some(self.seek_end_of_escape(false)
.and_then(|_|
self.cut_new_token(LexTokenType::Char))),*/
'|' => output = Some(self.seek_closing_pipe()),
'+' | '-' => output = Some(self.seek_end_of_number()),
'+' | '-' if self.peek_next_char()
.and_then(|x| Some(numeric!(x)))
.or(Some(false))
.unwrap() => output = Some(self.seek_end_of_number()),
_ if self.current_char().is_numeric() => output =
Some(self.seek_end_of_number()),
_ => (),
@ -583,7 +593,10 @@ impl Lexer {
if output.is_none() {
loop {
let c = self.current_char();
if !c.is_alphanumeric() && !LEX_SPECIAL.contains(&c) && c != ' ' {
if !c.is_alphanumeric() &&
!LEX_SPECIAL.contains(&c) &&
!TOK_DELIMITERS.contains(&c) {
output = Some(Err(LexError(E_INCOMPREHENSIBLE,
self.current_index, self.document.clone())));
break;
@ -635,7 +648,7 @@ mod tests {
/* Number Cases */ (
// HAPPY CASES
vec!["1", "1.0", "#d1.1", "#o1423", "#b11", "#xDF",
"#e1e1", "#i1/4", "+inf.0", "1e1"],
"#e1e1", "#i1/4", "+inf.0", "1e1", "-1"],
// SAD CASES
vec!["1.1.1", "#o9", "#b1.01", "#i1/3/3"]
@ -644,7 +657,7 @@ mod tests {
/* Char Cases */ (
// HAPPY CASES
vec!["#\\a", "#\\t", "#\\\"", "#\\t", "#\\space",
"#\\alarm", "#\\s", "#\\x20"],
"#\\alarm", "#\\s", "#\\x20", "#\\x", "#\\\\"],
// SAD CASES
vec!["\\c", "\\x20"]
@ -654,7 +667,8 @@ mod tests {
// HAPPY CASES
vec!["...", "<=?", "V17a", "a34kTMNs", "lambda", "q",
"list->vector", "|two words|", "|two\nwords|",
"the-word-recursion-has-many-meanings"],
"the-word-recursion-has-many-meanings", "+", "-",
"slatex.*slatex*"],
// SAD CASES
vec!["|\"\"|", "|(|", "|valid"]
@ -858,4 +872,29 @@ mod tests {
assert_eq!(res[2].token_type, LexTokenType::CollectionEnd);
assert_eq!(&res[2].source_doc[res[2].start_idx..res[2].end_idx], ")");
}
#[test]
fn num_lex_plusnum_case() {
let mut res = vec![];
Lexer::from(Rc::from("+1"))
.into_iter()
.collect_into(&mut res);
assert_eq!(res.len(), 1);
assert_eq!(res[0].token_type, LexTokenType::Number);
}
#[test]
fn char_lex_xchar_case() {
let mut res = vec![];
Lexer::from(Rc::from("#\\x)"))
.into_iter()
.collect_into(&mut res);
assert_eq!(res.len(), 2);
assert_eq!(res[0].token_type, LexTokenType::Char);
assert_eq!(&res[0].source_doc[res[0].start_idx..res[0].end_idx], "#\\x");
assert_eq!(res[1].token_type, LexTokenType::CollectionEnd);
assert_eq!(&res[1].source_doc[res[1].start_idx..res[1].end_idx], ")");
}
}

View file

@ -19,7 +19,7 @@ use alloc::string::String;
use alloc::format;
use alloc::fmt::Debug;
use core::{cmp::Ordering, f64, ops::{Add, Div, Mul, Sub}, str::FromStr};
use num::{integer::{gcd}, pow::{self, Pow}};
use num::{integer::{gcd}, pow::Pow};
pub const E_INCOMPREHENSIBLE: &str = "could not comprehend number literal";
pub const E_BASE_PARSE_FAIL: &str = "failed to parse explicit base literal";

View file

@ -62,7 +62,7 @@ pub struct ParseError(pub &'static str, pub Option<Result<LexToken, LexError>>);
impl Display for ParseError {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
let err_snippet_start = |t: &LexToken| -> usize {
/* backtrack from current index until we either hit
/* backtrack from current index until we either hit
* - beginning of line
* - 25 characters ago
* - the doc Start
@ -187,7 +187,7 @@ fn read_number(token: LexToken) -> Result<Number, ParseError> {
}
fn read_char(token: LexToken) -> Result<u8, ParseError> {
if token.end_idx - token.start_idx < 2 {
if token.end_idx - token.start_idx < 3 {
return Err(ParseError(E_CHAR_TRUNCATED, Some(Ok(token))))
}
@ -202,7 +202,7 @@ fn read_char(token: LexToken) -> Result<u8, ParseError> {
"space" => Ok(32),
"tab" => Ok(11),
_ if token.source_doc[token.start_idx + 2..].starts_with('x') &&
token.end_idx - token.start_idx > 2 => {
token.end_idx - token.start_idx > 3 => {
if token.end_idx - token.start_idx > 5 {
return Err(ParseError(E_CHAR_TOO_LONG, Some(Ok(token))))
}
@ -327,7 +327,10 @@ impl Parser {
loop {
let next_tok = self.lexer.next();
if let None = next_tok {
return Err(ParseError(E_COLLECTION_TRUNC, None))
if let Some(e) = &self.lexer.has_error_state {
return Err(ParseError(E_LEX_ERROR, Some(Err(e.clone()))))
}
return Err(ParseError(E_COLLECTION_TRUNC, Some(Ok(token))))
}
let tok = next_tok.unwrap();
@ -476,7 +479,7 @@ impl Parser {
}
// Lexer error
} else if self.lexer.has_error_state.is_some() {
} else if let Some(e) = &self.lexer.has_error_state && e.0 != E_END_OF_DOCUMENT {
Err(ParseError(E_LEX_ERROR,
Some(Err(self.lexer.has_error_state.clone().unwrap()))))
@ -511,7 +514,10 @@ mod tests {
("(hello \"world\")", "(hello \"world\")"),
("; big doc string\n(one two)", "(one two)"),
("(list #(vect 2 3 #u8(0 0)))", "(list #(vect 2 3 #u8(0 0)))"),
("(#\\xf0 #\\alarm #\\a #\\z)", "(#\\xf0 #\\x7 #\\a #\\z)")
("(#\\xf0 #\\alarm #\\a #\\z)", "(#\\xf0 #\\x7 #\\a #\\z)"),
("(- q 1)", "(- q 1)"),
("(+ q 1)", "(+ q 1)"),
("(#\\x)", "(#\\x)"),
];
let sad_cases = vec![