From e4c6e0924afc3d4979a39f4b781f4590da3be62c Mon Sep 17 00:00:00 2001 From: Ava Affine Date: Wed, 21 May 2025 14:48:36 -0700 Subject: [PATCH] Decomposer: fixes from found code This commit includes a new utility, the decomposer, which has primarily been used to test the AST against found scheme code in the wild (internet). Decomposer will time and test the lexing and parsing of any document full of scheme. This commit includes additional test cases and logical fixes for issues found during the testing performed. Signed-off-by: Ava Affine --- Cargo.lock | 242 +++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 2 +- decomposer/Cargo.toml | 9 ++ decomposer/src/main.rs | 81 ++++++++++++++ mycelium/src/lexer.rs | 103 ++++++++++++------ mycelium/src/number.rs | 2 +- mycelium/src/parser.rs | 18 ++- 7 files changed, 417 insertions(+), 40 deletions(-) create mode 100644 decomposer/Cargo.toml create mode 100644 decomposer/src/main.rs diff --git a/Cargo.lock b/Cargo.lock index e251ead..75350e7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6,16 +6,132 @@ version = 4 name = "amanita" version = "0.1.0" +[[package]] +name = "anstream" +version = "0.6.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" + +[[package]] +name = "anstyle-parse" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3534e77181a9cc07539ad51f2141fe32f6c3ffd4df76db8ad92346b003ae4e" +dependencies = [ + "anstyle", + "once_cell", + "windows-sys", +] + [[package]] name = "autocfg" version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" +[[package]] +name = "clap" +version = "4.5.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed93b9805f8ba930df42c2590f05453d5ec36cbb85d018868a5b24d31f6ac000" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "379026ff283facf611b0ea629334361c4211d1b12ee01024eec1591133b04120" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.5.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09176aae279615badda0765c0c0b3f6ed53f4709118af73cf4655d85d1530cd7" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" + +[[package]] +name = "colorchoice" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" + +[[package]] +name = "decomposer" +version = "0.1.0" +dependencies = [ + "clap", + "mycelium", +] + [[package]] name = "enoki" version = "0.1.0" +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" + [[package]] name = "mycelium" version = "0.1.0" @@ -95,3 +211,129 @@ checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", ] + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "proc-macro2" +version = "1.0.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "syn" +version = "2.0.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" diff --git a/Cargo.toml b/Cargo.toml index c90a4bb..3d4d48c 100755 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,3 +1,3 @@ [workspace] resolver = "2" -members = [ "amanita", "enoki","mycelium"] +members = ["mycelium", "decomposer"] diff --git a/decomposer/Cargo.toml b/decomposer/Cargo.toml new file mode 100644 index 0000000..60b0f8e --- /dev/null +++ b/decomposer/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "decomposer" +version = "0.1.0" +edition = "2021" +authors = ["Ava Affine "] + +[dependencies] +clap = { version = "4.5.38", features = [ "derive" ] } +mycelium = { path = "../mycelium" } diff --git a/decomposer/src/main.rs b/decomposer/src/main.rs new file mode 100644 index 0000000..345abb1 --- /dev/null +++ b/decomposer/src/main.rs @@ -0,0 +1,81 @@ +/* Mycelium Scheme + * Copyright (C) 2025 Ava Affine + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#![feature(iter_collect_into)] + +use mycelium::{lexer as l, parser as p}; + +use std::rc::Rc; +use std::path::PathBuf; +use std::fs; +use std::error::Error; +use std::time::Instant; + +use clap::Parser; + +#[derive(Parser, Debug)] +#[command(version, about, long_about = None)] +struct Args { + /// display time to parse scripts + #[arg(short, long)] + time: bool, + + /// output script AST once parsed + #[arg(short, long)] + debug: bool, + + scripts: Vec, +} + +fn main() -> Result<(), Box> { + let args = Args::parse(); + for script in args.scripts { + println!("+ processing {:#?}", script); + let message = fs::read_to_string(script)?; + + let start: Option; + if args.time { + start = Some(Instant::now()); + } else { + start = None; + } + + let mut p = p::Parser::from(l::Lexer::from(Rc::from(message.as_str()))); + let mut i = p.next(); + while let Some(ref r) = i { + if args.debug { + println!(" > res: {}", r); + } + + i = p.next(); + } + + if let Some(i) = start { + println!(" > time: {:#?}", i.elapsed()); + } + + if p.has_error_state.is_some() { + let e = p.has_error_state.unwrap(); + if e.0 != l::E_END_OF_DOCUMENT { + println!(" > error!\n{}", e); + } + } + } + + + Ok(()) +} diff --git a/mycelium/src/lexer.rs b/mycelium/src/lexer.rs index e497f94..431054c 100644 --- a/mycelium/src/lexer.rs +++ b/mycelium/src/lexer.rs @@ -234,7 +234,7 @@ impl Lexer { } #[inline(always)] - fn match_chunk_next(&mut self, chunk: &str) -> Option { + fn match_chunk_next(&mut self, chunk: &str, peek: bool) -> Option { let saved = self.current_index; for i in chunk.chars() { if let None = self.advance_char() { @@ -248,6 +248,7 @@ impl Lexer { } } + if peek { self.current_index = saved; } Some(true) } @@ -290,7 +291,15 @@ impl Lexer { let mut base = 10; let a = self.current_char(); - if NUMERICAL_BASE.contains(&a) || a == 'i' || a == 'e' { + if let Some(true) = self.match_chunk_next("inf.0", false) { + return self.cut_new_token(LexTokenType::Number) + } + + if let Some(true) = self.match_chunk_next("nan.0", false) { + return self.cut_new_token(LexTokenType::Number) + } + + if NUMERICAL_BASE.contains(&a) || a == 'i' || a == 'e' || a == '+' || a == '-' { if let None = self.advance_char() { return Err(LexError(E_NUMBER_TRUNCATED, self.current_token_start, self.document.clone())) @@ -311,14 +320,6 @@ impl Lexer { } } - if let Some(true) = self.match_chunk_next("inf.0") { - return self.cut_new_token(LexTokenType::Number) - } - - if let Some(true) = self.match_chunk_next("nan.0") { - return self.cut_new_token(LexTokenType::Number) - } - let mut hasdot = false; let mut hasslash = false; let mut hase = false; @@ -374,7 +375,7 @@ impl Lexer { '|' if self.advance_char().and_then(|_| if self.current_char() == '#' { return Some(()) - } else { return None }).is_some() => + } else { return None }).is_some() => return self.cut_new_token(LexTokenType::Comment), _ => continue, }; @@ -427,7 +428,7 @@ impl Lexer { 't' | 'f' => return self.cut_new_token(LexTokenType::Boolean), '|' => return self.seek_end_of_block_comment(), '!' => return self.seek_end_of_line_comment(true), - 'u' if self.match_chunk_next("8(").is_some_and(|x| x) => + 'u' if self.match_chunk_next("8(", false).is_some_and(|x| x) => return self.cut_new_token(LexTokenType::ByteVectorStart), '(' => return self.cut_new_token(LexTokenType::VectorStart), '\\' => self.seek_end_of_escape(false, ) @@ -470,16 +471,19 @@ impl Lexer { */ match self.current_char() { // char escapes - 'a' if !in_string => self.match_chunk_next("larm"), - 'b' if !in_string => self.match_chunk_next("ackspace"), - 'd' if !in_string => self.match_chunk_next("elete"), - 'e' if !in_string => self.match_chunk_next("scape"), - 'n' if !in_string => self.match_chunk_next("ewline").or( - self.match_chunk_next("ull") - ), - 'r' if !in_string => self.match_chunk_next("eturn"), - 's' if !in_string => self.match_chunk_next("pace"), - 't' if !in_string => self.match_chunk_next("ab"), + 'a' if !in_string => self.match_chunk_next("larm", false), + 'b' if !in_string => self.match_chunk_next("ackspace", false), + 'd' if !in_string => self.match_chunk_next("elete", false), + 'e' if !in_string => self.match_chunk_next("scape", false), + 'n' if !in_string => self.match_chunk_next("ewline", false) + .or(self.match_chunk_next("ull", false)), + 'r' if !in_string => self.match_chunk_next("eturn", false), + 's' if !in_string => self.match_chunk_next("pace", false), + 't' if !in_string => self.match_chunk_next("ab", false), + // specifically catch a non hex 'x' character escape + 'x' if self.peek_next_char() + .is_none_or(|c| TOK_DELIMITERS.contains(&c)) && !in_string + => None, // string escapes 'a' | 'b' | 't' | 'n' | 'r' | '"' | '\\' if in_string => None, @@ -538,7 +542,14 @@ impl Lexer { self.current_token_start = self.current_index; - // handle syntactic sugar cases + macro_rules! numeric { + ( $x:expr ) => { + $x.is_numeric() || self.match_chunk_next("inf.0", true) + .or(self.match_chunk_next("nan.0", true)) + .or(Some(false)) + .unwrap() + }; + } match self.current_char() { ';' => output = Some(self.seek_end_of_line_comment(false)), '\'' => output = Some(self.cut_new_token(LexTokenType::Quote)), @@ -547,12 +558,11 @@ impl Lexer { ')' => output = Some(self.cut_new_token(LexTokenType::CollectionEnd)), '#' => output = Some(self.seek_end_from_hash()), '"' => output = Some(self.seek_end_of_string()), - /* This code commented out. I dont think you can open a char without '#' - * '\\' => output = Some(self.seek_end_of_escape(false) - .and_then(|_| - self.cut_new_token(LexTokenType::Char))),*/ '|' => output = Some(self.seek_closing_pipe()), - '+' | '-' => output = Some(self.seek_end_of_number()), + '+' | '-' if self.peek_next_char() + .and_then(|x| Some(numeric!(x))) + .or(Some(false)) + .unwrap() => output = Some(self.seek_end_of_number()), _ if self.current_char().is_numeric() => output = Some(self.seek_end_of_number()), _ => (), @@ -583,7 +593,10 @@ impl Lexer { if output.is_none() { loop { let c = self.current_char(); - if !c.is_alphanumeric() && !LEX_SPECIAL.contains(&c) && c != ' ' { + if !c.is_alphanumeric() && + !LEX_SPECIAL.contains(&c) && + !TOK_DELIMITERS.contains(&c) { + output = Some(Err(LexError(E_INCOMPREHENSIBLE, self.current_index, self.document.clone()))); break; @@ -635,7 +648,7 @@ mod tests { /* Number Cases */ ( // HAPPY CASES vec!["1", "1.0", "#d1.1", "#o1423", "#b11", "#xDF", - "#e1e1", "#i1/4", "+inf.0", "1e1"], + "#e1e1", "#i1/4", "+inf.0", "1e1", "-1"], // SAD CASES vec!["1.1.1", "#o9", "#b1.01", "#i1/3/3"] @@ -644,7 +657,7 @@ mod tests { /* Char Cases */ ( // HAPPY CASES vec!["#\\a", "#\\t", "#\\\"", "#\\t", "#\\space", - "#\\alarm", "#\\s", "#\\x20"], + "#\\alarm", "#\\s", "#\\x20", "#\\x", "#\\\\"], // SAD CASES vec!["\\c", "\\x20"] @@ -654,7 +667,8 @@ mod tests { // HAPPY CASES vec!["...", "<=?", "V17a", "a34kTMNs", "lambda", "q", "list->vector", "|two words|", "|two\nwords|", - "the-word-recursion-has-many-meanings"], + "the-word-recursion-has-many-meanings", "+", "-", + "slatex.*slatex*"], // SAD CASES vec!["|\"\"|", "|(|", "|valid"] @@ -858,4 +872,29 @@ mod tests { assert_eq!(res[2].token_type, LexTokenType::CollectionEnd); assert_eq!(&res[2].source_doc[res[2].start_idx..res[2].end_idx], ")"); } + + #[test] + fn num_lex_plusnum_case() { + let mut res = vec![]; + Lexer::from(Rc::from("+1")) + .into_iter() + .collect_into(&mut res); + assert_eq!(res.len(), 1); + assert_eq!(res[0].token_type, LexTokenType::Number); + } + + #[test] + fn char_lex_xchar_case() { + let mut res = vec![]; + Lexer::from(Rc::from("#\\x)")) + .into_iter() + .collect_into(&mut res); + assert_eq!(res.len(), 2); + + assert_eq!(res[0].token_type, LexTokenType::Char); + assert_eq!(&res[0].source_doc[res[0].start_idx..res[0].end_idx], "#\\x"); + + assert_eq!(res[1].token_type, LexTokenType::CollectionEnd); + assert_eq!(&res[1].source_doc[res[1].start_idx..res[1].end_idx], ")"); + } } diff --git a/mycelium/src/number.rs b/mycelium/src/number.rs index 5ee65be..845bce0 100644 --- a/mycelium/src/number.rs +++ b/mycelium/src/number.rs @@ -19,7 +19,7 @@ use alloc::string::String; use alloc::format; use alloc::fmt::Debug; use core::{cmp::Ordering, f64, ops::{Add, Div, Mul, Sub}, str::FromStr}; -use num::{integer::{gcd}, pow::{self, Pow}}; +use num::{integer::{gcd}, pow::Pow}; pub const E_INCOMPREHENSIBLE: &str = "could not comprehend number literal"; pub const E_BASE_PARSE_FAIL: &str = "failed to parse explicit base literal"; diff --git a/mycelium/src/parser.rs b/mycelium/src/parser.rs index 4b383a0..3d7959e 100644 --- a/mycelium/src/parser.rs +++ b/mycelium/src/parser.rs @@ -62,7 +62,7 @@ pub struct ParseError(pub &'static str, pub Option>); impl Display for ParseError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let err_snippet_start = |t: &LexToken| -> usize { - /* backtrack from current index until we either hit + /* backtrack from current index until we either hit * - beginning of line * - 25 characters ago * - the doc Start @@ -187,7 +187,7 @@ fn read_number(token: LexToken) -> Result { } fn read_char(token: LexToken) -> Result { - if token.end_idx - token.start_idx < 2 { + if token.end_idx - token.start_idx < 3 { return Err(ParseError(E_CHAR_TRUNCATED, Some(Ok(token)))) } @@ -202,7 +202,7 @@ fn read_char(token: LexToken) -> Result { "space" => Ok(32), "tab" => Ok(11), _ if token.source_doc[token.start_idx + 2..].starts_with('x') && - token.end_idx - token.start_idx > 2 => { + token.end_idx - token.start_idx > 3 => { if token.end_idx - token.start_idx > 5 { return Err(ParseError(E_CHAR_TOO_LONG, Some(Ok(token)))) } @@ -327,7 +327,10 @@ impl Parser { loop { let next_tok = self.lexer.next(); if let None = next_tok { - return Err(ParseError(E_COLLECTION_TRUNC, None)) + if let Some(e) = &self.lexer.has_error_state { + return Err(ParseError(E_LEX_ERROR, Some(Err(e.clone())))) + } + return Err(ParseError(E_COLLECTION_TRUNC, Some(Ok(token)))) } let tok = next_tok.unwrap(); @@ -476,7 +479,7 @@ impl Parser { } // Lexer error - } else if self.lexer.has_error_state.is_some() { + } else if let Some(e) = &self.lexer.has_error_state && e.0 != E_END_OF_DOCUMENT { Err(ParseError(E_LEX_ERROR, Some(Err(self.lexer.has_error_state.clone().unwrap())))) @@ -511,7 +514,10 @@ mod tests { ("(hello \"world\")", "(hello \"world\")"), ("; big doc string\n(one two)", "(one two)"), ("(list #(vect 2 3 #u8(0 0)))", "(list #(vect 2 3 #u8(0 0)))"), - ("(#\\xf0 #\\alarm #\\a #\\z)", "(#\\xf0 #\\x7 #\\a #\\z)") + ("(#\\xf0 #\\alarm #\\a #\\z)", "(#\\xf0 #\\x7 #\\a #\\z)"), + ("(- q 1)", "(- q 1)"), + ("(+ q 1)", "(+ q 1)"), + ("(#\\x)", "(#\\x)"), ]; let sad_cases = vec![