diff --git a/Cargo.lock b/Cargo.lock index e251ead..75350e7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6,16 +6,132 @@ version = 4 name = "amanita" version = "0.1.0" +[[package]] +name = "anstream" +version = "0.6.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" + +[[package]] +name = "anstyle-parse" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3534e77181a9cc07539ad51f2141fe32f6c3ffd4df76db8ad92346b003ae4e" +dependencies = [ + "anstyle", + "once_cell", + "windows-sys", +] + [[package]] name = "autocfg" version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" +[[package]] +name = "clap" +version = "4.5.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed93b9805f8ba930df42c2590f05453d5ec36cbb85d018868a5b24d31f6ac000" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "379026ff283facf611b0ea629334361c4211d1b12ee01024eec1591133b04120" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.5.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09176aae279615badda0765c0c0b3f6ed53f4709118af73cf4655d85d1530cd7" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" + +[[package]] +name = "colorchoice" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" + +[[package]] +name = "decomposer" +version = "0.1.0" +dependencies = [ + "clap", + "mycelium", +] + [[package]] name = "enoki" version = "0.1.0" +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" + [[package]] name = "mycelium" version = "0.1.0" @@ -95,3 +211,129 @@ checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", ] + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "proc-macro2" +version = "1.0.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "syn" +version = "2.0.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" diff --git a/Cargo.toml b/Cargo.toml index c90a4bb..3d4d48c 100755 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,3 +1,3 @@ [workspace] resolver = "2" -members = [ "amanita", "enoki","mycelium"] +members = ["mycelium", "decomposer"] diff --git a/decomposer/Cargo.toml b/decomposer/Cargo.toml new file mode 100644 index 0000000..60b0f8e --- /dev/null +++ b/decomposer/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "decomposer" +version = "0.1.0" +edition = "2021" +authors = ["Ava Affine "] + +[dependencies] +clap = { version = "4.5.38", features = [ "derive" ] } +mycelium = { path = "../mycelium" } diff --git a/decomposer/src/main.rs b/decomposer/src/main.rs new file mode 100644 index 0000000..345abb1 --- /dev/null +++ b/decomposer/src/main.rs @@ -0,0 +1,81 @@ +/* Mycelium Scheme + * Copyright (C) 2025 Ava Affine + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#![feature(iter_collect_into)] + +use mycelium::{lexer as l, parser as p}; + +use std::rc::Rc; +use std::path::PathBuf; +use std::fs; +use std::error::Error; +use std::time::Instant; + +use clap::Parser; + +#[derive(Parser, Debug)] +#[command(version, about, long_about = None)] +struct Args { + /// display time to parse scripts + #[arg(short, long)] + time: bool, + + /// output script AST once parsed + #[arg(short, long)] + debug: bool, + + scripts: Vec, +} + +fn main() -> Result<(), Box> { + let args = Args::parse(); + for script in args.scripts { + println!("+ processing {:#?}", script); + let message = fs::read_to_string(script)?; + + let start: Option; + if args.time { + start = Some(Instant::now()); + } else { + start = None; + } + + let mut p = p::Parser::from(l::Lexer::from(Rc::from(message.as_str()))); + let mut i = p.next(); + while let Some(ref r) = i { + if args.debug { + println!(" > res: {}", r); + } + + i = p.next(); + } + + if let Some(i) = start { + println!(" > time: {:#?}", i.elapsed()); + } + + if p.has_error_state.is_some() { + let e = p.has_error_state.unwrap(); + if e.0 != l::E_END_OF_DOCUMENT { + println!(" > error!\n{}", e); + } + } + } + + + Ok(()) +} diff --git a/mycelium/src/lexer.rs b/mycelium/src/lexer.rs index e497f94..431054c 100644 --- a/mycelium/src/lexer.rs +++ b/mycelium/src/lexer.rs @@ -234,7 +234,7 @@ impl Lexer { } #[inline(always)] - fn match_chunk_next(&mut self, chunk: &str) -> Option { + fn match_chunk_next(&mut self, chunk: &str, peek: bool) -> Option { let saved = self.current_index; for i in chunk.chars() { if let None = self.advance_char() { @@ -248,6 +248,7 @@ impl Lexer { } } + if peek { self.current_index = saved; } Some(true) } @@ -290,7 +291,15 @@ impl Lexer { let mut base = 10; let a = self.current_char(); - if NUMERICAL_BASE.contains(&a) || a == 'i' || a == 'e' { + if let Some(true) = self.match_chunk_next("inf.0", false) { + return self.cut_new_token(LexTokenType::Number) + } + + if let Some(true) = self.match_chunk_next("nan.0", false) { + return self.cut_new_token(LexTokenType::Number) + } + + if NUMERICAL_BASE.contains(&a) || a == 'i' || a == 'e' || a == '+' || a == '-' { if let None = self.advance_char() { return Err(LexError(E_NUMBER_TRUNCATED, self.current_token_start, self.document.clone())) @@ -311,14 +320,6 @@ impl Lexer { } } - if let Some(true) = self.match_chunk_next("inf.0") { - return self.cut_new_token(LexTokenType::Number) - } - - if let Some(true) = self.match_chunk_next("nan.0") { - return self.cut_new_token(LexTokenType::Number) - } - let mut hasdot = false; let mut hasslash = false; let mut hase = false; @@ -374,7 +375,7 @@ impl Lexer { '|' if self.advance_char().and_then(|_| if self.current_char() == '#' { return Some(()) - } else { return None }).is_some() => + } else { return None }).is_some() => return self.cut_new_token(LexTokenType::Comment), _ => continue, }; @@ -427,7 +428,7 @@ impl Lexer { 't' | 'f' => return self.cut_new_token(LexTokenType::Boolean), '|' => return self.seek_end_of_block_comment(), '!' => return self.seek_end_of_line_comment(true), - 'u' if self.match_chunk_next("8(").is_some_and(|x| x) => + 'u' if self.match_chunk_next("8(", false).is_some_and(|x| x) => return self.cut_new_token(LexTokenType::ByteVectorStart), '(' => return self.cut_new_token(LexTokenType::VectorStart), '\\' => self.seek_end_of_escape(false, ) @@ -470,16 +471,19 @@ impl Lexer { */ match self.current_char() { // char escapes - 'a' if !in_string => self.match_chunk_next("larm"), - 'b' if !in_string => self.match_chunk_next("ackspace"), - 'd' if !in_string => self.match_chunk_next("elete"), - 'e' if !in_string => self.match_chunk_next("scape"), - 'n' if !in_string => self.match_chunk_next("ewline").or( - self.match_chunk_next("ull") - ), - 'r' if !in_string => self.match_chunk_next("eturn"), - 's' if !in_string => self.match_chunk_next("pace"), - 't' if !in_string => self.match_chunk_next("ab"), + 'a' if !in_string => self.match_chunk_next("larm", false), + 'b' if !in_string => self.match_chunk_next("ackspace", false), + 'd' if !in_string => self.match_chunk_next("elete", false), + 'e' if !in_string => self.match_chunk_next("scape", false), + 'n' if !in_string => self.match_chunk_next("ewline", false) + .or(self.match_chunk_next("ull", false)), + 'r' if !in_string => self.match_chunk_next("eturn", false), + 's' if !in_string => self.match_chunk_next("pace", false), + 't' if !in_string => self.match_chunk_next("ab", false), + // specifically catch a non hex 'x' character escape + 'x' if self.peek_next_char() + .is_none_or(|c| TOK_DELIMITERS.contains(&c)) && !in_string + => None, // string escapes 'a' | 'b' | 't' | 'n' | 'r' | '"' | '\\' if in_string => None, @@ -538,7 +542,14 @@ impl Lexer { self.current_token_start = self.current_index; - // handle syntactic sugar cases + macro_rules! numeric { + ( $x:expr ) => { + $x.is_numeric() || self.match_chunk_next("inf.0", true) + .or(self.match_chunk_next("nan.0", true)) + .or(Some(false)) + .unwrap() + }; + } match self.current_char() { ';' => output = Some(self.seek_end_of_line_comment(false)), '\'' => output = Some(self.cut_new_token(LexTokenType::Quote)), @@ -547,12 +558,11 @@ impl Lexer { ')' => output = Some(self.cut_new_token(LexTokenType::CollectionEnd)), '#' => output = Some(self.seek_end_from_hash()), '"' => output = Some(self.seek_end_of_string()), - /* This code commented out. I dont think you can open a char without '#' - * '\\' => output = Some(self.seek_end_of_escape(false) - .and_then(|_| - self.cut_new_token(LexTokenType::Char))),*/ '|' => output = Some(self.seek_closing_pipe()), - '+' | '-' => output = Some(self.seek_end_of_number()), + '+' | '-' if self.peek_next_char() + .and_then(|x| Some(numeric!(x))) + .or(Some(false)) + .unwrap() => output = Some(self.seek_end_of_number()), _ if self.current_char().is_numeric() => output = Some(self.seek_end_of_number()), _ => (), @@ -583,7 +593,10 @@ impl Lexer { if output.is_none() { loop { let c = self.current_char(); - if !c.is_alphanumeric() && !LEX_SPECIAL.contains(&c) && c != ' ' { + if !c.is_alphanumeric() && + !LEX_SPECIAL.contains(&c) && + !TOK_DELIMITERS.contains(&c) { + output = Some(Err(LexError(E_INCOMPREHENSIBLE, self.current_index, self.document.clone()))); break; @@ -635,7 +648,7 @@ mod tests { /* Number Cases */ ( // HAPPY CASES vec!["1", "1.0", "#d1.1", "#o1423", "#b11", "#xDF", - "#e1e1", "#i1/4", "+inf.0", "1e1"], + "#e1e1", "#i1/4", "+inf.0", "1e1", "-1"], // SAD CASES vec!["1.1.1", "#o9", "#b1.01", "#i1/3/3"] @@ -644,7 +657,7 @@ mod tests { /* Char Cases */ ( // HAPPY CASES vec!["#\\a", "#\\t", "#\\\"", "#\\t", "#\\space", - "#\\alarm", "#\\s", "#\\x20"], + "#\\alarm", "#\\s", "#\\x20", "#\\x", "#\\\\"], // SAD CASES vec!["\\c", "\\x20"] @@ -654,7 +667,8 @@ mod tests { // HAPPY CASES vec!["...", "<=?", "V17a", "a34kTMNs", "lambda", "q", "list->vector", "|two words|", "|two\nwords|", - "the-word-recursion-has-many-meanings"], + "the-word-recursion-has-many-meanings", "+", "-", + "slatex.*slatex*"], // SAD CASES vec!["|\"\"|", "|(|", "|valid"] @@ -858,4 +872,29 @@ mod tests { assert_eq!(res[2].token_type, LexTokenType::CollectionEnd); assert_eq!(&res[2].source_doc[res[2].start_idx..res[2].end_idx], ")"); } + + #[test] + fn num_lex_plusnum_case() { + let mut res = vec![]; + Lexer::from(Rc::from("+1")) + .into_iter() + .collect_into(&mut res); + assert_eq!(res.len(), 1); + assert_eq!(res[0].token_type, LexTokenType::Number); + } + + #[test] + fn char_lex_xchar_case() { + let mut res = vec![]; + Lexer::from(Rc::from("#\\x)")) + .into_iter() + .collect_into(&mut res); + assert_eq!(res.len(), 2); + + assert_eq!(res[0].token_type, LexTokenType::Char); + assert_eq!(&res[0].source_doc[res[0].start_idx..res[0].end_idx], "#\\x"); + + assert_eq!(res[1].token_type, LexTokenType::CollectionEnd); + assert_eq!(&res[1].source_doc[res[1].start_idx..res[1].end_idx], ")"); + } } diff --git a/mycelium/src/number.rs b/mycelium/src/number.rs index 5ee65be..845bce0 100644 --- a/mycelium/src/number.rs +++ b/mycelium/src/number.rs @@ -19,7 +19,7 @@ use alloc::string::String; use alloc::format; use alloc::fmt::Debug; use core::{cmp::Ordering, f64, ops::{Add, Div, Mul, Sub}, str::FromStr}; -use num::{integer::{gcd}, pow::{self, Pow}}; +use num::{integer::{gcd}, pow::Pow}; pub const E_INCOMPREHENSIBLE: &str = "could not comprehend number literal"; pub const E_BASE_PARSE_FAIL: &str = "failed to parse explicit base literal"; diff --git a/mycelium/src/parser.rs b/mycelium/src/parser.rs index 4b383a0..3d7959e 100644 --- a/mycelium/src/parser.rs +++ b/mycelium/src/parser.rs @@ -62,7 +62,7 @@ pub struct ParseError(pub &'static str, pub Option>); impl Display for ParseError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let err_snippet_start = |t: &LexToken| -> usize { - /* backtrack from current index until we either hit + /* backtrack from current index until we either hit * - beginning of line * - 25 characters ago * - the doc Start @@ -187,7 +187,7 @@ fn read_number(token: LexToken) -> Result { } fn read_char(token: LexToken) -> Result { - if token.end_idx - token.start_idx < 2 { + if token.end_idx - token.start_idx < 3 { return Err(ParseError(E_CHAR_TRUNCATED, Some(Ok(token)))) } @@ -202,7 +202,7 @@ fn read_char(token: LexToken) -> Result { "space" => Ok(32), "tab" => Ok(11), _ if token.source_doc[token.start_idx + 2..].starts_with('x') && - token.end_idx - token.start_idx > 2 => { + token.end_idx - token.start_idx > 3 => { if token.end_idx - token.start_idx > 5 { return Err(ParseError(E_CHAR_TOO_LONG, Some(Ok(token)))) } @@ -327,7 +327,10 @@ impl Parser { loop { let next_tok = self.lexer.next(); if let None = next_tok { - return Err(ParseError(E_COLLECTION_TRUNC, None)) + if let Some(e) = &self.lexer.has_error_state { + return Err(ParseError(E_LEX_ERROR, Some(Err(e.clone())))) + } + return Err(ParseError(E_COLLECTION_TRUNC, Some(Ok(token)))) } let tok = next_tok.unwrap(); @@ -476,7 +479,7 @@ impl Parser { } // Lexer error - } else if self.lexer.has_error_state.is_some() { + } else if let Some(e) = &self.lexer.has_error_state && e.0 != E_END_OF_DOCUMENT { Err(ParseError(E_LEX_ERROR, Some(Err(self.lexer.has_error_state.clone().unwrap())))) @@ -511,7 +514,10 @@ mod tests { ("(hello \"world\")", "(hello \"world\")"), ("; big doc string\n(one two)", "(one two)"), ("(list #(vect 2 3 #u8(0 0)))", "(list #(vect 2 3 #u8(0 0)))"), - ("(#\\xf0 #\\alarm #\\a #\\z)", "(#\\xf0 #\\x7 #\\a #\\z)") + ("(#\\xf0 #\\alarm #\\a #\\z)", "(#\\xf0 #\\x7 #\\a #\\z)"), + ("(- q 1)", "(- q 1)"), + ("(+ q 1)", "(+ q 1)"), + ("(#\\x)", "(#\\x)"), ]; let sad_cases = vec![