diff --git a/mycelium/src/lexer.rs b/mycelium/src/lexer.rs index 3bf4016..6ccdbe3 100644 --- a/mycelium/src/lexer.rs +++ b/mycelium/src/lexer.rs @@ -21,11 +21,14 @@ use alloc::rc::Rc; pub const LEX_SPECIAL: [char; 18] = ['!', '$', '%', '&', '*', '+', '-', '/', ':', '<', '=', '>', '?', '@', '^', '_', '~', '.']; pub const LEX_WHITESPACE: [char; 4] = [' ', '\n', '\t', '\r']; -pub const NUMERICAL_EXTRA: [char; 3] = ['.', 'i', 'e']; -pub const NUMERICAL_BASE: [char; 3] = ['d', 'o', 'b']; +pub const NUMERICAL_EXTRA: [char; 4] = ['.', 'i', 'e', '/']; +pub const NUMERICAL_BASE: [char; 4] = ['d', 'o', 'b', 'x']; +pub const TOK_DELIMITERS: [char; 5] = [')', ' ', '\t', '\n', '\r']; pub const E_NO_MATCHING_QUOTE: &str = "couldn't find matching quote"; -pub const E_TOO_MANY_DECIMALS: &str = "number can only have one of {i e .}"; +pub const E_TOO_MANY_DECIMALS: &str = "number can only have one dot"; +pub const E_TOO_MANY_SLASH: &str = "number can only have one slash"; +pub const E_TOO_MANY_E: &str = "number can only have one e"; pub const E_NO_MATCHING_PAREN: &str = "couldn't find matching paren"; pub const E_UNCLOSED_COMMENT: &str = "block comment has no end"; pub const E_NO_CLOSING_PIPE: &str = "expected a closing pipe"; @@ -33,11 +36,13 @@ pub const E_NO_END_TO_HASH: &str = "expected more input after hash"; pub const E_NUMBER_TRUNCATED: &str = "number literal is truncated"; pub const E_CHAR_TRUNCATED: &str = "character literal is truncated"; pub const E_STRING_TRUNCATED: &str = "string literal is truncated"; +pub const E_UNDELIMITED_ESC: &str = "char escape is not delimited"; pub const E_EXTRA_CLOSE: &str = "extra closing parenthesis"; -pub const E_UNIMPLEMENTED_HEX: &str = "hexadecimal literals not supported"; +pub const E_CHAR_TOO_LONG: &str = "character literal is too long"; pub const E_NUMER_BASE_ERR: &str = "digit in number exceeds specified base"; pub const E_UNSUPPORTED_ESC: &str = "unsupported escape"; pub const E_BAD_DOT: &str = "expected space after dot in dotted notation"; +pub const E_BAD_HEX: &str = "character is not valid hexadecimal notation"; pub const E_INCOMPREHENSIBLE: &str = "token does not lex"; pub const E_END_OF_DOCUMENT: &str = "no additional input left in document"; @@ -208,6 +213,10 @@ impl Lexer { #[inline(always)] fn advance_char(&mut self) -> Option<()> { self.current_index += 1; + if self.current_index >= self.document.len() { + return None + } + if let Some((idx, _)) = self.document[self.current_index..] .char_indices() .next() { @@ -223,9 +232,15 @@ impl Lexer { #[inline(always)] fn match_chunk_next(&mut self, chunk: &str) -> Option { + let saved = self.current_index; for i in chunk.chars() { - self.advance_char()?; + if let None = self.advance_char() { + self.current_index = saved; + return None + } + if i != self.current_char() { + self.current_index = saved; return Some(false) } } @@ -233,26 +248,6 @@ impl Lexer { Some(true) } - /* TODO - * I figured this function would be useful for supporting hexadec encoding - * later down the line. We can use this instead of the base check in the - * number function. - #[inline(always)] - fn next_chars_allowed(&mut self, len: usize, allowed: &str) -> Option { - let mut i = len; - while i < 0 { - if !allowed.contains(self.current_char()) { - return Some(false) - } - - i -= 1; - self.advance_char()?; - } - - Some(true) - } - */ - #[inline(always)] fn cut_new_token(&mut self, t: LexTokenType) -> Result { let next_idx = self.advance_char() @@ -273,11 +268,14 @@ impl Lexer { #[inline(always)] fn seek_end_of_string(&mut self) -> Result { - // TODO: support escaped quotes loop { if let None = self.advance_char() { return Err(LexError(E_NO_MATCHING_QUOTE, self.current_token_start, self.document.clone())) + + } else if self.current_char() == '\\' { + self.seek_end_of_escape(true)?; + } else if self.current_char() == '"' { return self.cut_new_token(LexTokenType::String) } @@ -288,39 +286,68 @@ impl Lexer { fn seek_end_of_number(&mut self) -> Result { let mut base = 10; let a = self.current_char(); - if NUMERICAL_BASE.contains(&a) { + + if NUMERICAL_BASE.contains(&a) || a == 'i' || a == 'e' { if let None = self.advance_char() { return Err(LexError(E_NUMBER_TRUNCATED, self.current_token_start, self.document.clone())) + + // someday rust will get its shit together and if let chaining will be adequate + } else if TOK_DELIMITERS.contains(&a) { + return Err(LexError(E_NUMBER_TRUNCATED, + self.current_token_start, self.document.clone())) } + match a { + 'x' => base = 16, 'd' => base = 10, 'o' => base = 8, 'b' => base = 2, + // ignore i or e, number parsers will handle that _ => (), } } - let mut hasdot = false; + if let Some(true) = self.match_chunk_next("inf.0") { + return self.cut_new_token(LexTokenType::Number) + } + + if let Some(true) = self.match_chunk_next("nan.0") { + return self.cut_new_token(LexTokenType::Number) + } + + let mut hasdot = false; + let mut hasslash = false; + let mut hase = false; loop { let a = self.current_char(); - if NUMERICAL_EXTRA.contains(&a) { + if a == '.' { if hasdot || base < 10 { - return Err(LexError(E_TOO_MANY_DECIMALS, + return Err(LexError(E_TOO_MANY_DECIMALS, self.current_token_start, self.document.clone())) } hasdot = true; - } else if a == ' ' || a == ')' { + } else if a == '/' { + if hasslash || base < 10 { + return Err(LexError(E_TOO_MANY_SLASH, + self.current_token_start, self.document.clone())) + } + hasslash = true; + + } else if a == 'e' { + if hase || base < 10 { + return Err(LexError(E_TOO_MANY_E, + self.current_token_start, self.document.clone())) + } + hase = true + + } else if TOK_DELIMITERS.contains(&a) { // back up one self.current_index -= 1; return self.cut_new_token(LexTokenType::Number) - } else if !a.is_numeric() { - return Err(LexError(E_INCOMPREHENSIBLE, - self.current_token_start, self.document.clone())) - - } else if a.to_digit(10).unwrap() >= base { + } else if let None = a.to_digit(base) { return Err(LexError(E_NUMER_BASE_ERR, self.current_token_start, self.document.clone())) } @@ -400,11 +427,10 @@ impl Lexer { 'u' if self.match_chunk_next("8(").is_some_and(|x| x) => return self.cut_new_token(LexTokenType::ByteVectorStart), '(' => return self.cut_new_token(LexTokenType::VectorStart), - '\\' => self.seek_end_of_escape(false) + '\\' => self.seek_end_of_escape(false, ) .and_then(|_| self.cut_new_token(LexTokenType::Char)), - 'x' => return Err(LexError(E_UNIMPLEMENTED_HEX, - self.current_index, self.document.clone())), _ if NUMERICAL_BASE.contains(&ch) => return self.seek_end_of_number(), + 'i' | 'e' => return self.seek_end_of_number(), _ => return Err(LexError(E_INCOMPREHENSIBLE, self.current_token_start, self.document.clone())), } @@ -417,25 +443,73 @@ impl Lexer { // only the caller knows what actually needs to be returned #[inline(always)] fn seek_end_of_escape(&mut self, in_string: bool) -> Result<(), LexError> { - //let delim = if in_string { ';' } else { ' ' }; - // Delim and the arg to this function will be useful once we support hexadecimal encoding - if let None = self.advance_char() { - let mut error_msg = E_CHAR_TRUNCATED; - if in_string { error_msg = E_STRING_TRUNCATED; } - return Err(LexError(error_msg, self.current_token_start, self.document.clone())) + // little helper to deduplicate logic for advancing characters + macro_rules! adv { + () => { + if let None = self.advance_char() { + let mut error_msg = E_CHAR_TRUNCATED; + if in_string { error_msg = E_STRING_TRUNCATED; } + Err(LexError(error_msg, self.current_token_start, + self.document.clone())) + } else { Ok(()) } + }; } + let delim = |x| -> bool { + in_string || TOK_DELIMITERS.contains(&x) + }; + + // advance char once + adv!()?; + + /* if match_chunk_next fails then the index is unmoved + * allowing us to treat this like a single char escape + */ match self.current_char() { - // eat an escaped whitespace or delim - ' ' | 'n' | 'r' | 't' | '|' | '\\' | '"' => { () }, - 'x' => return Err(LexError(E_UNIMPLEMENTED_HEX, - self.current_token_start, self.document.clone())), - _ if self.current_char().is_alphabetic() => { () }, - _ => return Err(LexError(E_UNSUPPORTED_ESC, - self.current_index, self.document.clone())), + // char escapes + 'a' if !in_string => self.match_chunk_next("larm"), + 'b' if !in_string => self.match_chunk_next("ackspace"), + 'd' if !in_string => self.match_chunk_next("elete"), + 'e' if !in_string => self.match_chunk_next("scape"), + 'n' if !in_string => self.match_chunk_next("ewline").or( + self.match_chunk_next("ull") + ), + 'r' if !in_string => self.match_chunk_next("eturn"), + 's' if !in_string => self.match_chunk_next("pace"), + 't' if !in_string => self.match_chunk_next("ab"), + + // string escapes + 'a' | 'b' | 't' | 'n' | 'r' | '"' | '\\' if in_string => None, + + // both + 'x' => { + // we look for TWO hex digits + adv!()?; + self.current_char().to_digit(16) + .ok_or(LexError(E_BAD_HEX, self.current_index, + self.document.clone()))?; + adv!()?; + self.current_char().to_digit(16) + .ok_or(LexError(E_BAD_HEX, self.current_index, + self.document.clone()))?; + None + }, + + // catchalls + _ if !in_string => None, + _ => return Err(LexError(E_UNSUPPORTED_ESC, self.current_index, + self.document.clone())), + }; + + let saved_idx = self.current_index; + if saved_idx == self.document.len() - 1 { + return Ok(()) } - return Ok(()) + adv!().and_then(|_| if !delim(self.current_char()) { + return Err(LexError(E_UNDELIMITED_ESC, self.current_index, + self.document.clone())) + } else { if in_string {self.current_index = saved_idx }; Ok(()) }) } /* Called to output a token by the iterator implementation @@ -469,10 +543,12 @@ impl Lexer { ')' => output = Some(self.cut_new_token(LexTokenType::CollectionEnd)), '#' => output = Some(self.seek_end_from_hash()), '"' => output = Some(self.seek_end_of_string()), - '\\' => output = Some(self.seek_end_of_escape(false) + /* This code commented out. I dont think you can open a char without '#' + * '\\' => output = Some(self.seek_end_of_escape(false) .and_then(|_| - self.cut_new_token(LexTokenType::Char))), + self.cut_new_token(LexTokenType::Char))),*/ '|' => output = Some(self.seek_closing_pipe()), + '+' | '-' => output = Some(self.seek_end_of_number()), _ if self.current_char().is_numeric() => output = Some(self.seek_end_of_number()), _ => (), @@ -545,7 +621,7 @@ mod tests { // HAPPY CASES vec!["\"asdf\"", "\"as sdf\"", "\"asdflkj\\n\"", "\"LKsldkf;l\"", "\" sdlkfj \"", "\"#;sdf\"", - "\"\""], + "\"\"", "\"\\\" \\\"\""], // SAD CASES vec!["\"sdf"] @@ -553,24 +629,25 @@ mod tests { /* Number Cases */ ( // HAPPY CASES - vec!["1", "1.0", "#d1.1", "#o1423", "#b11"], + vec!["1", "1.0", "#d1.1", "#o1423", "#b11", "#xDF", + "#e1e1", "#i1/4", "+inf.0", "1e1"], // SAD CASES - vec!["1.1.1", "#o9", "#b1.01", "#xADADAD"] + vec!["1.1.1", "#o9", "#b1.01", "#i1/3/3"] ), /* Char Cases */ ( // HAPPY CASES - vec!["\\a", "\\t", "\\\"", "#\\t"], + vec!["#\\a", "#\\t", "#\\\"", "#\\t", "#\\space", "#\\x20"], // SAD CASES - vec!["\\x20"] + vec!["\\c", "\\x20"] ), /* Identifier Cases */ ( // HAPPY CASES - vec!["...", "+", "+soup+", "<=?", "V17a", "->string", "a34kTMNs", - "lambda", "q", "list->vector", "|two words|", "|two\nwords|", + vec!["...", "<=?", "V17a", "a34kTMNs", "lambda", "q", + "list->vector", "|two words|", "|two\nwords|", "the-word-recursion-has-many-meanings"], // SAD CASES