Lexer supports hexadecimal escaping and number literals

This commit updates the Lexer to fully support character and string
escaping as well as hexadecimal in number literals. Updates to lexing
were performed according to the R7RS small specification. Test cases
were extended to cover new support, as well as more number cases
mirrored in the parsing logic of the number package.

Signed-off-by: Ava Affine <ava@sunnypup.io>
This commit is contained in:
Ava Apples Affine 2025-05-15 15:55:05 -07:00
parent 41216d3526
commit 3174494001

View file

@ -21,11 +21,14 @@ use alloc::rc::Rc;
pub const LEX_SPECIAL: [char; 18] = ['!', '$', '%', '&', '*', '+', '-', '/', pub const LEX_SPECIAL: [char; 18] = ['!', '$', '%', '&', '*', '+', '-', '/',
':', '<', '=', '>', '?', '@', '^', '_', '~', '.']; ':', '<', '=', '>', '?', '@', '^', '_', '~', '.'];
pub const LEX_WHITESPACE: [char; 4] = [' ', '\n', '\t', '\r']; pub const LEX_WHITESPACE: [char; 4] = [' ', '\n', '\t', '\r'];
pub const NUMERICAL_EXTRA: [char; 3] = ['.', 'i', 'e']; pub const NUMERICAL_EXTRA: [char; 4] = ['.', 'i', 'e', '/'];
pub const NUMERICAL_BASE: [char; 3] = ['d', 'o', 'b']; pub const NUMERICAL_BASE: [char; 4] = ['d', 'o', 'b', 'x'];
pub const TOK_DELIMITERS: [char; 5] = [')', ' ', '\t', '\n', '\r'];
pub const E_NO_MATCHING_QUOTE: &str = "couldn't find matching quote"; pub const E_NO_MATCHING_QUOTE: &str = "couldn't find matching quote";
pub const E_TOO_MANY_DECIMALS: &str = "number can only have one of {i e .}"; pub const E_TOO_MANY_DECIMALS: &str = "number can only have one dot";
pub const E_TOO_MANY_SLASH: &str = "number can only have one slash";
pub const E_TOO_MANY_E: &str = "number can only have one e";
pub const E_NO_MATCHING_PAREN: &str = "couldn't find matching paren"; pub const E_NO_MATCHING_PAREN: &str = "couldn't find matching paren";
pub const E_UNCLOSED_COMMENT: &str = "block comment has no end"; pub const E_UNCLOSED_COMMENT: &str = "block comment has no end";
pub const E_NO_CLOSING_PIPE: &str = "expected a closing pipe"; pub const E_NO_CLOSING_PIPE: &str = "expected a closing pipe";
@ -33,11 +36,13 @@ pub const E_NO_END_TO_HASH: &str = "expected more input after hash";
pub const E_NUMBER_TRUNCATED: &str = "number literal is truncated"; pub const E_NUMBER_TRUNCATED: &str = "number literal is truncated";
pub const E_CHAR_TRUNCATED: &str = "character literal is truncated"; pub const E_CHAR_TRUNCATED: &str = "character literal is truncated";
pub const E_STRING_TRUNCATED: &str = "string literal is truncated"; pub const E_STRING_TRUNCATED: &str = "string literal is truncated";
pub const E_UNDELIMITED_ESC: &str = "char escape is not delimited";
pub const E_EXTRA_CLOSE: &str = "extra closing parenthesis"; pub const E_EXTRA_CLOSE: &str = "extra closing parenthesis";
pub const E_UNIMPLEMENTED_HEX: &str = "hexadecimal literals not supported"; pub const E_CHAR_TOO_LONG: &str = "character literal is too long";
pub const E_NUMER_BASE_ERR: &str = "digit in number exceeds specified base"; pub const E_NUMER_BASE_ERR: &str = "digit in number exceeds specified base";
pub const E_UNSUPPORTED_ESC: &str = "unsupported escape"; pub const E_UNSUPPORTED_ESC: &str = "unsupported escape";
pub const E_BAD_DOT: &str = "expected space after dot in dotted notation"; pub const E_BAD_DOT: &str = "expected space after dot in dotted notation";
pub const E_BAD_HEX: &str = "character is not valid hexadecimal notation";
pub const E_INCOMPREHENSIBLE: &str = "token does not lex"; pub const E_INCOMPREHENSIBLE: &str = "token does not lex";
pub const E_END_OF_DOCUMENT: &str = "no additional input left in document"; pub const E_END_OF_DOCUMENT: &str = "no additional input left in document";
@ -208,6 +213,10 @@ impl Lexer {
#[inline(always)] #[inline(always)]
fn advance_char(&mut self) -> Option<()> { fn advance_char(&mut self) -> Option<()> {
self.current_index += 1; self.current_index += 1;
if self.current_index >= self.document.len() {
return None
}
if let Some((idx, _)) = self.document[self.current_index..] if let Some((idx, _)) = self.document[self.current_index..]
.char_indices() .char_indices()
.next() { .next() {
@ -223,9 +232,15 @@ impl Lexer {
#[inline(always)] #[inline(always)]
fn match_chunk_next(&mut self, chunk: &str) -> Option<bool> { fn match_chunk_next(&mut self, chunk: &str) -> Option<bool> {
let saved = self.current_index;
for i in chunk.chars() { for i in chunk.chars() {
self.advance_char()?; if let None = self.advance_char() {
self.current_index = saved;
return None
}
if i != self.current_char() { if i != self.current_char() {
self.current_index = saved;
return Some(false) return Some(false)
} }
} }
@ -233,26 +248,6 @@ impl Lexer {
Some(true) Some(true)
} }
/* TODO
* I figured this function would be useful for supporting hexadec encoding
* later down the line. We can use this instead of the base check in the
* number function.
#[inline(always)]
fn next_chars_allowed(&mut self, len: usize, allowed: &str) -> Option<bool> {
let mut i = len;
while i < 0 {
if !allowed.contains(self.current_char()) {
return Some(false)
}
i -= 1;
self.advance_char()?;
}
Some(true)
}
*/
#[inline(always)] #[inline(always)]
fn cut_new_token(&mut self, t: LexTokenType) -> Result<LexToken, LexError> { fn cut_new_token(&mut self, t: LexTokenType) -> Result<LexToken, LexError> {
let next_idx = self.advance_char() let next_idx = self.advance_char()
@ -273,11 +268,14 @@ impl Lexer {
#[inline(always)] #[inline(always)]
fn seek_end_of_string(&mut self) -> Result<LexToken, LexError> { fn seek_end_of_string(&mut self) -> Result<LexToken, LexError> {
// TODO: support escaped quotes
loop { loop {
if let None = self.advance_char() { if let None = self.advance_char() {
return Err(LexError(E_NO_MATCHING_QUOTE, return Err(LexError(E_NO_MATCHING_QUOTE,
self.current_token_start, self.document.clone())) self.current_token_start, self.document.clone()))
} else if self.current_char() == '\\' {
self.seek_end_of_escape(true)?;
} else if self.current_char() == '"' { } else if self.current_char() == '"' {
return self.cut_new_token(LexTokenType::String) return self.cut_new_token(LexTokenType::String)
} }
@ -288,39 +286,68 @@ impl Lexer {
fn seek_end_of_number(&mut self) -> Result<LexToken, LexError> { fn seek_end_of_number(&mut self) -> Result<LexToken, LexError> {
let mut base = 10; let mut base = 10;
let a = self.current_char(); let a = self.current_char();
if NUMERICAL_BASE.contains(&a) {
if NUMERICAL_BASE.contains(&a) || a == 'i' || a == 'e' {
if let None = self.advance_char() { if let None = self.advance_char() {
return Err(LexError(E_NUMBER_TRUNCATED, return Err(LexError(E_NUMBER_TRUNCATED,
self.current_token_start, self.document.clone())) self.current_token_start, self.document.clone()))
// someday rust will get its shit together and if let chaining will be adequate
} else if TOK_DELIMITERS.contains(&a) {
return Err(LexError(E_NUMBER_TRUNCATED,
self.current_token_start, self.document.clone()))
} }
match a { match a {
'x' => base = 16,
'd' => base = 10, 'd' => base = 10,
'o' => base = 8, 'o' => base = 8,
'b' => base = 2, 'b' => base = 2,
// ignore i or e, number parsers will handle that
_ => (), _ => (),
} }
} }
let mut hasdot = false; if let Some(true) = self.match_chunk_next("inf.0") {
return self.cut_new_token(LexTokenType::Number)
}
if let Some(true) = self.match_chunk_next("nan.0") {
return self.cut_new_token(LexTokenType::Number)
}
let mut hasdot = false;
let mut hasslash = false;
let mut hase = false;
loop { loop {
let a = self.current_char(); let a = self.current_char();
if NUMERICAL_EXTRA.contains(&a) { if a == '.' {
if hasdot || base < 10 { if hasdot || base < 10 {
return Err(LexError(E_TOO_MANY_DECIMALS, return Err(LexError(E_TOO_MANY_DECIMALS,
self.current_token_start, self.document.clone())) self.current_token_start, self.document.clone()))
} }
hasdot = true; hasdot = true;
} else if a == ' ' || a == ')' { } else if a == '/' {
if hasslash || base < 10 {
return Err(LexError(E_TOO_MANY_SLASH,
self.current_token_start, self.document.clone()))
}
hasslash = true;
} else if a == 'e' {
if hase || base < 10 {
return Err(LexError(E_TOO_MANY_E,
self.current_token_start, self.document.clone()))
}
hase = true
} else if TOK_DELIMITERS.contains(&a) {
// back up one // back up one
self.current_index -= 1; self.current_index -= 1;
return self.cut_new_token(LexTokenType::Number) return self.cut_new_token(LexTokenType::Number)
} else if !a.is_numeric() { } else if let None = a.to_digit(base) {
return Err(LexError(E_INCOMPREHENSIBLE,
self.current_token_start, self.document.clone()))
} else if a.to_digit(10).unwrap() >= base {
return Err(LexError(E_NUMER_BASE_ERR, return Err(LexError(E_NUMER_BASE_ERR,
self.current_token_start, self.document.clone())) self.current_token_start, self.document.clone()))
} }
@ -400,11 +427,10 @@ impl Lexer {
'u' if self.match_chunk_next("8(").is_some_and(|x| x) => 'u' if self.match_chunk_next("8(").is_some_and(|x| x) =>
return self.cut_new_token(LexTokenType::ByteVectorStart), return self.cut_new_token(LexTokenType::ByteVectorStart),
'(' => return self.cut_new_token(LexTokenType::VectorStart), '(' => return self.cut_new_token(LexTokenType::VectorStart),
'\\' => self.seek_end_of_escape(false) '\\' => self.seek_end_of_escape(false, )
.and_then(|_| self.cut_new_token(LexTokenType::Char)), .and_then(|_| self.cut_new_token(LexTokenType::Char)),
'x' => return Err(LexError(E_UNIMPLEMENTED_HEX,
self.current_index, self.document.clone())),
_ if NUMERICAL_BASE.contains(&ch) => return self.seek_end_of_number(), _ if NUMERICAL_BASE.contains(&ch) => return self.seek_end_of_number(),
'i' | 'e' => return self.seek_end_of_number(),
_ => return Err(LexError(E_INCOMPREHENSIBLE, _ => return Err(LexError(E_INCOMPREHENSIBLE,
self.current_token_start, self.document.clone())), self.current_token_start, self.document.clone())),
} }
@ -417,25 +443,73 @@ impl Lexer {
// only the caller knows what actually needs to be returned // only the caller knows what actually needs to be returned
#[inline(always)] #[inline(always)]
fn seek_end_of_escape(&mut self, in_string: bool) -> Result<(), LexError> { fn seek_end_of_escape(&mut self, in_string: bool) -> Result<(), LexError> {
//let delim = if in_string { ';' } else { ' ' }; // little helper to deduplicate logic for advancing characters
// Delim and the arg to this function will be useful once we support hexadecimal encoding macro_rules! adv {
if let None = self.advance_char() { () => {
let mut error_msg = E_CHAR_TRUNCATED; if let None = self.advance_char() {
if in_string { error_msg = E_STRING_TRUNCATED; } let mut error_msg = E_CHAR_TRUNCATED;
return Err(LexError(error_msg, self.current_token_start, self.document.clone())) if in_string { error_msg = E_STRING_TRUNCATED; }
Err(LexError(error_msg, self.current_token_start,
self.document.clone()))
} else { Ok(()) }
};
} }
let delim = |x| -> bool {
in_string || TOK_DELIMITERS.contains(&x)
};
// advance char once
adv!()?;
/* if match_chunk_next fails then the index is unmoved
* allowing us to treat this like a single char escape
*/
match self.current_char() { match self.current_char() {
// eat an escaped whitespace or delim // char escapes
' ' | 'n' | 'r' | 't' | '|' | '\\' | '"' => { () }, 'a' if !in_string => self.match_chunk_next("larm"),
'x' => return Err(LexError(E_UNIMPLEMENTED_HEX, 'b' if !in_string => self.match_chunk_next("ackspace"),
self.current_token_start, self.document.clone())), 'd' if !in_string => self.match_chunk_next("elete"),
_ if self.current_char().is_alphabetic() => { () }, 'e' if !in_string => self.match_chunk_next("scape"),
_ => return Err(LexError(E_UNSUPPORTED_ESC, 'n' if !in_string => self.match_chunk_next("ewline").or(
self.current_index, self.document.clone())), self.match_chunk_next("ull")
),
'r' if !in_string => self.match_chunk_next("eturn"),
's' if !in_string => self.match_chunk_next("pace"),
't' if !in_string => self.match_chunk_next("ab"),
// string escapes
'a' | 'b' | 't' | 'n' | 'r' | '"' | '\\' if in_string => None,
// both
'x' => {
// we look for TWO hex digits
adv!()?;
self.current_char().to_digit(16)
.ok_or(LexError(E_BAD_HEX, self.current_index,
self.document.clone()))?;
adv!()?;
self.current_char().to_digit(16)
.ok_or(LexError(E_BAD_HEX, self.current_index,
self.document.clone()))?;
None
},
// catchalls
_ if !in_string => None,
_ => return Err(LexError(E_UNSUPPORTED_ESC, self.current_index,
self.document.clone())),
};
let saved_idx = self.current_index;
if saved_idx == self.document.len() - 1 {
return Ok(())
} }
return Ok(()) adv!().and_then(|_| if !delim(self.current_char()) {
return Err(LexError(E_UNDELIMITED_ESC, self.current_index,
self.document.clone()))
} else { if in_string {self.current_index = saved_idx }; Ok(()) })
} }
/* Called to output a token by the iterator implementation /* Called to output a token by the iterator implementation
@ -469,10 +543,12 @@ impl Lexer {
')' => output = Some(self.cut_new_token(LexTokenType::CollectionEnd)), ')' => output = Some(self.cut_new_token(LexTokenType::CollectionEnd)),
'#' => output = Some(self.seek_end_from_hash()), '#' => output = Some(self.seek_end_from_hash()),
'"' => output = Some(self.seek_end_of_string()), '"' => output = Some(self.seek_end_of_string()),
'\\' => output = Some(self.seek_end_of_escape(false) /* This code commented out. I dont think you can open a char without '#'
* '\\' => output = Some(self.seek_end_of_escape(false)
.and_then(|_| .and_then(|_|
self.cut_new_token(LexTokenType::Char))), self.cut_new_token(LexTokenType::Char))),*/
'|' => output = Some(self.seek_closing_pipe()), '|' => output = Some(self.seek_closing_pipe()),
'+' | '-' => output = Some(self.seek_end_of_number()),
_ if self.current_char().is_numeric() => output = _ if self.current_char().is_numeric() => output =
Some(self.seek_end_of_number()), Some(self.seek_end_of_number()),
_ => (), _ => (),
@ -545,7 +621,7 @@ mod tests {
// HAPPY CASES // HAPPY CASES
vec!["\"asdf\"", "\"as sdf\"", "\"asdflkj\\n\"", vec!["\"asdf\"", "\"as sdf\"", "\"asdflkj\\n\"",
"\"LKsldkf;l\"", "\" sdlkfj \"", "\"#;sdf\"", "\"LKsldkf;l\"", "\" sdlkfj \"", "\"#;sdf\"",
"\"\""], "\"\"", "\"\\\" \\\"\""],
// SAD CASES // SAD CASES
vec!["\"sdf"] vec!["\"sdf"]
@ -553,24 +629,25 @@ mod tests {
/* Number Cases */ ( /* Number Cases */ (
// HAPPY CASES // HAPPY CASES
vec!["1", "1.0", "#d1.1", "#o1423", "#b11"], vec!["1", "1.0", "#d1.1", "#o1423", "#b11", "#xDF",
"#e1e1", "#i1/4", "+inf.0", "1e1"],
// SAD CASES // SAD CASES
vec!["1.1.1", "#o9", "#b1.01", "#xADADAD"] vec!["1.1.1", "#o9", "#b1.01", "#i1/3/3"]
), ),
/* Char Cases */ ( /* Char Cases */ (
// HAPPY CASES // HAPPY CASES
vec!["\\a", "\\t", "\\\"", "#\\t"], vec!["#\\a", "#\\t", "#\\\"", "#\\t", "#\\space", "#\\x20"],
// SAD CASES // SAD CASES
vec!["\\x20"] vec!["\\c", "\\x20"]
), ),
/* Identifier Cases */ ( /* Identifier Cases */ (
// HAPPY CASES // HAPPY CASES
vec!["...", "+", "+soup+", "<=?", "V17a", "->string", "a34kTMNs", vec!["...", "<=?", "V17a", "a34kTMNs", "lambda", "q",
"lambda", "q", "list->vector", "|two words|", "|two\nwords|", "list->vector", "|two words|", "|two\nwords|",
"the-word-recursion-has-many-meanings"], "the-word-recursion-has-many-meanings"],
// SAD CASES // SAD CASES