1
0
Fork 0
forked from wry/wry
wry/build/tokens.rs
2024-09-06 11:08:22 +02:00

247 lines
6.5 KiB
Rust

use {
anyhow::{bail, Context, Result},
bstr::{BString, ByteSlice},
};
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub enum TreeDelim {
Paren,
Brace,
}
impl TreeDelim {
pub fn opening(self) -> u8 {
match self {
TreeDelim::Paren => b'(',
TreeDelim::Brace => b'{',
}
}
pub fn closing(self) -> u8 {
match self {
TreeDelim::Paren => b')',
TreeDelim::Brace => b'}',
}
}
}
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub enum Symbol {
Comma,
Colon,
Semicolon,
Equals,
At,
}
impl Symbol {
pub fn name(self) -> &'static str {
match self {
Symbol::Comma => "','",
Symbol::Colon => "':'",
Symbol::Equals => "'='",
Symbol::At => "'@'",
Symbol::Semicolon => "';'",
}
}
}
#[derive(Debug, Eq, PartialEq)]
pub struct Token<'a> {
pub line: u32,
pub kind: TokenKind<'a>,
}
#[derive(Debug, Eq, PartialEq)]
pub enum TokenKind<'a> {
Ident(&'a str),
Num(u32),
Tree {
delim: TreeDelim,
body: Vec<Token<'a>>,
},
Symbol(Symbol),
String(String),
}
impl TokenKind<'_> {
pub fn name(&self) -> &str {
match self {
TokenKind::Ident(_) => "identifier",
TokenKind::Num(_) => "number",
TokenKind::Tree { delim, .. } => match delim {
TreeDelim::Paren => "'('-tree",
TreeDelim::Brace => "'{'-tree",
},
TokenKind::Symbol(s) => s.name(),
TokenKind::String(_) => "string",
}
}
}
#[derive(Copy, Clone)]
struct Cursor<'a> {
pos: usize,
s: &'a [u8],
}
impl Cursor<'_> {
fn eof(&self) -> bool {
self.pos >= self.s.len()
}
}
pub fn tokenize<'a>(s: &'a [u8]) -> Result<Vec<Token<'a>>> {
let mut tnz = Tokenizer {
line: 1,
cursor: Cursor { pos: 0, s },
delim: None,
res: vec![],
};
tnz.tokenize()?;
Ok(tnz.res)
}
struct Tokenizer<'a> {
line: u32,
cursor: Cursor<'a>,
delim: Option<TreeDelim>,
res: Vec<Token<'a>>,
}
impl<'a> Tokenizer<'a> {
fn tokenize_one(&mut self) -> Result<bool> {
let c = &mut self.cursor;
while !c.eof() {
let b = c.s[c.pos];
if matches!(b, b' ' | b'\n' | b'#') {
c.pos += 1;
if b == b'\n' {
self.line += 1;
} else if b == b'#' {
while !c.eof() {
c.pos += 1;
if c.s[c.pos - 1] == b'\n' {
self.line += 1;
break;
}
}
}
} else {
break;
}
}
if c.eof() {
if self.delim.is_some() {
bail!("Unexpected eof");
}
return Ok(false);
}
let line = self.line;
let b = c.s[c.pos];
let b_pos = c.pos;
c.pos += 1;
let kind = match b {
b'a'..=b'z' | b'A'..=b'Z' => {
while !c.eof()
&& matches!(c.s[c.pos], b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'0'..=b'9')
{
c.pos += 1;
}
TokenKind::Ident(c.s[b_pos..c.pos].as_bstr().to_str()?)
}
b'0'..=b'9' => {
c.pos -= 1;
let mut num = 0;
while !c.eof() && matches!(c.s[c.pos], b'0'..=b'9') {
num = num * 10 + (c.s[c.pos] - b'0') as u32;
c.pos += 1;
}
TokenKind::Num(num)
}
b',' => TokenKind::Symbol(Symbol::Comma),
b'=' => TokenKind::Symbol(Symbol::Equals),
b'@' => TokenKind::Symbol(Symbol::At),
b':' => TokenKind::Symbol(Symbol::Colon),
b';' => TokenKind::Symbol(Symbol::Semicolon),
b'(' => self.tokenize_tree(TreeDelim::Paren)?,
b'{' => self.tokenize_tree(TreeDelim::Brace)?,
c @ (b')' | b'}') => {
if self.delim.map(|d| d.closing()) != Some(c) {
bail!("Unexpected {:?} in line {}", c as char, self.line);
}
return Ok(false);
}
b'"' => {
let mut res = vec![];
let mut escaped = false;
while !c.eof() {
let char = c.s[c.pos];
if char == b'\\' {
escaped = true;
} else if escaped {
escaped = false;
if matches!(char, b'"' | b'\\') {
res.push(char);
} else {
bail!(
"Unexpected escape sequence '\\{}' in line {}",
char,
self.line
);
}
} else if char == b'"' {
break;
} else {
res.push(char);
}
c.pos += 1;
}
if c.eof() {
bail!("Unterminated string in line {}", self.line);
}
c.pos += 1;
TokenKind::String(BString::from(res).to_string())
}
_ => bail!("Unexpected byte {:?} in line {}", b as char, self.line),
};
self.res.push(Token { line, kind });
Ok(true)
}
fn tokenize(&mut self) -> Result<()> {
while self.tokenize_one()? {
// nothing
}
Ok(())
}
fn tokenize_tree(&mut self, delim: TreeDelim) -> Result<TokenKind<'a>> {
let mut tnz = Tokenizer {
line: self.line,
cursor: self.cursor,
delim: Some(delim),
res: vec![],
};
tnz.tokenize().with_context(|| {
format!(
"While tokenizing {:?} block starting in line {}",
delim.opening() as char,
self.line
)
})?;
self.cursor.pos = tnz.cursor.pos;
self.line = tnz.line;
Ok(TokenKind::Tree {
delim,
body: tnz.res,
})
}
}
#[derive(Debug)]
pub struct Lined<T> {
#[expect(dead_code)]
pub line: u32,
pub val: T,
}