From 3ebef158dfbe69ae33504af149b308bc04334743 Mon Sep 17 00:00:00 2001 From: Adrien Burgun Date: Tue, 12 Sep 2023 17:39:21 +0200 Subject: [PATCH] :tada: Implement tokenizer --- .gitignore | 1 + Cargo.lock | 54 ++++++++++++ Cargo.toml | 9 ++ README.md | 6 ++ src/lib.rs | 1 + src/main.rs | 5 ++ src/parse/mod.rs | 220 +++++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 296 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 README.md create mode 100644 src/lib.rs create mode 100644 src/main.rs create mode 100644 src/parse/mod.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..aaf75f5 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,54 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "aho-corasick" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c378d78423fdad8089616f827526ee33c19f2fddbd5de1629152c9593ba4783" +dependencies = [ + "memchr", +] + +[[package]] +name = "basic-to-mindustry" +version = "0.1.0" +dependencies = [ + "regex", +] + +[[package]] +name = "memchr" +version = "2.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f232d6ef707e1956a43342693d2a31e72989554d58299d7a88738cc95b0d35c" + +[[package]] +name = "regex" +version = "1.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "697061221ea1b4a94a624f67d0ae2bfe4e22b8a17b6a192afb11046542cc8c47" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..9a6cbeb --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "basic-to-mindustry" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +regex = "1.9.5" diff --git a/README.md b/README.md new file mode 100644 index 0000000..e44ed44 --- /dev/null +++ b/README.md @@ -0,0 +1,6 @@ +# BASIC to Mindustry logic + +This is a small transpiler from the [BASIC](https://en.wikipedia.org/wiki/BASIC) language to [Mindustry](https://github.com/Anuken/Mindustry/)'s [logic system](https://www.reddit.com/r/Mindustry/comments/kfea1e/an_overly_indepth_logic_guide/) (also known as `mlog`). +Basic is chosen as the source language as it already contains jumps (which mindustry heavily relies on), while allowing for some higher-order constructs like conditions, loops and functions. + +For now this is a heavily work-in-progress project. diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..ea86848 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1 @@ +pub mod parse; diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..ed1ffda --- /dev/null +++ b/src/main.rs @@ -0,0 +1,5 @@ + + +fn main() { + println!("Hello, world!"); +} diff --git a/src/parse/mod.rs b/src/parse/mod.rs new file mode 100644 index 0000000..9b6d39c --- /dev/null +++ b/src/parse/mod.rs @@ -0,0 +1,220 @@ +use regex::Regex; + +#[derive(PartialEq, Eq, Clone, Copy, Debug)] +pub enum Operator { + Add, + Sub, + Mul, + Div, + Mod, + RShift, + LShift, + Gt, + Lt, + Gte, + Lte, + Eq, + Neq, + // etc. +} + +#[derive(PartialEq, Clone, Debug)] +pub enum BasicToken { + NewLine, + Assign, + If, + Then, + Else, + EndIf, + Goto, + OpenParen, + CloseParen, + Integer(u64), + Float(f64), + Name(String), + String(String), + Operator(Operator), +} + +#[derive(PartialEq, Clone, Debug)] +pub enum ParseError { + InvalidToken(String), +} + +pub fn tokenize(raw: &str) -> Result, ParseError> { + macro_rules! match_token { + ( $line:expr, $res:expr $(;)? ) => {}; + ( + $line:expr, $res:expr; + $matcher:ident => (), + $( + $rest_matcher:ident $(($rest_match_name:ident))? => $rest_value:tt, + )* + ) => { + if let Some(matched) = $matcher.find($line) { + $line = &$line[matched.end()..]; + continue + } + match_token!( + $line, $res; + $( + $rest_matcher $(($rest_match_name))? => $rest_value, + )* + ); + }; + ( + $line:expr, $res:expr; + $matcher:ident $(($match_name:ident))? => $value:expr, + $( + $rest_matcher:ident $(($rest_match_name:ident))? => $rest_value:tt, + )* + ) => { + if let Some(matched) = $matcher.find($line) { + $line = &$line[matched.end()..]; + $(let $match_name = matched.as_str();)? + $res.push($value); + continue + } + match_token!( + $line, $res; + $( + $rest_matcher $(($rest_match_name))? => $rest_value, + )* + ); + } + } + + let mut res = Vec::new(); + let match_let = Regex::new(r"(?i)^let").unwrap(); + let match_jump = Regex::new(r"(?i)^go\s*to").unwrap(); + let match_word = Regex::new(r"(?i)^(?:if|then|else|end\s?if)").unwrap(); + let match_space = Regex::new(r"^\s+").unwrap(); + let match_variable = Regex::new(r"^@?[a-zA-Z_][a-zA-Z_0-9]*").unwrap(); + let match_float = Regex::new(r"^[0-9]*\.[0-9]+").unwrap(); + let match_integer = Regex::new(r"^[0-9]+").unwrap(); + let match_assign = Regex::new(r"^=").unwrap(); + let match_operator = Regex::new(r"^(?:[+\-*/%]|[<>]=?|[!=]=|<<|>>)").unwrap(); + let match_paren = Regex::new(r"^(?:\(|\))").unwrap(); + // TODO: handle escapes + let match_string = Regex::new(r#""[^"]*""#).unwrap(); + let match_comment = Regex::new(r"(?i)^rem\s.*$").unwrap(); + // TODO: handle labels + + for mut line in raw.lines() { + if line.len() > 0 { + res.push(BasicToken::NewLine); + } + while line.len() > 0 { + match_token!(line, res; + match_space => (), + match_let => (), + match_comment => (), + match_jump => (BasicToken::Goto), + match_word(word) => (match word.to_lowercase().as_str() { + "if" => BasicToken::If, + "then" => BasicToken::Then, + "else" => BasicToken::Else, + "end if" | "endif" => BasicToken::EndIf, + _ => unreachable!(), + }), + match_variable(name) => (BasicToken::Name(name.to_string())), + match_float(float) => (BasicToken::Float(float.parse().unwrap())), + match_integer(int) => (BasicToken::Integer(int.parse().unwrap())), + match_assign => (BasicToken::Assign), + match_operator(op) => (BasicToken::Operator(match op { + "+" => Operator::Add, + "-" => Operator::Sub, + "*" => Operator::Mul, + "/" => Operator::Div, + "%" => Operator::Mod, + "<" => Operator::Lt, + "<=" => Operator::Lte, + ">" => Operator::Gt, + ">=" => Operator::Gte, + "<<" => Operator::LShift, + ">>" => Operator::RShift, + _ => unreachable!(), + })), + match_paren(paren) => (if paren == "(" { + BasicToken::OpenParen + } else { + BasicToken::CloseParen + }), + match_string(with_quotes) => (BasicToken::String(with_quotes[1..with_quotes.len() - 1].to_string())), + ); + // If this line is reached, then none of the matches above matched + return Err(ParseError::InvalidToken(line.to_string())); + } + } + + Ok(res) +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_tokenize_basic() { + assert_eq!( + tokenize("hello + world").unwrap(), + vec![ + BasicToken::NewLine, + BasicToken::Name(String::from("hello")), + BasicToken::Operator(Operator::Add), + BasicToken::Name(String::from("world")), + ], + ); + + assert_eq!( + tokenize("let thing = thing / 2").unwrap(), + vec![ + BasicToken::NewLine, + BasicToken::Name(String::from("thing")), + BasicToken::Assign, + BasicToken::Name(String::from("thing")), + BasicToken::Operator(Operator::Div), + BasicToken::Integer(2) + ], + ); + + assert_eq!( + tokenize("10 thing = thing + 0.5\ngoto 10").unwrap(), + vec![ + BasicToken::NewLine, + BasicToken::Integer(10), + BasicToken::Name(String::from("thing")), + BasicToken::Assign, + BasicToken::Name(String::from("thing")), + BasicToken::Operator(Operator::Add), + BasicToken::Float(0.5), + BasicToken::NewLine, + BasicToken::Goto, + BasicToken::Integer(10), + ], + ); + + assert_eq!( + tokenize("x = 0\n\nif x > 0 then\nprint(\"Positive\")\nend if").unwrap(), + vec![ + BasicToken::NewLine, + BasicToken::Name(String::from("x")), + BasicToken::Assign, + BasicToken::Integer(0), + BasicToken::NewLine, + BasicToken::If, + BasicToken::Name(String::from("x")), + BasicToken::Operator(Operator::Gt), + BasicToken::Integer(0), + BasicToken::Then, + BasicToken::NewLine, + BasicToken::Name(String::from("print")), + BasicToken::OpenParen, + BasicToken::String(String::from("Positive")), + BasicToken::CloseParen, + BasicToken::NewLine, + BasicToken::EndIf, + ], + ); + } +}