From 3ebef158dfbe69ae33504af149b308bc04334743 Mon Sep 17 00:00:00 2001
From: Adrien Burgun <adrien.burgun@orange.fr>
Date: Tue, 12 Sep 2023 17:39:21 +0200
Subject: [PATCH] :tada: Implement tokenizer

---
 .gitignore       |   1 +
 Cargo.lock       |  54 ++++++++++++
 Cargo.toml       |   9 ++
 README.md        |   6 ++
 src/lib.rs       |   1 +
 src/main.rs      |   5 ++
 src/parse/mod.rs | 220 +++++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 296 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 Cargo.lock
 create mode 100644 Cargo.toml
 create mode 100644 README.md
 create mode 100644 src/lib.rs
 create mode 100644 src/main.rs
 create mode 100644 src/parse/mod.rs

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ea8c4bf
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+/target
diff --git a/Cargo.lock b/Cargo.lock
new file mode 100644
index 0000000..aaf75f5
--- /dev/null
+++ b/Cargo.lock
@@ -0,0 +1,54 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "aho-corasick"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c378d78423fdad8089616f827526ee33c19f2fddbd5de1629152c9593ba4783"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "basic-to-mindustry"
+version = "0.1.0"
+dependencies = [
+ "regex",
+]
+
+[[package]]
+name = "memchr"
+version = "2.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f232d6ef707e1956a43342693d2a31e72989554d58299d7a88738cc95b0d35c"
+
+[[package]]
+name = "regex"
+version = "1.9.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "697061221ea1b4a94a624f67d0ae2bfe4e22b8a17b6a192afb11046542cc8c47"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.3.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-syntax"
+version = "0.7.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da"
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..9a6cbeb
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,9 @@
+[package]
+name = "basic-to-mindustry"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+regex = "1.9.5"
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..e44ed44
--- /dev/null
+++ b/README.md
@@ -0,0 +1,6 @@
+# BASIC to Mindustry logic
+
+This is a small transpiler from the [BASIC](https://en.wikipedia.org/wiki/BASIC) language to [Mindustry](https://github.com/Anuken/Mindustry/)'s [logic system](https://www.reddit.com/r/Mindustry/comments/kfea1e/an_overly_indepth_logic_guide/) (also known as `mlog`).
+Basic is chosen as the source language as it already contains jumps (which mindustry heavily relies on), while allowing for some higher-order constructs like conditions, loops and functions.
+
+For now this is a heavily work-in-progress project.
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..ea86848
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1 @@
+pub mod parse;
diff --git a/src/main.rs b/src/main.rs
new file mode 100644
index 0000000..ed1ffda
--- /dev/null
+++ b/src/main.rs
@@ -0,0 +1,5 @@
+
+
+fn main() {
+    println!("Hello, world!");
+}
diff --git a/src/parse/mod.rs b/src/parse/mod.rs
new file mode 100644
index 0000000..9b6d39c
--- /dev/null
+++ b/src/parse/mod.rs
@@ -0,0 +1,220 @@
+use regex::Regex;
+
+#[derive(PartialEq, Eq, Clone, Copy, Debug)]
+pub enum Operator {
+    Add,
+    Sub,
+    Mul,
+    Div,
+    Mod,
+    RShift,
+    LShift,
+    Gt,
+    Lt,
+    Gte,
+    Lte,
+    Eq,
+    Neq,
+    // etc.
+}
+
+#[derive(PartialEq, Clone, Debug)]
+pub enum BasicToken {
+    NewLine,
+    Assign,
+    If,
+    Then,
+    Else,
+    EndIf,
+    Goto,
+    OpenParen,
+    CloseParen,
+    Integer(u64),
+    Float(f64),
+    Name(String),
+    String(String),
+    Operator(Operator),
+}
+
+#[derive(PartialEq, Clone, Debug)]
+pub enum ParseError {
+    InvalidToken(String),
+}
+
+pub fn tokenize(raw: &str) -> Result<Vec<BasicToken>, ParseError> {
+    macro_rules! match_token {
+        ( $line:expr, $res:expr $(;)? ) => {};
+        (
+            $line:expr, $res:expr;
+            $matcher:ident => (),
+            $(
+                $rest_matcher:ident $(($rest_match_name:ident))? => $rest_value:tt,
+            )*
+        ) => {
+            if let Some(matched) = $matcher.find($line) {
+                $line = &$line[matched.end()..];
+                continue
+            }
+            match_token!(
+                $line, $res;
+                $(
+                    $rest_matcher $(($rest_match_name))? => $rest_value,
+                )*
+            );
+        };
+        (
+            $line:expr, $res:expr;
+            $matcher:ident $(($match_name:ident))? => $value:expr,
+            $(
+                $rest_matcher:ident $(($rest_match_name:ident))? => $rest_value:tt,
+            )*
+        ) => {
+            if let Some(matched) = $matcher.find($line) {
+                $line = &$line[matched.end()..];
+                $(let $match_name = matched.as_str();)?
+                $res.push($value);
+                continue
+            }
+            match_token!(
+                $line, $res;
+                $(
+                    $rest_matcher $(($rest_match_name))? => $rest_value,
+                )*
+            );
+        }
+    }
+
+    let mut res = Vec::new();
+    let match_let = Regex::new(r"(?i)^let").unwrap();
+    let match_jump = Regex::new(r"(?i)^go\s*to").unwrap();
+    let match_word = Regex::new(r"(?i)^(?:if|then|else|end\s?if)").unwrap();
+    let match_space = Regex::new(r"^\s+").unwrap();
+    let match_variable = Regex::new(r"^@?[a-zA-Z_][a-zA-Z_0-9]*").unwrap();
+    let match_float = Regex::new(r"^[0-9]*\.[0-9]+").unwrap();
+    let match_integer = Regex::new(r"^[0-9]+").unwrap();
+    let match_assign = Regex::new(r"^=").unwrap();
+    let match_operator = Regex::new(r"^(?:[+\-*/%]|[<>]=?|[!=]=|<<|>>)").unwrap();
+    let match_paren = Regex::new(r"^(?:\(|\))").unwrap();
+    // TODO: handle escapes
+    let match_string = Regex::new(r#""[^"]*""#).unwrap();
+    let match_comment = Regex::new(r"(?i)^rem\s.*$").unwrap();
+    // TODO: handle labels
+
+    for mut line in raw.lines() {
+        if line.len() > 0 {
+            res.push(BasicToken::NewLine);
+        }
+        while line.len() > 0 {
+            match_token!(line, res;
+                match_space => (),
+                match_let => (),
+                match_comment => (),
+                match_jump => (BasicToken::Goto),
+                match_word(word) => (match word.to_lowercase().as_str() {
+                    "if" => BasicToken::If,
+                    "then" => BasicToken::Then,
+                    "else" => BasicToken::Else,
+                    "end if" | "endif" => BasicToken::EndIf,
+                    _ => unreachable!(),
+                }),
+                match_variable(name) => (BasicToken::Name(name.to_string())),
+                match_float(float) => (BasicToken::Float(float.parse().unwrap())),
+                match_integer(int) => (BasicToken::Integer(int.parse().unwrap())),
+                match_assign => (BasicToken::Assign),
+                match_operator(op) => (BasicToken::Operator(match op {
+                    "+" => Operator::Add,
+                    "-" => Operator::Sub,
+                    "*" => Operator::Mul,
+                    "/" => Operator::Div,
+                    "%" => Operator::Mod,
+                    "<" => Operator::Lt,
+                    "<=" => Operator::Lte,
+                    ">" => Operator::Gt,
+                    ">=" => Operator::Gte,
+                    "<<" => Operator::LShift,
+                    ">>" => Operator::RShift,
+                    _ => unreachable!(),
+                })),
+                match_paren(paren) => (if paren == "(" {
+                    BasicToken::OpenParen
+                } else {
+                    BasicToken::CloseParen
+                }),
+                match_string(with_quotes) => (BasicToken::String(with_quotes[1..with_quotes.len() - 1].to_string())),
+            );
+            // If this line is reached, then none of the matches above matched
+            return Err(ParseError::InvalidToken(line.to_string()));
+        }
+    }
+
+    Ok(res)
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn test_tokenize_basic() {
+        assert_eq!(
+            tokenize("hello + world").unwrap(),
+            vec![
+                BasicToken::NewLine,
+                BasicToken::Name(String::from("hello")),
+                BasicToken::Operator(Operator::Add),
+                BasicToken::Name(String::from("world")),
+            ],
+        );
+
+        assert_eq!(
+            tokenize("let thing = thing / 2").unwrap(),
+            vec![
+                BasicToken::NewLine,
+                BasicToken::Name(String::from("thing")),
+                BasicToken::Assign,
+                BasicToken::Name(String::from("thing")),
+                BasicToken::Operator(Operator::Div),
+                BasicToken::Integer(2)
+            ],
+        );
+
+        assert_eq!(
+            tokenize("10 thing = thing + 0.5\ngoto 10").unwrap(),
+            vec![
+                BasicToken::NewLine,
+                BasicToken::Integer(10),
+                BasicToken::Name(String::from("thing")),
+                BasicToken::Assign,
+                BasicToken::Name(String::from("thing")),
+                BasicToken::Operator(Operator::Add),
+                BasicToken::Float(0.5),
+                BasicToken::NewLine,
+                BasicToken::Goto,
+                BasicToken::Integer(10),
+            ],
+        );
+
+        assert_eq!(
+            tokenize("x = 0\n\nif x > 0 then\nprint(\"Positive\")\nend if").unwrap(),
+            vec![
+                BasicToken::NewLine,
+                BasicToken::Name(String::from("x")),
+                BasicToken::Assign,
+                BasicToken::Integer(0),
+                BasicToken::NewLine,
+                BasicToken::If,
+                BasicToken::Name(String::from("x")),
+                BasicToken::Operator(Operator::Gt),
+                BasicToken::Integer(0),
+                BasicToken::Then,
+                BasicToken::NewLine,
+                BasicToken::Name(String::from("print")),
+                BasicToken::OpenParen,
+                BasicToken::String(String::from("Positive")),
+                BasicToken::CloseParen,
+                BasicToken::NewLine,
+                BasicToken::EndIf,
+            ],
+        );
+    }
+}