initial

2023-12-14 10:25:56 +01:00 · 2023-12-14 10:25:56 +01:00 · d9281843f2
commit d9281843f2
11 changed files with 2183 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+/target
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,21 @@
+[package]
+name = "pascal-mlir"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+melior = { version = "0.14.0", features = ["ods-dialects"] }
+clap = { version = "4.3.3", features = ["derive"] }
+color-eyre = "0.6.2"
+itertools = "0.12"
+lalrpop-util = { version = "0.20.0", features = ["lexer"] }
+regex = "1.9"
+tracing = "0.1.37"
+tracing-subscriber = { version = "0.3.17", features = ["env-filter"] }
+annotate-snippets = { version = "0.9.1", features = ["color"] }
+logos = "0.13.0"
+
+[build-dependencies]
+lalrpop = "0.20.0"
--- a/README.md
+++ b/README.md
@ -0,0 +1,4 @@
+https://www.cs.utexas.edu/users/novak/iso7185.pdf
+https://lalrpop.github.io/lalrpop/lexer_tutorial/004_token_references.html
+
+im at 6.1.7
--- a/build.rs
+++ b/build.rs
@ -0,0 +1,3 @@
+fn main() {
+    lalrpop::process_root().unwrap();
+}
--- a/programs/first.pas
+++ b/programs/first.pas
@ -0,0 +1,6 @@
+program learn_pascal;
+
+
+const
+    PI = 3.141592654;
+    GNU = 'GNU''s Not Unix';
--- a/src/ast.rs
+++ b/src/ast.rs
@ -0,0 +1,20 @@
+
+
+pub enum Number<'a> {
+    Integer(&'a str),
+    Real(&'a str)
+}
+
+pub enum Constant<'a> {
+    Identifier {
+        is_negative: bool,
+        ident: &'a str
+    },
+    Number(Number<'a>),
+    String(&'a str),
+}
+
+pub struct ConstantDef<'a> {
+    pub ident: &'a str,
+    pub value: Constant<'a>
+}
--- a/src/grammar.lalrpop
+++ b/src/grammar.lalrpop
@ -0,0 +1,49 @@
+use crate::{
+    ast,
+    tokens::Token,
+    lexer::LexicalError,
+};
+
+grammar<'input>(input: &'input str);
+
+extern {
+    type Location = usize;
+    type Error = LexicalError;
+
+    enum Token<'input> {
+        "program" => Token::WordProgram,
+        "identifier" => Token::Identifier(<&'input str>),
+        "integer" => Token::Integer(<&'input str>),
+        "real" => Token::Real(<&'input str>),
+        "string" => Token::String(<&'input str>),
+        "-" => Token::SpecialMinus,
+        "+" => Token::SpecialPlus,
+    }
+}
+
+Comma<T>: Vec<T> = {
+    <mut v:(<T> ",")*> <e:T?> => match e {
+        None => v,
+        Some(e) => {
+            v.push(e);
+            v
+        }
+    }
+};
+
+
+pub Hello: String = {
+    "program" => "let".to_string()
+}
+
+Number: ast::Number<'input> = {
+    <"integer"> => ast::Number::Integer(<>),
+    <"real"> => ast::Number::Real(<>),
+}
+
+Constant: ast::Constant<'input> = {
+    <Number> => ast::Constant::Number(<>),
+    <"string"> => ast::Constant::String(<>),
+    "+"? <ident:"identifier"> => ast::Constant::Identifier { is_negative: false, ident },
+    "-" <ident:"identifier"> => ast::Constant::Identifier { is_negative: true, ident },
+}
--- a/src/lexer.rs
+++ b/src/lexer.rs
@ -0,0 +1,47 @@
+use std::{fmt::Display, ops::Range};
+
+use logos::{Logos, SpannedIter};
+
+use crate::tokens::{LexingError, Token};
+
+pub type Spanned<Tok, Loc, Error> = Result<(Loc, Tok, Loc), Error>;
+
+#[derive(Debug, Clone)]
+pub enum LexicalError {
+    InvalidToken(LexingError, Range<usize>),
+}
+
+impl Display for LexicalError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            LexicalError::InvalidToken(err, span) => {
+                write!(f, "lexical error at ({:?}): {:?}", err, span)
+            }
+        }
+    }
+}
+
+pub struct Lexer<'input> {
+    // instead of an iterator over characters, we have a token iterator
+    token_stream: SpannedIter<'input, Token<'input>>,
+}
+
+impl<'input> Lexer<'input> {
+    pub fn new(input: &'input str) -> Self {
+        // the Token::lexer() method is provided by the Logos trait
+        Self {
+            token_stream: Token::lexer(input).spanned(),
+        }
+    }
+}
+
+impl<'input> Iterator for Lexer<'input> {
+    type Item = Spanned<Token<'input>, usize, LexicalError>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.token_stream.next().map(|(token, span)| match token {
+            Ok(token) => Ok((span.start, token, span.end)),
+            Err(err) => Err(LexicalError::InvalidToken(err, span)),
+        })
+    }
+}
--- a/src/main.rs
+++ b/src/main.rs
@ -0,0 +1,11 @@
+use lalrpop_util::lalrpop_mod;
+
+lalrpop_mod!(pub grammar);
+
+pub mod lexer;
+pub mod tokens;
+pub mod ast;
+
+fn main() {
+    println!("Hello, world!");
+}
--- a/src/tokens.rs
+++ b/src/tokens.rs
@ -0,0 +1,151 @@
+use logos::Logos;
+use std::convert::Infallible;
+
+//  https://github.com/maciejhirsz/logos/issues/133
+
+#[derive(Debug, PartialEq, Clone, Default)]
+pub enum LexingError {
+    NumberParseError,
+    #[default]
+    Other,
+}
+
+impl From<std::num::ParseIntError> for LexingError {
+    fn from(_: std::num::ParseIntError) -> Self {
+        LexingError::NumberParseError
+    }
+}
+
+impl From<Infallible> for LexingError {
+    fn from(_: Infallible) -> Self {
+        LexingError::Other
+    }
+}
+
+#[derive(Logos, Debug, PartialEq, Clone)]
+#[logos(error = LexingError, skip r"[ \t\n\f]+", skip r"//.*\n?", skip r"\{[^}]*\}" skip r"\(\*(.|[\r\n])*?\*\)")]
+pub enum Token<'input> {
+    #[regex(r"[a-zA-Z][a-zA-Z\d]*")]
+    Identifier(&'input str), // also directive
+    #[regex(r"[+-]?[0-9][0-9]*")]
+    Integer(&'input str),
+    #[regex(r"[+-]?[0-9][0-9]*\.[0-9][0-9]*([eE][+-]?[0-9][0-9]*)?")]
+    #[regex(r"[+-]?[0-9][0-9]*[eE][+-]?[0-9][0-9]*")]
+    Real(&'input str),
+    #[regex(r#""(?:[^"]|\\")*""#)]
+    String(&'input str),
+
+    // special symbols
+    #[token("+")]
+    SpecialPlus,
+    #[token("-")]
+    SpecialMinus,
+    #[token("*")]
+    SpecialMul,
+    #[token("/")]
+    SpecialDiv,
+    #[token("=")]
+    SpecialEqual,
+    #[token("<")]
+    SpecialLower,
+    #[token(">")]
+    SpecialGreater,
+    #[token("[")]
+    SpecialOpenBracket,
+    #[token("]")]
+    SpecialCloseBracket,
+    #[token(".")]
+    SpecialDot,
+    #[token(",")]
+    SpecialComma,
+    #[token(";")]
+    SpecialDotComma,
+    #[token("\"")]
+    SpecialQuotation,
+    #[token("(")]
+    SpecialOpenParen,
+    #[token(")")]
+    SpecialCloseParen,
+    #[token("<>")]
+    SpecialSpaceship,
+    #[token("<=")]
+    SpecialLessEqual,
+    #[token(">=")]
+    SpecialGreaterEqual,
+    #[token(":=")]
+    SpecialAssign,
+    #[token("..")]
+    SpecialRange,
+
+    // special symbols - word symbols
+    #[token("and")]
+    WordAnd,
+    #[token("array")]
+    WordArray,
+    #[token("begin")]
+    WordBegin,
+    #[token("case")]
+    WordCase,
+    #[token("const")]
+    WordConst,
+    #[token("div")]
+    WordDiv,
+    #[token("do")]
+    WordDo,
+    #[token("downto")]
+    WordDownto,
+    #[token("else")]
+    WordElse,
+    #[token("end")]
+    WordEnd,
+    #[token("file")]
+    WordFile,
+    #[token("for")]
+    WordFor,
+    #[token("function")]
+    WordFunction,
+    #[token("goto")]
+    WordGoto,
+    #[token("if")]
+    WordIf,
+    #[token("in")]
+    WordIn,
+    #[token("label")]
+    WordLabel,
+    #[token("mod")]
+    WordMod,
+    #[token("nil")]
+    WordNil,
+    #[token("not")]
+    WordNot,
+    #[token("of")]
+    WordOf,
+    #[token("or")]
+    WordOr,
+    #[token("packed")]
+    WordPacked,
+    #[token("procedure")]
+    WordProcedure,
+    #[token("program")]
+    WordProgram,
+    #[token("record")]
+    WordRecord,
+    #[token("repeat")]
+    WordRepeat,
+    #[token("set")]
+    WordSet,
+    #[token("then")]
+    WordThen,
+    #[token("to")]
+    WordTo,
+    #[token("type")]
+    WordType,
+    #[token("until")]
+    WordUntil,
+    #[token("var")]
+    WordVar,
+    #[token("while")]
+    WordWhile,
+    #[token("with")]
+    WordWith,
+}