diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..c58fc8a --- /dev/null +++ b/Makefile @@ -0,0 +1,3 @@ +make: + + diff --git a/joosc b/joosc new file mode 100755 index 0000000..bf86038 --- /dev/null +++ b/joosc @@ -0,0 +1,2 @@ +lein uberjar +java -jar target/uberjar/watcompiler-0.1.0-SNAPSHOT-standalone.jar $1 diff --git a/src/watcompiler/Language.txt b/src/watcompiler/Language.txt new file mode 100644 index 0000000..7f7badd --- /dev/null +++ b/src/watcompiler/Language.txt @@ -0,0 +1,14 @@ +BRACKET { } ( ) [ ] +KEYWORD abstract default if private this boolean do implements protected break double import public throws throw byte else instanceof return transient case extends int short try catch interface static void char finally final long strictfp volatile class float native super while const for new switch continue goto package synchronized +UNARY-OPERATOR ++ - -- ! ~ +BINARY-OPERATOR == * / % < << > >> >>> & ^ | != >= <= + +ASSIGNMENT-OPERATOR = *= /= %= += -= <<= >>= >>>= &= ^= |= +TERMINAL ; , +BOOLEAN-LITERAL true false +NULL-LITERAL null +INT-LITERAL +STRING-LITERAL +CHARACTER-LITERAL +IDENTIFIER +WHITESPACE +COMMENT diff --git a/src/watcompiler/lang.clj b/src/watcompiler/lang.clj index d40a90f..51f112e 100644 --- a/src/watcompiler/lang.clj +++ b/src/watcompiler/lang.clj @@ -21,6 +21,12 @@ (def DIGITS-NONZERO (char-range \1 \9)) +(def ALL-ASCII + (char-range 32 126)) + +(def ESCAPABLE + [\b \t \n \f \r \" \' \\]) + (def S-PLUS \+) (def S-MINUS \-) (def S-STAR \*) diff --git a/src/watcompiler/nfa.clj b/src/watcompiler/nfa.clj index 715811d..bdface1 100644 --- a/src/watcompiler/nfa.clj +++ b/src/watcompiler/nfa.clj @@ -79,7 +79,7 @@ transition-map (let [[s-from s-to alphabets] (first remaining)] (recur (rest remaining) - (if (seq? alphabets) + (if (or (seq? alphabets) (vector? alphabets)) (reduce #(add-to-map %1 (list s-from %2) s-to) transition-map alphabets) diff --git a/src/watcompiler/re.clj b/src/watcompiler/re.clj new file mode 100644 index 0000000..ed4d705 --- /dev/null +++ b/src/watcompiler/re.clj @@ -0,0 +1,240 @@ +(ns watcompiler.re + (:require [clojure.set :refer :all] + [watcompiler.nfa :refer :all] + [watcompiler.lang :refer :all] + [clojure.string :as str]) + (:import [watcompiler.nfa NFA])) + +;; NFAs for types + +;; Integer literal +;; 0 and [1-9][0-9]* +(defn build-integer-literal-nfa + [] + (let [stateS (gensym :S) + state1 (gensym :1) + state2 (gensym :2)] + (make-NFA (into #{} (concat [\0] DIGITS DIGITS-NONZERO)) + #{stateS state1 state2} + stateS + {state1 (list "INTEGER-LITERAL" 0) + state2 (list "INTEGER-LITERAL" 0)} + (make-transition-NFA [[stateS state1 \0] + [stateS state2 DIGITS-NONZERO] + [state2 state2 DIGITS]])))) + +;; String literal +;; \"(\\[btnfr\"\'\\] | ALL-ASCII)*\" (\ shown for escaping ") +;; aka \"(.*)\" with escapes inside +(defn build-string-literal-nfa + [] + (let [stateS (gensym :S) + state1 (gensym :1) + state2 (gensym :2) + state3 (gensym :3)] + (make-NFA (into #{} (concat [\'] ALL-ASCII [\\] ESCAPABLE)) + #{stateS state1 state2 state3} + stateS + {state3 (list "STRING-LITERAL" 0)} + (make-transition-NFA [[stateS state1 \"] + [state1 state1 ALL-ASCII] + [state1 state2 \\] + [state2 state1 ESCAPABLE] + [state1 state3 \"]])))) + +;; Character literal +;; \'(\\ESCAPABLE | ALL-ASCII)*\' (\ shown for escaping ") +;; aka \'(.*)\' with escapes inside +(defn build-character-literal-nfa + [] + (let [stateS (gensym :S) + state1 (gensym :1) + state2 (gensym :2) + state3 (gensym :3) + state4 (gensym :4)] + (make-NFA (into #{} (concat [\'] ALL-ASCII [\\] ESCAPABLE)) + #{stateS state1 state2 state3 state4} + stateS + {state4 (list "CHARACTER-LITERAL" 0)} + (make-transition-NFA [[stateS state1 \'] + [state1 state3 ALL-ASCII] + [state1 state2 \\] + [state2 state3 ESCAPABLE] + [state3 state4 \']])))) + +;; Identifiers +;; [a-zA-Z][a-zA-Z0-9]* +(defn build-identifier-nfa + [] + (let [stateS (gensym :S) + state1 (gensym :s1) + state2 (gensym :s2)] + (make-NFA (into #{} (concat UPPER-ALPHABET LOWER-ALPHABET DIGITS)) + #{stateS state1 state2} + stateS + {state1 (list "IDENTIFIER" 1) + state2 (list "IDENTIFIER" 1)} + (make-transition-NFA [[stateS state1 UPPER-ALPHABET] + [stateS state1 LOWER-ALPHABET] + [state1 state2 UPPER-ALPHABET] + [state1 state2 LOWER-ALPHABET] + [state2 state2 UPPER-ALPHABET] + [state2 state2 LOWER-ALPHABET] + [state2 state2 DIGITS]])))) + +;; Whitespace +;; [space tab newline]+ +(defn build-whitespace-nfa + [] + (let [stateS (gensym :S) + state1 (gensym :s1)] + (make-NFA (into #{} WHITESPACE) + #{stateS state1} + stateS + {state1 (list "WHITESPACE" 0)} + (make-transition-NFA [[stateS state1 WHITESPACE] + [state1 state1 WHITESPACE]])))) + +;; Comment +;; //.* +;; /*.**/ +(defn build-comment-nfa + [] + (let [stateS (gensym :S) + state1 (gensym :s1) + state2 (gensym :s2) + state3 (gensym :s3) + state4 (gensym :s4) + state5 (gensym :s5)] + (make-NFA (into #{} (concat ALL-ASCII [\*] [\/])) + #{stateS state1 state2 state3 state4 state5} + stateS + {state2 (list "COMMENT" 0) + state5 (list "COMMENT"0)} + (make-transition-NFA [[stateS state1 \/] + [state1 state2 \/] + [state2 state2 ALL-ASCII] + [state1 state3 \*] + [state3 state3 ALL-ASCII] + [state3 state4 \*] + [state4 state5 \/]])))) + +;; Merging multiple nfas +(defn merge-nfas + [& nfas] + (let + [stateS (gensym :s) + all-states (apply union (map :states nfas)) + all-accept-states (apply union (map :accept-states nfas)) + merged-transitions (apply merge (map :transitions nfas)) + all-transitions (merge + ;; Merged transitions from the nfas + merged-transitions + ;; Episilon transition to each nfa + (make-transition-NFA (into [] + (for [nfa-start (map :start nfas)] + [stateS nfa-start e])))) + all-accept-priorities (apply union (map :accept-priorities nfas))] + (->NFA (into #{} ) ;; collect the alphabet + all-states + stateS + all-accept-states + all-transitions + all-accept-priorities))) + +;; Parses a string to form the nfa +(defn string-to-nfa + [class-type word] + ;; case on the word, if it is a special token, pass the respective nfa + (let + [stateS (gensym :s) + ;; List of prefixes of word, stored as strings + states-map (set (rest (reductions str (str) word))) + + ;; Key: substring of word, Value: gensym associated with this state + gensym-map (into (sorted-map) (for [c states-map] + [c (gensym c)])) + ;; Key: gensym value, Value: char to get to this state + states-char-map (into (sorted-map) (for [pair (map list (vals gensym-map) (seq word))] + [(first pair) (second pair)])) + + ;; Accept states + accept-states-map {(get gensym-map word) (list class-type 0)} + + ;; transitions from previous substring gensym to next substring gensym + transitions-map (into #{ [stateS (get gensym-map (str (first (seq word))) \a) (first (seq word))] } + (for [v (partition 2 1 (vals gensym-map))] + [(first v) (second v) (get states-char-map (second v))]))] + (make-NFA (into #{} (concat (seq word))) + states-map + stateS + accept-states-map + (make-transition-NFA transitions-map)))) + +;; Acts as a wrapper to either get the made nfa or form it with string-to-nfa +(defn get-nfa + [class-type first-token arguments] + (case first-token + "" (build-integer-literal-nfa) + "" (build-string-literal-nfa) + "" (build-character-literal-nfa) + "" (build-identifier-nfa) + "" (build-whitespace-nfa) + "" (build-comment-nfa) + (string-to-nfa class-type first-token))) + +;; Takes strings and forms nfas from them and links them into one nfa +(defn form-multiple-nfas + [& arguments] + (let + [stateS (gensym :s) + class-type (first arguments) + args (rest arguments) + + ;; Letters of all of the words in arguments + alphabet (apply concat (for [x (map seq (map char-array arguments))] x)) + + ;; Key: string for keyword, Value: NFA for that keyword + strings-nfas (into (sorted-map) (for [nfa-name args] + [nfa-name (get-nfa class-type nfa-name args)])) + + ;; All of the states in the nfas + all-states (apply union (map :states (vals strings-nfas))) + + ;; All of the accept states in the nfas + all-accept-states (apply union (map :accept-states (vals strings-nfas))) + + ;; All of the transitions in the nfas + merged-transitions (apply merge (map :transitions (vals strings-nfas))) + + ;; Setting epsilon transitions to all of the nfas start states + all-transitions (merge + ;; Merged transitions from the nfas + merged-transitions + ;; Episilon transition to each nfa + (make-transition-NFA (into [] + (for [nfa-start (map :start (vals strings-nfas))] + [stateS nfa-start e])))) + all-accept-priorities (apply union (map :accept-priorities (vals strings-nfas)))] + (->NFA (into #{} (concat alphabet)) + all-states + stateS + all-accept-states + all-transitions + all-accept-priorities))) + +;; Reading the file +(def read-file ;; change that notation read-file + (into [] + (for [line (str/split-lines (slurp "src/watcompiler/Language.txt"))] + (str/split line #" ")))) + +(def file-formed-nfa + (let [nfas (into [] + (for [x read-file] + (apply form-multiple-nfas x)))] + (apply merge-nfas (remove nil? nfas)))) + +;; complete nfa from all of the individual RE nfas +(def complete-nfa + (merge-nfas file-formed-nfa)) diff --git a/test/watcompiler/re_test.clj b/test/watcompiler/re_test.clj new file mode 100644 index 0000000..b1379d3 --- /dev/null +++ b/test/watcompiler/re_test.clj @@ -0,0 +1,158 @@ +(ns watcompiler.re-test + (:require [clojure.test :refer :all] + [watcompiler.nfa :refer :all] + [watcompiler.re :refer :all]) + (:import [watcompiler.nfa NFA])) + +;; Regex NFA tests +(deftest integer-literal-tests + (let [integer-literal-nfa (build-integer-literal-nfa)] + + (is (= "INTEGER-LITERAL" (run-NFA integer-literal-nfa "1010"))) + (is (= "INTEGER-LITERAL" (run-NFA integer-literal-nfa "0"))) + (is (= "INTEGER-LITERAL" (run-NFA integer-literal-nfa "1"))))) + +(deftest string-literal-tests + (let [string-literal-nfa (build-string-literal-nfa)] + + (is (= "STRING-LITERAL" (run-NFA string-literal-nfa "\"s\""))) + (is (= "STRING-LITERAL" (run-NFA string-literal-nfa "\"thisis a string literal\""))) + (is (= "STRING-LITERAL" (run-NFA string-literal-nfa "\"[]~`!%^&*$(&^%#.][` @$g literal\""))) + (is (= "STRING-LITERAL" (run-NFA string-literal-nfa "\"\""))) + + (is (= "STRING-LITERAL" (run-NFA string-literal-nfa "\" \\b \\t \\n \\f \\r \\' \\\\ \""))) + (is (= "STRING-LITERAL" (run-NFA string-literal-nfa "\"abc\\n\""))) + (is (= false (run-NFA string-literal-nfa "needquotes"))))) + +(deftest character-literal-tests + (let [character-literal-nfa (build-character-literal-nfa)] + + (is (= "CHARACTER-LITERAL" (run-NFA character-literal-nfa "'s'"))) + (is (= "CHARACTER-LITERAL" (run-NFA character-literal-nfa "'\\b'"))) + (is (= "CHARACTER-LITERAL" (run-NFA character-literal-nfa "'0'"))) + (is (= false (run-NFA character-literal-nfa "'sa'"))))) + + +(deftest identifier-tests + (let [identifier-nfa (build-identifier-nfa)] + + (is (= "IDENTIFIER" (run-NFA identifier-nfa "thisidentifier"))) + (is (= "IDENTIFIER" (run-NFA identifier-nfa "a"))))) + +(deftest whitespace-test + (let [whitespace-nfa (build-whitespace-nfa)] + + (is (= "WHITESPACE" (run-NFA whitespace-nfa " \n\n"))))) + +(deftest comment-test + (let [comment-nfa (build-comment-nfa)] + + (is (= "COMMENT" (run-NFA comment-nfa "///this is a comment "))) + (is (= "COMMENT" (run-NFA comment-nfa "//"))) + (is (= "COMMENT" (run-NFA comment-nfa "/*multilinecomment\\n\\ncomment*/"))) + (is (= "COMMENT" (run-NFA comment-nfa "/*multilinecomment\\n\\ncomment****/"))) + (is (= false (run-NFA comment-nfa "//dawda\na"))) + (is (= false (run-NFA comment-nfa "/*notmultiline"))) + (is (= false (run-NFA comment-nfa "/notacomment"))))) + +;; Form the NFAs from a file +(deftest reading-file + (let [lines read-file + formed file-formed-nfa] + + (is (= "BRACKET" (run-NFA formed "]"))) + (is (= "BOOLEAN-LITERAL" (run-NFA formed "true"))) + (is (= "BRACKET" (run-NFA formed "{"))))) + +;; Test forming multiple nfas from multiple strings +(deftest multiple-nfas-function-test + (let [full-nfa (form-multiple-nfas "KEYWORD" "int" "if")] + (is (= "KEYWORD" (run-NFA full-nfa "int"))) + (is (= "KEYWORD" (run-NFA full-nfa "if"))) + (is (= false (run-NFA full-nfa "in"))) + (is (= false (run-NFA full-nfa "nt"))))) + +;; Test function forming individual nfa +(deftest function-test + (let [int-nfa-test (string-to-nfa "INT" "int") + synchronized-nfa-test (string-to-nfa "KEYWORD" "synchronized")] + + (is :MAP int-nfa-test) + (is (= "INT" (run-NFA int-nfa-test "int"))) + (is :MAP synchronized-nfa-test) + (is (= "KEYWORD" (run-NFA synchronized-nfa-test "synchronized"))) + (is (= false (run-NFA synchronized-nfa-test "synchronize"))) + (is (= false (run-NFA synchronized-nfa-test "ynchronize"))))) + +;; Individual NFA tests +(deftest int-test + (let [int-nfa (string-to-nfa "KEYWORD" "int")] + (is (= "KEYWORD" (run-NFA complete-nfa "int"))) + (is (= "INTEGER-LITERAL" (run-NFA complete-nfa "109"))))) + +(deftest operator-test + ;; Operators + (is (= "BINARY-OPERATOR" (run-NFA complete-nfa "+"))) + (is (= "UNARY-OPERATOR" (run-NFA complete-nfa "++"))) + (is (= "BINARY-OPERATOR" (run-NFA complete-nfa ">"))) + (is (= "ASSIGNMENT-OPERATOR" (run-NFA complete-nfa ">>>="))) + (is (= "UNARY-OPERATOR" (run-NFA complete-nfa "!"))) + (is (= "BINARY-OPERATOR" (run-NFA complete-nfa "!=")))) + +;; Booleans test +(deftest boolean-test + (is (= "BOOLEAN-LITERAL" (run-NFA complete-nfa "true"))) + (is (= "BOOLEAN-LITERAL" (run-NFA complete-nfa "false"))) + (is (= "IDENTIFIER" (run-NFA complete-nfa "tru"))) + (is (= "IDENTIFIER" (run-NFA complete-nfa "fals")))) + +;; Keyword test +(deftest keyword-test + ;; Individual Keywords on their nfas + (let [int-nfa (string-to-nfa "KEYWORD" "int") + abstract-nfa (string-to-nfa "KEYWORD" "abstract") + default-nfa (string-to-nfa "KEYWORD" "default") + synchronize-nfa (string-to-nfa "KEYWORD" "synchronize")] + (is (= "KEYWORD" (run-NFA int-nfa "int"))) + (is (= "KEYWORD" (run-NFA abstract-nfa "abstract"))) + (is (= "KEYWORD" (run-NFA default-nfa "default"))) + (is (= "KEYWORD" (run-NFA synchronize-nfa "synchronize"))) + (is (= false (run-NFA synchronize-nfa "ynchronize"))))) + +;; Test on a complete merged nfa +(deftest merged-function-nfa-test + (is :MAP complete-nfa) + (is (= "KEYWORD" (run-NFA complete-nfa "int"))) + (is (= "KEYWORD" (run-NFA complete-nfa "synchronized"))) + (is (= "INTEGER-LITERAL" (run-NFA complete-nfa "9"))) + (is (= "UNARY-OPERATOR" (run-NFA complete-nfa "++"))) + (is (= "BOOLEAN-LITERAL" (run-NFA complete-nfa "true"))) + (is (= "BOOLEAN-LITERAL" (run-NFA complete-nfa "false")))) + +(deftest complete-nfa-test + (is (= "KEYWORD" (run-NFA complete-nfa "abstract"))) + (is (= "KEYWORD" (run-NFA complete-nfa "default"))) + (is (= "KEYWORD" (run-NFA complete-nfa "package"))) + (is (= "KEYWORD" (run-NFA complete-nfa "synchronized"))) + ;; Booleans + (is (= "BOOLEAN-LITERAL" (run-NFA complete-nfa "true"))) + (is (= "BOOLEAN-LITERAL" (run-NFA complete-nfa "false"))) + (is (= "IDENTIFIER" (run-NFA complete-nfa "tru"))) + (is (= "IDENTIFIER" (run-NFA complete-nfa "fals"))) + ;; Integer + (is (= "INTEGER-LITERAL" (run-NFA complete-nfa "109"))) + ;; Operators + (is (= "BINARY-OPERATOR" (run-NFA complete-nfa "+"))) + (is (= "UNARY-OPERATOR" (run-NFA complete-nfa "++"))) + (is (= "BINARY-OPERATOR" (run-NFA complete-nfa ">"))) + (is (= "BINARY-OPERATOR" (run-NFA complete-nfa ">>>"))) + (is (= "ASSIGNMENT-OPERATOR" (run-NFA complete-nfa ">>>="))) + ;; Terminal + (is (= "TERMINAL" (run-NFA complete-nfa ";"))) + ;; null + (is (= "NULL-LITERAL" (run-NFA complete-nfa "null")))) + +(deftest filter-regex-nfas + ;; INT-LITERAL in Tokens.txt + ;; shouldn't give a real matching to the text given + (is (= false (run-NFA complete-nfa "")))) diff --git a/test/watcompiler/sample.java b/test/watcompiler/sample.java new file mode 100644 index 0000000..92fed23 --- /dev/null +++ b/test/watcompiler/sample.java @@ -0,0 +1 @@ +int a 0