summaryrefslogtreecommitdiff
path: root/cpp.sml
diff options
context:
space:
mode:
authorVladimir Azarov <avm@intermediate-node.net>2025-03-22 17:58:30 +0100
committerVladimir Azarov <avm@intermediate-node.net>2025-03-22 17:58:30 +0100
commit40f6375daa736ca549a1d0cbd6df87dd83df4781 (patch)
tree17c0fd6ccdced187adc8b99e4b3609432ec36327 /cpp.sml
parent9ed6d5386bcc9ad8a8e73b481d41fcaea80adaad (diff)
Symbol parser
Diffstat (limited to 'cpp.sml')
-rw-r--r--cpp.sml280
1 files changed, 227 insertions, 53 deletions
diff --git a/cpp.sml b/cpp.sml
index ea3b35c..99f8b36 100644
--- a/cpp.sml
+++ b/cpp.sml
@@ -4,19 +4,53 @@ infixr 0 $
fun printLn s = (print s; print "\n")
datatype token =
- Number of substring |
- Id of substring |
+ Invalid |
+ Number of string |
+ Id of string |
+
Plus |
+ DoublePlus|
+ Minus |
+ DoubleMinus |
+
Semicolon |
+ EqualSign |
+ DoubleEqualSign |
+
Hash |
- CppInclude
+ DoubleHash |
+
+ CppInclude |
+ CppDefine
+
+val tokenRepr = [
+ (Plus, "+"),
+ (DoublePlus, "++"),
+ (Minus, "-"),
+ (DoubleMinus, "--"),
+
+ (Semicolon, ";"),
+ (EqualSign, "="),
+ (DoubleEqualSign, "=="),
+
+ (Hash, "#"),
+ (DoubleHash, "##"),
-fun printToken (Number s) = printLn $ "Number: " ^ Substring.string s
- | printToken (Id s) = printLn $ "Id: " ^ Substring.string s
- | printToken Plus = printLn "+"
- | printToken Semicolon = printLn ";"
- | printToken Hash = printLn "#"
- | printToken CppInclude = printLn "$include"
+ (CppInclude, "$include"),
+ (CppDefine, "$define")
+]
+
+fun isCppDir CppInclude = true
+ | isCppDir _ = false
+
+exception TokenWithoutRepr
+
+fun printToken (Number s) = printLn $ "Num: " ^ s
+ | printToken (Id s) = printLn $ "Id: " ^ s
+ | printToken v =
+ case List.find (fn (x, _) => x = v) tokenRepr of
+ SOME (_, repr) => printLn repr
+ | NONE => raise TokenWithoutRepr
type fileId = int
type fileOffset = int
@@ -41,10 +75,9 @@ let
if cur = offset then
(line, col)
else
- if String.sub (s, cur) = #"\n" then
- calc s (cur + 1) offset (line + 1, 1)
- else
- calc s (cur + 1) offset (line, col + 1)
+ calc s (cur + 1) offset
+ (if String.sub (s, cur) = #"\n" then (line + 1, 1)
+ else (line, col + 1))
in
calc s 0 offset (1, 1)
end
@@ -97,8 +130,7 @@ fun getOffset ({ stack = (id, off, contents) :: _, ... }: stream) = off
| getOffset _ = raise InvalidStream
fun getPosAfterCharRead
- ({ stack = (id, off, contents) :: _, ... }: stream) =
- (id, off - 1)
+ ({ stack = (id, off, contents) :: _, ... }: stream) = (id, off - 1)
| getPosAfterCharRead _ = raise InvalidStream
fun getPos ({ stack = (id, off, contents) :: _, ... }: stream) = (id, off)
@@ -106,7 +138,7 @@ fun getPos ({ stack = (id, off, contents) :: _, ... }: stream) = (id, off)
fun getSubstr startOff endOff
({ stack = (_, _, contents) :: _, ... }: stream) =
- Substring.substring (contents, startOff, endOff - startOff)
+ String.substring (contents, startOff, endOff - startOff)
| getSubstr _ _ _ = raise InvalidStream
fun advanceToNewFile
@@ -124,6 +156,101 @@ in
{ allFiles = [(0, fname, contents)], stack = [(0, 0, contents)] }
end
+(* FSM for parsing symbols *)
+
+val maxStates = 40
+
+fun fsmInsert (nextState, buf) curState tk (c :: cs) =
+let
+ open Array
+
+ val (_, row) = sub (buf, curState)
+ val potNextState = sub (row, ord c)
+in
+ if potNextState <> ~1 then
+ fsmInsert (nextState, buf) potNextState tk cs
+ else (
+ update (row, ord c, !nextState);
+ nextState := !nextState + 1;
+ fsmInsert (nextState, buf) (!nextState - 1) tk cs
+ )
+end
+ | fsmInsert (nextState, buf) curState tk [] =
+let
+ open Array
+ val (_, row) = sub (buf, curState)
+in
+ update (buf, curState, (tk, row))
+end
+
+fun fsmTableCreate () =
+let
+ open Array
+
+ val T as (_, buf) =
+ (ref 1, array (maxStates, (Invalid, array (128, ~1))))
+ val r = ref 1
+
+ fun filterNeeded [] acc = acc
+ | filterNeeded ((T as (tk, _)) :: tks) acc =
+ filterNeeded tks (if isCppDir tk then acc else T :: acc)
+
+ val tokenRepr = filterNeeded tokenRepr []
+in
+ while !r < length buf do (
+ update (buf, !r, (Invalid, array(128, ~1)));
+ r := !r + 1
+ );
+
+ List.app (fn (v, p) => fsmInsert T 0 v (explode p)) tokenRepr;
+ T
+end
+
+fun isStartForFsmGen () =
+let
+ open Array
+
+ val lookupTable = array (128, false)
+
+ fun firstChr s = ord $ String.sub (s, 0)
+ val () = List.app
+ (fn (_, repr) => update (lookupTable, firstChr repr, true)) tokenRepr
+in
+ fn c => sub (lookupTable, ord c)
+end
+
+val isStartForFsm = isStartForFsmGen ()
+
+val fsmTable = fsmTableCreate ()
+
+fun fsmEat stream =
+let
+ open Array
+ val stream = ungetc stream
+ val pos = getPos stream
+
+ fun get curState stream =
+ let
+ val (c, stream) = (fn (c, s) => (SOME c, s)) $ getchar stream handle
+ _ => (NONE, stream)
+ in
+ case c of
+ NONE => (#1 $ sub (#2 fsmTable, curState), stream)
+ | SOME c =>
+ let
+ val (tk, row) = sub (#2 fsmTable, curState)
+ val nextState = sub (row, ord c)
+ in
+ if nextState = ~1 then
+ (tk, ungetc stream)
+ else
+ get nextState stream
+ end
+ end
+in
+ (fn (tk, stream) => ((pos, tk), stream)) $ get 0 stream
+end
+
exception UnexpectedCharacter of char
fun isSpace c = c = #" " orelse c = #"\t" orelse c = #"\n"
@@ -159,7 +286,74 @@ in
end
end
-fun tokenize stream tkl =
+fun parseGeneric stream parser acc =
+let
+ val stream = ungetc stream
+ val P as (fid, startOff) = getPos stream
+
+ fun parse' stream acc = let
+ val (c, stream) = (fn (c, s) => (SOME c, s)) $ getchar stream handle
+ _ => (NONE, stream)
+
+ val (acc, tk, stream) = parser acc (stream, startOff) c
+ in
+ case tk of
+ NONE => parse' stream acc
+ | _ => (valOf tk, stream)
+ end
+
+ val (tk, stream) = parse' stream acc
+in
+ ((P, tk): fullToken, stream)
+end
+
+fun finishSeqRead startOff stream =
+let
+ val (_, endOff) = getPos stream
+ val s = getSubstr startOff endOff stream
+in
+ s
+end
+
+fun idParser () (stream, startOff) c =
+let
+ fun finalize stream =
+ ((), SOME $ Id (finishSeqRead startOff stream), stream)
+in
+ case c of
+ NONE => finalize stream
+ | SOME c =>
+ if isIdBody c then
+ ((), NONE, stream)
+ else
+ finalize (ungetc stream)
+end
+
+fun formCppDir tk tkl =
+let
+ fun conv tk = ((#1 $ hd tkl, tk) :: tl tkl)
+in
+ case tk of
+ Id "include" => conv CppInclude
+ | Id "define" => conv CppDefine
+ | _ => raise Fail $ "Expected cpp direcive\n"
+end
+
+fun numParser () (stream, startOff) c =
+let
+ fun finalize stream =
+ ((), SOME $ Number (finishSeqRead startOff stream), stream)
+in
+ case c of
+ NONE => finalize stream
+ | SOME c =>
+ if isDigit c then
+ ((), NONE, stream)
+ else
+ finalize (ungetc stream)
+end
+
+fun tokenize stream (tkl: fullToken list) =
let
fun getcharSkipEof stream = getchar stream handle
EndOfFile => getcharSkipEof (advanceToNewFile stream)
@@ -169,6 +363,8 @@ let
EndOfStream => (NONE, stream)
fun % tk = (getPosAfterCharRead stream, tk)
+
+ fun @-> (tk, stream) = tokenize stream (tk :: tkl)
in
case c of
NONE => (rev tkl, #allFiles stream)
@@ -176,42 +372,20 @@ in
if isSpace c then
tokenize stream tkl
else if isIdStart c then
- parseId c stream tkl
- else case c of
- #"+" => tokenize stream (% Plus :: tkl)
- | #";" => tokenize stream (% Semicolon :: tkl)
- | #"#" => tokenize stream (% Hash :: tkl)
- | _ => raise UnexpectedCharacter c
-end and
-parseId c stream (tkl: fullToken list) =
-let
- val P as (_, pos) = getPosAfterCharRead stream
-
- fun eatId curId stream =
- let
- val (c, stream) = (fn (c, s) => (SOME c, s)) $ getchar stream handle
- _ => (NONE, stream)
- in
- case c of
- NONE => (curId, stream)
- | SOME c =>
- if isIdBody c then
- eatId (curId + 1) stream
- else
- (curId, ungetc stream)
- end
-
- val (endOffset, stream) = eatId (pos + 1) stream
- val id = getSubstr pos endOffset stream
-
-in
- if ((fn Hash => true | _ => false) (#2 $ hd tkl) handle Empty => false)
- andalso isFirstOnLine (hd tkl) stream
- andalso Substring.string id = "include"
- then
- tokenize stream (((#1 $ hd tkl), CppInclude) :: tl tkl)
- else
- tokenize stream ((P, Id id) :: tkl)
+ let
+ val isCppDir = (fn Hash => true | _ => false) (#2 $ hd tkl)
+ handle Empty => false
+ val (tk, stream) = parseGeneric stream idParser ()
+ in
+ tokenize stream
+ (if isCppDir then (formCppDir (#2 tk) tkl) else (tk :: tkl))
+ end
+ else if isDigit c then
+ @-> $ parseGeneric stream numParser ()
+ else if isStartForFsm c then
+ @-> $ fsmEat stream
+ else
+ raise UnexpectedCharacter c
end
fun main [fname] =