summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorVladimir Azarov <avm@intermediate-node.net>2025-03-24 21:51:16 +0100
committerVladimir Azarov <avm@intermediate-node.net>2025-03-24 21:51:16 +0100
commit87217fe5ba58f5199d30586b5d9bec104dece445 (patch)
tree00a43d11ebdfbb65750e80758ce1925e4c6a1a3e
parente40727b58e357f123256557af50666aa42c2caa4 (diff)
Partition into structures
-rw-r--r--.gitignore1
-rw-r--r--cpp.mlb8
-rw-r--r--cpp.sml1023
-rw-r--r--exn_handler.sml46
-rw-r--r--general.sml26
-rw-r--r--stream.sig36
-rw-r--r--stream.sml185
-rw-r--r--tokenizer.sig12
-rw-r--r--tokenizer.sml787
9 files changed, 1105 insertions, 1019 deletions
diff --git a/.gitignore b/.gitignore
index 678840a..58c478a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@ test*
cpp
todo*
mlmon.out*
+*dot
diff --git a/cpp.mlb b/cpp.mlb
index 8039660..dd7fb36 100644
--- a/cpp.mlb
+++ b/cpp.mlb
@@ -4,6 +4,14 @@ ann
in
$(SML_LIB)/basis/basis.mlb
$(SML_LIB)/basis/mlton.mlb
+ general.sml
+ stream.sig
+ stream.sml
+
+ tokenizer.sig
+ tokenizer.sml
+
+ exn_handler.sml
cpp.sml
end
diff --git a/cpp.sml b/cpp.sml
index cb04406..32fcd3c 100644
--- a/cpp.sml
+++ b/cpp.sml
@@ -1,1025 +1,10 @@
-fun $ (x, y) = x y
-infixr 0 $
-
-fun printLn s = (print s; print "\n")
-
-datatype includeArg = IARel of string | IAFromRef of string
-
-datatype token =
- Invalid |
- Number of string |
- Id of string |
- CharConst of string * int |
- StringConst of string |
-
- kwBreak |
- kwCase |
- kwChar |
- kwConst |
- kwContinue |
- kwDefault |
- kwDouble |
- kwElse |
- kwEnum |
- kwExtern |
- kwFloat |
- kwFor |
- kwGoto |
- kwInt |
- kwLong |
- kwRegister |
- kwReturn |
- kwShort |
- kwSigned |
- kwSizeof |
- kwStruct |
- kwSwitch |
- kwTypedef |
- kwUnion |
- kwUnsigned |
- kwVoid |
- kwVolatile |
-
- LParen |
- RParen |
- LBracket |
- RBracket |
- LBrace |
- RBrace |
-
- QuestionMark |
- Colon |
- Coma |
- Semicolon |
-
- Arrow |
- Plus |
- DoublePlus|
- Minus |
- DoubleMinus |
- Ampersand |
- Asterisk |
- Slash |
- Tilde |
- ExclMark |
- Percent |
- DoubleGreater |
- DoubleLess |
- Greater |
- Less |
- EqualSign |
- LessEqualSign |
- GreaterEqualSign |
- DoubleEqualSign |
- ExclMarkEqualSign |
- Cap |
- VerticalBar |
- DoubleAmpersand |
- DoubleVerticalBar |
-
- AsteriskEqualSign |
- SlashEqualSign |
- PercentEqualSign |
- PlusEqualSign |
- MinusEqualSign |
- DoubleLessEqualSign |
- DoubleGreaterEqualSign |
- AmpersandEqualSign |
- CapEqualSign |
- VerticalBarEqualSign |
-
- Hash |
- DoubleHash |
-
- Dot |
- DoubleDot |
- TripleDot |
-
- CommentStart |
-
- CppInclude of includeArg |
- CppDefine |
- CppUndef |
- CppIf |
- CppIfdef |
- CppIfndef |
- CppElse |
- CppElif |
- CppEndif |
- CppPragma
-
-val kwPrefix = #"@"
-val cppPrefix = #"$"
-
-type fileId = int
-type fileOffset = int
-type pos = fileId * fileOffset
-type convPos = string * int * int option
-type fullToken = pos * token
-
-datatype tkErrorAuxInfo = TkiEOF | TkiDx of int | TkiStart
-
-exception TkError of tkErrorAuxInfo * string
-exception TkErrorAug of convPos * string (* main exception *)
-
-exception ExpectedCppDir (* handled in postprocess *)
-
-(* handled in tokenizer *)
-exception EndOfStream
-exception EndOfFile
-
-(* Unreachable (should be) *)
-exception Unreachable
-exception FsmTableIsTooSmall
-exception InvalidStream
-exception InvalidStreamAdvance
-exception TokenWithoutRepr
-
-(* handled in readIncludeArg *)
-exception LineWithoutNl
-
-fun pos2str (pos, line, col) =
-let
- val % = Int.toString
-in
- case col of
- SOME col => pos ^ ":" ^ %line ^ ":" ^ %col
- | NONE => pos ^ ":" ^ %line
-end
-fun eprint s = printLn $ "error: " ^ s
-
-fun otherExn e =
-let
- val hist = MLton.Exn.history e
-in
- eprint $ "exception " ^ exnMessage e ^ " was raised";
- if hist = [] then
- (printLn "No stack trace is avaliable";
- printLn "Recompile with -const \"Exn.keepHistory true\"")
- else
- List.app (fn x => printLn $ "\t" ^ x) hist
-end
-
-fun exit code = Posix.Process.exit $ Word8.fromInt code
-
-fun ioExn (IO.Io { name, function = _, cause }) =
-let
- open OS
- val prefix = name ^ ": "
- val reason =
- case cause of
- SysErr (str, _) => str
- | _ => exnMessage cause
-in
- printLn $ prefix ^ reason
-end
- | ioExn _ = (printLn "ioExn: unreachable"; exit 254)
-
-fun globalExnHandler e =
- (case e of
- FsmTableIsTooSmall =>
- eprint "fsm table is too small. Increate 'maxState' value"
- | IO.Io _ => ioExn e
- | TkErrorAug (pos, msg) => eprint $ pos2str pos ^ ": " ^ msg
- | _ => otherExn e;
- exit 255)
-
-val () = MLton.Exn.setTopLevelHandler globalExnHandler
-
-val tokenRepr =
-let
- fun & repr = str kwPrefix ^ repr
- fun % repr = str cppPrefix ^ repr
-in
- [
- (kwBreak, &"break"),
- (kwCase, &"case"),
- (kwChar, &"char"),
- (kwConst, &"const"),
- (kwContinue, &"continue"),
- (kwDefault, &"default"),
- (kwDouble, &"double"),
- (kwElse, &"else"),
- (kwEnum, &"enum"),
- (kwExtern, &"extern"),
- (kwFloat, &"float"),
- (kwFor, &"for"),
- (kwGoto, &"goto"),
- (kwInt, &"int"),
- (kwLong, &"long"),
- (kwRegister, &"register"),
- (kwReturn, &"return"),
- (kwShort, &"short"),
- (kwSigned, &"signed"),
- (kwSizeof, &"sizeof"),
- (kwStruct, &"struct"),
- (kwSwitch, &"switch"),
- (kwTypedef, &"typedef"),
- (kwUnion, &"union"),
- (kwUnsigned, &"unsigned"),
- (kwVoid, &"void"),
- (kwVolatile, &"volatile"),
-
- (LParen, "("),
- (RParen, ")"),
- (LBracket, "["),
- (RBracket, "]"),
- (LBrace, "{"),
- (RBrace, "}"),
-
- (QuestionMark, "?"),
- (Colon, ":"),
- (Coma, ","),
- (Semicolon, ";"),
-
- (Arrow, "->"),
- (Plus, "+"),
- (DoublePlus, "++"),
- (Minus, "-"),
- (DoubleMinus, "--"),
- (Ampersand, "&"),
- (Asterisk, "*"),
- (Slash, "/"),
- (Tilde, "~"),
- (ExclMark, "!"),
- (Percent, "%"),
- (DoubleLess, "<<"),
- (DoubleGreater, ">>"),
- (Less, "<"),
- (Greater, ">"),
- (EqualSign, "="),
- (LessEqualSign, "<="),
- (GreaterEqualSign, ">="),
- (DoubleEqualSign, "=="),
- (ExclMarkEqualSign, "!="),
- (Cap, "^"),
- (VerticalBar, "|"),
- (DoubleAmpersand, "&&"),
- (DoubleVerticalBar, "||"),
-
- (AsteriskEqualSign, "*="),
- (SlashEqualSign, "/="),
- (PercentEqualSign, "%="),
- (PlusEqualSign, "+="),
- (MinusEqualSign, "-="),
- (DoubleLessEqualSign, "<<="),
- (DoubleGreaterEqualSign, ">>="),
- (AmpersandEqualSign, "&="),
- (CapEqualSign, "^="),
- (VerticalBarEqualSign, "|="),
-
- (Hash, "#"),
- (DoubleHash, "##"),
-
- (Dot, "."),
- (DoubleDot, ".."),
- (TripleDot, "..."),
-
- (CommentStart, "/*"),
-
- (CppDefine, %"define"),
- (CppUndef, %"undef"),
- (CppIf, %"if"),
- (CppIfdef, %"ifdef"),
- (CppIfndef, %"ifndef"),
- (CppElse, %"else"),
- (CppElif, %"elif"),
- (CppEndif, %"endif"),
- (CppPragma, %"pragma")
-]
-end
-
-val printToken = fn
- Number s => printLn $ "Num: " ^ s
- | Id s => printLn $ "Id: " ^ s
- | CharConst (repr, _) => printLn repr
- | CppInclude arg =>
- let
- val (start, end', arg) =
- case arg of
- IARel v => ("\"", "\"", v)
- | IAFromRef v => ("<", ">", v)
- in
- printLn $ (str cppPrefix) ^ "include " ^ start ^ arg ^ end'
- end
- | StringConst s =>
- printLn $ "\"" ^ s ^ "\""
- | v =>
- case List.find (fn (x, _) => x = v) tokenRepr of
- SOME (_, repr) => printLn repr
- | NONE => raise TokenWithoutRepr
-
-type stream = {
- (* list of file ids, file names and file contents *)
- allFiles: (fileId * string * string) list,
- (* stack of file ids, file offsets and file contents *)
- stack: (fileId * fileOffset * string) list
-}
-
-fun calcFilePos s offset =
-let
- fun calc s cur offset (line, col) =
- if cur = offset then
- (line, col)
- else
- calc s (cur + 1) offset
- (if String.sub (s, cur) = #"\n" then (line + 1, 1)
- else (line, col + 1))
-in
- calc s 0 offset (1, 1)
-end
-
-fun printPos fileList (id, pos) =
-let
- val triple = List.find (fn (fid, _, _) => fid = id) fileList
-in
- case triple of
- NONE => raise InvalidStream
- | SOME (_, fname, contents) =>
- let
- val (line, col) = calcFilePos contents pos
- val line = Int.toString line
- val col = Int.toString col
- in
- print $ fname ^ ":" ^ line ^ ":" ^ col ^ ": "
- end
-end
-
-fun readFile fname =
-let
- open TextIO
- val h = openIn fname
- val s = inputAll h
- val () = closeIn h
-in
- s
-end
-
-fun getchar
- ({ stack = (id, off, contents) :: rest, allFiles }: stream)
- : (char * stream) =
- if off < String.size contents then
- (String.sub (contents, off),
- { stack = (id, off + 1, contents) :: rest, allFiles })
- else
- raise EndOfFile
- | getchar _ = raise EndOfStream
-
-fun ungetc
- ({ stack = (id, off, contents) :: rest, allFiles }) =
- if off = 0 then
- raise InvalidStream
- else
- { stack = (id, off - 1, contents) :: rest, allFiles }
- | ungetc _ = raise InvalidStream
-
-fun readline { stack = (fid, off, contents) :: rest, allFiles } =
- let
- val prevIsSlash =
- off > 0 andalso String.sub (contents, off - 1) = #"\\"
-
- open String
- fun read prevIsSlash offset acc =
- let
- val c = sub (contents, offset)
- in
- if offset = size contents then
- raise LineWithoutNl
- else if c = #"\n" then
- if prevIsSlash then
- read (c = #"\\") (offset + 1) (#" " :: tl acc)
- else
- (implode $ rev acc, offset + 1)
- else
- read (c = #"\\") (offset + 1) (c :: acc)
- end
-
- val (arg, newOffset) = read prevIsSlash off []
- in
- (arg, { stack = (fid, newOffset, contents) :: rest, allFiles })
- end
- | readline _ = raise InvalidStream
-
-fun getOffset ({ stack = (_, off, _) :: _, ... }: stream) = off
- | getOffset _ = raise InvalidStream
-
-fun getPosAfterCharRead
- ({ stack = (id, off, _) :: _, ... }: stream) = (id, off - 1)
- | getPosAfterCharRead _ = raise InvalidStream
-
-fun getPposFromPos (id, pos)
- { stack = (_, _, _) :: _, allFiles} =
-let
- val (fname, contents) =
- case List.find (fn (fid, _, _) => fid = id) allFiles of
- NONE => raise InvalidStream
- | SOME (_, fname, contents) => (fname, contents)
-
- val (line, col) = calcFilePos contents pos
-in
- (fname, line, SOME col)
-end
- | getPposFromPos _ _ = raise InvalidStream
-
-fun getPos ({ stack = (id, off, _) :: _, ... }: stream) = (id, off)
- | getPos _ = raise InvalidStream
-
-fun getSubstr startOff endOff
- ({ stack = (_, _, contents) :: _, ... }: stream) =
- String.substring (contents, startOff, endOff - startOff)
- | getSubstr _ _ _ = raise InvalidStream
-
-fun advanceToNewFile
- ({ stack = (_, off, contents) :: rest, allFiles }: stream) =
- if off = String.size contents then
- { stack = rest, allFiles }
- else
- raise InvalidStreamAdvance
- | advanceToNewFile _ = raise InvalidStreamAdvance
-
-fun streamInit fname =
-let
- val contents = readFile fname
-in
- { allFiles = [(0, fname, contents)], stack = [(0, 0, contents)] }
-end
-
-fun isSpace c = c = #" " orelse c = #"\t" orelse c = #"\n"
-fun isLetter c =
- ord c >= ord #"a" andalso ord c <= ord #"z" orelse
- ord c >= ord #"A" andalso ord c <= ord #"Z"
-fun isIdStart c = c = #"_" orelse isLetter c
-fun isDigit c = ord c >= ord #"0" andalso ord c <= ord #"9"
-fun isIdBody c = isIdStart c orelse isDigit c
-
-fun isPrintable c = Char.isPrint c andalso c <> #" "
-
-(* FSM for parsing symbols *)
-
-val maxStates = 51
-
-fun fsmInsert (nextState, buf) curState tk (c :: cs) =
-let
- open Array
-
- val (_, row) = sub (buf, curState)
- val potNextState = sub (row, ord c)
-in
- if potNextState <> ~1 then
- fsmInsert (nextState, buf) potNextState tk cs
- else (
- update (row, ord c, !nextState);
- nextState := !nextState + 1;
- fsmInsert (nextState, buf) (!nextState - 1) tk cs
- )
-end
- | fsmInsert (_, buf) curState tk [] =
-let
- open Array
- val (_, row) = sub (buf, curState)
-in
- update (buf, curState, (tk, row))
-end
-
-fun isStartForFsmGen () =
-let
- open Array
-
- val lookupTable = array (128, false)
-
- fun firstChr s = ord $ String.sub (s, 0)
- val () = List.app
- (fn (_, repr) => update (lookupTable, firstChr repr, true))
- $ List.filter
- (fn (_, repr) =>
- let
- val c = String.sub (repr, 0)
- in
- c <> kwPrefix andalso c <> cppPrefix
- end)
- tokenRepr
-
-in
- fn c => sub (lookupTable, ord c)
-end
-
-val isStartForFsm = isStartForFsmGen ()
-
-fun fsmTableCreate () =
-let
- open Array
-
- val T as (nextState, buf) =
- (ref 1, array (maxStates, (Invalid, array (128, ~1))))
- val r = ref 1
-
- fun filterNeeded [] acc = acc
- | filterNeeded ((T as (_, repr)) :: tks) acc =
- filterNeeded tks
- (if isStartForFsm $ String.sub (repr, 0) then
- T :: acc
- else
- acc)
-
- val tokenRepr = filterNeeded tokenRepr []
-
- fun fsmInsert' T curState tk repr = fsmInsert T curState tk repr handle
- Subscript => raise FsmTableIsTooSmall
-in
- while !r < length buf do (
- update (buf, !r, (Invalid, array(128, ~1)));
- r := !r + 1
- );
-
- List.app (fn (v, p) => fsmInsert' T 0 v $ explode p) tokenRepr;
- if !nextState <> maxStates then
- printLn $ "note: Fsm table size can be smaller: "
- ^ Int.toString (!nextState) ^ " is enough"
- else ();
- T
-end
-
-(* Unused right now
-fun printTable (nextState, buf) =
-let
- fun printRow i row =
- if i = length row then
- print "\n"
- else
- let
- val state = sub (row, i)
- in
- if state = ~1 then
- ()
- else
- print ((str (chr i)) ^ ": " ^ (Int.toString state) ^ ", ");
- printRow (i + 1) row
- end
-
- fun print' rowNum buf =
- if rowNum = !nextState then
- ()
- else
- let
- val (tk, row) = sub (buf, rowNum)
- in
- print ((token2string tk) ^ ": ");
- printRow 0 row;
- print' (rowNum + 1) buf
- end
-in
- print ("NextState: " ^ Int.toString (!nextState) ^ "\n");
- print' 0 buf;
- print "\n"
-end
-*)
-
-val fsmTable = fsmTableCreate ()
-
-fun fsmEat stream =
-let
- open Array
- val stream = ungetc stream
- val pos = getPos stream
-
- fun get curState stream =
- let
- val (c, stream) = (fn (c, s) => (SOME c, s)) $ getchar stream handle
- _ => (NONE, stream)
- in
- case c of
- NONE => (#1 $ sub (#2 fsmTable, curState), stream)
- | SOME c =>
- let
- val (tk, row) = sub (#2 fsmTable, curState)
- val nextState = sub (row, ord c)
- in
- if nextState = ~1 then
- (tk, ungetc stream)
- else
- get nextState stream
- end
- end
-in
- (fn (tk, stream) => ((pos, tk), stream)) $ get 0 stream
-end
-
-fun isFirstOnLine (tk: fullToken) (stream: stream) =
-let
- val (id, offset) = #1 tk
-in
- case List.find (fn (fid, _, _) => id = fid) (#allFiles stream) of
- NONE => raise InvalidStream
- | SOME (_, _, contents) =>
- let
- fun returnToNL ~1 = true
- | returnToNL offset =
- let
- val chr = String.sub (contents, offset)
- in
- if chr = #"\n" then
- true
- else if isSpace chr then
- returnToNL (offset - 1)
- else
- false
- end
- in
- returnToNL (offset - 1)
- end
-end
-
-fun tkError2aug stream (dx, msg) =
-let
- val (id, pos) = getPosAfterCharRead stream
- val pos = getPposFromPos (id, pos + dx) stream
-in
- TkErrorAug (pos, msg)
-end
-
-fun parseGeneric stream parser acc =
-let
- val stream = ungetc stream
- val P as (_, startOff) = getPos stream
-
- fun parse' stream acc = let
- val (c, stream) = (fn (c, s) => (SOME c, s)) $ getchar stream handle
- _ => (NONE, stream)
-
- val (acc, tk, stream) = parser acc (stream, startOff) c handle
- TkError (TkiDx dx, msg) => raise tkError2aug stream (dx, msg)
- | TkError (TkiStart, msg) =>
- let
- val startPos = getPposFromPos P stream
- in
- raise TkErrorAug (startPos, msg)
- end
- | TkError (TkiEOF, msg) =>
- let
- val (file, line, _) = getPposFromPos P stream
- in
- raise TkErrorAug ((file, line, NONE), msg)
- end
- in
- case tk of
- NONE => parse' stream acc
- | _ => (valOf tk, stream)
- end
-
- val (tk, stream) = parse' stream acc
-in
- ((P, tk): fullToken, stream)
-end
-
-fun finishSeqRead startOff stream =
-let
- val (_, endOff) = getPos stream
- val s = getSubstr startOff endOff stream
-in
- s
-end
-
-fun findKeyword s =
- case List.find
- (fn (_, repr) =>
- String.sub (repr, 0) = kwPrefix andalso
- String.extract (repr, 1, NONE) = s)
- tokenRepr
- of
- SOME (tk, _) => tk
- | NONE => Id s
-
-fun idParser () (stream, startOff) c =
-let
- fun finalize stream =
- let
- val s = finishSeqRead startOff stream
- val tk = findKeyword s
- in
- ((), SOME tk, stream)
- end
-in
- case c of
- NONE => finalize stream
- | SOME c =>
- if isIdBody c then
- ((), NONE, stream)
- else
- finalize (ungetc stream)
-end
-
-fun formCppDir (Id s) =
-let
- open String
-in
- case List.find
- (fn (_, repr) =>
- sub (repr, 0) = cppPrefix andalso
- extract (repr, 1, NONE) = s)
- tokenRepr
- of
- SOME (tk, _) => tk
- | NONE => raise ExpectedCppDir
-end
- | formCppDir kwElse = CppElse
- | formCppDir _ = raise ExpectedCppDir
-
-fun numParser () (stream, startOff) c =
-let
- fun finalize stream =
- ((), SOME $ Number (finishSeqRead startOff stream), stream)
-in
- case c of
- NONE => finalize stream
- | SOME c =>
- if isDigit c then
- ((), NONE, stream)
- else
- finalize (ungetc stream)
-end
-
-fun eatEscSeq stream =
-let
- val (c, stream) = getchar stream handle
- _ => raise TkError (TkiDx 0, "unfinished escape sequence")
-in
- (case c of
- #"\\" => #"\\"
- | #"t" => #"\t"
- | #"n" => #"\n"
- | c => c,
- stream)
-end
-
-fun stringCut s = String.extract (s, 1, SOME $ String.size s - 2)
-
-datatype SeqParseState = SeqInit | SeqStart | SeqValue of int | SeqTerm
-
-datatype seqParseMode = SpmChr | SpmStr
-
-fun seqBound SpmChr = #"'"
- | seqBound SpmStr = #"\""
-
-fun seqExnConv mode (TkError (v, msg)) =
-let
- val bound = if mode = SpmChr then "'" else "\""
- val msg =
- String.translate (fn c => if c = #"%" then bound else str c) msg
-in
- TkError (v, msg)
-end
- | seqExnConv _ _ = raise Unreachable
-
-fun unfinishedSeq SpmChr = "unfinished character constant"
- | unfinishedSeq SpmStr = "unfinished string literal"
-
-fun seqParser mode SeqInit (stream, _) (SOME c) =
- if seqBound mode = c then
- (SeqStart, NONE, stream)
- else
- raise Unreachable
- | seqParser mode SeqStart (stream, _) (SOME c) =
- if c <> seqBound mode then
- let
- val (c, stream) =
- if c <> #"\\" then (c, stream) else eatEscSeq stream
- in
- (SeqValue (ord c), NONE, stream)
- end
- else if mode = SpmStr then
- (SeqTerm, SOME $ StringConst "", stream)
- else
- raise seqExnConv SpmChr $ TkError (TkiDx 0, "expected value after %")
- | seqParser mode (SeqValue v) (stream, startOff) (SOME c) =
- if seqBound mode = c then
- let
- fun term s v =
- if mode = SpmChr then
- CharConst (s, v)
- else
- StringConst $ stringCut s
- in
- (SeqTerm, SOME $ term (finishSeqRead startOff stream) v, stream)
- end
- else if mode = SpmStr then
- let
- val (_, stream) =
- if c <> #"\\" then (c, stream) else eatEscSeq stream
- in
- (SeqValue v, NONE, stream)
- end
- else
- raise seqExnConv SpmChr $ TkError (TkiDx 0, "expected % after value")
- | seqParser _ SeqTerm _ (SOME _) =
- raise Unreachable
- | seqParser mode state (_, _) NONE =
- raise case state of
- SeqInit => Unreachable
- | SeqStart => seqExnConv mode $ TkError (TkiStart, unfinishedSeq mode)
- | SeqValue _ =>
- seqExnConv mode $ TkError (TkiStart, unfinishedSeq mode)
- | SeqTerm => Unreachable
-
-val charParser = seqParser SpmChr
-val strParser = seqParser SpmStr
-
-fun readIncludeArg stream =
-let
- open String
-
- fun triml s idx =
- if idx = size s then
- ""
- else if isSpace $ sub (s, idx) then
- triml s (idx + 1)
- else
- extract (s, idx, NONE)
-
- fun trimr s idx =
- if idx = 0 then
- ""
- else if isSpace $ sub (s, idx) then
- trimr s (idx - 1)
- else
- extract (s, 0, SOME $ idx + 1)
-
- fun trim s = triml (trimr s (size s - 1)) 0
-
- fun getLinePos () =
- let
- val (fname, line, _) = getPposFromPos (getPos stream) stream
- in
- (fname, line, NONE)
- end
-
- fun determineType s =
- let
- fun --> msg = raise TkErrorAug (getLinePos (), msg)
- fun isLast c = sub (s, size s - 1) = c
- in
- if s = "" then
- --> "#include argument is empty"
- else
- case sub (s, 0) of
- #"<" =>
- if isLast #">" then
- IAFromRef $ stringCut s
- else
- --> "expected > at #include argument end"
- | #"\"" =>
- if isLast #"\"" then
- IARel $ stringCut s
- else
- --> "expected \" at #include argument end"
- | _ => --> "#include argument should start with \" or <"
- end
-
- val (arg, stream) = readline stream handle
- LineWithoutNl =>
- raise TkErrorAug (getLinePos (),
- "#include line does not end with \\n")
-in
- (determineType $ trim arg, stream)
-end
-
-fun postprocessCppDir tk tkl stream =
-let
- val isCppDir =
- (fn Hash => true | _ => false) (#2 $ hd tkl)
- andalso isFirstOnLine (hd tkl) stream
- handle Empty => false
- val (pos, tk') = tk
-
- fun conv tk = ((#1 $ hd tkl, tk) :: tl tkl)
-in
- if isCppDir andalso tk' = Id "include" then
- let
- val (arg, stream) = readIncludeArg stream
- in
- (conv $ CppInclude arg, stream)
- end
- else if isCppDir then
- (conv $ formCppDir tk', stream) handle
- ExpectedCppDir =>
- raise TkErrorAug (getPposFromPos pos stream,
- "expected preprocessor directive")
-
- else
- (tk :: tkl, stream)
-end
-
-fun unexpectedCharRaise stream c =
-let
- val (id, pos) = getPosAfterCharRead stream
- val pos = getPposFromPos (id, pos) stream
- val repr =
- if isPrintable c then
- str c
- else
- "<" ^ Int.toString (ord c) ^ ">"
-in
- raise TkErrorAug (pos, "unexpected character " ^ repr)
-end
-
-fun skipComment stream pos =
-let
- fun skip prevIsAsterisk stream =
- let
- val (c, stream) = getchar stream
- in
- if prevIsAsterisk andalso c = #"/" then
- stream
- else
- skip (c = #"*") stream
- end
-in
- skip false stream handle
- EndOfFile =>
- let
- val pos = getPposFromPos pos stream
- in
- raise TkErrorAug (pos, "unfinished comment")
- end
-end
-
-fun handleBackslash stream =
-let
- val (c, stream) = (fn (c, s) => (SOME c, s)) $ getchar stream handle
- _ => (NONE, stream)
-
- val raiseErr = fn () =>
- let
- val pos = getPosAfterCharRead stream
- val pos = getPposFromPos pos stream
- in
- raise TkErrorAug (pos, "expected \\n after backslash")
- end
-in
- case c of
- SOME c =>
- if c = #"\n" then
- stream
- else
- raiseErr ()
- | NONE => raiseErr ()
-end
-
-fun processSymbol stream tkl =
-let
- val (T as (p as (fid, off), tk), stream) = fsmEat stream
-in
- case tk of
- CommentStart => tokenize (skipComment stream p) tkl
- | DoubleDot => tokenize stream (((fid, off + 1), Dot) :: (p, Dot) :: tkl)
- | _ => tokenize stream (T :: tkl)
-end
-
-and tokenize stream tkl =
-let
- fun getcharSkipEof stream = getchar stream handle
- EndOfFile => getcharSkipEof (advanceToNewFile stream)
-
- val (c, stream) = (fn (c, s) => (SOME c, s)) $ getcharSkipEof stream
- handle
- EndOfStream => (NONE, stream)
-
- fun cont (tk, stream) = tokenize stream (tk :: tkl)
- fun @-> parser acc = cont $ parseGeneric stream parser acc
-in
- case c of
- NONE => (rev tkl, #allFiles stream)
- | SOME c =>
- if isSpace c then
- tokenize stream tkl
- else if isIdStart c then
- let
- val (tk, stream) = parseGeneric stream idParser ()
- val (tkl, stream) = postprocessCppDir tk tkl stream
- in
- tokenize stream tkl
- end
- else if isDigit c then
- @-> numParser ()
- else if c = #"'" then
- @-> charParser SeqInit
- else if c = #"\"" then
- @-> strParser SeqInit
- else if isStartForFsm c then
- processSymbol stream tkl
- else if c = #"\\" then
- tokenize (handleBackslash stream) tkl
- else if c = #"\f" then
- tokenize stream tkl
- else
- unexpectedCharRaise stream c
-end
-
fun main [fname] =
let
- val stream = streamInit fname
- val (tkl, fileList) = tokenize stream []
+ val stream = Stream.streamInit fname
+ val (tkl, fileList) = Tokenizer.tokenize stream []
in
- List.app (fn (p, x) => (printPos fileList p; printToken x)) tkl
+ List.app
+ (fn (p, x) => (Stream.printPos fileList p; Tokenizer.printToken x)) tkl
end
| main _ = printLn "Expected a signle argument: file name"
diff --git a/exn_handler.sml b/exn_handler.sml
new file mode 100644
index 0000000..2e3a0b2
--- /dev/null
+++ b/exn_handler.sml
@@ -0,0 +1,46 @@
+structure GlobalExnHandler: sig val handler: exn -> unit end = struct
+
+ fun eprint s = printLn $ "error: " ^ s
+
+ fun otherExn e =
+ let
+ val hist = MLton.Exn.history e
+ in
+ eprint $ "exception " ^ exnMessage e ^ " was raised";
+ if hist = [] then
+ (printLn "No stack trace is avaliable";
+ printLn "Recompile with -const \"Exn.keepHistory true\"")
+ else
+ List.app (fn x => printLn $ "\t" ^ x) hist
+ end
+
+ fun exit code = Posix.Process.exit $ Word8.fromInt code
+
+ fun ioExn (IO.Io { name, function = _, cause }) =
+ let
+ open OS
+ val prefix = name ^ ": "
+ val reason =
+ case cause of
+ SysErr (str, _) => str
+ | _ => exnMessage cause
+ in
+ printLn $ prefix ^ reason
+ end
+ | ioExn _ = (printLn "ioExn: unreachable"; exit 254)
+
+ fun handler e =
+ let
+ open Tokenizer
+ in
+ (case e of
+ FsmTableIsTooSmall =>
+ eprint "fsm table is too small. Increate 'maxState' value"
+ | IO.Io _ => ioExn e
+ | TkErrorAug (pos, msg) => eprint $ Stream.pos2str pos ^ ": " ^ msg
+ | _ => otherExn e;
+ exit 255)
+ end
+end
+
+val () = MLton.Exn.setTopLevelHandler GlobalExnHandler.handler
diff --git a/general.sml b/general.sml
new file mode 100644
index 0000000..cde0514
--- /dev/null
+++ b/general.sml
@@ -0,0 +1,26 @@
+fun $ (x, y) = x y
+infixr 0 $
+
+fun printLn s = (print s; print "\n")
+
+fun lazy thunk =
+let
+ datatype 'a value =
+ Unevaluated of unit -> 'a |
+ Evaluated of 'a |
+ Exn of exn
+
+ val value = ref $ Unevaluated thunk
+in
+ fn () =>
+ case !value of
+ Unevaluated th =>
+ let
+ val x = th () handle e => (value := Exn e; raise e)
+ val () = value := Evaluated x
+ in
+ x
+ end
+ | Evaluated v => v
+ | Exn e => raise e
+end
diff --git a/stream.sig b/stream.sig
new file mode 100644
index 0000000..98cfb65
--- /dev/null
+++ b/stream.sig
@@ -0,0 +1,36 @@
+signature STREAM = sig
+ type fileId = int
+ type fileOffset = int
+ type pos = fileId * fileOffset
+ type convPos = string * int * int option
+
+ type filesInfo = (fileId * string * string) list
+
+ exception EndOfStream
+ exception EndOfFile
+
+ exception LineWithoutNl
+
+ type t
+
+ val extractFilesInfo: t -> filesInfo
+
+ val pos2str: convPos -> string
+ val printPos: (fileId * string * string) list -> pos -> unit
+
+ val getchar: t -> char * t
+ val ungetc: t -> t
+ val readline: t -> string * t
+
+ val getOffset: t -> fileOffset
+ val getPos: t -> pos
+ val getPosAfterCharRead: t -> pos
+ val getPposFromPos: pos -> t -> convPos
+
+ val getSubstr: fileOffset -> fileOffset -> t -> string
+ val isFirstOnLine: pos -> t -> bool
+
+ val advanceToNewFile: t -> t
+
+ val streamInit: string -> t
+end
diff --git a/stream.sml b/stream.sml
new file mode 100644
index 0000000..10b02cc
--- /dev/null
+++ b/stream.sml
@@ -0,0 +1,185 @@
+structure Stream :> STREAM = struct
+ type fileId = int
+ type fileOffset = int
+ type pos = fileId * fileOffset
+ type convPos = string * int * int option
+
+ type filesInfo = (fileId * string * string) list
+
+ exception EndOfStream
+ exception EndOfFile
+
+ (* unreachable *)
+ exception InvalidStream
+ exception InvalidStreamAdvance
+
+ exception LineWithoutNl
+
+ fun pos2str (pos, line, col) =
+ let
+ val % = Int.toString
+ in
+ case col of
+ SOME col => pos ^ ":" ^ %line ^ ":" ^ %col
+ | NONE => pos ^ ":" ^ %line
+ end
+
+ type t = {
+ (* stack of file ids, file offsets and file contents *)
+ stack: (fileId * fileOffset * string) list,
+ (* list of file ids, file names and file contents *)
+ allFiles: filesInfo
+ }
+
+ fun extractFilesInfo (s: t) = #allFiles s
+
+ fun calcFilePos s offset =
+ let
+ fun calc s cur offset (line, col) =
+ if cur = offset then
+ (line, col)
+ else
+ calc s (cur + 1) offset
+ (if String.sub (s, cur) = #"\n" then (line + 1, 1)
+ else (line, col + 1))
+ in
+ calc s 0 offset (1, 1)
+ end
+
+ fun printPos fileList (id, pos) =
+ let
+ val triple = List.find (fn (fid, _, _) => fid = id) fileList
+ in
+ case triple of
+ NONE => raise InvalidStream
+ | SOME (_, fname, contents) =>
+ let
+ val (line, col) = calcFilePos contents pos
+ val line = Int.toString line
+ val col = Int.toString col
+ in
+ print $ fname ^ ":" ^ line ^ ":" ^ col ^ ": "
+ end
+ end
+
+ fun readFile fname =
+ let
+ open TextIO
+ val h = openIn fname
+ val s = inputAll h
+ val () = closeIn h
+ in
+ s
+ end
+
+ fun getchar
+ ({ stack = (id, off, contents) :: rest, allFiles }: t)
+ : (char * t) =
+ if off < String.size contents then
+ (String.sub (contents, off),
+ { stack = (id, off + 1, contents) :: rest, allFiles })
+ else
+ raise EndOfFile
+ | getchar _ = raise EndOfStream
+
+ fun ungetc
+ ({ stack = (id, off, contents) :: rest, allFiles }) =
+ if off = 0 then
+ raise InvalidStream
+ else
+ { stack = (id, off - 1, contents) :: rest, allFiles }
+ | ungetc _ = raise InvalidStream
+
+ fun readline { stack = (fid, off, contents) :: rest, allFiles } =
+ let
+ val prevIsSlash =
+ off > 0 andalso String.sub (contents, off - 1) = #"\\"
+
+ open String
+ fun read prevIsSlash offset acc =
+ let
+ val c = sub (contents, offset)
+ in
+ if offset = size contents then
+ raise LineWithoutNl
+ else if c = #"\n" then
+ if prevIsSlash then
+ read (c = #"\\") (offset + 1) (#" " :: tl acc)
+ else
+ (implode $ rev acc, offset + 1)
+ else
+ read (c = #"\\") (offset + 1) (c :: acc)
+ end
+
+ val (arg, newOffset) = read prevIsSlash off []
+ in
+ (arg, { stack = (fid, newOffset, contents) :: rest, allFiles })
+ end
+ | readline _ = raise InvalidStream
+
+ fun getOffset ({ stack = (_, off, _) :: _, ... }: t) = off
+ | getOffset _ = raise InvalidStream
+
+ fun getPosAfterCharRead
+ ({ stack = (id, off, _) :: _, ... }: t) = (id, off - 1)
+ | getPosAfterCharRead _ = raise InvalidStream
+
+ fun getPposFromPos (id, pos)
+ { stack = (_, _, _) :: _, allFiles} =
+ let
+ val (fname, contents) =
+ case List.find (fn (fid, _, _) => fid = id) allFiles of
+ NONE => raise InvalidStream
+ | SOME (_, fname, contents) => (fname, contents)
+
+ val (line, col) = calcFilePos contents pos
+ in
+ (fname, line, SOME col)
+ end
+ | getPposFromPos _ _ = raise InvalidStream
+
+ fun getPos ({ stack = (id, off, _) :: _, ... }: t) = (id, off)
+ | getPos _ = raise InvalidStream
+
+ fun getSubstr startOff endOff
+ ({ stack = (_, _, contents) :: _, ... }: t) =
+ String.substring (contents, startOff, endOff - startOff)
+ | getSubstr _ _ _ = raise InvalidStream
+
+ fun advanceToNewFile
+ ({ stack = (_, off, contents) :: rest, allFiles }: t) =
+ if off = String.size contents then
+ { stack = rest, allFiles }
+ else
+ raise InvalidStreamAdvance
+ | advanceToNewFile _ = raise InvalidStreamAdvance
+
+ fun streamInit fname =
+ let
+ val contents = readFile fname
+ in
+ { allFiles = [(0, fname, contents)], stack = [(0, 0, contents)] }
+ end
+
+ fun isFirstOnLine (id, offset) (stream: t) =
+ case List.find (fn (fid, _, _) => id = fid) (#allFiles stream) of
+ NONE => raise InvalidStream
+ | SOME (_, _, contents) =>
+ let
+ fun returnToNL ~1 = true
+ | returnToNL offset =
+ let
+ val chr = String.sub (contents, offset)
+ in
+ if chr = #"\n" then
+ true
+ else if Char.isSpace chr then
+ returnToNL (offset - 1)
+ else
+ false
+ end
+ in
+ returnToNL (offset - 1)
+ end
+
+end
diff --git a/tokenizer.sig b/tokenizer.sig
new file mode 100644
index 0000000..c31a7e6
--- /dev/null
+++ b/tokenizer.sig
@@ -0,0 +1,12 @@
+signature TOKENIZER = sig
+ type token
+ type fullToken = Stream.pos * token
+
+ (* Fatal. both may be thrown by tokenize *)
+ exception FsmTableIsTooSmall
+ exception TkErrorAug of Stream.convPos * string
+
+ val tokenize: Stream.t -> fullToken list -> fullToken list * Stream.filesInfo
+
+ val printToken: token -> unit
+end
diff --git a/tokenizer.sml b/tokenizer.sml
new file mode 100644
index 0000000..53bb396
--- /dev/null
+++ b/tokenizer.sml
@@ -0,0 +1,787 @@
+structure Tokenizer:> TOKENIZER = struct
+ datatype includeArg = IARel of string | IAFromRef of string
+
+ datatype token =
+ Invalid |
+ Number of string |
+ Id of string |
+ CharConst of string * int |
+ StringConst of string |
+
+ kwBreak |
+ kwCase |
+ kwChar |
+ kwConst |
+ kwContinue |
+ kwDefault |
+ kwDouble |
+ kwElse |
+ kwEnum |
+ kwExtern |
+ kwFloat |
+ kwFor |
+ kwGoto |
+ kwInt |
+ kwLong |
+ kwRegister |
+ kwReturn |
+ kwShort |
+ kwSigned |
+ kwSizeof |
+ kwStruct |
+ kwSwitch |
+ kwTypedef |
+ kwUnion |
+ kwUnsigned |
+ kwVoid |
+ kwVolatile |
+
+ LParen |
+ RParen |
+ LBracket |
+ RBracket |
+ LBrace |
+ RBrace |
+
+ QuestionMark |
+ Colon |
+ Coma |
+ Semicolon |
+
+ Arrow |
+ Plus |
+ DoublePlus|
+ Minus |
+ DoubleMinus |
+ Ampersand |
+ Asterisk |
+ Slash |
+ Tilde |
+ ExclMark |
+ Percent |
+ DoubleGreater |
+ DoubleLess |
+ Greater |
+ Less |
+ EqualSign |
+ LessEqualSign |
+ GreaterEqualSign |
+ DoubleEqualSign |
+ ExclMarkEqualSign |
+ Cap |
+ VerticalBar |
+ DoubleAmpersand |
+ DoubleVerticalBar |
+
+ AsteriskEqualSign |
+ SlashEqualSign |
+ PercentEqualSign |
+ PlusEqualSign |
+ MinusEqualSign |
+ DoubleLessEqualSign |
+ DoubleGreaterEqualSign |
+ AmpersandEqualSign |
+ CapEqualSign |
+ VerticalBarEqualSign |
+
+ Hash |
+ DoubleHash |
+
+ Dot |
+ DoubleDot |
+ TripleDot |
+
+ CommentStart |
+
+ CppInclude of includeArg |
+ CppDefine |
+ CppUndef |
+ CppIf |
+ CppIfdef |
+ CppIfndef |
+ CppElse |
+ CppElif |
+ CppEndif |
+ CppPragma
+
+ val kwPrefix = #"@"
+ val cppPrefix = #"$"
+
+ type fullToken = Stream.pos * token
+
+ datatype tkErrorAuxInfo = TkiEOF | TkiDx of int | TkiStart
+
+ exception TkError of tkErrorAuxInfo * string
+ exception TkErrorAug of Stream.convPos * string
+
+ exception ExpectedCppDir (* handled in postprocess *)
+
+ exception FsmTableIsTooSmall
+
+ (* Unreachable (should be) *)
+ exception Unreachable
+ exception TokenWithoutRepr
+
+ val tokenRepr =
+ let
+ fun & repr = str kwPrefix ^ repr
+ fun % repr = str cppPrefix ^ repr
+ in
+ [
+ (kwBreak, &"break"),
+ (kwCase, &"case"),
+ (kwChar, &"char"),
+ (kwConst, &"const"),
+ (kwContinue, &"continue"),
+ (kwDefault, &"default"),
+ (kwDouble, &"double"),
+ (kwElse, &"else"),
+ (kwEnum, &"enum"),
+ (kwExtern, &"extern"),
+ (kwFloat, &"float"),
+ (kwFor, &"for"),
+ (kwGoto, &"goto"),
+ (kwInt, &"int"),
+ (kwLong, &"long"),
+ (kwRegister, &"register"),
+ (kwReturn, &"return"),
+ (kwShort, &"short"),
+ (kwSigned, &"signed"),
+ (kwSizeof, &"sizeof"),
+ (kwStruct, &"struct"),
+ (kwSwitch, &"switch"),
+ (kwTypedef, &"typedef"),
+ (kwUnion, &"union"),
+ (kwUnsigned, &"unsigned"),
+ (kwVoid, &"void"),
+ (kwVolatile, &"volatile"),
+
+ (LParen, "("),
+ (RParen, ")"),
+ (LBracket, "["),
+ (RBracket, "]"),
+ (LBrace, "{"),
+ (RBrace, "}"),
+
+ (QuestionMark, "?"),
+ (Colon, ":"),
+ (Coma, ","),
+ (Semicolon, ";"),
+
+ (Arrow, "->"),
+ (Plus, "+"),
+ (DoublePlus, "++"),
+ (Minus, "-"),
+ (DoubleMinus, "--"),
+ (Ampersand, "&"),
+ (Asterisk, "*"),
+ (Slash, "/"),
+ (Tilde, "~"),
+ (ExclMark, "!"),
+ (Percent, "%"),
+ (DoubleLess, "<<"),
+ (DoubleGreater, ">>"),
+ (Less, "<"),
+ (Greater, ">"),
+ (EqualSign, "="),
+ (LessEqualSign, "<="),
+ (GreaterEqualSign, ">="),
+ (DoubleEqualSign, "=="),
+ (ExclMarkEqualSign, "!="),
+ (Cap, "^"),
+ (VerticalBar, "|"),
+ (DoubleAmpersand, "&&"),
+ (DoubleVerticalBar, "||"),
+
+ (AsteriskEqualSign, "*="),
+ (SlashEqualSign, "/="),
+ (PercentEqualSign, "%="),
+ (PlusEqualSign, "+="),
+ (MinusEqualSign, "-="),
+ (DoubleLessEqualSign, "<<="),
+ (DoubleGreaterEqualSign, ">>="),
+ (AmpersandEqualSign, "&="),
+ (CapEqualSign, "^="),
+ (VerticalBarEqualSign, "|="),
+
+ (Hash, "#"),
+ (DoubleHash, "##"),
+
+ (Dot, "."),
+ (DoubleDot, ".."),
+ (TripleDot, "..."),
+
+ (CommentStart, "/*"),
+
+ (CppDefine, %"define"),
+ (CppUndef, %"undef"),
+ (CppIf, %"if"),
+ (CppIfdef, %"ifdef"),
+ (CppIfndef, %"ifndef"),
+ (CppElse, %"else"),
+ (CppElif, %"elif"),
+ (CppEndif, %"endif"),
+ (CppPragma, %"pragma")
+ ]
+ end
+
+ val printToken = fn
+ Number s => printLn $ "Num: " ^ s
+ | Id s => printLn $ "Id: " ^ s
+ | CharConst (repr, _) => printLn repr
+ | CppInclude arg =>
+ let
+ val (start, end', arg) =
+ case arg of
+ IARel v => ("\"", "\"", v)
+ | IAFromRef v => ("<", ">", v)
+ in
+ printLn $ (str cppPrefix) ^ "include " ^ start ^ arg ^ end'
+ end
+ | StringConst s =>
+ printLn $ "\"" ^ s ^ "\""
+ | v =>
+ case List.find (fn (x, _) => x = v) tokenRepr of
+ SOME (_, repr) => printLn repr
+ | NONE => raise TokenWithoutRepr
+
+ fun isIdStart c = Char.isAlpha c orelse c = #"_"
+ fun isIdBody c = Char.isAlphaNum c orelse c = #"_"
+
+ fun isPrintable c = Char.isPrint c andalso c <> #" "
+
+ (* FSM for parsing symbols *)
+
+ val maxStates = 51
+
+ fun fsmInsert (nextState, buf) curState tk (c :: cs) =
+ let
+ open Array
+
+ val (_, row) = sub (buf, curState)
+ val potNextState = sub (row, ord c)
+ in
+ if potNextState <> ~1 then
+ fsmInsert (nextState, buf) potNextState tk cs
+ else (
+ update (row, ord c, !nextState);
+ nextState := !nextState + 1;
+ fsmInsert (nextState, buf) (!nextState - 1) tk cs
+ )
+ end
+ | fsmInsert (_, buf) curState tk [] =
+ let
+ open Array
+ val (_, row) = sub (buf, curState)
+ in
+ update (buf, curState, (tk, row))
+ end
+
+ fun isStartForFsmGen () =
+ let
+ open Array
+
+ val lookupTable = array (128, false)
+
+ fun firstChr s = ord $ String.sub (s, 0)
+ val () = List.app
+ (fn (_, repr) => update (lookupTable, firstChr repr, true))
+ $ List.filter
+ (fn (_, repr) =>
+ let
+ val c = String.sub (repr, 0)
+ in
+ c <> kwPrefix andalso c <> cppPrefix
+ end)
+ tokenRepr
+
+ in
+ fn c => sub (lookupTable, ord c)
+ end
+
+ val isStartForFsm = isStartForFsmGen ()
+
+ fun fsmTableCreate () =
+ let
+ open Array
+
+ val T as (nextState, buf) =
+ (ref 1, array (maxStates, (Invalid, array (128, ~1))))
+ val r = ref 1
+
+ fun filterNeeded [] acc = acc
+ | filterNeeded ((T as (_, repr)) :: tks) acc =
+ filterNeeded tks
+ (if isStartForFsm $ String.sub (repr, 0) then
+ T :: acc
+ else
+ acc)
+
+ val tokenRepr = filterNeeded tokenRepr []
+
+ fun fsmInsert' T curState tk repr = fsmInsert T curState tk repr handle
+ Subscript => raise FsmTableIsTooSmall
+ in
+ while !r < length buf do (
+ update (buf, !r, (Invalid, array(128, ~1)));
+ r := !r + 1
+ );
+
+ List.app (fn (v, p) => fsmInsert' T 0 v $ explode p) tokenRepr;
+ if !nextState <> maxStates then
+ printLn $ "note: Fsm table size can be smaller: "
+ ^ Int.toString (!nextState) ^ " is enough"
+ else ();
+ T
+ end
+
+ (* Unused right now
+ fun printTable (nextState, buf) =
+ let
+ fun printRow i row =
+ if i = length row then
+ print "\n"
+ else
+ let
+ val state = sub (row, i)
+ in
+ if state = ~1 then
+ ()
+ else
+ print ((str (chr i)) ^ ": " ^ (Int.toString state) ^ ", ");
+ printRow (i + 1) row
+ end
+
+ fun print' rowNum buf =
+ if rowNum = !nextState then
+ ()
+ else
+ let
+ val (tk, row) = sub (buf, rowNum)
+ in
+ print ((token2string tk) ^ ": ");
+ printRow 0 row;
+ print' (rowNum + 1) buf
+ end
+ in
+ print ("NextState: " ^ Int.toString (!nextState) ^ "\n");
+ print' 0 buf;
+ print "\n"
+ end
+ *)
+
+ val fsmTable = lazy fsmTableCreate
+
+ fun fsmEat stream =
+ let
+ open Array
+ val stream = Stream.ungetc stream
+ val pos = Stream.getPos stream
+
+ fun get curState stream =
+ let
+ val (c, stream) = (fn (c, s) => (SOME c, s)) $ Stream.getchar stream
+ handle
+ _ => (NONE, stream)
+ in
+ case c of
+ NONE => (#1 $ sub (#2 $ fsmTable (), curState), stream)
+ | SOME c =>
+ let
+ val (tk, row) = sub (#2 $ fsmTable (), curState)
+ val nextState = sub (row, ord c)
+ in
+ if nextState = ~1 then
+ (tk, Stream.ungetc stream)
+ else
+ get nextState stream
+ end
+ end
+ in
+ (fn (tk, stream) => ((pos, tk), stream)) $ get 0 stream
+ end
+
+ fun tkError2aug stream (dx, msg) =
+ let
+ val (id, pos) = Stream.getPosAfterCharRead stream
+ val pos = Stream.getPposFromPos (id, pos + dx) stream
+ in
+ TkErrorAug (pos, msg)
+ end
+
+ fun parseGeneric stream parser acc =
+ let
+ val stream = Stream.ungetc stream
+ val P as (_, startOff) = Stream.getPos stream
+
+ fun parse' stream acc = let
+ val (c, stream) = (fn (c, s) => (SOME c, s)) $ Stream.getchar stream handle
+ _ => (NONE, stream)
+
+ val (acc, tk, stream) = parser acc (stream, startOff) c handle
+ TkError (TkiDx dx, msg) => raise tkError2aug stream (dx, msg)
+ | TkError (TkiStart, msg) =>
+ let
+ val startPos = Stream.getPposFromPos P stream
+ in
+ raise TkErrorAug (startPos, msg)
+ end
+ | TkError (TkiEOF, msg) =>
+ let
+ val (file, line, _) = Stream.getPposFromPos P stream
+ in
+ raise TkErrorAug ((file, line, NONE), msg)
+ end
+ in
+ case tk of
+ NONE => parse' stream acc
+ | _ => (valOf tk, stream)
+ end
+
+ val (tk, stream) = parse' stream acc
+ in
+ ((P, tk): fullToken, stream)
+ end
+
+ fun finishSeqRead startOff stream =
+ let
+ val (_, endOff) = Stream.getPos stream
+ val s = Stream.getSubstr startOff endOff stream
+ in
+ s
+ end
+
+ fun findKeyword s =
+ case List.find
+ (fn (_, repr) =>
+ String.sub (repr, 0) = kwPrefix andalso
+ String.extract (repr, 1, NONE) = s)
+ tokenRepr
+ of
+ SOME (tk, _) => tk
+ | NONE => Id s
+
+ fun idParser () (stream, startOff) c =
+ let
+ fun finalize stream =
+ let
+ val s = finishSeqRead startOff stream
+ val tk = findKeyword s
+ in
+ ((), SOME tk, stream)
+ end
+ in
+ case c of
+ NONE => finalize stream
+ | SOME c =>
+ if isIdBody c then
+ ((), NONE, stream)
+ else
+ finalize (Stream.ungetc stream)
+ end
+
+ fun formCppDir (Id s) =
+ let
+ open String
+ in
+ case List.find
+ (fn (_, repr) =>
+ sub (repr, 0) = cppPrefix andalso
+ extract (repr, 1, NONE) = s)
+ tokenRepr
+ of
+ SOME (tk, _) => tk
+ | NONE => raise ExpectedCppDir
+ end
+ | formCppDir kwElse = CppElse
+ | formCppDir _ = raise ExpectedCppDir
+
+ fun numParser () (stream, startOff) c =
+ let
+ fun finalize stream =
+ ((), SOME $ Number (finishSeqRead startOff stream), stream)
+ in
+ case c of
+ NONE => finalize stream
+ | SOME c =>
+ if Char.isDigit c then
+ ((), NONE, stream)
+ else
+ finalize (Stream.ungetc stream)
+ end
+
+ fun eatEscSeq stream =
+ let
+ val (c, stream) = Stream.getchar stream handle
+ _ => raise TkError (TkiDx 0, "unfinished escape sequence")
+ in
+ (case c of
+ #"\\" => #"\\"
+ | #"t" => #"\t"
+ | #"n" => #"\n"
+ | c => c,
+ stream)
+ end
+
+ fun stringCut s = String.extract (s, 1, SOME $ String.size s - 2)
+
+ datatype SeqParseState = SeqInit | SeqStart | SeqValue of int | SeqTerm
+
+ datatype seqParseMode = SpmChr | SpmStr
+
+ fun seqBound SpmChr = #"'"
+ | seqBound SpmStr = #"\""
+
+ fun seqExnConv mode (TkError (v, msg)) =
+ let
+ val bound = if mode = SpmChr then "'" else "\""
+ val msg =
+ String.translate (fn c => if c = #"%" then bound else str c) msg
+ in
+ TkError (v, msg)
+ end
+ | seqExnConv _ _ = raise Unreachable
+
+ fun unfinishedSeq SpmChr = "unfinished character constant"
+ | unfinishedSeq SpmStr = "unfinished string literal"
+
+ fun seqParser mode SeqInit (stream, _) (SOME c) =
+ if seqBound mode = c then
+ (SeqStart, NONE, stream)
+ else
+ raise Unreachable
+ | seqParser mode SeqStart (stream, _) (SOME c) =
+ if c <> seqBound mode then
+ let
+ val (c, stream) =
+ if c <> #"\\" then (c, stream) else eatEscSeq stream
+ in
+ (SeqValue (ord c), NONE, stream)
+ end
+ else if mode = SpmStr then
+ (SeqTerm, SOME $ StringConst "", stream)
+ else
+ raise seqExnConv SpmChr $ TkError (TkiDx 0, "expected value after %")
+ | seqParser mode (SeqValue v) (stream, startOff) (SOME c) =
+ if seqBound mode = c then
+ let
+ fun term s v =
+ if mode = SpmChr then
+ CharConst (s, v)
+ else
+ StringConst $ stringCut s
+ in
+ (SeqTerm, SOME $ term (finishSeqRead startOff stream) v, stream)
+ end
+ else if mode = SpmStr then
+ let
+ val (_, stream) =
+ if c <> #"\\" then (c, stream) else eatEscSeq stream
+ in
+ (SeqValue v, NONE, stream)
+ end
+ else
+ raise seqExnConv SpmChr $ TkError (TkiDx 0, "expected % after value")
+ | seqParser _ SeqTerm _ (SOME _) =
+ raise Unreachable
+ | seqParser mode state (_, _) NONE =
+ raise case state of
+ SeqInit => Unreachable
+ | SeqStart => seqExnConv mode $ TkError (TkiStart, unfinishedSeq mode)
+ | SeqValue _ =>
+ seqExnConv mode $ TkError (TkiStart, unfinishedSeq mode)
+ | SeqTerm => Unreachable
+
+ val charParser = seqParser SpmChr
+ val strParser = seqParser SpmStr
+
+ fun readIncludeArg stream =
+ let
+ open String
+
+ fun triml s idx =
+ if idx = size s then
+ ""
+ else if Char.isSpace $ sub (s, idx) then
+ triml s (idx + 1)
+ else
+ extract (s, idx, NONE)
+
+ fun trimr s idx =
+ if idx = 0 then
+ ""
+ else if Char.isSpace $ sub (s, idx) then
+ trimr s (idx - 1)
+ else
+ extract (s, 0, SOME $ idx + 1)
+
+ fun trim s = triml (trimr s (size s - 1)) 0
+
+ fun getLinePos () =
+ let
+ val (fname, line, _) = Stream.getPposFromPos (Stream.getPos stream) stream
+ in
+ (fname, line, NONE)
+ end
+
+ fun determineType s =
+ let
+ fun --> msg = raise TkErrorAug (getLinePos (), msg)
+ fun isLast c = sub (s, size s - 1) = c
+ in
+ if s = "" then
+ --> "#include argument is empty"
+ else
+ case sub (s, 0) of
+ #"<" =>
+ if isLast #">" then
+ IAFromRef $ stringCut s
+ else
+ --> "expected > at #include argument end"
+ | #"\"" =>
+ if isLast #"\"" then
+ IARel $ stringCut s
+ else
+ --> "expected \" at #include argument end"
+ | _ => --> "#include argument should start with \" or <"
+ end
+
+ val (arg, stream) = Stream.readline stream handle
+ Stream.LineWithoutNl =>
+ raise TkErrorAug (getLinePos (),
+ "#include line does not end with \\n")
+ in
+ (determineType $ trim arg, stream)
+ end
+
+ fun postprocessCppDir tk tkl stream =
+ let
+ val isCppDir =
+ (fn Hash => true | _ => false) (#2 $ hd tkl)
+ andalso Stream.isFirstOnLine (#1 $ hd tkl) stream
+ handle Empty => false
+ val (pos, tk') = tk
+
+ fun conv tk = ((#1 $ hd tkl, tk) :: tl tkl)
+ in
+ if isCppDir andalso tk' = Id "include" then
+ let
+ val (arg, stream) = readIncludeArg stream
+ in
+ (conv $ CppInclude arg, stream)
+ end
+ else if isCppDir then
+ (conv $ formCppDir tk', stream) handle
+ ExpectedCppDir =>
+ raise TkErrorAug (Stream.getPposFromPos pos stream,
+ "expected preprocessor directive")
+
+ else
+ (tk :: tkl, stream)
+ end
+
+ fun unexpectedCharRaise stream c =
+ let
+ val (id, pos) = Stream.getPosAfterCharRead stream
+ val pos = Stream.getPposFromPos (id, pos) stream
+ val repr =
+ if isPrintable c then
+ str c
+ else
+ "<" ^ Int.toString (ord c) ^ ">"
+ in
+ raise TkErrorAug (pos, "unexpected character " ^ repr)
+ end
+
+ fun skipComment stream pos =
+ let
+ fun skip prevIsAsterisk stream =
+ let
+ val (c, stream) = Stream.getchar stream
+ in
+ if prevIsAsterisk andalso c = #"/" then
+ stream
+ else
+ skip (c = #"*") stream
+ end
+ in
+ skip false stream handle
+ Stream.EndOfFile =>
+ let
+ val pos = Stream.getPposFromPos pos stream
+ in
+ raise TkErrorAug (pos, "unfinished comment")
+ end
+ end
+
+ fun handleBackslash stream =
+ let
+ val (c, stream) = (fn (c, s) => (SOME c, s)) $ Stream.getchar stream handle
+ _ => (NONE, stream)
+
+ val raiseErr = fn () =>
+ let
+ val pos = Stream.getPosAfterCharRead stream
+ val pos = Stream.getPposFromPos pos stream
+ in
+ raise TkErrorAug (pos, "expected \\n after backslash")
+ end
+ in
+ case c of
+ SOME c =>
+ if c = #"\n" then
+ stream
+ else
+ raiseErr ()
+ | NONE => raiseErr ()
+ end
+
+ fun processSymbol stream tkl =
+ let
+ val (T as (p as (fid, off), tk), stream) = fsmEat stream
+ in
+ case tk of
+ CommentStart => tokenize (skipComment stream p) tkl
+ | DoubleDot => tokenize stream (((fid, off + 1), Dot) :: (p, Dot) :: tkl)
+ | _ => tokenize stream (T :: tkl)
+ end
+
+ and tokenize stream tkl =
+ let
+ fun getcharSkipEof stream = Stream.getchar stream handle
+ Stream.EndOfFile => getcharSkipEof (Stream.advanceToNewFile stream)
+
+ val (c, stream) = (fn (c, s) => (SOME c, s)) $ getcharSkipEof stream
+ handle
+ Stream.EndOfStream => (NONE, stream)
+
+ fun cont (tk, stream) = tokenize stream (tk :: tkl)
+ fun @-> parser acc = cont $ parseGeneric stream parser acc
+ in
+ case c of
+ NONE => (rev tkl, Stream.extractFilesInfo stream)
+ | SOME c =>
+ if Char.isSpace c then
+ tokenize stream tkl
+ else if isIdStart c then
+ let
+ val (tk, stream) = parseGeneric stream idParser ()
+ val (tkl, stream) = postprocessCppDir tk tkl stream
+ in
+ tokenize stream tkl
+ end
+ else if Char.isDigit c then
+ @-> numParser ()
+ else if c = #"'" then
+ @-> charParser SeqInit
+ else if c = #"\"" then
+ @-> strParser SeqInit
+ else if isStartForFsm c then
+ processSymbol stream tkl
+ else if c = #"\\" then
+ tokenize (handleBackslash stream) tkl
+ else
+ unexpectedCharRaise stream c
+ end
+
+end