From 9d724f17e813fa344d485329d33b5f5ecf8197a3 Mon Sep 17 00:00:00 2001 From: Vladimir Azarov Date: Fri, 4 Apr 2025 20:53:56 +0200 Subject: Functorization --- .gitignore | 4 +- ccross.mlb | 9 +- ccross.sig | 4 + ccross.sml | 14 +- cpp.fun | 97 ++++++ cpp.sig | 6 +- cpp.sml | 95 ------ exn_handler.fun | 42 +++ exn_handler.sig | 3 + exn_handler.sml | 46 --- tokenizer.fun | 1019 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ tokenizer.sig | 10 +- tokenizer.sml | 1018 ------------------------------------------------------ 13 files changed, 1196 insertions(+), 1171 deletions(-) create mode 100644 ccross.sig create mode 100644 cpp.fun delete mode 100644 cpp.sml create mode 100644 exn_handler.fun create mode 100644 exn_handler.sig delete mode 100644 exn_handler.sml create mode 100644 tokenizer.fun delete mode 100644 tokenizer.sml diff --git a/.gitignore b/.gitignore index 28bb688..c495a68 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ -test* +test ccross -todo* +doc/todo.txt mlmon.out* *dot diff --git a/ccross.mlb b/ccross.mlb index 76eeb47..6e36d72 100644 --- a/ccross.mlb +++ b/ccross.mlb @@ -13,11 +13,14 @@ in hashtable.sml tokenizer.sig - tokenizer.sml + tokenizer.fun cpp.sig - cpp.sml + cpp.fun - exn_handler.sml + exn_handler.sig + exn_handler.fun + + ccross.sig ccross.sml end diff --git a/ccross.sig b/ccross.sig new file mode 100644 index 0000000..3bfaf9e --- /dev/null +++ b/ccross.sig @@ -0,0 +1,4 @@ +signature CCROSS = sig + structure P: CPP + structure ExnHandler: EXN_HANDLER +end diff --git a/ccross.sml b/ccross.sml index 58f9c32..749687b 100644 --- a/ccross.sml +++ b/ccross.sml @@ -1,4 +1,16 @@ -fun main [fname] = Cpp.debugPrint fname +structure ccross:> CCROSS = struct + structure T:> TOKENIZER = + Tokenizer(structure H = Hashtable; structure S = Stream) + + structure P:> CPP = Cpp(T) + + structure ExnHandler:> EXN_HANDLER = + ExnHandler(structure T = T; structure P = P) +end + +val () = MLton.Exn.setTopLevelHandler ccross.ExnHandler.handler + +fun main [fname] = ccross.P.debugPrint fname | main _ = printLn "Expected a single argument: file name" val () = main $ CommandLine.arguments () diff --git a/cpp.fun b/cpp.fun new file mode 100644 index 0000000..424d3ca --- /dev/null +++ b/cpp.fun @@ -0,0 +1,97 @@ +functor Cpp(T: TOKENIZER): CPP = struct + + structure T = T + + type tkPos = T.S.pos + type t = + { streams: T.S.t list, fileInfo: T.S.fileInfo list, + lastPos: tkPos option, firstId: T.S.fileId }; + + datatype tkExp = + Tk of T.token | + Id | + NumConst | + StrLiteral | + UnOp | + BinOp | + Op + + type tkExpectedValue = string * tkExp list + + exception StreamTooOld + exception TkExpected of tkExpectedValue + + fun create fname = + let + val stream = T.S.create fname + val info = T.S.convert stream + in + { streams = [stream] , fileInfo = [info], lastPos = NONE, firstId = #1 info } + end + + fun getToken + ({ streams = stream :: tail, fileInfo, lastPos, firstId }: t) = + let + val (tk, stream) = T.getToken stream + in + case tk of + NONE => getToken { streams = tail, fileInfo, lastPos, firstId } + | SOME (pos, tk) => + ( tk, { streams = stream :: tail, fileInfo, + lastPos = SOME pos, firstId }) + end + | getToken + { streams = [], fileInfo, lastPos = SOME lastPos, firstId } = + let + val pos = SOME (#1 lastPos, ~1) (* EOF *) + in + (T.EOS, {streams = [], fileInfo, lastPos = pos, firstId }) + end + | getToken { streams = [], fileInfo, lastPos = NONE, firstId } = + (T.EOS, { streams = [], fileInfo, + lastPos = SOME (firstId, ~1), firstId }) + + fun getLastPos ({ lastPos = NONE, ... }: t) = raise Unreachable + | getLastPos { lastPos = SOME p, ... } = p + + val tkExp2str = fn + (Tk tk) => T.token2str tk + | Id: tkExp => "identifier" + | NumConst => "numeric constant" + | StrLiteral => "string literal" + | UnOp => "unary operator" + | BinOp => "binary operator" + | Op => "operator" + + fun prepAndRaise (stream: t) (id, pos) expList = + let + val fileInfo = + case List.find (fn (id', _, _) => id' = id) $ #fileInfo stream of + NONE => raise StreamTooOld + | SOME fileInfo => fileInfo + val pos = T.S.ppos2str $ T.S.pos2pposWithFI (id, pos) fileInfo + in + raise TkExpected (pos, expList) + end + + fun tkExpectedPrint (pos, expList) = + let + fun tkExps2str [e] [] = tkExp2str e + | tkExps2str [e] acc = + (String.concatWith ", " acc ^ " or ") ^ tkExp2str e + | tkExps2str (e :: ec) acc = + tkExps2str ec (tkExp2str e :: acc) + | tkExps2str [] _ = raise Unreachable + in + print pos; + print ":expected "; + printLn $ tkExps2str expList [] + end + + fun debugPrint fname = + let + val stream = create fname + in + () + end +end diff --git a/cpp.sig b/cpp.sig index 378a5d4..beed8c2 100644 --- a/cpp.sig +++ b/cpp.sig @@ -1,4 +1,6 @@ signature CPP = sig + structure T: TOKENIZER + type t type tkPos type tkExpectedValue @@ -6,7 +8,7 @@ signature CPP = sig exception TkExpected of tkExpectedValue datatype tkExp = - Tk of Tokenizer.token | + Tk of T.token | Id | NumConst | StrLiteral | @@ -15,7 +17,7 @@ signature CPP = sig Op val create: string -> t - val getToken: t -> Tokenizer.token * t + val getToken: t -> T.token * t val getLastPos: t -> tkPos val prepAndRaise: t -> tkPos -> tkExp list -> 'a diff --git a/cpp.sml b/cpp.sml deleted file mode 100644 index 91f7f82..0000000 --- a/cpp.sml +++ /dev/null @@ -1,95 +0,0 @@ -structure Cpp:> CPP = struct - type tkPos = Stream.pos - - type t = - { streams: Stream.t list, fileInfo: Stream.fileInfo list, - lastPos: tkPos option, firstId: Stream.fileId }; - - datatype tkExp = - Tk of Tokenizer.token | - Id | - NumConst | - StrLiteral | - UnOp | - BinOp | - Op - - type tkExpectedValue = string * tkExp list - - exception StreamTooOld - exception TkExpected of tkExpectedValue - - fun create fname = - let - val stream = Stream.create fname - val info = Stream.convert stream - in - { streams = [stream] , fileInfo = [info], lastPos = NONE, firstId = #1 info } - end - - fun getToken - ({ streams = stream :: tail, fileInfo, lastPos, firstId }: t) = - let - val (tk, stream) = Tokenizer.getToken stream - in - case tk of - NONE => getToken { streams = tail, fileInfo, lastPos, firstId } - | SOME (pos, tk) => - ( tk, { streams = stream :: tail, fileInfo, - lastPos = SOME pos, firstId }) - end - | getToken - { streams = [], fileInfo, lastPos = SOME lastPos, firstId } = - let - val pos = SOME (#1 lastPos, ~1) (* EOF *) - in - (Tokenizer.EOS, {streams = [], fileInfo, lastPos = pos, firstId }) - end - | getToken { streams = [], fileInfo, lastPos = NONE, firstId } = - (Tokenizer.EOS, { streams = [], fileInfo, - lastPos = SOME (firstId, ~1), firstId }) - - fun getLastPos ({ lastPos = NONE, ... }: t) = raise Unreachable - | getLastPos { lastPos = SOME p, ... } = p - - val tkExp2str = fn - (Tk tk) => Tokenizer.token2str tk - | Id: tkExp => "identifier" - | NumConst => "numeric constant" - | StrLiteral => "string literal" - | UnOp => "unary operator" - | BinOp => "binary operator" - | Op => "operator" - - fun prepAndRaise (stream: t) (id, pos) expList = - let - val fileInfo = - case List.find (fn (id', _, _) => id' = id) $ #fileInfo stream of - NONE => raise StreamTooOld - | SOME fileInfo => fileInfo - val pos = Stream.ppos2str $ Stream.pos2pposWithFI (id, pos) fileInfo - in - raise TkExpected (pos, expList) - end - - fun tkExpectedPrint (pos, expList) = - let - fun tkExps2str [e] [] = tkExp2str e - | tkExps2str [e] acc = - (String.concatWith ", " acc ^ " or ") ^ tkExp2str e - | tkExps2str (e :: ec) acc = - tkExps2str ec (tkExp2str e :: acc) - | tkExps2str [] _ = raise Unreachable - in - print pos; - print ":expected "; - printLn $ tkExps2str expList [] - end - - fun debugPrint fname = - let - val stream = create fname - in - () - end -end diff --git a/exn_handler.fun b/exn_handler.fun new file mode 100644 index 0000000..6e069d6 --- /dev/null +++ b/exn_handler.fun @@ -0,0 +1,42 @@ +functor ExnHandler(structure T: TOKENIZER; structure P: CPP): + EXN_HANDLER = +struct + + fun eprint s = printLn $ "error: " ^ s + + fun otherExn e = + let + val hist = MLton.Exn.history e + in + eprint $ "exception " ^ exnMessage e ^ " was raised"; + if hist = [] then + (printLn "No stack trace is avaliable"; + printLn "Recompile with -const \"Exn.keepHistory true\"") + else + List.app (fn x => printLn $ "\t" ^ x) hist + end + + fun exit code = Posix.Process.exit $ Word8.fromInt code + + fun ioExn (IO.Io { name, function = _, cause }) = + let + val prefix = name ^ ": " + val reason = + case cause of + OS.SysErr (str, _) => str + | _ => exnMessage cause + in + printLn $ prefix ^ reason + end + | ioExn _ = (printLn "ioExn: unreachable"; exit 254) + + fun handler e = + (case e of + T.FsmTableIsTooSmall => + eprint "fsm table is too small. Increate 'maxState' value" + | IO.Io _ => ioExn e + | T.TkErrorAug (pos, msg) => eprint $ T.S.ppos2str pos ^ ": " ^ msg + | P.TkExpected v => P.tkExpectedPrint v + | _ => otherExn e; + exit 255) +end diff --git a/exn_handler.sig b/exn_handler.sig new file mode 100644 index 0000000..ac5c574 --- /dev/null +++ b/exn_handler.sig @@ -0,0 +1,3 @@ +signature EXN_HANDLER = sig + val handler: exn -> unit +end diff --git a/exn_handler.sml b/exn_handler.sml deleted file mode 100644 index 50fb8dc..0000000 --- a/exn_handler.sml +++ /dev/null @@ -1,46 +0,0 @@ -structure GlobalExnHandler: sig val handler: exn -> unit end = struct - - fun eprint s = printLn $ "error: " ^ s - - fun otherExn e = - let - val hist = MLton.Exn.history e - in - eprint $ "exception " ^ exnMessage e ^ " was raised"; - if hist = [] then - (printLn "No stack trace is avaliable"; - printLn "Recompile with -const \"Exn.keepHistory true\"") - else - List.app (fn x => printLn $ "\t" ^ x) hist - end - - fun exit code = Posix.Process.exit $ Word8.fromInt code - - fun ioExn (IO.Io { name, function = _, cause }) = - let - val prefix = name ^ ": " - val reason = - case cause of - OS.SysErr (str, _) => str - | _ => exnMessage cause - in - printLn $ prefix ^ reason - end - | ioExn _ = (printLn "ioExn: unreachable"; exit 254) - - fun handler e = - let - open Tokenizer Cpp - in - (case e of - FsmTableIsTooSmall => - eprint "fsm table is too small. Increate 'maxState' value" - | IO.Io _ => ioExn e - | TkErrorAug (pos, msg) => eprint $ Stream.ppos2str pos ^ ": " ^ msg - | TkExpected v => tkExpectedPrint v - | _ => otherExn e; - exit 255) - end -end - -val () = MLton.Exn.setTopLevelHandler GlobalExnHandler.handler diff --git a/tokenizer.fun b/tokenizer.fun new file mode 100644 index 0000000..5cca203 --- /dev/null +++ b/tokenizer.fun @@ -0,0 +1,1019 @@ +functor Tokenizer(structure H: HASHTABLE; structure S: STREAM) + : TOKENIZER = +struct + + structure S = S + + datatype intType = ItDec | ItOct | ItHex + datatype intSfx = IsNone | IsU | IsL | IsUL | IsLL | IsULL + datatype floatSfx = FsNone | FsF | FsL + + datatype numConst = + IntConst of intType * string * intSfx | + FloatConst of string * floatSfx + + datatype token = + Invalid | + EOS | + NewLine | + + Num of numConst | + + Id of string | + CharConst of string * int | + StringConst of string | + + kwBreak | + kwCase | + kwChar | + kwConst | + kwContinue | + kwDefault | + kwDouble | + kwElse | + kwEnum | + kwExtern | + kwFloat | + kwFor | + kwGoto | + kwInt | + kwLong | + kwRegister | + kwReturn | + kwShort | + kwSigned | + kwSizeof | + kwStruct | + kwSwitch | + kwTypedef | + kwUnion | + kwUnsigned | + kwVoid | + kwVolatile | + + LParen | + RParen | + LBracket | + RBracket | + LBrace | + RBrace | + + QuestionMark | + Colon | + Coma | + Semicolon | + + Arrow | + Plus | + DoublePlus| + Minus | + DoubleMinus | + Ampersand | + Asterisk | + Slash | + Tilde | + ExclMark | + Percent | + DoubleGreater | + DoubleLess | + Greater | + Less | + EqualSign | + LessEqualSign | + GreaterEqualSign | + DoubleEqualSign | + ExclMarkEqualSign | + Cap | + VerticalBar | + DoubleAmpersand | + DoubleVerticalBar | + + AsteriskEqualSign | + SlashEqualSign | + PercentEqualSign | + PlusEqualSign | + MinusEqualSign | + DoubleLessEqualSign | + DoubleGreaterEqualSign | + AmpersandEqualSign | + CapEqualSign | + VerticalBarEqualSign | + + Hash | + DoubleHash | + + Dot | + DoubleDot | + TripleDot | + + CommentStart | + + CppInclude | + CppDefine | + CppUndef | + CppIf | + CppIfdef | + CppIfndef | + CppElse | + CppElif | + CppEndif | + CppWarning | + CppError | + CppPragma + + val kwPrefix = #"@" + val cppPrefix = #"$" + + type fullToken = S.pos * token + + datatype tkErrorAuxInfo = TkiEOF | TkiDx of int | TkiStart + + exception TkError of tkErrorAuxInfo * string + exception TkErrorAug of S.ppos * string + + exception ExpectedCppDir (* handled in postprocess *) + + exception FsmTableIsTooSmall + + (* Unreachable (should be) *) + exception TokenWithoutRepr + exception SuffixWithoutRepr + + val tokenRepr = + let + fun & repr = str kwPrefix ^ repr + fun % repr = str cppPrefix ^ repr + in + [ + (NewLine, "NewLine"), + + (kwBreak, &"break"), + (kwCase, &"case"), + (kwChar, &"char"), + (kwConst, &"const"), + (kwContinue, &"continue"), + (kwDefault, &"default"), + (kwDouble, &"double"), + (kwElse, &"else"), + (kwEnum, &"enum"), + (kwExtern, &"extern"), + (kwFloat, &"float"), + (kwFor, &"for"), + (kwGoto, &"goto"), + (kwInt, &"int"), + (kwLong, &"long"), + (kwRegister, &"register"), + (kwReturn, &"return"), + (kwShort, &"short"), + (kwSigned, &"signed"), + (kwSizeof, &"sizeof"), + (kwStruct, &"struct"), + (kwSwitch, &"switch"), + (kwTypedef, &"typedef"), + (kwUnion, &"union"), + (kwUnsigned, &"unsigned"), + (kwVoid, &"void"), + (kwVolatile, &"volatile"), + + (LParen, "("), + (RParen, ")"), + (LBracket, "["), + (RBracket, "]"), + (LBrace, "{"), + (RBrace, "}"), + + (QuestionMark, "?"), + (Colon, ":"), + (Coma, ","), + (Semicolon, ";"), + + (Arrow, "->"), + (Plus, "+"), + (DoublePlus, "++"), + (Minus, "-"), + (DoubleMinus, "--"), + (Ampersand, "&"), + (Asterisk, "*"), + (Slash, "/"), + (Tilde, "~"), + (ExclMark, "!"), + (Percent, "%"), + (DoubleLess, "<<"), + (DoubleGreater, ">>"), + (Less, "<"), + (Greater, ">"), + (EqualSign, "="), + (LessEqualSign, "<="), + (GreaterEqualSign, ">="), + (DoubleEqualSign, "=="), + (ExclMarkEqualSign, "!="), + (Cap, "^"), + (VerticalBar, "|"), + (DoubleAmpersand, "&&"), + (DoubleVerticalBar, "||"), + + (AsteriskEqualSign, "*="), + (SlashEqualSign, "/="), + (PercentEqualSign, "%="), + (PlusEqualSign, "+="), + (MinusEqualSign, "-="), + (DoubleLessEqualSign, "<<="), + (DoubleGreaterEqualSign, ">>="), + (AmpersandEqualSign, "&="), + (CapEqualSign, "^="), + (VerticalBarEqualSign, "|="), + + (Hash, "#"), + (DoubleHash, "##"), + + (Dot, "."), + (DoubleDot, ".."), + (TripleDot, "..."), + + (CommentStart, "/*"), + + (CppInclude, %"include"), + (CppDefine, %"define"), + (CppUndef, %"undef"), + (CppIf, %"if"), + (CppIfdef, %"ifdef"), + (CppIfndef, %"ifndef"), + (CppElse, %"else"), + (CppElif, %"elif"), + (CppEndif, %"endif"), + (CppWarning, %"warning"), + (CppError, %"error"), + (CppPragma, %"pragma") + ] + end + + val intSuffixRepr = [ + (IsNone, ""), + (IsU, "u"), + (IsL, "l"), + (IsUL, "ul"), + (IsLL, "ll"), + (IsULL, "ull") + ] + + val floatSuffixRepr = [ + (FsNone, ""), + (FsF, "f"), + (FsL, "l") + ] + + fun getSfxRepr sfx buf onError = + case List.find (fn (sfx', _) => sfx' = sfx) buf of + NONE => onError () + | SOME (_, repr) => repr + + fun getSfxReprSimple sfx buf = + getSfxRepr sfx buf (fn () => raise SuffixWithoutRepr) + + val token2str = fn + Id s => "id:" ^ s + | Num (IntConst (it, str, sfx)) => + let + val intType = + case it of + ItDec => "" + | ItOct => "0" + | ItHex => "0x" + in + intType ^ str ^ "`" ^ getSfxReprSimple sfx intSuffixRepr ^ "`" + end + | Num (FloatConst (str, sfx)) => + str ^ "`" ^ getSfxReprSimple sfx floatSuffixRepr ^ "`" + | CharConst (repr, _) => repr + | StringConst s => + "\"" ^ s ^ "\"" + | v => + case List.find (fn (x, _) => x = v) tokenRepr of + SOME (_, repr) => repr + | NONE => raise TokenWithoutRepr + + fun printToken tk = print $ token2str tk + + fun isIdStart c = Char.isAlpha c orelse c = #"_" + fun isIdBody c = Char.isAlphaNum c orelse c = #"_" + + fun isOctal c = ord c >= ord #"0" andalso ord c < ord #"8" + val isDigit = Char.isDigit + val isHexDigit = Char.isHexDigit + + fun isPrintable c = Char.isPrint c andalso c <> #" " + + (* FSM for parsing symbols *) + + val maxStates = 51 + + fun fsmInsert (nextState, buf) curState tk (c :: cs) = + let + open Array + + val (_, row) = sub (buf, curState) + val potNextState = sub (row, ord c) + in + if potNextState <> ~1 then + fsmInsert (nextState, buf) potNextState tk cs + else ( + update (row, ord c, !nextState); + nextState := !nextState + 1; + fsmInsert (nextState, buf) (!nextState - 1) tk cs + ) + end + | fsmInsert (_, buf) curState tk [] = + let + open Array + val (_, row) = sub (buf, curState) + in + update (buf, curState, (tk, row)) + end + + fun isStartForFsmGen () = + let + open Array + + val lookupTable = array (128, false) + + fun firstChr s = ord $ String.sub (s, 0) + val () = List.app + (fn (_, repr) => update (lookupTable, firstChr repr, true)) + $ List.filter + (fn (tk, repr) => + let + val c = String.sub (repr, 0) + in + c <> kwPrefix andalso c <> cppPrefix + andalso tk <> NewLine + end) + tokenRepr + + in + fn c => sub (lookupTable, ord c) + end + + val isStartForFsm = isStartForFsmGen () + + fun fsmTableCreate () = + let + open Array + + val T as (nextState, buf) = + (ref 1, array (maxStates, (Invalid, array (128, ~1)))) + val r = ref 1 + + fun filterNeeded [] acc = acc + | filterNeeded ((T as (_, repr)) :: tks) acc = + filterNeeded tks + (if isStartForFsm $ String.sub (repr, 0) then + T :: acc + else + acc) + + val tokenRepr = filterNeeded tokenRepr [] + + fun fsmInsert' T curState tk repr = fsmInsert T curState tk repr handle + Subscript => raise FsmTableIsTooSmall + in + while !r < length buf do ( + update (buf, !r, (Invalid, array(128, ~1))); + r := !r + 1 + ); + + List.app (fn (v, p) => fsmInsert' T 0 v $ explode p) tokenRepr; + if !nextState <> maxStates then + printLn $ "note: Fsm table size can be smaller: " + ^ Int.toString (!nextState) ^ " is enough" + else (); + T + end + + (* Unused right now + fun printTable (nextState, buf) = + let + fun printRow i row = + if i = length row then + print "\n" + else + let + val state = sub (row, i) + in + if state = ~1 then + () + else + print ((str (chr i)) ^ ": " ^ (Int.toString state) ^ ", "); + printRow (i + 1) row + end + + fun print' rowNum buf = + if rowNum = !nextState then + () + else + let + val (tk, row) = sub (buf, rowNum) + in + print ((token2string tk) ^ ": "); + printRow 0 row; + print' (rowNum + 1) buf + end + in + print ("NextState: " ^ Int.toString (!nextState) ^ "\n"); + print' 0 buf; + print "\n" + end + *) + + val fsmTable = lazy fsmTableCreate + + fun fsmEat stream = + let + open Array + val pos = S.getPos stream + + fun get curState stream = + let + val (c, stream) = S.getchar stream + in + case c of + NONE => (#1 $ sub (#2 $ fsmTable (), curState), stream) + | SOME c => + let + val (tk, row) = sub (#2 $ fsmTable (), curState) + val nextState = sub (row, ord c) + in + if nextState = ~1 then + (tk, S.ungetc stream) + else + get nextState stream + end + end + in + (fn (tk, stream) => ((pos, tk), stream)) $ get 0 stream + end + + fun tkError2aug stream (dx, msg) = + let + val (id, pos) = S.getPosAfterCharRead stream + val pos = S.pos2ppos (id, pos + dx) stream + in + TkErrorAug (pos, msg) + end + + fun parserWrapper stream parser acc = + let + val stream = S.ungetc stream + val P as (_, startOff) = S.getPos stream + + fun parse' stream acc = let + val (c, stream) = S.getchar stream + + val (acc, tk, stream) = parser acc (stream, startOff) c handle + TkError (TkiDx dx, msg) => raise tkError2aug stream (dx, msg) + | TkError (TkiStart, msg) => + let + val startPos = S.pos2ppos P stream + in + raise TkErrorAug (startPos, msg) + end + | TkError (TkiEOF, msg) => + let + val pos = S.pposWithoutCol $ S.pos2ppos P stream + in + raise TkErrorAug (pos, msg) + end + in + case tk of + NONE => parse' stream acc + | _ => (valOf tk, stream) + end + + val (tk, stream) = parse' stream acc + in + ((P, tk): fullToken, stream) + end + + fun finishSeqRead startOff stream = + let + val (_, endOff) = S.getPos stream + val s = S.getSubstr startOff endOff stream + in + s + end + + fun keywordHashtableGen () = + let + val table = H.create 128 + val () = + List.app + (fn (tk, repr) => + if String.sub (repr, 0) = kwPrefix then + H.insert table (String.extract (repr, 1, NONE)) tk + else + ()) + tokenRepr + in + table + end + + val keywordHashtable = lazy keywordHashtableGen + + fun findKeyword str = + case H.lookup (keywordHashtable ()) str of + NONE => Id str + | SOME tk => tk + + fun idParser () (stream, startOff) c = + let + fun finalize stream = + let + val s = finishSeqRead startOff stream + val tk = findKeyword s + in + ((), SOME tk, stream) + end + in + case c of + NONE => finalize stream + | SOME c => + if isIdBody c then + ((), NONE, stream) + else + finalize (S.ungetc stream) + end + + datatype intMode = ImDec | ImOct | ImInvalidOct | ImHex + datatype floatMode = FmDot | FmExp + + datatype npState = NpInit | IntMode of intMode | FloatMode of floatMode + + fun getLongestSeq acc pred stream = + let + val (c, stream') = S.getchar stream + in + if isSome c andalso pred $ valOf c then + getLongestSeq (valOf c :: acc) pred stream' + else + (implode $ rev acc, stream) + end + + fun skipDigitSeq off stream = + let + val (res, stream) = getLongestSeq [] isDigit stream + in + if res = "" then + raise TkError (TkiDx off, "expected digit") + else + (String.size res, stream) + end + + fun getSuffixCommon buf off stream = + let + val (sfx, stream) = getLongestSeq [] Char.isAlpha stream + val sfx = String.map (fn c => Char.toLower c) sfx + val sfx = + case List.find (fn (_, repr) => repr = sfx) buf of + NONE => raise TkError (TkiDx off, "unknown suffix") + | SOME (sfx, _) => sfx + in + (sfx, stream) + end + + val getIntSuffix = getSuffixCommon intSuffixRepr 0 + fun getFloatSuffix off = getSuffixCommon floatSuffixRepr off + + (* + * It seems that there is an ambiguity in C: + * gcc/clang/cparser consider 0xe-3 as a fp constant and reject it. + * Right now we do not, but it may be reconsidered. + *) + fun numParser NpInit (stream, _) (SOME c) = + if c = #"0" then + let + val (c, stream') = S.getchar stream + in + case c of + NONE => + (NpInit, SOME $ Num $ IntConst (ItOct, "", IsNone), stream') + | SOME c => + if Char.toLower c = #"x" then + (IntMode ImHex, NONE, stream') + else + (IntMode ImOct, NONE, stream) + end + else + (IntMode ImDec, NONE, stream) + | numParser NpInit _ NONE = raise Unreachable + | numParser (IntMode mode) (stream, startOff) c = + let + val (pred, res, offset) = + case mode of + ImDec => (isDigit, ItDec, 0) + | ImOct => (isOctal, ItOct, 1) + | ImInvalidOct => (isDigit, ItOct, 1) + | ImHex => (isHexDigit, ItHex, 2) + + fun checkAndRaise m msg = + if mode = m then raise TkError (TkiStart, msg) else () + + fun finish () = + let + val () = checkAndRaise ImInvalidOct "invalid octal constant" + val stream = S.ungetc stream + val str = finishSeqRead (startOff + offset) stream + val (sfx, stream) = getIntSuffix stream + in + (IntMode mode, SOME $ Num $ IntConst (res, str, sfx), stream) + end + in + case c of + NONE => finish () + | SOME c => + if pred c then + (IntMode mode, NONE, stream) + else if c = #"." then + (FloatMode FmDot, NONE, stream) + else if Char.toLower c = #"e" then + (checkAndRaise ImHex + "floating constant can not come with 0x prefix"; + (FloatMode FmExp, NONE, stream)) + else + finish () + end + | numParser (FloatMode FmDot) (stream, startOff) _ = + let + val (len, stream) = skipDigitSeq 0 $ S.ungetc stream + + val (c, stream') = S.getchar stream + + fun finish () = + let + val str = finishSeqRead startOff stream + val (sfx, stream) = getFloatSuffix len stream + in + (FloatMode FmDot, SOME $ Num $ FloatConst (str, sfx), stream) + end + in + case c of + NONE => finish () + | SOME c => + if Char.toLower c = #"e" then + (FloatMode FmExp, NONE, stream') + else + finish () + end + | numParser (FloatMode FmExp) (stream, startOff) c = + let + val (off, stream) = + if c = NONE then + raise TkError (TkiDx 0, "expected digit") + else if valOf c <> #"+" andalso valOf c <> #"-" then + (0, S.ungetc stream) + else + (~1, stream) + + val (len, stream) = skipDigitSeq off stream + val str = finishSeqRead startOff stream + val (sfx, stream) = getFloatSuffix len stream + in + (FloatMode FmDot, SOME $ Num $ FloatConst (str, sfx), stream) + end + + fun chrIntVal c = + if isDigit c then + ord c - ord #"0" + else if ord c >= ord #"a" andalso ord c <= ord #"z" then + ord c - ord #"a" + 10 + else if ord c >= ord #"A" andalso ord c <= ord #"Z" then + ord c - ord #"A" + 10 + else + raise Unreachable + + fun parseOctalSeq stream c = + let + fun follow stream acc count = + if count = 3 then + (SOME $ chr acc, stream) + else + let + val (c, stream) = S.getchar stream + in + case c of + NONE => (SOME $ chr acc, stream) + | SOME c => + if isOctal c then + follow stream (acc * 8 + chrIntVal c) (count + 1) + else + (SOME $ chr acc, S.ungetc stream) + end + in + follow stream (chrIntVal c) 1 + end + + fun parseHexSeq stream = + let + fun follow stream acc count = + let + val (c, stream) = S.getchar stream + + val noHex = TkError (TkiDx 0, "\\x without hex digits") + in + case c of + NONE => + if count = 0 then + raise noHex + else + (SOME $ chr acc, stream) + | SOME c => + if isHexDigit c then + if count = 2 then + raise TkError (TkiDx 2, "hex sequence out of range") + else + follow stream (acc * 16 + chrIntVal c) (count + 1) + else + if count = 0 then + raise noHex + else + (SOME $ chr acc, S.ungetc stream) + end + in + follow stream 0 0 + end + + fun eatEscSeq stream = + let + fun raiseErr0 msg = raise TkError (TkiDx 0, msg) + + val (c, stream) = S.getchar stream + val c = + case c of + NONE => raiseErr0 "unfinished escape sequence" + | SOME c => c + + fun & c = (SOME c, stream) + in + case c of + #"'" => & #"'" + | #"\"" => & #"\"" + | #"?" => & #"?" + | #"\\" => & #"\\" + | #"a" => & #"\a" + | #"b" => & #"\b" + | #"f" => & #"\f" + | #"n" => & #"\n" + | #"r" => & #"\r" + | #"t" => & #"\t" + | #"v" => & #"\v" + | #"\n" => (NONE, stream) + | #"x" => parseHexSeq stream + | c => + if isOctal c then + parseOctalSeq stream c + else + raiseErr0 "unknown escape sequence" + end + + datatype SeqParseState = SeqInit | SeqStart | SeqValue of int | SeqTerm + + datatype seqParseMode = SpmChr | SpmStr + + fun seqBound SpmChr = #"'" + | seqBound SpmStr = #"\"" + + fun seqExnConv mode (TkError (v, msg)) = + let + val bound = if mode = SpmChr then "'" else "\"" + val msg = + String.translate (fn c => if c = #"%" then bound else str c) msg + in + TkError (v, msg) + end + | seqExnConv _ _ = raise Unreachable + + fun unfinishedSeq SpmChr = "unfinished character constant" + | unfinishedSeq SpmStr = "unfinished string literal" + + fun seqParser mode SeqInit (stream, _) (SOME c) = + if seqBound mode = c then + (SeqStart, NONE, stream) + else + raise Unreachable + | seqParser mode SeqStart (stream, _) (SOME c) = + if c <> seqBound mode then + let + val (c, stream) = + if c <> #"\\" then (SOME c, stream) else eatEscSeq stream + in + if c = NONE then + (SeqStart, NONE, stream) + else + (SeqValue (ord $ Option.valOf c), NONE, stream) + end + else if mode = SpmStr then + (SeqTerm, SOME $ StringConst "", stream) + else + raise seqExnConv SpmChr $ TkError (TkiDx 0, "expected value after %") + | seqParser mode (SeqValue v) (stream, startOff) (SOME c) = + if seqBound mode = c then + let + fun term s v = + if mode = SpmChr then + CharConst (s, v) + else + StringConst $ String.extract (s, 1, SOME $ String.size s - 2) + in + (SeqTerm, SOME $ term (finishSeqRead startOff stream) v, stream) + end + else if mode = SpmStr then + let + val (_, stream) = + if c <> #"\\" then (SOME c, stream) else eatEscSeq stream + in + (SeqValue v, NONE, stream) + end + else + raise seqExnConv SpmChr $ TkError (TkiDx 0, "expected % after value") + | seqParser _ SeqTerm _ (SOME _) = + raise Unreachable + | seqParser mode state (_, _) NONE = + raise case state of + SeqInit => Unreachable + | SeqStart => seqExnConv mode $ TkError (TkiStart, unfinishedSeq mode) + | SeqValue _ => + seqExnConv mode $ TkError (TkiStart, unfinishedSeq mode) + | SeqTerm => Unreachable + + val charParser = seqParser SpmChr + val strParser = seqParser SpmStr + + fun formCppDir (Id s) = + let + open String + in + case List.find + (fn (_, repr) => + sub (repr, 0) = cppPrefix andalso + extract (repr, 1, NONE) = s) + tokenRepr + of + SOME (tk, _) => tk + | NONE => raise ExpectedCppDir + end + | formCppDir kwElse = CppElse + | formCppDir _ = raise ExpectedCppDir + + fun handleCppDir tk prevPos stream = + let + val (pos, tk') = tk + in + (prevPos, formCppDir tk') handle + ExpectedCppDir => + raise TkErrorAug (S.pos2ppos pos stream, + "expected preprocessor directive") + end + + fun unexpectedCharRaise stream c = + let + val pos = S.pos2ppos (S.getPosAfterCharRead stream) stream + val repr = + if isPrintable c then + str c + else + "<" ^ Int.toString (ord c) ^ ">" + in + raise TkErrorAug (pos, "unexpected character " ^ repr) + end + + fun skipComment stream pos = + let + fun skip prevIsAsterisk stream = + let + val (c, stream) = + case S.getchar stream of + (NONE, _) => + let + val pos = S.pos2ppos pos stream + in + raise TkErrorAug (pos, "unfinished comment") + end + | (SOME c, stream) => (c, stream) + in + if prevIsAsterisk andalso c = #"/" then + stream + else + skip (c = #"*") stream + end + in + skip false stream + end + + fun handleBackslash stream = + let + val (c, stream) = S.getchar stream + + val raiseErr = fn () => + let + val pos = S.getPosAfterCharRead stream + val pos = S.pos2ppos pos stream + in + raise TkErrorAug (pos, "expected \\n after backslash") + end + in + case c of + SOME c => + if c = #"\n" then + stream + else + raiseErr () + | NONE => raiseErr () + end + + fun processSymbol stream = + let + val (T as (p, tk), stream) = fsmEat $ S.ungetc stream + in + case tk of + CommentStart => getToken $ skipComment stream p + | DoubleDot => (SOME (p, Dot), S.ungetc stream) + | Hash => + if S.isFirstOnLine p stream then + let + val (tk, stream) = getToken stream + in + case tk of + NONE => + raise TkErrorAug (S.pos2ppos p stream, + "unfinished preprecessor directive") + | SOME tk => + (SOME $ handleCppDir tk p stream, stream) + end + else + (SOME T, stream) + | _ => (SOME T, stream) + end + + and getToken stream = + let + val (c, stream) = S.getchar stream + + fun @-> parser acc = + (fn (tk, s) => (SOME tk, s)) $ parserWrapper stream parser acc + in + case c of + NONE => (NONE, stream) + | SOME c => + if c = #"\n" then + (SOME (S.getPosAfterCharRead stream, NewLine), stream) + else if Char.isSpace c then + getToken stream + else if isIdStart c then + @-> idParser () + else if isDigit c then + @-> numParser NpInit + else if c = #"'" then + @-> charParser SeqInit + else if c = #"\"" then + @-> strParser SeqInit + else if isStartForFsm c then + processSymbol stream + else if c = #"\\" then + getToken $ handleBackslash stream + else + unexpectedCharRaise stream c + end + + fun tokenize stream = + let + fun aux acc stream = + let + val (tk, stream) = getToken stream + in + case tk of + NONE => rev acc + | SOME tk => aux (tk :: acc) stream + end + in + aux [] stream + end + + fun debugPrint tkl fname = + let + fun print' line _ ((_, NewLine) :: tks) = + print' (line + 1) true tks + | print' line firstOnLine ((_, tk) :: tks) = ( + if firstOnLine then ( + print "\n"; + printLn $ fname ^ ":" ^ Int.toString line; + print "\t") + else + (); + printToken tk; + print " "; + print' line false tks + ) + | print' _ _ [] = () + in + print' 1 true tkl; + print "\n" + end +end diff --git a/tokenizer.sig b/tokenizer.sig index 245cfbe..c9626e0 100644 --- a/tokenizer.sig +++ b/tokenizer.sig @@ -1,4 +1,6 @@ signature TOKENIZER = sig + structure S: STREAM + datatype intType = ItDec | ItOct | ItHex datatype intSfx = IsNone | IsU | IsL | IsUL | IsLL | IsULL datatype floatSfx = FsNone | FsF | FsL @@ -116,15 +118,15 @@ signature TOKENIZER = sig CppError | CppPragma - type fullToken = Stream.pos * token + type fullToken = S.pos * token (* Fatal. both may be thrown by tokenize *) exception FsmTableIsTooSmall - exception TkErrorAug of Stream.ppos * string + exception TkErrorAug of S.ppos * string - val getToken: Stream.t -> fullToken option * Stream.t + val getToken: S.t -> fullToken option * S.t - val tokenize: Stream.t -> fullToken list + val tokenize: S.t -> fullToken list val token2str: token -> string val printToken: token -> unit diff --git a/tokenizer.sml b/tokenizer.sml deleted file mode 100644 index 10c2e77..0000000 --- a/tokenizer.sml +++ /dev/null @@ -1,1018 +0,0 @@ -structure Tokenizer:> TOKENIZER = struct - - datatype intType = ItDec | ItOct | ItHex - datatype intSfx = IsNone | IsU | IsL | IsUL | IsLL | IsULL - datatype floatSfx = FsNone | FsF | FsL - - datatype numConst = - IntConst of intType * string * intSfx | - FloatConst of string * floatSfx - - datatype token = - Invalid | - EOS | - NewLine | - - Num of numConst | - - Id of string | - CharConst of string * int | - StringConst of string | - - kwBreak | - kwCase | - kwChar | - kwConst | - kwContinue | - kwDefault | - kwDouble | - kwElse | - kwEnum | - kwExtern | - kwFloat | - kwFor | - kwGoto | - kwInt | - kwLong | - kwRegister | - kwReturn | - kwShort | - kwSigned | - kwSizeof | - kwStruct | - kwSwitch | - kwTypedef | - kwUnion | - kwUnsigned | - kwVoid | - kwVolatile | - - LParen | - RParen | - LBracket | - RBracket | - LBrace | - RBrace | - - QuestionMark | - Colon | - Coma | - Semicolon | - - Arrow | - Plus | - DoublePlus| - Minus | - DoubleMinus | - Ampersand | - Asterisk | - Slash | - Tilde | - ExclMark | - Percent | - DoubleGreater | - DoubleLess | - Greater | - Less | - EqualSign | - LessEqualSign | - GreaterEqualSign | - DoubleEqualSign | - ExclMarkEqualSign | - Cap | - VerticalBar | - DoubleAmpersand | - DoubleVerticalBar | - - AsteriskEqualSign | - SlashEqualSign | - PercentEqualSign | - PlusEqualSign | - MinusEqualSign | - DoubleLessEqualSign | - DoubleGreaterEqualSign | - AmpersandEqualSign | - CapEqualSign | - VerticalBarEqualSign | - - Hash | - DoubleHash | - - Dot | - DoubleDot | - TripleDot | - - CommentStart | - - CppInclude | - CppDefine | - CppUndef | - CppIf | - CppIfdef | - CppIfndef | - CppElse | - CppElif | - CppEndif | - CppWarning | - CppError | - CppPragma - - val kwPrefix = #"@" - val cppPrefix = #"$" - - type fullToken = Stream.pos * token - - datatype tkErrorAuxInfo = TkiEOF | TkiDx of int | TkiStart - - exception TkError of tkErrorAuxInfo * string - exception TkErrorAug of Stream.ppos * string - - exception ExpectedCppDir (* handled in postprocess *) - - exception FsmTableIsTooSmall - - (* Unreachable (should be) *) - exception TokenWithoutRepr - exception SuffixWithoutRepr - - val tokenRepr = - let - fun & repr = str kwPrefix ^ repr - fun % repr = str cppPrefix ^ repr - in - [ - (NewLine, "NewLine"), - - (kwBreak, &"break"), - (kwCase, &"case"), - (kwChar, &"char"), - (kwConst, &"const"), - (kwContinue, &"continue"), - (kwDefault, &"default"), - (kwDouble, &"double"), - (kwElse, &"else"), - (kwEnum, &"enum"), - (kwExtern, &"extern"), - (kwFloat, &"float"), - (kwFor, &"for"), - (kwGoto, &"goto"), - (kwInt, &"int"), - (kwLong, &"long"), - (kwRegister, &"register"), - (kwReturn, &"return"), - (kwShort, &"short"), - (kwSigned, &"signed"), - (kwSizeof, &"sizeof"), - (kwStruct, &"struct"), - (kwSwitch, &"switch"), - (kwTypedef, &"typedef"), - (kwUnion, &"union"), - (kwUnsigned, &"unsigned"), - (kwVoid, &"void"), - (kwVolatile, &"volatile"), - - (LParen, "("), - (RParen, ")"), - (LBracket, "["), - (RBracket, "]"), - (LBrace, "{"), - (RBrace, "}"), - - (QuestionMark, "?"), - (Colon, ":"), - (Coma, ","), - (Semicolon, ";"), - - (Arrow, "->"), - (Plus, "+"), - (DoublePlus, "++"), - (Minus, "-"), - (DoubleMinus, "--"), - (Ampersand, "&"), - (Asterisk, "*"), - (Slash, "/"), - (Tilde, "~"), - (ExclMark, "!"), - (Percent, "%"), - (DoubleLess, "<<"), - (DoubleGreater, ">>"), - (Less, "<"), - (Greater, ">"), - (EqualSign, "="), - (LessEqualSign, "<="), - (GreaterEqualSign, ">="), - (DoubleEqualSign, "=="), - (ExclMarkEqualSign, "!="), - (Cap, "^"), - (VerticalBar, "|"), - (DoubleAmpersand, "&&"), - (DoubleVerticalBar, "||"), - - (AsteriskEqualSign, "*="), - (SlashEqualSign, "/="), - (PercentEqualSign, "%="), - (PlusEqualSign, "+="), - (MinusEqualSign, "-="), - (DoubleLessEqualSign, "<<="), - (DoubleGreaterEqualSign, ">>="), - (AmpersandEqualSign, "&="), - (CapEqualSign, "^="), - (VerticalBarEqualSign, "|="), - - (Hash, "#"), - (DoubleHash, "##"), - - (Dot, "."), - (DoubleDot, ".."), - (TripleDot, "..."), - - (CommentStart, "/*"), - - (CppInclude, %"include"), - (CppDefine, %"define"), - (CppUndef, %"undef"), - (CppIf, %"if"), - (CppIfdef, %"ifdef"), - (CppIfndef, %"ifndef"), - (CppElse, %"else"), - (CppElif, %"elif"), - (CppEndif, %"endif"), - (CppWarning, %"warning"), - (CppError, %"error"), - (CppPragma, %"pragma") - ] - end - - val intSuffixRepr = [ - (IsNone, ""), - (IsU, "u"), - (IsL, "l"), - (IsUL, "ul"), - (IsLL, "ll"), - (IsULL, "ull") - ] - - val floatSuffixRepr = [ - (FsNone, ""), - (FsF, "f"), - (FsL, "l") - ] - - fun getSfxRepr sfx buf onError = - case List.find (fn (sfx', _) => sfx' = sfx) buf of - NONE => onError () - | SOME (_, repr) => repr - - fun getSfxReprSimple sfx buf = - getSfxRepr sfx buf (fn () => raise SuffixWithoutRepr) - - val token2str = fn - Id s => "id:" ^ s - | Num (IntConst (it, str, sfx)) => - let - val intType = - case it of - ItDec => "" - | ItOct => "0" - | ItHex => "0x" - in - intType ^ str ^ "`" ^ getSfxReprSimple sfx intSuffixRepr ^ "`" - end - | Num (FloatConst (str, sfx)) => - str ^ "`" ^ getSfxReprSimple sfx floatSuffixRepr ^ "`" - | CharConst (repr, _) => repr - | StringConst s => - "\"" ^ s ^ "\"" - | v => - case List.find (fn (x, _) => x = v) tokenRepr of - SOME (_, repr) => repr - | NONE => raise TokenWithoutRepr - - fun printToken tk = print $ token2str tk - - fun isIdStart c = Char.isAlpha c orelse c = #"_" - fun isIdBody c = Char.isAlphaNum c orelse c = #"_" - - fun isOctal c = ord c >= ord #"0" andalso ord c < ord #"8" - val isDigit = Char.isDigit - val isHexDigit = Char.isHexDigit - - fun isPrintable c = Char.isPrint c andalso c <> #" " - - (* FSM for parsing symbols *) - - val maxStates = 51 - - fun fsmInsert (nextState, buf) curState tk (c :: cs) = - let - open Array - - val (_, row) = sub (buf, curState) - val potNextState = sub (row, ord c) - in - if potNextState <> ~1 then - fsmInsert (nextState, buf) potNextState tk cs - else ( - update (row, ord c, !nextState); - nextState := !nextState + 1; - fsmInsert (nextState, buf) (!nextState - 1) tk cs - ) - end - | fsmInsert (_, buf) curState tk [] = - let - open Array - val (_, row) = sub (buf, curState) - in - update (buf, curState, (tk, row)) - end - - fun isStartForFsmGen () = - let - open Array - - val lookupTable = array (128, false) - - fun firstChr s = ord $ String.sub (s, 0) - val () = List.app - (fn (_, repr) => update (lookupTable, firstChr repr, true)) - $ List.filter - (fn (tk, repr) => - let - val c = String.sub (repr, 0) - in - c <> kwPrefix andalso c <> cppPrefix - andalso tk <> NewLine - end) - tokenRepr - - in - fn c => sub (lookupTable, ord c) - end - - val isStartForFsm = isStartForFsmGen () - - fun fsmTableCreate () = - let - open Array - - val T as (nextState, buf) = - (ref 1, array (maxStates, (Invalid, array (128, ~1)))) - val r = ref 1 - - fun filterNeeded [] acc = acc - | filterNeeded ((T as (_, repr)) :: tks) acc = - filterNeeded tks - (if isStartForFsm $ String.sub (repr, 0) then - T :: acc - else - acc) - - val tokenRepr = filterNeeded tokenRepr [] - - fun fsmInsert' T curState tk repr = fsmInsert T curState tk repr handle - Subscript => raise FsmTableIsTooSmall - in - while !r < length buf do ( - update (buf, !r, (Invalid, array(128, ~1))); - r := !r + 1 - ); - - List.app (fn (v, p) => fsmInsert' T 0 v $ explode p) tokenRepr; - if !nextState <> maxStates then - printLn $ "note: Fsm table size can be smaller: " - ^ Int.toString (!nextState) ^ " is enough" - else (); - T - end - - (* Unused right now - fun printTable (nextState, buf) = - let - fun printRow i row = - if i = length row then - print "\n" - else - let - val state = sub (row, i) - in - if state = ~1 then - () - else - print ((str (chr i)) ^ ": " ^ (Int.toString state) ^ ", "); - printRow (i + 1) row - end - - fun print' rowNum buf = - if rowNum = !nextState then - () - else - let - val (tk, row) = sub (buf, rowNum) - in - print ((token2string tk) ^ ": "); - printRow 0 row; - print' (rowNum + 1) buf - end - in - print ("NextState: " ^ Int.toString (!nextState) ^ "\n"); - print' 0 buf; - print "\n" - end - *) - - val fsmTable = lazy fsmTableCreate - - fun fsmEat stream = - let - open Array - val pos = Stream.getPos stream - - fun get curState stream = - let - val (c, stream) = Stream.getchar stream - in - case c of - NONE => (#1 $ sub (#2 $ fsmTable (), curState), stream) - | SOME c => - let - val (tk, row) = sub (#2 $ fsmTable (), curState) - val nextState = sub (row, ord c) - in - if nextState = ~1 then - (tk, Stream.ungetc stream) - else - get nextState stream - end - end - in - (fn (tk, stream) => ((pos, tk), stream)) $ get 0 stream - end - - fun tkError2aug stream (dx, msg) = - let - val (id, pos) = Stream.getPosAfterCharRead stream - val pos = Stream.pos2ppos (id, pos + dx) stream - in - TkErrorAug (pos, msg) - end - - fun parseGeneric stream parser acc = - let - val stream = Stream.ungetc stream - val P as (_, startOff) = Stream.getPos stream - - fun parse' stream acc = let - val (c, stream) = Stream.getchar stream - - val (acc, tk, stream) = parser acc (stream, startOff) c handle - TkError (TkiDx dx, msg) => raise tkError2aug stream (dx, msg) - | TkError (TkiStart, msg) => - let - val startPos = Stream.pos2ppos P stream - in - raise TkErrorAug (startPos, msg) - end - | TkError (TkiEOF, msg) => - let - open Stream - val pos = pposWithoutCol $ pos2ppos P stream - in - raise TkErrorAug (pos, msg) - end - in - case tk of - NONE => parse' stream acc - | _ => (valOf tk, stream) - end - - val (tk, stream) = parse' stream acc - in - ((P, tk): fullToken, stream) - end - - fun finishSeqRead startOff stream = - let - val (_, endOff) = Stream.getPos stream - val s = Stream.getSubstr startOff endOff stream - in - s - end - - fun keywordHashtableGen () = - let - open Hashtable - val table = create 128 - val () = - List.app - (fn (tk, repr) => - if String.sub (repr, 0) = kwPrefix then - insert table (String.extract (repr, 1, NONE)) tk - else - ()) - tokenRepr - in - table - end - - val keywordHashtable = lazy keywordHashtableGen - - fun findKeyword str = - case Hashtable.lookup (keywordHashtable ()) str of - NONE => Id str - | SOME tk => tk - - fun idParser () (stream, startOff) c = - let - fun finalize stream = - let - val s = finishSeqRead startOff stream - val tk = findKeyword s - in - ((), SOME tk, stream) - end - in - case c of - NONE => finalize stream - | SOME c => - if isIdBody c then - ((), NONE, stream) - else - finalize (Stream.ungetc stream) - end - - datatype intMode = ImDec | ImOct | ImInvalidOct | ImHex - datatype floatMode = FmDot | FmExp - - datatype npState = NpInit | IntMode of intMode | FloatMode of floatMode - - fun getLongestSeq acc pred stream = - let - val (c, stream') = Stream.getchar stream - in - if isSome c andalso pred $ valOf c then - getLongestSeq (valOf c :: acc) pred stream' - else - (implode $ rev acc, stream) - end - - fun skipDigitSeq off stream = - let - val (res, stream) = getLongestSeq [] isDigit stream - in - if res = "" then - raise TkError (TkiDx off, "expected digit") - else - (String.size res, stream) - end - - fun getSuffixCommon buf off stream = - let - val (sfx, stream) = getLongestSeq [] Char.isAlpha stream - val sfx = String.map (fn c => Char.toLower c) sfx - val sfx = - case List.find (fn (_, repr) => repr = sfx) buf of - NONE => raise TkError (TkiDx off, "unknown suffix") - | SOME (sfx, _) => sfx - in - (sfx, stream) - end - - val getIntSuffix = getSuffixCommon intSuffixRepr 0 - fun getFloatSuffix off = getSuffixCommon floatSuffixRepr off - - (* - * It seems that there is an ambiguity in C: - * gcc/clang/cparser consider 0xe-3 as a fp constant and reject it. - * Right now we do not, but it may be reconsidered. - *) - fun numParser NpInit (stream, _) (SOME c) = - if c = #"0" then - let - val (c, stream') = Stream.getchar stream - in - case c of - NONE => - (NpInit, SOME $ Num $ IntConst (ItOct, "", IsNone), stream') - | SOME c => - if Char.toLower c = #"x" then - (IntMode ImHex, NONE, stream') - else - (IntMode ImOct, NONE, stream) - end - else - (IntMode ImDec, NONE, stream) - | numParser NpInit _ NONE = raise Unreachable - | numParser (IntMode mode) (stream, startOff) c = - let - val (pred, res, offset) = - case mode of - ImDec => (isDigit, ItDec, 0) - | ImOct => (isOctal, ItOct, 1) - | ImInvalidOct => (isDigit, ItOct, 1) - | ImHex => (isHexDigit, ItHex, 2) - - fun checkAndRaise m msg = - if mode = m then raise TkError (TkiStart, msg) else () - - fun finish () = - let - val () = checkAndRaise ImInvalidOct "invalid octal constant" - val stream = Stream.ungetc stream - val str = finishSeqRead (startOff + offset) stream - val (sfx, stream) = getIntSuffix stream - in - (IntMode mode, SOME $ Num $ IntConst (res, str, sfx), stream) - end - in - case c of - NONE => finish () - | SOME c => - if pred c then - (IntMode mode, NONE, stream) - else if c = #"." then - (FloatMode FmDot, NONE, stream) - else if Char.toLower c = #"e" then - (checkAndRaise ImHex - "floating constant can not come with 0x prefix"; - (FloatMode FmExp, NONE, stream)) - else - finish () - end - | numParser (FloatMode FmDot) (stream, startOff) _ = - let - val (len, stream) = skipDigitSeq 0 $ Stream.ungetc stream - - val (c, stream') = Stream.getchar stream - - fun finish () = - let - val str = finishSeqRead startOff stream - val (sfx, stream) = getFloatSuffix len stream - in - (FloatMode FmDot, SOME $ Num $ FloatConst (str, sfx), stream) - end - in - case c of - NONE => finish () - | SOME c => - if Char.toLower c = #"e" then - (FloatMode FmExp, NONE, stream') - else - finish () - end - | numParser (FloatMode FmExp) (stream, startOff) c = - let - val (off, stream) = - if c = NONE then - raise TkError (TkiDx 0, "expected digit") - else if valOf c <> #"+" andalso valOf c <> #"-" then - (0, Stream.ungetc stream) - else - (~1, stream) - - val (len, stream) = skipDigitSeq off stream - val str = finishSeqRead startOff stream - val (sfx, stream) = getFloatSuffix len stream - in - (FloatMode FmDot, SOME $ Num $ FloatConst (str, sfx), stream) - end - - fun chrIntVal c = - if isDigit c then - ord c - ord #"0" - else if ord c >= ord #"a" andalso ord c <= ord #"z" then - ord c - ord #"a" + 10 - else if ord c >= ord #"A" andalso ord c <= ord #"Z" then - ord c - ord #"A" + 10 - else - raise Unreachable - - fun parseOctalSeq stream c = - let - fun follow stream acc count = - if count = 3 then - (SOME $ chr acc, stream) - else - let - val (c, stream) = Stream.getchar stream - in - case c of - NONE => (SOME $ chr acc, stream) - | SOME c => - if isOctal c then - follow stream (acc * 8 + chrIntVal c) (count + 1) - else - (SOME $ chr acc, Stream.ungetc stream) - end - in - follow stream (chrIntVal c) 1 - end - - fun parseHexSeq stream = - let - fun follow stream acc count = - let - val (c, stream) = Stream.getchar stream - - val noHex = TkError (TkiDx 0, "\\x without hex digits") - in - case c of - NONE => - if count = 0 then - raise noHex - else - (SOME $ chr acc, stream) - | SOME c => - if isHexDigit c then - if count = 2 then - raise TkError (TkiDx 2, "hex sequence out of range") - else - follow stream (acc * 16 + chrIntVal c) (count + 1) - else - if count = 0 then - raise noHex - else - (SOME $ chr acc, Stream.ungetc stream) - end - in - follow stream 0 0 - end - - fun eatEscSeq stream = - let - fun raiseErr0 msg = raise TkError (TkiDx 0, msg) - - val (c, stream) = Stream.getchar stream - val c = - case c of - NONE => raiseErr0 "unfinished escape sequence" - | SOME c => c - - fun & c = (SOME c, stream) - in - case c of - #"'" => & #"'" - | #"\"" => & #"\"" - | #"?" => & #"?" - | #"\\" => & #"\\" - | #"a" => & #"\a" - | #"b" => & #"\b" - | #"f" => & #"\f" - | #"n" => & #"\n" - | #"r" => & #"\r" - | #"t" => & #"\t" - | #"v" => & #"\v" - | #"\n" => (NONE, stream) - | #"x" => parseHexSeq stream - | c => - if isOctal c then - parseOctalSeq stream c - else - raiseErr0 "unknown escape sequence" - end - - datatype SeqParseState = SeqInit | SeqStart | SeqValue of int | SeqTerm - - datatype seqParseMode = SpmChr | SpmStr - - fun seqBound SpmChr = #"'" - | seqBound SpmStr = #"\"" - - fun seqExnConv mode (TkError (v, msg)) = - let - val bound = if mode = SpmChr then "'" else "\"" - val msg = - String.translate (fn c => if c = #"%" then bound else str c) msg - in - TkError (v, msg) - end - | seqExnConv _ _ = raise Unreachable - - fun unfinishedSeq SpmChr = "unfinished character constant" - | unfinishedSeq SpmStr = "unfinished string literal" - - fun seqParser mode SeqInit (stream, _) (SOME c) = - if seqBound mode = c then - (SeqStart, NONE, stream) - else - raise Unreachable - | seqParser mode SeqStart (stream, _) (SOME c) = - if c <> seqBound mode then - let - val (c, stream) = - if c <> #"\\" then (SOME c, stream) else eatEscSeq stream - in - if c = NONE then - (SeqStart, NONE, stream) - else - (SeqValue (ord $ Option.valOf c), NONE, stream) - end - else if mode = SpmStr then - (SeqTerm, SOME $ StringConst "", stream) - else - raise seqExnConv SpmChr $ TkError (TkiDx 0, "expected value after %") - | seqParser mode (SeqValue v) (stream, startOff) (SOME c) = - if seqBound mode = c then - let - fun term s v = - if mode = SpmChr then - CharConst (s, v) - else - StringConst $ String.extract (s, 1, SOME $ String.size s - 2) - in - (SeqTerm, SOME $ term (finishSeqRead startOff stream) v, stream) - end - else if mode = SpmStr then - let - val (_, stream) = - if c <> #"\\" then (SOME c, stream) else eatEscSeq stream - in - (SeqValue v, NONE, stream) - end - else - raise seqExnConv SpmChr $ TkError (TkiDx 0, "expected % after value") - | seqParser _ SeqTerm _ (SOME _) = - raise Unreachable - | seqParser mode state (_, _) NONE = - raise case state of - SeqInit => Unreachable - | SeqStart => seqExnConv mode $ TkError (TkiStart, unfinishedSeq mode) - | SeqValue _ => - seqExnConv mode $ TkError (TkiStart, unfinishedSeq mode) - | SeqTerm => Unreachable - - val charParser = seqParser SpmChr - val strParser = seqParser SpmStr - - fun formCppDir (Id s) = - let - open String - in - case List.find - (fn (_, repr) => - sub (repr, 0) = cppPrefix andalso - extract (repr, 1, NONE) = s) - tokenRepr - of - SOME (tk, _) => tk - | NONE => raise ExpectedCppDir - end - | formCppDir kwElse = CppElse - | formCppDir _ = raise ExpectedCppDir - - fun handleCppDir tk prevPos stream = - let - val (pos, tk') = tk - in - (prevPos, formCppDir tk') handle - ExpectedCppDir => - raise TkErrorAug (Stream.pos2ppos pos stream, - "expected preprocessor directive") - end - - fun unexpectedCharRaise stream c = - let - open Stream - val pos = pos2ppos (getPosAfterCharRead stream) stream - val repr = - if isPrintable c then - str c - else - "<" ^ Int.toString (ord c) ^ ">" - in - raise TkErrorAug (pos, "unexpected character " ^ repr) - end - - fun skipComment stream pos = - let - fun skip prevIsAsterisk stream = - let - val (c, stream) = - case Stream.getchar stream of - (NONE, _) => - let - val pos = Stream.pos2ppos pos stream - in - raise TkErrorAug (pos, "unfinished comment") - end - | (SOME c, stream) => (c, stream) - in - if prevIsAsterisk andalso c = #"/" then - stream - else - skip (c = #"*") stream - end - in - skip false stream - end - - fun handleBackslash stream = - let - val (c, stream) = Stream.getchar stream - - val raiseErr = fn () => - let - val pos = Stream.getPosAfterCharRead stream - val pos = Stream.pos2ppos pos stream - in - raise TkErrorAug (pos, "expected \\n after backslash") - end - in - case c of - SOME c => - if c = #"\n" then - stream - else - raiseErr () - | NONE => raiseErr () - end - - fun processSymbol stream = - let - val (T as (p, tk), stream) = fsmEat $ Stream.ungetc stream - in - case tk of - CommentStart => getToken $ skipComment stream p - | DoubleDot => (SOME (p, Dot), Stream.ungetc stream) - | Hash => - if Stream.isFirstOnLine p stream then - let - val (tk, stream) = getToken stream - in - case tk of - NONE => - raise TkErrorAug (Stream.pos2ppos p stream, - "unfinished preprecessor directive") - | SOME tk => - (SOME $ handleCppDir tk p stream, stream) - end - else - (SOME T, stream) - | _ => (SOME T, stream) - end - - and getToken stream = - let - val (c, stream) = Stream.getchar stream - - fun @-> parser acc = - (fn (tk, s) => (SOME tk, s)) $ parseGeneric stream parser acc - in - case c of - NONE => (NONE, stream) - | SOME c => - if c = #"\n" then - (SOME (Stream.getPosAfterCharRead stream, NewLine), stream) - else if Char.isSpace c then - getToken stream - else if isIdStart c then - @-> idParser () - else if isDigit c then - @-> numParser NpInit - else if c = #"'" then - @-> charParser SeqInit - else if c = #"\"" then - @-> strParser SeqInit - else if isStartForFsm c then - processSymbol stream - else if c = #"\\" then - getToken $ handleBackslash stream - else - unexpectedCharRaise stream c - end - - fun tokenize stream = - let - fun aux acc stream = - let - val (tk, stream) = getToken stream - in - case tk of - NONE => rev acc - | SOME tk => aux (tk :: acc) stream - end - in - aux [] stream - end - - fun debugPrint tkl fname = - let - fun print' line _ ((_, NewLine) :: tks) = - print' (line + 1) true tks - | print' line firstOnLine ((_, tk) :: tks) = ( - if firstOnLine then ( - print "\n"; - printLn $ fname ^ ":" ^ Int.toString line; - print "\t") - else - (); - printToken tk; - print " "; - print' line false tks - ) - | print' _ _ [] = () - in - print' 1 true tkl; - print "\n" - end -end -- cgit v1.2.3