functor Tokenizer(structure ST: SYMTAB; structure S: STREAM) : TOKENIZER = struct structure ST = ST structure S = S datatype token = Invalid | EOS | NewLine | MacroEnd of int | Id of int | CharConst of int * Word64.word | Num of int | Strlit of int * int | kwBreak | kwCase | kwChar | kwConst | kwContinue | kwDefault | kwDo | kwDouble | kwElse | kwEnum | kwExtern | kwFloat | kwFor | kwGoto | kwIf | kwInt | kwLong | kwRegister | kwReturn | kwShort | kwSigned | kwSizeof | kwStatic | kwStruct | kwSwitch | kwTypedef | kwUnion | kwUnsigned | kwVoid | kwVolatile | kwWhile | LParen | RParen | LBracket | RBracket | LBrace | RBrace | QuestionMark | Colon | Comma | Semicolon | Arrow | Plus | DoublePlus| Minus | DoubleMinus | Ampersand | Asterisk | Slash | Tilde | ExclMark | Percent | DoubleGreater | DoubleLess | Greater | Less | EqualSign | LessEqualSign | GreaterEqualSign | DoubleEqualSign | ExclMarkEqualSign | Cap | VerticalBar | DoubleAmpersand | DoubleVerticalBar | AsteriskEqualSign | SlashEqualSign | PercentEqualSign | PlusEqualSign | MinusEqualSign | DoubleLessEqualSign | DoubleGreaterEqualSign | AmpersandEqualSign | CapEqualSign | VerticalBarEqualSign | Hash | DoubleHash | Dot | DoubleDot | TripleDot | CommentStart | PpcInclude of string * string | PpcDefine | PpcUndef | PpcIf | PpcIfdef | PpcIfndef | PpcElse | PpcElif | PpcEndif | PpcWarning | PpcError | PpcPragma exception TkError of int * string fun error pos msg = (printf `"\n" S.Ppos pos `": " `msg `"\n" %; exit 1) exception TokenWithoutRepr val kwPrefix = #"`" val ppcPrefix = #"$" val otherPrefix = #"@" val tokenRepr = let fun & repr = str kwPrefix ^ repr fun % repr = str ppcPrefix ^ repr fun ` repr = str otherPrefix ^ repr in [ (NewLine, `"NewLine"), (EOS, `"EOS"), (Invalid, `"Invalid"), (kwBreak, &"break"), (kwCase, &"case"), (kwChar, &"char"), (kwConst, &"const"), (kwContinue, &"continue"), (kwDefault, &"default"), (kwDo, &"do"), (kwDouble, &"double"), (kwElse, &"else"), (kwEnum, &"enum"), (kwExtern, &"extern"), (kwFloat, &"float"), (kwFor, &"for"), (kwGoto, &"goto"), (kwInt, &"int"), (kwIf, &"if"), (kwLong, &"long"), (kwRegister, &"register"), (kwReturn, &"return"), (kwShort, &"short"), (kwSigned, &"signed"), (kwSizeof, &"sizeof"), (kwStatic, &"static"), (kwStruct, &"struct"), (kwSwitch, &"switch"), (kwTypedef, &"typedef"), (kwUnion, &"union"), (kwUnsigned, &"unsigned"), (kwVoid, &"void"), (kwVolatile, &"volatile"), (kwWhile, &"while"), (LParen, "("), (RParen, ")"), (LBracket, "["), (RBracket, "]"), (LBrace, "{"), (RBrace, "}"), (QuestionMark, "?"), (Colon, ":"), (Comma, ","), (Semicolon, ";"), (Arrow, "->"), (Plus, "+"), (DoublePlus, "++"), (Minus, "-"), (DoubleMinus, "--"), (Ampersand, "&"), (Asterisk, "*"), (Slash, "/"), (Tilde, "~"), (ExclMark, "!"), (Percent, "%"), (DoubleLess, "<<"), (DoubleGreater, ">>"), (Less, "<"), (Greater, ">"), (EqualSign, "="), (LessEqualSign, "<="), (GreaterEqualSign, ">="), (DoubleEqualSign, "=="), (ExclMarkEqualSign, "!="), (Cap, "^"), (VerticalBar, "|"), (DoubleAmpersand, "&&"), (DoubleVerticalBar, "||"), (AsteriskEqualSign, "*="), (SlashEqualSign, "/="), (PercentEqualSign, "%="), (PlusEqualSign, "+="), (MinusEqualSign, "-="), (DoubleLessEqualSign, "<<="), (DoubleGreaterEqualSign, ">>="), (AmpersandEqualSign, "&="), (CapEqualSign, "^="), (VerticalBarEqualSign, "|="), (Hash, "#"), (DoubleHash, "##"), (Dot, "."), (DoubleDot, ".."), (TripleDot, "..."), (CommentStart, "/*"), (PpcDefine, %"define"), (PpcUndef, %"undef"), (PpcIf, %"if"), (PpcIfdef, %"ifdef"), (PpcIfndef, %"ifndef"), (PpcElse, %"else"), (PpcElif, %"elif"), (PpcEndif, %"endif"), (PpcWarning, %"warning"), (PpcError, %"error"), (PpcPragma, %"pragma") ] end fun initSymtab () = let val symtab = ST.init () fun ?s = String.sub (s, 0) fun rest s = String.extract (s, 1, NONE) fun clPpc tk (NONE, kw) = (SOME tk, kw) | clPpc _ _ = raise Unreachable fun clKw tk (ppc, NONE) = (ppc, SOME tk) | clKw _ _ = raise Unreachable val res = ST.getId symtab "0" val () = if res <> 0 then raise Unreachable else () in app (fn (tk, repr) => if ?repr = ppcPrefix then ignore $ ST.insert symtab (rest repr) (clPpc tk) else if ?repr = kwPrefix then ignore $ ST.insert symtab (rest repr) (clKw tk) else ()) tokenRepr; symtab end fun ptk symtab tk out = let val ? = fn z => let fun f id out = Printf out `(ST.getStr symtab id) % in bind A1 f end z in case tk of Id id | Num id => Printf out ?id % | MacroEnd mid => Printf out `"mend(" ?mid `")" % | NewLine => Printf out `"\\n" % | PpcInclude (dir, arg) => Printf out `"#include(" `dir `", " `arg `")" % | CharConst (repr, _) => Printf out ?repr % | Strlit (id, _) => Printf out ?id % | v => case List.find (fn (x, _) => x = v) tokenRepr of SOME (_, repr) => let val head = String.sub (repr, 0) val head = if head = ppcPrefix then #"#" else head val tail = String.extract (repr, 1, NONE) in Printf out C head `tail % end | NONE => raise TokenWithoutRepr end val Ptk = fn z => bind A2 ptk z fun isNondigit c = Char.isAlpha c orelse c = #"_" fun isOctal c = ord c >= ord #"0" andalso ord c < ord #"8" val isDigit = Char.isDigit val isHexDigit = Char.isHexDigit fun isPrintable c = Char.isPrint c andalso c <> #" " (* FSM for parsing symbols *) val maxStates = 51 fun fsmInsert (nextState, buf) curState tk (c :: cs) = let open Array val (_, row) = sub (buf, curState) val potNextState = sub (row, ord c) in if potNextState <> ~1 then fsmInsert (nextState, buf) potNextState tk cs else ( update (row, ord c, !nextState); nextState := !nextState + 1; fsmInsert (nextState, buf) (!nextState - 1) tk cs ) end | fsmInsert (_, buf) curState tk [] = let open Array val (_, row) = sub (buf, curState) in update (buf, curState, (tk, row)) end fun isStartForFsmGen () = let open Array val lookupTable = array (128, false) fun firstChr s = ord $ String.sub (s, 0) val () = List.app (fn (_, repr) => update (lookupTable, firstChr repr, true)) $ List.filter (fn (_, repr) => let val c = String.sub (repr, 0) in c <> kwPrefix andalso c <> ppcPrefix andalso c <> otherPrefix end) tokenRepr in fn c => sub (lookupTable, ord c) end val isStartForFsm = isStartForFsmGen () fun fsmTableCreate () = let open Array val T as (nextState, buf) = (ref 1, array (maxStates, (Invalid, array (128, ~1)))) val r = ref 1 fun filterNeeded [] acc = acc | filterNeeded ((T as (_, repr)) :: tks) acc = filterNeeded tks (if isStartForFsm $ String.sub (repr, 0) then T :: acc else acc) val tokenRepr = filterNeeded tokenRepr [] fun fsmInsert' T curState tk repr = fsmInsert T curState tk repr handle Subscript => die 2 `"fsm table is too small. Increate 'maxState' value\n" % in while !r < length buf do ( update (buf, !r, (Invalid, array(128, ~1))); r := !r + 1 ); List.app (fn (v, p) => fsmInsert' T 0 v $ explode p) tokenRepr; if !nextState <> maxStates then printf `"note: Fsm table size can be smaller: " I (!nextState) `" is enough" % else (); T end fun printTable symtab (nextState, buf) = let open Array fun printRow i row = if i = length row then printf `"\n" % else let val state = sub (row, i) in if state = ~1 then () else printf C (chr i) `" -> " I state `", " %; printRow (i + 1) row end fun print' rowNum buf = if rowNum = !nextState then () else let val (tk, row) = sub (buf, rowNum) in printf `"row " I rowNum `" - " Ptk symtab tk `": \t" %; printRow 0 row; print' (rowNum + 1) buf end in printf `"FSM table:\n"; printf `"NextState: " I (!nextState) `"\n" %; print' 0 buf; printf `"\n" % end val fsmTable = fsmTableCreate () fun fsmEat stream = let open Array val (pos, stream) = S.getPos stream fun get curState stream = let val (c, stream) = S.getchar stream in if c = #"\000" then (#1 $ sub (#2 $ fsmTable, curState), stream) else let val (tk, row) = sub (#2 $ fsmTable, curState) val nextState = sub (row, ord c) in if nextState = ~1 then (tk, S.ungetc stream) else get nextState stream end end val (tk, stream) = get 0 stream in (tk, pos, stream) end datatype chr = Reg of char | EscSeqed of char | NoChar fun chrIntVal c = if isDigit c then ord c - ord #"0" else if ord c >= ord #"a" andalso ord c <= ord #"z" then ord c - ord #"a" + 10 else if ord c >= ord #"A" andalso ord c <= ord #"Z" then ord c - ord #"A" + 10 else raise Unreachable fun parseOctalSeq stream c = let fun follow stream acc count = if count = 3 then (EscSeqed $ chr acc, stream) else let val (c, stream) = S.getchar stream in if c = #"\000" then (EscSeqed $ chr acc, stream) else if isOctal c then follow stream (acc * 8 + chrIntVal c) (count + 1) else (EscSeqed $ chr acc, S.ungetc stream) end in follow stream (chrIntVal c) 1 end fun parseHexSeq stream = let fun follow stream acc count = let val (c, stream) = S.getchar stream val noHex = TkError (0, "\\x without hex digits") in if c = #"\000" then if count = 0 then raise noHex else (EscSeqed $ chr acc, stream) else if isHexDigit c then if count = 2 then raise TkError (2, "hex sequence out of range") else follow stream (acc * 16 + chrIntVal c) (count + 1) else if count = 0 then raise noHex else (EscSeqed $ chr acc, S.ungetc stream) end in follow stream 0 0 end fun eatEscSeq stream = let fun raiseErr0 msg = raise TkError (0, msg) val (c, stream) = S.getchar stream val c = if c = #"\000" then raiseErr0 "unfinished escape sequence" else c fun & c = (EscSeqed c, stream) in case c of #"'" => & #"'" | #"\"" => & #"\"" | #"?" => & #"?" | #"\\" => & #"\\" | #"a" => & #"\a" | #"b" => & #"\b" | #"f" => & #"\f" | #"n" => & #"\n" | #"r" => & #"\r" | #"t" => & #"\t" | #"v" => & #"\v" | #"\n" => (NoChar, stream) | #"x" => parseHexSeq stream | c => if isOctal c then parseOctalSeq stream c else raiseErr0 "unknown escape sequence" end handle TkError (dx, msg) => let val offset = S.getOffset stream + dx val (pos, _) = S.getPosRaw offset stream in error pos msg end fun getMaybeBackslashed stream = let fun getMaybeBackslashed' stream = let val (c, stream) = S.getchar stream in if c = #"\\" then eatEscSeq stream else (Reg c, stream) end val (c, stream) = getMaybeBackslashed' stream in case c of NoChar => getMaybeBackslashed stream | _ => (c, stream) end fun strlit2charList (s: string) = let val s = String.substring (s, 1, size s - 2) val stream = S.createFromString s fun collect acc stream = let val (c, stream) = getMaybeBackslashed stream in case c of Reg #"\000" => rev $ #"\000" :: acc | Reg c | EscSeqed c => collect (c :: acc) stream | NoChar => raise Unreachable end in collect [] stream end fun parseCharConst symtab stream = let val startOff = S.getOffset stream - 1 val (pos, stream) = S.getPosRaw startOff stream val (chr, stream1) = getMaybeBackslashed stream val v = case chr of Reg #"\000" => error (S.getPosDisc stream) "expected character" | Reg c | EscSeqed c => Word64.fromInt $ ord c | NoChar => raise Unreachable val (c, stream2) = getMaybeBackslashed stream1 val () = case c of Reg #"'" => () | _ => error (S.getPosDisc stream1) "expected '" val endOff = S.getOffset stream2 val repr = S.getSubstr startOff endOff stream2 val id = ST.getId symtab repr in (CharConst (id, v), pos, stream2) end fun parseStrlit symtab stream = let val startOff = S.getOffset stream - 1 val (pos, stream) = S.getPosRaw startOff stream fun collect size stream = let val (c, stream) = getMaybeBackslashed stream in case c of Reg #"\000" => error pos "unfinished string literal" | Reg #"\"" => (S.getOffset stream, size, stream) | _ => collect (size + 1) stream end val (endOff, size, stream) = collect 0 stream val s = S.getSubstr startOff endOff stream val id = ST.getId symtab s in (Strlit (id, size), pos, stream) end fun parseId symtab _ stream = let val startOff = S.getOffset stream - 1 fun collect stream = let val (c, stream') = S.getchar stream in if isNondigit c orelse isDigit c then collect stream' else (S.getOffset stream, stream) end val (endOff, stream) = collect stream val id = S.getSubstr startOff endOff stream val id = ST.getId symtab id in (Id id, stream) end fun parseNumber symtab dx stream = let fun collect stream = let val (c, stream') = S.getchar stream val (c1, stream2) = S.getchar stream' in if isDigit c orelse c = #"." then collect stream' else if Char.toLower c = #"e" andalso (c1 = #"+" orelse c1 = #"-") then collect stream2 else if isNondigit c then collect stream' else (S.getOffset stream, stream) end val startOff = S.getOffset stream - dx val (pos, stream) = S.getPosRaw startOff stream val (endOff, stream) = collect stream val s = S.getSubstr startOff endOff stream val id = ST.getId symtab s in (Num id, pos, stream) end fun getDir stream = OS.Path.getParent o S.getFname $ stream fun completePpcInclude (S.Pos (fname, line, _)) stream = let val pos = S.Pos (fname, line, 1) val (line, stream) = S.getLine stream in case line of SOME line => (PpcInclude (getDir stream, line), stream) | NONE => error pos "line does not end with '\\n'" end fun isPpcDir (PpcInclude _) = true | isPpcDir tk = case List.find (fn (tk', _) => tk' = tk) tokenRepr of SOME (_, repr) => String.sub (repr, 0) = ppcPrefix | NONE => false fun handlePpcDir symtab (tk, pos) stream = let open String val error = fn () => error pos "expected preprocessor directive" fun getById id = case ST.isPpcDir symtab id of SOME tk => (tk, stream) | NONE => if ST.getStr symtab id = "include" then completePpcInclude pos stream else error () in case tk of Id id => getById id | _ => error () end fun unexpectedCharRaise stream c = let val (pos, _) = S.getPosAfterChar stream val repr = if isPrintable c then str c else "<" ^ Int.toString (ord c) ^ ">" in error pos ("unexpected character " ^ repr) end fun skipComment stream pos = let fun skip prevIsAsterisk stream = let val (c, stream) = S.getchar stream val () = if c = #"\000" then error pos "unfinished comment" else () in if prevIsAsterisk andalso c = #"/" then stream else skip (c = #"*") stream end in skip false stream end fun handleBackslash stream = let val (c, stream) = S.getchar stream val raiseErr = fn () => let val (pos, _) = S.getPosAfterChar stream in error pos "expected \\n after backslash" end in if c = #"\000" then raiseErr () else if c = #"\n" then stream else raiseErr () end fun processSymbol symtab stream = let val (tk, pos, stream) = fsmEat $ S.ungetc stream in case tk of CommentStart => getToken symtab $ skipComment stream pos | DoubleDot => (Dot, pos, S.ungetc stream) | Hash => if S.isFirstOnLine stream (S.getOffset stream - 1) then let val (tk, pos', stream) = getToken symtab stream in if tk = EOS then error pos "unfinished preprecessor directive" else let val (tk, stream) = handlePpcDir symtab (tk, pos') stream in (tk, pos, stream) end end else (tk, pos, stream) | _ => (tk, pos, stream) end and getToken symtab stream = let val (c, stream) = S.getchar stream val (c1, _) = S.getchar stream fun conv tk (pos, stream) = (tk, pos, stream) fun @-> parser = let val (pos, stream) = S.getPosAfterChar stream val (tk, stream) = parser pos stream in (tk, pos, stream) end in if c = #"\000" then conv EOS $ S.EOFpos stream else if c = #"\n" then conv NewLine $ S.getPosAfterChar stream else if Char.isSpace c then getToken symtab stream else if isNondigit c then @-> $ parseId symtab else if isDigit c then parseNumber symtab 1 stream else if c = #"." andalso isDigit c1 then parseNumber symtab 2 stream else if c = #"'" then parseCharConst symtab stream else if c = #"\"" then parseStrlit symtab stream else if isStartForFsm c then processSymbol symtab stream else if c = #"\\" then getToken symtab $ handleBackslash stream else unexpectedCharRaise stream c end fun debugPrint fname = let val stream = S.create fname val symtab = initSymtab () fun print line stream = let val (tk, S.Pos (_, line', col'), stream) = getToken symtab stream in if line <> line' then printf `"\nline " I line' `": \t" % else (); printf I col' `":" Ptk symtab tk `" "; if tk = EOS then () else print line' stream end in printTable symtab $ fsmTable; printf `"Tokenizing file: " `fname; print 0 stream; printf `"\n" % end end