structure Tokenizer:> TOKENIZER = struct datatype intType = ItDec | ItOct | ItHex datatype intSfx = IsNone | IsU | IsL | IsUL | IsLL | IsULL datatype floatSfx = FsNone | FsF | FsL datatype numConst = IntConst of intType * string * intSfx | FloatConst of string * floatSfx datatype token = Invalid | NewLine | Num of numConst | Id of string | CharConst of string * int | StringConst of string | kwBreak | kwCase | kwChar | kwConst | kwContinue | kwDefault | kwDouble | kwElse | kwEnum | kwExtern | kwFloat | kwFor | kwGoto | kwInt | kwLong | kwRegister | kwReturn | kwShort | kwSigned | kwSizeof | kwStruct | kwSwitch | kwTypedef | kwUnion | kwUnsigned | kwVoid | kwVolatile | LParen | RParen | LBracket | RBracket | LBrace | RBrace | QuestionMark | Colon | Coma | Semicolon | Arrow | Plus | DoublePlus| Minus | DoubleMinus | Ampersand | Asterisk | Slash | Tilde | ExclMark | Percent | DoubleGreater | DoubleLess | Greater | Less | EqualSign | LessEqualSign | GreaterEqualSign | DoubleEqualSign | ExclMarkEqualSign | Cap | VerticalBar | DoubleAmpersand | DoubleVerticalBar | AsteriskEqualSign | SlashEqualSign | PercentEqualSign | PlusEqualSign | MinusEqualSign | DoubleLessEqualSign | DoubleGreaterEqualSign | AmpersandEqualSign | CapEqualSign | VerticalBarEqualSign | Hash | DoubleHash | Dot | DoubleDot | TripleDot | CommentStart | CppInclude | CppDefine | CppUndef | CppIf | CppIfdef | CppIfndef | CppElse | CppElif | CppEndif | CppWarning | CppError | CppPragma val kwPrefix = #"@" val cppPrefix = #"$" type fullToken = Stream.pos * token datatype tkErrorAuxInfo = TkiEOF | TkiDx of int | TkiStart exception TkError of tkErrorAuxInfo * string exception TkErrorAug of Stream.ppos * string exception ExpectedCppDir (* handled in postprocess *) exception FsmTableIsTooSmall (* Unreachable (should be) *) exception TokenWithoutRepr exception SuffixWithoutRepr val tokenRepr = let fun & repr = str kwPrefix ^ repr fun % repr = str cppPrefix ^ repr in [ (NewLine, "NewLine"), (kwBreak, &"break"), (kwCase, &"case"), (kwChar, &"char"), (kwConst, &"const"), (kwContinue, &"continue"), (kwDefault, &"default"), (kwDouble, &"double"), (kwElse, &"else"), (kwEnum, &"enum"), (kwExtern, &"extern"), (kwFloat, &"float"), (kwFor, &"for"), (kwGoto, &"goto"), (kwInt, &"int"), (kwLong, &"long"), (kwRegister, &"register"), (kwReturn, &"return"), (kwShort, &"short"), (kwSigned, &"signed"), (kwSizeof, &"sizeof"), (kwStruct, &"struct"), (kwSwitch, &"switch"), (kwTypedef, &"typedef"), (kwUnion, &"union"), (kwUnsigned, &"unsigned"), (kwVoid, &"void"), (kwVolatile, &"volatile"), (LParen, "("), (RParen, ")"), (LBracket, "["), (RBracket, "]"), (LBrace, "{"), (RBrace, "}"), (QuestionMark, "?"), (Colon, ":"), (Coma, ","), (Semicolon, ";"), (Arrow, "->"), (Plus, "+"), (DoublePlus, "++"), (Minus, "-"), (DoubleMinus, "--"), (Ampersand, "&"), (Asterisk, "*"), (Slash, "/"), (Tilde, "~"), (ExclMark, "!"), (Percent, "%"), (DoubleLess, "<<"), (DoubleGreater, ">>"), (Less, "<"), (Greater, ">"), (EqualSign, "="), (LessEqualSign, "<="), (GreaterEqualSign, ">="), (DoubleEqualSign, "=="), (ExclMarkEqualSign, "!="), (Cap, "^"), (VerticalBar, "|"), (DoubleAmpersand, "&&"), (DoubleVerticalBar, "||"), (AsteriskEqualSign, "*="), (SlashEqualSign, "/="), (PercentEqualSign, "%="), (PlusEqualSign, "+="), (MinusEqualSign, "-="), (DoubleLessEqualSign, "<<="), (DoubleGreaterEqualSign, ">>="), (AmpersandEqualSign, "&="), (CapEqualSign, "^="), (VerticalBarEqualSign, "|="), (Hash, "#"), (DoubleHash, "##"), (Dot, "."), (DoubleDot, ".."), (TripleDot, "..."), (CommentStart, "/*"), (CppInclude, %"include"), (CppDefine, %"define"), (CppUndef, %"undef"), (CppIf, %"if"), (CppIfdef, %"ifdef"), (CppIfndef, %"ifndef"), (CppElse, %"else"), (CppElif, %"elif"), (CppEndif, %"endif"), (CppWarning, %"warning"), (CppError, %"error"), (CppPragma, %"pragma") ] end val intSuffixRepr = [ (IsNone, ""), (IsU, "u"), (IsL, "l"), (IsUL, "ul"), (IsLL, "ll"), (IsULL, "ull") ] val floatSuffixRepr = [ (FsNone, ""), (FsF, "f"), (FsL, "l") ] fun getSfxRepr sfx buf onError = case List.find (fn (sfx', _) => sfx' = sfx) buf of NONE => onError () | SOME (_, repr) => repr fun getSfxReprSimple sfx buf = getSfxRepr sfx buf (fn () => raise SuffixWithoutRepr) val printToken = fn Id s => print $ "id:" ^ s | Num (IntConst (it, str, sfx)) => let val intType = case it of ItDec => "" | ItOct => "0" | ItHex => "0x" in print intType; print str; print $ "`" ^ getSfxReprSimple sfx intSuffixRepr ^ "`" end | Num (FloatConst (str, sfx)) => ( print str; print $ "`" ^ getSfxReprSimple sfx floatSuffixRepr ^ "`" ) | CharConst (repr, _) => print repr | StringConst s => print $ "\"" ^ s ^ "\"" | v => case List.find (fn (x, _) => x = v) tokenRepr of SOME (_, repr) => print repr | NONE => raise TokenWithoutRepr fun isIdStart c = Char.isAlpha c orelse c = #"_" fun isIdBody c = Char.isAlphaNum c orelse c = #"_" fun isOctal c = ord c >= ord #"0" andalso ord c < ord #"8" val isDigit = Char.isDigit val isHexDigit = Char.isHexDigit fun isPrintable c = Char.isPrint c andalso c <> #" " (* FSM for parsing symbols *) val maxStates = 51 fun fsmInsert (nextState, buf) curState tk (c :: cs) = let open Array val (_, row) = sub (buf, curState) val potNextState = sub (row, ord c) in if potNextState <> ~1 then fsmInsert (nextState, buf) potNextState tk cs else ( update (row, ord c, !nextState); nextState := !nextState + 1; fsmInsert (nextState, buf) (!nextState - 1) tk cs ) end | fsmInsert (_, buf) curState tk [] = let open Array val (_, row) = sub (buf, curState) in update (buf, curState, (tk, row)) end fun isStartForFsmGen () = let open Array val lookupTable = array (128, false) fun firstChr s = ord $ String.sub (s, 0) val () = List.app (fn (_, repr) => update (lookupTable, firstChr repr, true)) $ List.filter (fn (tk, repr) => let val c = String.sub (repr, 0) in c <> kwPrefix andalso c <> cppPrefix andalso tk <> NewLine end) tokenRepr in fn c => sub (lookupTable, ord c) end val isStartForFsm = isStartForFsmGen () fun fsmTableCreate () = let open Array val T as (nextState, buf) = (ref 1, array (maxStates, (Invalid, array (128, ~1)))) val r = ref 1 fun filterNeeded [] acc = acc | filterNeeded ((T as (_, repr)) :: tks) acc = filterNeeded tks (if isStartForFsm $ String.sub (repr, 0) then T :: acc else acc) val tokenRepr = filterNeeded tokenRepr [] fun fsmInsert' T curState tk repr = fsmInsert T curState tk repr handle Subscript => raise FsmTableIsTooSmall in while !r < length buf do ( update (buf, !r, (Invalid, array(128, ~1))); r := !r + 1 ); List.app (fn (v, p) => fsmInsert' T 0 v $ explode p) tokenRepr; if !nextState <> maxStates then printLn $ "note: Fsm table size can be smaller: " ^ Int.toString (!nextState) ^ " is enough" else (); T end (* Unused right now fun printTable (nextState, buf) = let fun printRow i row = if i = length row then print "\n" else let val state = sub (row, i) in if state = ~1 then () else print ((str (chr i)) ^ ": " ^ (Int.toString state) ^ ", "); printRow (i + 1) row end fun print' rowNum buf = if rowNum = !nextState then () else let val (tk, row) = sub (buf, rowNum) in print ((token2string tk) ^ ": "); printRow 0 row; print' (rowNum + 1) buf end in print ("NextState: " ^ Int.toString (!nextState) ^ "\n"); print' 0 buf; print "\n" end *) val fsmTable = lazy fsmTableCreate fun fsmEat stream = let open Array val pos = Stream.getPos stream fun get curState stream = let val (c, stream) = Stream.getchar stream in case c of NONE => (#1 $ sub (#2 $ fsmTable (), curState), stream) | SOME c => let val (tk, row) = sub (#2 $ fsmTable (), curState) val nextState = sub (row, ord c) in if nextState = ~1 then (tk, Stream.ungetc stream) else get nextState stream end end in (fn (tk, stream) => ((pos, tk), stream)) $ get 0 stream end fun tkError2aug stream (dx, msg) = let val (id, pos) = Stream.getPosAfterCharRead stream val pos = Stream.pos2ppos (id, pos + dx) stream in TkErrorAug (pos, msg) end fun parseGeneric stream parser acc = let val stream = Stream.ungetc stream val P as (_, startOff) = Stream.getPos stream fun parse' stream acc = let val (c, stream) = Stream.getchar stream val (acc, tk, stream) = parser acc (stream, startOff) c handle TkError (TkiDx dx, msg) => raise tkError2aug stream (dx, msg) | TkError (TkiStart, msg) => let val startPos = Stream.pos2ppos P stream in raise TkErrorAug (startPos, msg) end | TkError (TkiEOF, msg) => let open Stream val pos = pposWithoutCol $ pos2ppos P stream in raise TkErrorAug (pos, msg) end in case tk of NONE => parse' stream acc | _ => (valOf tk, stream) end val (tk, stream) = parse' stream acc in ((P, tk): fullToken, stream) end fun finishSeqRead startOff stream = let val (_, endOff) = Stream.getPos stream val s = Stream.getSubstr startOff endOff stream in s end fun keywordHashtableGen () = let open Hashtable val table = create 128 val () = List.app (fn (tk, repr) => if String.sub (repr, 0) = kwPrefix then insert table (String.extract (repr, 1, NONE)) tk else ()) tokenRepr in table end val keywordHashtable = lazy keywordHashtableGen fun findKeyword str = case Hashtable.lookup (keywordHashtable ()) str of NONE => Id str | SOME tk => tk fun idParser () (stream, startOff) c = let fun finalize stream = let val s = finishSeqRead startOff stream val tk = findKeyword s in ((), SOME tk, stream) end in case c of NONE => finalize stream | SOME c => if isIdBody c then ((), NONE, stream) else finalize (Stream.ungetc stream) end datatype intMode = ImDec | ImOct | ImInvalidOct | ImHex datatype floatMode = FmDot | FmExp datatype npState = NpInit | IntMode of intMode | FloatMode of floatMode fun getLongestSeq acc pred stream = let val (c, stream') = Stream.getchar stream in if isSome c andalso pred $ valOf c then getLongestSeq (valOf c :: acc) pred stream' else (implode $ rev acc, stream) end fun skipDigitSeq off stream = let val (res, stream) = getLongestSeq [] isDigit stream in if res = "" then raise TkError (TkiDx off, "expected digit") else (String.size res, stream) end fun getSuffixCommon buf off stream = let val (sfx, stream) = getLongestSeq [] Char.isAlpha stream val sfx = String.map (fn c => Char.toLower c) sfx val sfx = case List.find (fn (_, repr) => repr = sfx) buf of NONE => raise TkError (TkiDx off, "unknown suffix") | SOME (sfx, _) => sfx in (sfx, stream) end val getIntSuffix = getSuffixCommon intSuffixRepr 0 fun getFloatSuffix off = getSuffixCommon floatSuffixRepr off (* * It seems that there is an ambiguity in C: * gcc/clang/cparser consider 0xe-3 as a fp constant and reject it. * Right now we do not, but it may be reconsidered. *) fun numParser NpInit (stream, _) (SOME c) = if c = #"0" then let val (c, stream') = Stream.getchar stream in case c of NONE => (NpInit, SOME $ Num $ IntConst (ItOct, "", IsNone), stream') | SOME c => if Char.toLower c = #"x" then (IntMode ImHex, NONE, stream') else (IntMode ImOct, NONE, stream) end else (IntMode ImDec, NONE, stream) | numParser NpInit _ NONE = raise Unreachable | numParser (IntMode mode) (stream, startOff) c = let val (pred, res, offset) = case mode of ImDec => (isDigit, ItDec, 0) | ImOct => (isOctal, ItOct, 1) | ImInvalidOct => (isDigit, ItOct, 1) | ImHex => (isHexDigit, ItHex, 2) fun checkAndRaise m msg = if mode = m then raise TkError (TkiStart, msg) else () fun finish () = let val () = checkAndRaise ImInvalidOct "invalid octal constant" val stream = Stream.ungetc stream val str = finishSeqRead (startOff + offset) stream val (sfx, stream) = getIntSuffix stream in (IntMode mode, SOME $ Num $ IntConst (res, str, sfx), stream) end in case c of NONE => finish () | SOME c => if pred c then (IntMode mode, NONE, stream) else if c = #"." then (FloatMode FmDot, NONE, stream) else if Char.toLower c = #"e" then (checkAndRaise ImHex "floating constant can not come with 0x prefix"; (FloatMode FmExp, NONE, stream)) else finish () end | numParser (FloatMode FmDot) (stream, startOff) _ = let val (len, stream) = skipDigitSeq 0 $ Stream.ungetc stream val (c, stream') = Stream.getchar stream fun finish () = let val str = finishSeqRead startOff stream val (sfx, stream) = getFloatSuffix len stream in (FloatMode FmDot, SOME $ Num $ FloatConst (str, sfx), stream) end in case c of NONE => finish () | SOME c => if Char.toLower c = #"e" then (FloatMode FmExp, NONE, stream') else finish () end | numParser (FloatMode FmExp) (stream, startOff) c = let val (off, stream) = if c = NONE then raise TkError (TkiDx 0, "expected digit") else if valOf c <> #"+" andalso valOf c <> #"-" then (0, Stream.ungetc stream) else (~1, stream) val (len, stream) = skipDigitSeq off stream val str = finishSeqRead startOff stream val (sfx, stream) = getFloatSuffix len stream in (FloatMode FmDot, SOME $ Num $ FloatConst (str, sfx), stream) end fun chrIntVal c = if isDigit c then ord c - ord #"0" else if ord c >= ord #"a" andalso ord c <= ord #"z" then ord c - ord #"a" + 10 else if ord c >= ord #"A" andalso ord c <= ord #"Z" then ord c - ord #"A" + 10 else raise Unreachable fun parseOctalSeq stream c = let fun follow stream acc count = if count = 3 then (SOME $ chr acc, stream) else let val (c, stream) = Stream.getchar stream in case c of NONE => (SOME $ chr acc, stream) | SOME c => if isOctal c then follow stream (acc * 8 + chrIntVal c) (count + 1) else (SOME $ chr acc, Stream.ungetc stream) end in follow stream (chrIntVal c) 1 end fun parseHexSeq stream = let fun follow stream acc count = let val (c, stream) = Stream.getchar stream val noHex = TkError (TkiDx 0, "\\x without hex digits") in case c of NONE => if count = 0 then raise noHex else (SOME $ chr acc, stream) | SOME c => if isHexDigit c then if count = 2 then raise TkError (TkiDx 2, "hex sequence out of range") else follow stream (acc * 16 + chrIntVal c) (count + 1) else if count = 0 then raise noHex else (SOME $ chr acc, Stream.ungetc stream) end in follow stream 0 0 end fun eatEscSeq stream = let fun raiseErr0 msg = raise TkError (TkiDx 0, msg) val (c, stream) = Stream.getchar stream val c = case c of NONE => raiseErr0 "unfinished escape sequence" | SOME c => c fun & c = (SOME c, stream) in case c of #"'" => & #"'" | #"\"" => & #"\"" | #"?" => & #"?" | #"\\" => & #"\\" | #"a" => & #"\a" | #"b" => & #"\b" | #"f" => & #"\f" | #"n" => & #"\n" | #"r" => & #"\r" | #"t" => & #"\t" | #"v" => & #"\v" | #"\n" => (NONE, stream) | #"x" => parseHexSeq stream | c => if isOctal c then parseOctalSeq stream c else raiseErr0 "unknown escape sequence" end fun stringCut s = String.extract (s, 1, SOME $ String.size s - 2) datatype SeqParseState = SeqInit | SeqStart | SeqValue of int | SeqTerm datatype seqParseMode = SpmChr | SpmStr fun seqBound SpmChr = #"'" | seqBound SpmStr = #"\"" fun seqExnConv mode (TkError (v, msg)) = let val bound = if mode = SpmChr then "'" else "\"" val msg = String.translate (fn c => if c = #"%" then bound else str c) msg in TkError (v, msg) end | seqExnConv _ _ = raise Unreachable fun unfinishedSeq SpmChr = "unfinished character constant" | unfinishedSeq SpmStr = "unfinished string literal" fun seqParser mode SeqInit (stream, _) (SOME c) = if seqBound mode = c then (SeqStart, NONE, stream) else raise Unreachable | seqParser mode SeqStart (stream, _) (SOME c) = if c <> seqBound mode then let val (c, stream) = if c <> #"\\" then (SOME c, stream) else eatEscSeq stream in if c = NONE then (SeqStart, NONE, stream) else (SeqValue (ord $ Option.valOf c), NONE, stream) end else if mode = SpmStr then (SeqTerm, SOME $ StringConst "", stream) else raise seqExnConv SpmChr $ TkError (TkiDx 0, "expected value after %") | seqParser mode (SeqValue v) (stream, startOff) (SOME c) = if seqBound mode = c then let fun term s v = if mode = SpmChr then CharConst (s, v) else StringConst $ stringCut s in (SeqTerm, SOME $ term (finishSeqRead startOff stream) v, stream) end else if mode = SpmStr then let val (_, stream) = if c <> #"\\" then (SOME c, stream) else eatEscSeq stream in (SeqValue v, NONE, stream) end else raise seqExnConv SpmChr $ TkError (TkiDx 0, "expected % after value") | seqParser _ SeqTerm _ (SOME _) = raise Unreachable | seqParser mode state (_, _) NONE = raise case state of SeqInit => Unreachable | SeqStart => seqExnConv mode $ TkError (TkiStart, unfinishedSeq mode) | SeqValue _ => seqExnConv mode $ TkError (TkiStart, unfinishedSeq mode) | SeqTerm => Unreachable val charParser = seqParser SpmChr val strParser = seqParser SpmStr fun formCppDir (Id s) = let open String in case List.find (fn (_, repr) => sub (repr, 0) = cppPrefix andalso extract (repr, 1, NONE) = s) tokenRepr of SOME (tk, _) => tk | NONE => raise ExpectedCppDir end | formCppDir kwElse = CppElse | formCppDir _ = raise ExpectedCppDir fun handleCppDir tk prevPos stream = let val (pos, tk') = tk in (prevPos, formCppDir tk') handle ExpectedCppDir => raise TkErrorAug (Stream.pos2ppos pos stream, "expected preprocessor directive") end fun unexpectedCharRaise stream c = let open Stream val pos = pos2ppos (getPosAfterCharRead stream) stream val repr = if isPrintable c then str c else "<" ^ Int.toString (ord c) ^ ">" in raise TkErrorAug (pos, "unexpected character " ^ repr) end fun skipComment stream pos = let fun skip prevIsAsterisk stream = let val (c, stream) = case Stream.getchar stream of (NONE, _) => let val pos = Stream.pos2ppos pos stream in raise TkErrorAug (pos, "unfinished comment") end | (SOME c, stream) => (c, stream) in if prevIsAsterisk andalso c = #"/" then stream else skip (c = #"*") stream end in skip false stream end fun handleBackslash stream = let val (c, stream) = Stream.getchar stream val raiseErr = fn () => let val pos = Stream.getPosAfterCharRead stream val pos = Stream.pos2ppos pos stream in raise TkErrorAug (pos, "expected \\n after backslash") end in case c of SOME c => if c = #"\n" then stream else raiseErr () | NONE => raiseErr () end fun processSymbol stream = let val (T as (p, tk), stream) = fsmEat $ Stream.ungetc stream in case tk of CommentStart => getToken $ skipComment stream p | DoubleDot => (SOME (p, Dot), Stream.ungetc stream) | Hash => if Stream.isFirstOnLine p stream then let val (tk, stream) = getToken stream in case tk of NONE => raise TkErrorAug (Stream.pos2ppos p stream, "unfinished preprecessor directive") | SOME tk => (SOME $ handleCppDir tk p stream, stream) end else (SOME T, stream) | _ => (SOME T, stream) end and getToken stream = let val (c, stream) = Stream.getchar stream fun @-> parser acc = (fn (tk, s) => (SOME tk, s)) $ parseGeneric stream parser acc in case c of NONE => (NONE, stream) | SOME c => if c = #"\n" then (SOME (Stream.getPosAfterCharRead stream, NewLine), stream) else if Char.isSpace c then getToken stream else if isIdStart c then @-> idParser () else if isDigit c then @-> numParser NpInit else if c = #"'" then @-> charParser SeqInit else if c = #"\"" then @-> strParser SeqInit else if isStartForFsm c then processSymbol stream else if c = #"\\" then getToken $ handleBackslash stream else unexpectedCharRaise stream c end fun tokenize stream = let fun aux acc stream = let val (tk, stream) = getToken stream in case tk of NONE => rev acc | SOME tk => aux (tk :: acc) stream end in aux [] stream end fun printTokens tkl fname = let fun print' line _ ((_, NewLine) :: tks) = print' (line + 1) true tks | print' line firstOnLine ((_, tk) :: tks) = ( if firstOnLine then ( print "\n"; printLn $ fname ^ ":" ^ Int.toString line; print "\t") else (); printToken tk; print " "; print' line false tks ) | print' _ _ [] = () in print' 1 true tkl; print "\n" end end