fun $ (x, y) = x y infixr 0 $ fun printLn s = (print s; print "\n") datatype includeArg = IARel of string | IAFromRef of string datatype token = Invalid | Number of string | Id of string | CharConst of string * int | kwBreak | kwCase | kwChar | kwConst | kwContinue | kwDefault | kwDouble | kwElse | kwEnum | kwExtern | kwFloat | kwFor | kwGoto | kwInt | kwLong | kwRegister | kwReturn | kwShort | kwSigned | kwSizeof | kwStruct | kwSwitch | kwTypedef | kwUnion | kwUnsigned | kwVoid | kwVolatile | OParen | CParen | Plus | DoublePlus| Minus | DoubleMinus | Semicolon | EqualSign | DoubleEqualSign | ExclMark | ExclMarkEqualSign | QuestionMark | Colon | Hash | DoubleHash | CppInclude of includeArg | CppDefine val kwPrefix = #"@" val cppPrefix = #"$" type fileId = int type fileOffset = int type pos = fileId * fileOffset type convPos = string * int * int option type fullToken = pos * token exception TkError of int * string exception TkErrorAug of convPos * string (* main exception *) exception ExpectedCppDir (* handled in postprocess *) (* handled in tokenizer *) exception EndOfStream exception EndOfFile (* Unreachable (should be) *) exception Unreachable exception FsmTableIsTooSmall exception InvalidStream exception InvalidStreamAdvance exception TokenWithoutRepr (* handled in readIncludeArg *) exception LineWithoutNl fun pos2str (pos, line, col) = let val % = Int.toString in case col of SOME col => pos ^ ":" ^ %line ^ ":" ^ %col | NONE => pos ^ ":" ^ %line end fun eprint s = printLn $ "error: " ^ s fun otherExn e = let val hist = MLton.Exn.history e in eprint $ "exception " ^ exnMessage e ^ " was raised"; if hist = [] then (printLn "No stack trace is avaliable"; printLn "Recompile with -const \"Exn.keepHistory true\"") else List.app (fn x => printLn $ "\t" ^ x) hist end fun exit code = Posix.Process.exit $ Word8.fromInt code fun ioExn (IO.Io { name, function = _, cause }) = let open OS val prefix = name ^ ": " val reason = case cause of SysErr (str, _) => str | _ => exnMessage cause in printLn $ prefix ^ reason end | ioExn _ = (printLn "ioExn: unreachable"; exit 254) fun globalExnHandler e = (case e of FsmTableIsTooSmall => eprint "fsm table is too small. Increate 'maxState' value" | IO.Io _ => ioExn e | TkErrorAug (pos, msg) => eprint $ pos2str pos ^ ": " ^ msg | _ => otherExn e; exit 255) val () = MLton.Exn.setTopLevelHandler globalExnHandler val tokenRepr = let fun & repr = str kwPrefix ^ repr fun % repr = str cppPrefix ^ repr in [ (OParen, "("), (CParen, ")"), (kwBreak, &"break"), (kwCase, &"case"), (kwChar, &"char"), (kwConst, &"const"), (kwContinue, &"continue"), (kwDefault, &"default"), (kwDouble, &"double"), (kwElse, &"else"), (kwEnum, &"enum"), (kwExtern, &"extern"), (kwFloat, &"float"), (kwFor, &"for"), (kwGoto, &"goto"), (kwInt, &"int"), (kwLong, &"long"), (kwRegister, &"register"), (kwReturn, &"return"), (kwShort, &"short"), (kwSigned, &"signed"), (kwSizeof, &"sizeof"), (kwStruct, &"struct"), (kwSwitch, &"switch"), (kwTypedef, &"typedef"), (kwUnion, &"union"), (kwUnsigned, &"unsigned"), (kwVoid, &"void"), (kwVolatile, &"volatile"), (Plus, "+"), (DoublePlus, "++"), (Minus, "-"), (DoubleMinus, "--"), (Semicolon, ";"), (EqualSign, "="), (DoubleEqualSign, "=="), (ExclMark, "!"), (ExclMarkEqualSign, "!="), (QuestionMark, "?"), (Colon, ":"), (Hash, "#"), (DoubleHash, "##"), (CppDefine, % "define") ] end val printToken = fn Number s => printLn $ "Num: " ^ s | Id s => printLn $ "Id: " ^ s | CharConst (repr, _) => printLn repr | CppInclude arg => let val (start, end', arg) = case arg of IARel v => ("\"", "\"", v) | IAFromRef v => ("<", ">", v) in printLn $ (str cppPrefix) ^ "include " ^ start ^ arg ^ end' end | v => case List.find (fn (x, _) => x = v) tokenRepr of SOME (_, repr) => printLn repr | NONE => raise TokenWithoutRepr type stream = { (* list of file ids, file names and file contents *) allFiles: (fileId * string * string) list, (* stack of file ids, file offsets and file contents *) stack: (fileId * fileOffset * string) list } fun calcFilePos s offset = let fun calc s cur offset (line, col) = if cur = offset then (line, col) else calc s (cur + 1) offset (if String.sub (s, cur) = #"\n" then (line + 1, 1) else (line, col + 1)) in calc s 0 offset (1, 1) end fun printPos fileList (id, pos) = let val triple = List.find (fn (fid, _, _) => fid = id) fileList in case triple of NONE => raise InvalidStream | SOME (_, fname, contents) => let val (line, col) = calcFilePos contents pos val line = Int.toString line val col = Int.toString col in print $ fname ^ ":" ^ line ^ ":" ^ col ^ ": " end end fun readFile fname = let open TextIO val h = openIn fname val s = inputAll h val () = closeIn h in s end fun getchar ({ stack = (id, off, contents) :: rest, allFiles }: stream) : (char * stream) = if off < String.size contents then (String.sub (contents, off), { stack = (id, off + 1, contents) :: rest, allFiles }) else raise EndOfFile | getchar _ = raise EndOfStream fun ungetc ({ stack = (id, off, contents) :: rest, allFiles }) = if off = 0 then raise InvalidStream else { stack = (id, off - 1, contents) :: rest, allFiles } | ungetc _ = raise InvalidStream fun readline { stack = (fid, off, contents) :: rest, allFiles } = let open String fun read offset = if offset = size contents then raise LineWithoutNl else if sub (contents, offset) = #"\n" then (String.extract (contents, off, SOME $ offset - off), offset + 1) else read (offset + 1) val (arg, newOffset) = read off in (arg, { stack = (fid, newOffset, contents) :: rest, allFiles }) end | readline _ = raise InvalidStream fun getOffset ({ stack = (_, off, _) :: _, ... }: stream) = off | getOffset _ = raise InvalidStream fun getPosAfterCharRead ({ stack = (id, off, _) :: _, ... }: stream) = (id, off - 1) | getPosAfterCharRead _ = raise InvalidStream fun getPposFromPos (id, pos) { stack = (_, _, _) :: _, allFiles} = let val (fname, contents) = case List.find (fn (fid, _, _) => fid = id) allFiles of NONE => raise InvalidStream | SOME (_, fname, contents) => (fname, contents) val (line, col) = calcFilePos contents pos in (fname, line, SOME col) end | getPposFromPos _ _ = raise InvalidStream fun getPos ({ stack = (id, off, _) :: _, ... }: stream) = (id, off) | getPos _ = raise InvalidStream fun getSubstr startOff endOff ({ stack = (_, _, contents) :: _, ... }: stream) = String.substring (contents, startOff, endOff - startOff) | getSubstr _ _ _ = raise InvalidStream fun advanceToNewFile ({ stack = (_, off, contents) :: rest, allFiles }: stream) = if off = String.size contents then { stack = rest, allFiles } else raise InvalidStreamAdvance | advanceToNewFile _ = raise InvalidStreamAdvance fun streamInit fname = let val contents = readFile fname in { allFiles = [(0, fname, contents)], stack = [(0, 0, contents)] } end fun isSpace c = c = #" " orelse c = #"\t" orelse c = #"\n" fun isLetter c = ord c >= ord #"a" andalso ord c <= ord #"z" orelse ord c >= ord #"A" andalso ord c <= ord #"Z" fun isIdStart c = c = #"_" orelse isLetter c fun isDigit c = ord c >= ord #"0" andalso ord c <= ord #"9" fun isIdBody c = isIdStart c orelse isDigit c fun isPrintable c = Char.isPrint c andalso c <> #" " (* FSM for parsing symbols *) val maxStates = 40 fun fsmInsert (nextState, buf) curState tk (c :: cs) = let open Array val (_, row) = sub (buf, curState) val potNextState = sub (row, ord c) in if potNextState <> ~1 then fsmInsert (nextState, buf) potNextState tk cs else ( update (row, ord c, !nextState); nextState := !nextState + 1; fsmInsert (nextState, buf) (!nextState - 1) tk cs ) end | fsmInsert (_, buf) curState tk [] = let open Array val (_, row) = sub (buf, curState) in update (buf, curState, (tk, row)) end fun isStartForFsmGen () = let open Array val lookupTable = array (128, false) fun firstChr s = ord $ String.sub (s, 0) val () = List.app (fn (_, repr) => update (lookupTable, firstChr repr, true)) $ List.filter (fn (_, repr) => let val c = String.sub (repr, 0) in c <> kwPrefix andalso c <> cppPrefix end) tokenRepr in fn c => sub (lookupTable, ord c) end val isStartForFsm = isStartForFsmGen () fun fsmTableCreate () = let open Array val T as (nextState, buf) = (ref 1, array (maxStates, (Invalid, array (128, ~1)))) val r = ref 1 fun filterNeeded [] acc = acc | filterNeeded ((T as (_, repr)) :: tks) acc = filterNeeded tks (if isStartForFsm $ String.sub (repr, 0) then T :: acc else acc) val tokenRepr = filterNeeded tokenRepr [] fun fsmInsert' T curState tk repr = fsmInsert T curState tk repr handle Subscript => raise FsmTableIsTooSmall in while !r < length buf do ( update (buf, !r, (Invalid, array(128, ~1))); r := !r + 1 ); List.app (fn (v, p) => fsmInsert' T 0 v $ explode p) tokenRepr; if !nextState <> maxStates then printLn $ "note: Fsm table size can be smaller: " ^ Int.toString (!nextState) ^ " is enough" else (); T end (* Unused right now fun printTable (nextState, buf) = let fun printRow i row = if i = length row then print "\n" else let val state = sub (row, i) in if state = ~1 then () else print ((str (chr i)) ^ ": " ^ (Int.toString state) ^ ", "); printRow (i + 1) row end fun print' rowNum buf = if rowNum = !nextState then () else let val (tk, row) = sub (buf, rowNum) in print ((token2string tk) ^ ": "); printRow 0 row; print' (rowNum + 1) buf end in print ("NextState: " ^ Int.toString (!nextState) ^ "\n"); print' 0 buf; print "\n" end *) val fsmTable = fsmTableCreate () fun fsmEat stream = let open Array val stream = ungetc stream val pos = getPos stream fun get curState stream = let val (c, stream) = (fn (c, s) => (SOME c, s)) $ getchar stream handle _ => (NONE, stream) in case c of NONE => (#1 $ sub (#2 fsmTable, curState), stream) | SOME c => let val (tk, row) = sub (#2 fsmTable, curState) val nextState = sub (row, ord c) in if nextState = ~1 then (tk, ungetc stream) else get nextState stream end end in (fn (tk, stream) => ((pos, tk), stream)) $ get 0 stream end fun isFirstOnLine (tk: fullToken) (stream: stream) = let val (id, offset) = #1 tk in case List.find (fn (fid, _, _) => id = fid) (#allFiles stream) of NONE => raise InvalidStream | SOME (_, _, contents) => let fun returnToNL ~1 = true | returnToNL offset = let val chr = String.sub (contents, offset) in if chr = #"\n" then true else if isSpace chr then returnToNL (offset - 1) else false end in returnToNL (offset - 1) end end fun tkError2aug stream (TkError (dx, msg)) = let val (id, pos) = getPosAfterCharRead stream val pos = getPposFromPos (id, pos + dx) stream in TkErrorAug (pos, msg) end | tkError2aug _ _ = raise Unreachable fun parseGeneric stream parser acc = let val stream = ungetc stream val P as (_, startOff) = getPos stream fun parse' stream acc = let val (c, stream) = (fn (c, s) => (SOME c, s)) $ getchar stream handle _ => (NONE, stream) val (acc, tk, stream) = parser acc (stream, startOff) c handle e as TkError _ => raise tkError2aug stream e in case tk of NONE => parse' stream acc | _ => (valOf tk, stream) end val (tk, stream) = parse' stream acc in ((P, tk): fullToken, stream) end fun finishSeqRead startOff stream = let val (_, endOff) = getPos stream val s = getSubstr startOff endOff stream in s end fun findKeyword s = case List.find (fn (_, repr) => String.sub (repr, 0) = kwPrefix andalso String.extract (repr, 1, NONE) = s) tokenRepr of SOME (tk, _) => tk | NONE => Id s fun idParser () (stream, startOff) c = let fun finalize stream = let val s = finishSeqRead startOff stream val tk = findKeyword s in ((), SOME tk, stream) end in case c of NONE => finalize stream | SOME c => if isIdBody c then ((), NONE, stream) else finalize (ungetc stream) end fun formCppDir (Id s) = let open String in case List.find (fn (_, repr) => sub (repr, 0) = cppPrefix andalso extract (repr, 1, NONE) = s) tokenRepr of SOME (tk, _) => tk | NONE => raise ExpectedCppDir end | formCppDir _ = raise ExpectedCppDir fun numParser () (stream, startOff) c = let fun finalize stream = ((), SOME $ Number (finishSeqRead startOff stream), stream) in case c of NONE => finalize stream | SOME c => if isDigit c then ((), NONE, stream) else finalize (ungetc stream) end fun eatEscSeq stream = let val (c, stream) = getchar stream handle _ => raise TkError (0, "unfinished escape sequence") in (case c of #"\\" => #"\\" | #"t" => #"\t" | #"n" => #"\n" | c => c, stream) end datatype CharParseState = ChInit | ChStart | ChValue of int | ChTerm val chExnExpValue = TkError (0, "expected value after '") val chExnExpTerm = TkError (0, "expected ' after value") val chExnStart = Unreachable val chExnAfterTerm = Unreachable fun charParser ChInit (stream, _) (SOME c) = if c = #"'" then (ChStart, NONE, stream) else raise chExnStart | charParser ChStart (stream, _) (SOME c) = if c <> #"'" then let val (c, stream) = if c <> #"\\" then (c, stream) else eatEscSeq stream in (ChValue $ ord c, NONE, stream) end else raise chExnExpValue | charParser (ChValue v) (stream, startOff) (SOME c) = if c = #"'" then (ChTerm, SOME $ CharConst (finishSeqRead startOff stream, v), stream) else raise chExnExpTerm | charParser ChTerm _ (SOME _) = raise chExnAfterTerm | charParser state (_, _) NONE = raise case state of ChInit => chExnStart | ChStart => chExnExpValue | ChValue _ => chExnExpTerm | ChTerm => chExnAfterTerm fun readIncludeArg stream = let open String fun triml s idx = if idx = size s then "" else if isSpace $ sub (s, idx) then triml s (idx + 1) else extract (s, idx, NONE) fun trimr s idx = if idx = 0 then "" else if isSpace $ sub (s, idx) then trimr s (idx - 1) else extract (s, 0, SOME $ idx + 1) fun trim s = triml (trimr s (size s - 1)) 0 fun getLinePos () = let val (fname, line, _) = getPposFromPos (getPos stream) stream in (fname, line, NONE) end fun determineType s = let fun --> msg = raise TkErrorAug (getLinePos (), msg) fun isLast c = sub (s, size s - 1) = c fun cut () = extract (s, 1, SOME $ size s - 2) in if s = "" then --> "#include argument is empty" else case sub (s, 0) of #"<" => if isLast #">" then IAFromRef $ cut () else --> "expected > at #include argument end" | #"\"" => if isLast #"\"" then IARel $ cut () else --> "expected \" at #include argument end" | _ => --> "#include argument should start with \" or <" end val (arg, stream) = readline stream handle LineWithoutNl => raise TkErrorAug (getLinePos (), "#include line does not end with \\n") in (determineType $ trim arg, stream) end fun postprocessCppDir tk tkl stream = let val isCppDir = (fn Hash => true | _ => false) (#2 $ hd tkl) handle Empty => false val (pos, tk') = tk fun conv tk = ((#1 $ hd tkl, tk) :: tl tkl) in if isCppDir andalso tk' = Id "include" then let val (arg, stream) = readIncludeArg stream in (conv $ CppInclude arg, stream) end else if isCppDir then (conv $ formCppDir tk', stream) handle ExpectedCppDir => raise TkErrorAug (getPposFromPos pos stream, "expected preprocessor directive") else (tk :: tkl, stream) end fun unexpectedCharRaise stream c = let val (id, pos) = getPosAfterCharRead stream val pos = getPposFromPos (id, pos) stream val repr = if isPrintable c then str c else "<" ^ Int.toString (ord c) ^ ">" in raise TkErrorAug (pos, "unexpected character " ^ repr) end fun tokenize stream tkl = let fun getcharSkipEof stream = getchar stream handle EndOfFile => getcharSkipEof (advanceToNewFile stream) val (c, stream) = (fn (c, s) => (SOME c, s)) $ getcharSkipEof stream handle EndOfStream => (NONE, stream) fun cont (tk, stream) = tokenize stream (tk :: tkl) fun @-> parser acc = cont $ parseGeneric stream parser acc in case c of NONE => (rev tkl, #allFiles stream) | SOME c => if isSpace c then tokenize stream tkl else if isIdStart c then let val (tk, stream) = parseGeneric stream idParser () val (tkl, stream) = postprocessCppDir tk tkl stream in tokenize stream tkl end else if isDigit c then @-> numParser () else if c = #"'" then @-> charParser ChInit else if isStartForFsm c then cont $ fsmEat stream else unexpectedCharRaise stream c end fun main [fname] = let val stream = streamInit fname val (tkl, fileList) = tokenize stream [] in List.app (fn (p, x) => (printPos fileList p; printToken x)) tkl end | main _ = printLn "Expected a signle argument: file name" val () = main $ CommandLine.arguments ()