diff options
author | Vladimir Azarov <avm@intermediate-node.net> | 2025-03-24 21:51:16 +0100 |
---|---|---|
committer | Vladimir Azarov <avm@intermediate-node.net> | 2025-03-24 21:51:16 +0100 |
commit | 87217fe5ba58f5199d30586b5d9bec104dece445 (patch) | |
tree | 00a43d11ebdfbb65750e80758ce1925e4c6a1a3e | |
parent | e40727b58e357f123256557af50666aa42c2caa4 (diff) |
Partition into structures
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | cpp.mlb | 8 | ||||
-rw-r--r-- | cpp.sml | 1023 | ||||
-rw-r--r-- | exn_handler.sml | 46 | ||||
-rw-r--r-- | general.sml | 26 | ||||
-rw-r--r-- | stream.sig | 36 | ||||
-rw-r--r-- | stream.sml | 185 | ||||
-rw-r--r-- | tokenizer.sig | 12 | ||||
-rw-r--r-- | tokenizer.sml | 787 |
9 files changed, 1105 insertions, 1019 deletions
@@ -2,3 +2,4 @@ test* cpp todo* mlmon.out* +*dot @@ -4,6 +4,14 @@ ann in $(SML_LIB)/basis/basis.mlb $(SML_LIB)/basis/mlton.mlb + general.sml + stream.sig + stream.sml + + tokenizer.sig + tokenizer.sml + + exn_handler.sml cpp.sml end @@ -1,1025 +1,10 @@ -fun $ (x, y) = x y -infixr 0 $ - -fun printLn s = (print s; print "\n") - -datatype includeArg = IARel of string | IAFromRef of string - -datatype token = - Invalid | - Number of string | - Id of string | - CharConst of string * int | - StringConst of string | - - kwBreak | - kwCase | - kwChar | - kwConst | - kwContinue | - kwDefault | - kwDouble | - kwElse | - kwEnum | - kwExtern | - kwFloat | - kwFor | - kwGoto | - kwInt | - kwLong | - kwRegister | - kwReturn | - kwShort | - kwSigned | - kwSizeof | - kwStruct | - kwSwitch | - kwTypedef | - kwUnion | - kwUnsigned | - kwVoid | - kwVolatile | - - LParen | - RParen | - LBracket | - RBracket | - LBrace | - RBrace | - - QuestionMark | - Colon | - Coma | - Semicolon | - - Arrow | - Plus | - DoublePlus| - Minus | - DoubleMinus | - Ampersand | - Asterisk | - Slash | - Tilde | - ExclMark | - Percent | - DoubleGreater | - DoubleLess | - Greater | - Less | - EqualSign | - LessEqualSign | - GreaterEqualSign | - DoubleEqualSign | - ExclMarkEqualSign | - Cap | - VerticalBar | - DoubleAmpersand | - DoubleVerticalBar | - - AsteriskEqualSign | - SlashEqualSign | - PercentEqualSign | - PlusEqualSign | - MinusEqualSign | - DoubleLessEqualSign | - DoubleGreaterEqualSign | - AmpersandEqualSign | - CapEqualSign | - VerticalBarEqualSign | - - Hash | - DoubleHash | - - Dot | - DoubleDot | - TripleDot | - - CommentStart | - - CppInclude of includeArg | - CppDefine | - CppUndef | - CppIf | - CppIfdef | - CppIfndef | - CppElse | - CppElif | - CppEndif | - CppPragma - -val kwPrefix = #"@" -val cppPrefix = #"$" - -type fileId = int -type fileOffset = int -type pos = fileId * fileOffset -type convPos = string * int * int option -type fullToken = pos * token - -datatype tkErrorAuxInfo = TkiEOF | TkiDx of int | TkiStart - -exception TkError of tkErrorAuxInfo * string -exception TkErrorAug of convPos * string (* main exception *) - -exception ExpectedCppDir (* handled in postprocess *) - -(* handled in tokenizer *) -exception EndOfStream -exception EndOfFile - -(* Unreachable (should be) *) -exception Unreachable -exception FsmTableIsTooSmall -exception InvalidStream -exception InvalidStreamAdvance -exception TokenWithoutRepr - -(* handled in readIncludeArg *) -exception LineWithoutNl - -fun pos2str (pos, line, col) = -let - val % = Int.toString -in - case col of - SOME col => pos ^ ":" ^ %line ^ ":" ^ %col - | NONE => pos ^ ":" ^ %line -end -fun eprint s = printLn $ "error: " ^ s - -fun otherExn e = -let - val hist = MLton.Exn.history e -in - eprint $ "exception " ^ exnMessage e ^ " was raised"; - if hist = [] then - (printLn "No stack trace is avaliable"; - printLn "Recompile with -const \"Exn.keepHistory true\"") - else - List.app (fn x => printLn $ "\t" ^ x) hist -end - -fun exit code = Posix.Process.exit $ Word8.fromInt code - -fun ioExn (IO.Io { name, function = _, cause }) = -let - open OS - val prefix = name ^ ": " - val reason = - case cause of - SysErr (str, _) => str - | _ => exnMessage cause -in - printLn $ prefix ^ reason -end - | ioExn _ = (printLn "ioExn: unreachable"; exit 254) - -fun globalExnHandler e = - (case e of - FsmTableIsTooSmall => - eprint "fsm table is too small. Increate 'maxState' value" - | IO.Io _ => ioExn e - | TkErrorAug (pos, msg) => eprint $ pos2str pos ^ ": " ^ msg - | _ => otherExn e; - exit 255) - -val () = MLton.Exn.setTopLevelHandler globalExnHandler - -val tokenRepr = -let - fun & repr = str kwPrefix ^ repr - fun % repr = str cppPrefix ^ repr -in - [ - (kwBreak, &"break"), - (kwCase, &"case"), - (kwChar, &"char"), - (kwConst, &"const"), - (kwContinue, &"continue"), - (kwDefault, &"default"), - (kwDouble, &"double"), - (kwElse, &"else"), - (kwEnum, &"enum"), - (kwExtern, &"extern"), - (kwFloat, &"float"), - (kwFor, &"for"), - (kwGoto, &"goto"), - (kwInt, &"int"), - (kwLong, &"long"), - (kwRegister, &"register"), - (kwReturn, &"return"), - (kwShort, &"short"), - (kwSigned, &"signed"), - (kwSizeof, &"sizeof"), - (kwStruct, &"struct"), - (kwSwitch, &"switch"), - (kwTypedef, &"typedef"), - (kwUnion, &"union"), - (kwUnsigned, &"unsigned"), - (kwVoid, &"void"), - (kwVolatile, &"volatile"), - - (LParen, "("), - (RParen, ")"), - (LBracket, "["), - (RBracket, "]"), - (LBrace, "{"), - (RBrace, "}"), - - (QuestionMark, "?"), - (Colon, ":"), - (Coma, ","), - (Semicolon, ";"), - - (Arrow, "->"), - (Plus, "+"), - (DoublePlus, "++"), - (Minus, "-"), - (DoubleMinus, "--"), - (Ampersand, "&"), - (Asterisk, "*"), - (Slash, "/"), - (Tilde, "~"), - (ExclMark, "!"), - (Percent, "%"), - (DoubleLess, "<<"), - (DoubleGreater, ">>"), - (Less, "<"), - (Greater, ">"), - (EqualSign, "="), - (LessEqualSign, "<="), - (GreaterEqualSign, ">="), - (DoubleEqualSign, "=="), - (ExclMarkEqualSign, "!="), - (Cap, "^"), - (VerticalBar, "|"), - (DoubleAmpersand, "&&"), - (DoubleVerticalBar, "||"), - - (AsteriskEqualSign, "*="), - (SlashEqualSign, "/="), - (PercentEqualSign, "%="), - (PlusEqualSign, "+="), - (MinusEqualSign, "-="), - (DoubleLessEqualSign, "<<="), - (DoubleGreaterEqualSign, ">>="), - (AmpersandEqualSign, "&="), - (CapEqualSign, "^="), - (VerticalBarEqualSign, "|="), - - (Hash, "#"), - (DoubleHash, "##"), - - (Dot, "."), - (DoubleDot, ".."), - (TripleDot, "..."), - - (CommentStart, "/*"), - - (CppDefine, %"define"), - (CppUndef, %"undef"), - (CppIf, %"if"), - (CppIfdef, %"ifdef"), - (CppIfndef, %"ifndef"), - (CppElse, %"else"), - (CppElif, %"elif"), - (CppEndif, %"endif"), - (CppPragma, %"pragma") -] -end - -val printToken = fn - Number s => printLn $ "Num: " ^ s - | Id s => printLn $ "Id: " ^ s - | CharConst (repr, _) => printLn repr - | CppInclude arg => - let - val (start, end', arg) = - case arg of - IARel v => ("\"", "\"", v) - | IAFromRef v => ("<", ">", v) - in - printLn $ (str cppPrefix) ^ "include " ^ start ^ arg ^ end' - end - | StringConst s => - printLn $ "\"" ^ s ^ "\"" - | v => - case List.find (fn (x, _) => x = v) tokenRepr of - SOME (_, repr) => printLn repr - | NONE => raise TokenWithoutRepr - -type stream = { - (* list of file ids, file names and file contents *) - allFiles: (fileId * string * string) list, - (* stack of file ids, file offsets and file contents *) - stack: (fileId * fileOffset * string) list -} - -fun calcFilePos s offset = -let - fun calc s cur offset (line, col) = - if cur = offset then - (line, col) - else - calc s (cur + 1) offset - (if String.sub (s, cur) = #"\n" then (line + 1, 1) - else (line, col + 1)) -in - calc s 0 offset (1, 1) -end - -fun printPos fileList (id, pos) = -let - val triple = List.find (fn (fid, _, _) => fid = id) fileList -in - case triple of - NONE => raise InvalidStream - | SOME (_, fname, contents) => - let - val (line, col) = calcFilePos contents pos - val line = Int.toString line - val col = Int.toString col - in - print $ fname ^ ":" ^ line ^ ":" ^ col ^ ": " - end -end - -fun readFile fname = -let - open TextIO - val h = openIn fname - val s = inputAll h - val () = closeIn h -in - s -end - -fun getchar - ({ stack = (id, off, contents) :: rest, allFiles }: stream) - : (char * stream) = - if off < String.size contents then - (String.sub (contents, off), - { stack = (id, off + 1, contents) :: rest, allFiles }) - else - raise EndOfFile - | getchar _ = raise EndOfStream - -fun ungetc - ({ stack = (id, off, contents) :: rest, allFiles }) = - if off = 0 then - raise InvalidStream - else - { stack = (id, off - 1, contents) :: rest, allFiles } - | ungetc _ = raise InvalidStream - -fun readline { stack = (fid, off, contents) :: rest, allFiles } = - let - val prevIsSlash = - off > 0 andalso String.sub (contents, off - 1) = #"\\" - - open String - fun read prevIsSlash offset acc = - let - val c = sub (contents, offset) - in - if offset = size contents then - raise LineWithoutNl - else if c = #"\n" then - if prevIsSlash then - read (c = #"\\") (offset + 1) (#" " :: tl acc) - else - (implode $ rev acc, offset + 1) - else - read (c = #"\\") (offset + 1) (c :: acc) - end - - val (arg, newOffset) = read prevIsSlash off [] - in - (arg, { stack = (fid, newOffset, contents) :: rest, allFiles }) - end - | readline _ = raise InvalidStream - -fun getOffset ({ stack = (_, off, _) :: _, ... }: stream) = off - | getOffset _ = raise InvalidStream - -fun getPosAfterCharRead - ({ stack = (id, off, _) :: _, ... }: stream) = (id, off - 1) - | getPosAfterCharRead _ = raise InvalidStream - -fun getPposFromPos (id, pos) - { stack = (_, _, _) :: _, allFiles} = -let - val (fname, contents) = - case List.find (fn (fid, _, _) => fid = id) allFiles of - NONE => raise InvalidStream - | SOME (_, fname, contents) => (fname, contents) - - val (line, col) = calcFilePos contents pos -in - (fname, line, SOME col) -end - | getPposFromPos _ _ = raise InvalidStream - -fun getPos ({ stack = (id, off, _) :: _, ... }: stream) = (id, off) - | getPos _ = raise InvalidStream - -fun getSubstr startOff endOff - ({ stack = (_, _, contents) :: _, ... }: stream) = - String.substring (contents, startOff, endOff - startOff) - | getSubstr _ _ _ = raise InvalidStream - -fun advanceToNewFile - ({ stack = (_, off, contents) :: rest, allFiles }: stream) = - if off = String.size contents then - { stack = rest, allFiles } - else - raise InvalidStreamAdvance - | advanceToNewFile _ = raise InvalidStreamAdvance - -fun streamInit fname = -let - val contents = readFile fname -in - { allFiles = [(0, fname, contents)], stack = [(0, 0, contents)] } -end - -fun isSpace c = c = #" " orelse c = #"\t" orelse c = #"\n" -fun isLetter c = - ord c >= ord #"a" andalso ord c <= ord #"z" orelse - ord c >= ord #"A" andalso ord c <= ord #"Z" -fun isIdStart c = c = #"_" orelse isLetter c -fun isDigit c = ord c >= ord #"0" andalso ord c <= ord #"9" -fun isIdBody c = isIdStart c orelse isDigit c - -fun isPrintable c = Char.isPrint c andalso c <> #" " - -(* FSM for parsing symbols *) - -val maxStates = 51 - -fun fsmInsert (nextState, buf) curState tk (c :: cs) = -let - open Array - - val (_, row) = sub (buf, curState) - val potNextState = sub (row, ord c) -in - if potNextState <> ~1 then - fsmInsert (nextState, buf) potNextState tk cs - else ( - update (row, ord c, !nextState); - nextState := !nextState + 1; - fsmInsert (nextState, buf) (!nextState - 1) tk cs - ) -end - | fsmInsert (_, buf) curState tk [] = -let - open Array - val (_, row) = sub (buf, curState) -in - update (buf, curState, (tk, row)) -end - -fun isStartForFsmGen () = -let - open Array - - val lookupTable = array (128, false) - - fun firstChr s = ord $ String.sub (s, 0) - val () = List.app - (fn (_, repr) => update (lookupTable, firstChr repr, true)) - $ List.filter - (fn (_, repr) => - let - val c = String.sub (repr, 0) - in - c <> kwPrefix andalso c <> cppPrefix - end) - tokenRepr - -in - fn c => sub (lookupTable, ord c) -end - -val isStartForFsm = isStartForFsmGen () - -fun fsmTableCreate () = -let - open Array - - val T as (nextState, buf) = - (ref 1, array (maxStates, (Invalid, array (128, ~1)))) - val r = ref 1 - - fun filterNeeded [] acc = acc - | filterNeeded ((T as (_, repr)) :: tks) acc = - filterNeeded tks - (if isStartForFsm $ String.sub (repr, 0) then - T :: acc - else - acc) - - val tokenRepr = filterNeeded tokenRepr [] - - fun fsmInsert' T curState tk repr = fsmInsert T curState tk repr handle - Subscript => raise FsmTableIsTooSmall -in - while !r < length buf do ( - update (buf, !r, (Invalid, array(128, ~1))); - r := !r + 1 - ); - - List.app (fn (v, p) => fsmInsert' T 0 v $ explode p) tokenRepr; - if !nextState <> maxStates then - printLn $ "note: Fsm table size can be smaller: " - ^ Int.toString (!nextState) ^ " is enough" - else (); - T -end - -(* Unused right now -fun printTable (nextState, buf) = -let - fun printRow i row = - if i = length row then - print "\n" - else - let - val state = sub (row, i) - in - if state = ~1 then - () - else - print ((str (chr i)) ^ ": " ^ (Int.toString state) ^ ", "); - printRow (i + 1) row - end - - fun print' rowNum buf = - if rowNum = !nextState then - () - else - let - val (tk, row) = sub (buf, rowNum) - in - print ((token2string tk) ^ ": "); - printRow 0 row; - print' (rowNum + 1) buf - end -in - print ("NextState: " ^ Int.toString (!nextState) ^ "\n"); - print' 0 buf; - print "\n" -end -*) - -val fsmTable = fsmTableCreate () - -fun fsmEat stream = -let - open Array - val stream = ungetc stream - val pos = getPos stream - - fun get curState stream = - let - val (c, stream) = (fn (c, s) => (SOME c, s)) $ getchar stream handle - _ => (NONE, stream) - in - case c of - NONE => (#1 $ sub (#2 fsmTable, curState), stream) - | SOME c => - let - val (tk, row) = sub (#2 fsmTable, curState) - val nextState = sub (row, ord c) - in - if nextState = ~1 then - (tk, ungetc stream) - else - get nextState stream - end - end -in - (fn (tk, stream) => ((pos, tk), stream)) $ get 0 stream -end - -fun isFirstOnLine (tk: fullToken) (stream: stream) = -let - val (id, offset) = #1 tk -in - case List.find (fn (fid, _, _) => id = fid) (#allFiles stream) of - NONE => raise InvalidStream - | SOME (_, _, contents) => - let - fun returnToNL ~1 = true - | returnToNL offset = - let - val chr = String.sub (contents, offset) - in - if chr = #"\n" then - true - else if isSpace chr then - returnToNL (offset - 1) - else - false - end - in - returnToNL (offset - 1) - end -end - -fun tkError2aug stream (dx, msg) = -let - val (id, pos) = getPosAfterCharRead stream - val pos = getPposFromPos (id, pos + dx) stream -in - TkErrorAug (pos, msg) -end - -fun parseGeneric stream parser acc = -let - val stream = ungetc stream - val P as (_, startOff) = getPos stream - - fun parse' stream acc = let - val (c, stream) = (fn (c, s) => (SOME c, s)) $ getchar stream handle - _ => (NONE, stream) - - val (acc, tk, stream) = parser acc (stream, startOff) c handle - TkError (TkiDx dx, msg) => raise tkError2aug stream (dx, msg) - | TkError (TkiStart, msg) => - let - val startPos = getPposFromPos P stream - in - raise TkErrorAug (startPos, msg) - end - | TkError (TkiEOF, msg) => - let - val (file, line, _) = getPposFromPos P stream - in - raise TkErrorAug ((file, line, NONE), msg) - end - in - case tk of - NONE => parse' stream acc - | _ => (valOf tk, stream) - end - - val (tk, stream) = parse' stream acc -in - ((P, tk): fullToken, stream) -end - -fun finishSeqRead startOff stream = -let - val (_, endOff) = getPos stream - val s = getSubstr startOff endOff stream -in - s -end - -fun findKeyword s = - case List.find - (fn (_, repr) => - String.sub (repr, 0) = kwPrefix andalso - String.extract (repr, 1, NONE) = s) - tokenRepr - of - SOME (tk, _) => tk - | NONE => Id s - -fun idParser () (stream, startOff) c = -let - fun finalize stream = - let - val s = finishSeqRead startOff stream - val tk = findKeyword s - in - ((), SOME tk, stream) - end -in - case c of - NONE => finalize stream - | SOME c => - if isIdBody c then - ((), NONE, stream) - else - finalize (ungetc stream) -end - -fun formCppDir (Id s) = -let - open String -in - case List.find - (fn (_, repr) => - sub (repr, 0) = cppPrefix andalso - extract (repr, 1, NONE) = s) - tokenRepr - of - SOME (tk, _) => tk - | NONE => raise ExpectedCppDir -end - | formCppDir kwElse = CppElse - | formCppDir _ = raise ExpectedCppDir - -fun numParser () (stream, startOff) c = -let - fun finalize stream = - ((), SOME $ Number (finishSeqRead startOff stream), stream) -in - case c of - NONE => finalize stream - | SOME c => - if isDigit c then - ((), NONE, stream) - else - finalize (ungetc stream) -end - -fun eatEscSeq stream = -let - val (c, stream) = getchar stream handle - _ => raise TkError (TkiDx 0, "unfinished escape sequence") -in - (case c of - #"\\" => #"\\" - | #"t" => #"\t" - | #"n" => #"\n" - | c => c, - stream) -end - -fun stringCut s = String.extract (s, 1, SOME $ String.size s - 2) - -datatype SeqParseState = SeqInit | SeqStart | SeqValue of int | SeqTerm - -datatype seqParseMode = SpmChr | SpmStr - -fun seqBound SpmChr = #"'" - | seqBound SpmStr = #"\"" - -fun seqExnConv mode (TkError (v, msg)) = -let - val bound = if mode = SpmChr then "'" else "\"" - val msg = - String.translate (fn c => if c = #"%" then bound else str c) msg -in - TkError (v, msg) -end - | seqExnConv _ _ = raise Unreachable - -fun unfinishedSeq SpmChr = "unfinished character constant" - | unfinishedSeq SpmStr = "unfinished string literal" - -fun seqParser mode SeqInit (stream, _) (SOME c) = - if seqBound mode = c then - (SeqStart, NONE, stream) - else - raise Unreachable - | seqParser mode SeqStart (stream, _) (SOME c) = - if c <> seqBound mode then - let - val (c, stream) = - if c <> #"\\" then (c, stream) else eatEscSeq stream - in - (SeqValue (ord c), NONE, stream) - end - else if mode = SpmStr then - (SeqTerm, SOME $ StringConst "", stream) - else - raise seqExnConv SpmChr $ TkError (TkiDx 0, "expected value after %") - | seqParser mode (SeqValue v) (stream, startOff) (SOME c) = - if seqBound mode = c then - let - fun term s v = - if mode = SpmChr then - CharConst (s, v) - else - StringConst $ stringCut s - in - (SeqTerm, SOME $ term (finishSeqRead startOff stream) v, stream) - end - else if mode = SpmStr then - let - val (_, stream) = - if c <> #"\\" then (c, stream) else eatEscSeq stream - in - (SeqValue v, NONE, stream) - end - else - raise seqExnConv SpmChr $ TkError (TkiDx 0, "expected % after value") - | seqParser _ SeqTerm _ (SOME _) = - raise Unreachable - | seqParser mode state (_, _) NONE = - raise case state of - SeqInit => Unreachable - | SeqStart => seqExnConv mode $ TkError (TkiStart, unfinishedSeq mode) - | SeqValue _ => - seqExnConv mode $ TkError (TkiStart, unfinishedSeq mode) - | SeqTerm => Unreachable - -val charParser = seqParser SpmChr -val strParser = seqParser SpmStr - -fun readIncludeArg stream = -let - open String - - fun triml s idx = - if idx = size s then - "" - else if isSpace $ sub (s, idx) then - triml s (idx + 1) - else - extract (s, idx, NONE) - - fun trimr s idx = - if idx = 0 then - "" - else if isSpace $ sub (s, idx) then - trimr s (idx - 1) - else - extract (s, 0, SOME $ idx + 1) - - fun trim s = triml (trimr s (size s - 1)) 0 - - fun getLinePos () = - let - val (fname, line, _) = getPposFromPos (getPos stream) stream - in - (fname, line, NONE) - end - - fun determineType s = - let - fun --> msg = raise TkErrorAug (getLinePos (), msg) - fun isLast c = sub (s, size s - 1) = c - in - if s = "" then - --> "#include argument is empty" - else - case sub (s, 0) of - #"<" => - if isLast #">" then - IAFromRef $ stringCut s - else - --> "expected > at #include argument end" - | #"\"" => - if isLast #"\"" then - IARel $ stringCut s - else - --> "expected \" at #include argument end" - | _ => --> "#include argument should start with \" or <" - end - - val (arg, stream) = readline stream handle - LineWithoutNl => - raise TkErrorAug (getLinePos (), - "#include line does not end with \\n") -in - (determineType $ trim arg, stream) -end - -fun postprocessCppDir tk tkl stream = -let - val isCppDir = - (fn Hash => true | _ => false) (#2 $ hd tkl) - andalso isFirstOnLine (hd tkl) stream - handle Empty => false - val (pos, tk') = tk - - fun conv tk = ((#1 $ hd tkl, tk) :: tl tkl) -in - if isCppDir andalso tk' = Id "include" then - let - val (arg, stream) = readIncludeArg stream - in - (conv $ CppInclude arg, stream) - end - else if isCppDir then - (conv $ formCppDir tk', stream) handle - ExpectedCppDir => - raise TkErrorAug (getPposFromPos pos stream, - "expected preprocessor directive") - - else - (tk :: tkl, stream) -end - -fun unexpectedCharRaise stream c = -let - val (id, pos) = getPosAfterCharRead stream - val pos = getPposFromPos (id, pos) stream - val repr = - if isPrintable c then - str c - else - "<" ^ Int.toString (ord c) ^ ">" -in - raise TkErrorAug (pos, "unexpected character " ^ repr) -end - -fun skipComment stream pos = -let - fun skip prevIsAsterisk stream = - let - val (c, stream) = getchar stream - in - if prevIsAsterisk andalso c = #"/" then - stream - else - skip (c = #"*") stream - end -in - skip false stream handle - EndOfFile => - let - val pos = getPposFromPos pos stream - in - raise TkErrorAug (pos, "unfinished comment") - end -end - -fun handleBackslash stream = -let - val (c, stream) = (fn (c, s) => (SOME c, s)) $ getchar stream handle - _ => (NONE, stream) - - val raiseErr = fn () => - let - val pos = getPosAfterCharRead stream - val pos = getPposFromPos pos stream - in - raise TkErrorAug (pos, "expected \\n after backslash") - end -in - case c of - SOME c => - if c = #"\n" then - stream - else - raiseErr () - | NONE => raiseErr () -end - -fun processSymbol stream tkl = -let - val (T as (p as (fid, off), tk), stream) = fsmEat stream -in - case tk of - CommentStart => tokenize (skipComment stream p) tkl - | DoubleDot => tokenize stream (((fid, off + 1), Dot) :: (p, Dot) :: tkl) - | _ => tokenize stream (T :: tkl) -end - -and tokenize stream tkl = -let - fun getcharSkipEof stream = getchar stream handle - EndOfFile => getcharSkipEof (advanceToNewFile stream) - - val (c, stream) = (fn (c, s) => (SOME c, s)) $ getcharSkipEof stream - handle - EndOfStream => (NONE, stream) - - fun cont (tk, stream) = tokenize stream (tk :: tkl) - fun @-> parser acc = cont $ parseGeneric stream parser acc -in - case c of - NONE => (rev tkl, #allFiles stream) - | SOME c => - if isSpace c then - tokenize stream tkl - else if isIdStart c then - let - val (tk, stream) = parseGeneric stream idParser () - val (tkl, stream) = postprocessCppDir tk tkl stream - in - tokenize stream tkl - end - else if isDigit c then - @-> numParser () - else if c = #"'" then - @-> charParser SeqInit - else if c = #"\"" then - @-> strParser SeqInit - else if isStartForFsm c then - processSymbol stream tkl - else if c = #"\\" then - tokenize (handleBackslash stream) tkl - else if c = #"\f" then - tokenize stream tkl - else - unexpectedCharRaise stream c -end - fun main [fname] = let - val stream = streamInit fname - val (tkl, fileList) = tokenize stream [] + val stream = Stream.streamInit fname + val (tkl, fileList) = Tokenizer.tokenize stream [] in - List.app (fn (p, x) => (printPos fileList p; printToken x)) tkl + List.app + (fn (p, x) => (Stream.printPos fileList p; Tokenizer.printToken x)) tkl end | main _ = printLn "Expected a signle argument: file name" diff --git a/exn_handler.sml b/exn_handler.sml new file mode 100644 index 0000000..2e3a0b2 --- /dev/null +++ b/exn_handler.sml @@ -0,0 +1,46 @@ +structure GlobalExnHandler: sig val handler: exn -> unit end = struct + + fun eprint s = printLn $ "error: " ^ s + + fun otherExn e = + let + val hist = MLton.Exn.history e + in + eprint $ "exception " ^ exnMessage e ^ " was raised"; + if hist = [] then + (printLn "No stack trace is avaliable"; + printLn "Recompile with -const \"Exn.keepHistory true\"") + else + List.app (fn x => printLn $ "\t" ^ x) hist + end + + fun exit code = Posix.Process.exit $ Word8.fromInt code + + fun ioExn (IO.Io { name, function = _, cause }) = + let + open OS + val prefix = name ^ ": " + val reason = + case cause of + SysErr (str, _) => str + | _ => exnMessage cause + in + printLn $ prefix ^ reason + end + | ioExn _ = (printLn "ioExn: unreachable"; exit 254) + + fun handler e = + let + open Tokenizer + in + (case e of + FsmTableIsTooSmall => + eprint "fsm table is too small. Increate 'maxState' value" + | IO.Io _ => ioExn e + | TkErrorAug (pos, msg) => eprint $ Stream.pos2str pos ^ ": " ^ msg + | _ => otherExn e; + exit 255) + end +end + +val () = MLton.Exn.setTopLevelHandler GlobalExnHandler.handler diff --git a/general.sml b/general.sml new file mode 100644 index 0000000..cde0514 --- /dev/null +++ b/general.sml @@ -0,0 +1,26 @@ +fun $ (x, y) = x y +infixr 0 $ + +fun printLn s = (print s; print "\n") + +fun lazy thunk = +let + datatype 'a value = + Unevaluated of unit -> 'a | + Evaluated of 'a | + Exn of exn + + val value = ref $ Unevaluated thunk +in + fn () => + case !value of + Unevaluated th => + let + val x = th () handle e => (value := Exn e; raise e) + val () = value := Evaluated x + in + x + end + | Evaluated v => v + | Exn e => raise e +end diff --git a/stream.sig b/stream.sig new file mode 100644 index 0000000..98cfb65 --- /dev/null +++ b/stream.sig @@ -0,0 +1,36 @@ +signature STREAM = sig + type fileId = int + type fileOffset = int + type pos = fileId * fileOffset + type convPos = string * int * int option + + type filesInfo = (fileId * string * string) list + + exception EndOfStream + exception EndOfFile + + exception LineWithoutNl + + type t + + val extractFilesInfo: t -> filesInfo + + val pos2str: convPos -> string + val printPos: (fileId * string * string) list -> pos -> unit + + val getchar: t -> char * t + val ungetc: t -> t + val readline: t -> string * t + + val getOffset: t -> fileOffset + val getPos: t -> pos + val getPosAfterCharRead: t -> pos + val getPposFromPos: pos -> t -> convPos + + val getSubstr: fileOffset -> fileOffset -> t -> string + val isFirstOnLine: pos -> t -> bool + + val advanceToNewFile: t -> t + + val streamInit: string -> t +end diff --git a/stream.sml b/stream.sml new file mode 100644 index 0000000..10b02cc --- /dev/null +++ b/stream.sml @@ -0,0 +1,185 @@ +structure Stream :> STREAM = struct + type fileId = int + type fileOffset = int + type pos = fileId * fileOffset + type convPos = string * int * int option + + type filesInfo = (fileId * string * string) list + + exception EndOfStream + exception EndOfFile + + (* unreachable *) + exception InvalidStream + exception InvalidStreamAdvance + + exception LineWithoutNl + + fun pos2str (pos, line, col) = + let + val % = Int.toString + in + case col of + SOME col => pos ^ ":" ^ %line ^ ":" ^ %col + | NONE => pos ^ ":" ^ %line + end + + type t = { + (* stack of file ids, file offsets and file contents *) + stack: (fileId * fileOffset * string) list, + (* list of file ids, file names and file contents *) + allFiles: filesInfo + } + + fun extractFilesInfo (s: t) = #allFiles s + + fun calcFilePos s offset = + let + fun calc s cur offset (line, col) = + if cur = offset then + (line, col) + else + calc s (cur + 1) offset + (if String.sub (s, cur) = #"\n" then (line + 1, 1) + else (line, col + 1)) + in + calc s 0 offset (1, 1) + end + + fun printPos fileList (id, pos) = + let + val triple = List.find (fn (fid, _, _) => fid = id) fileList + in + case triple of + NONE => raise InvalidStream + | SOME (_, fname, contents) => + let + val (line, col) = calcFilePos contents pos + val line = Int.toString line + val col = Int.toString col + in + print $ fname ^ ":" ^ line ^ ":" ^ col ^ ": " + end + end + + fun readFile fname = + let + open TextIO + val h = openIn fname + val s = inputAll h + val () = closeIn h + in + s + end + + fun getchar + ({ stack = (id, off, contents) :: rest, allFiles }: t) + : (char * t) = + if off < String.size contents then + (String.sub (contents, off), + { stack = (id, off + 1, contents) :: rest, allFiles }) + else + raise EndOfFile + | getchar _ = raise EndOfStream + + fun ungetc + ({ stack = (id, off, contents) :: rest, allFiles }) = + if off = 0 then + raise InvalidStream + else + { stack = (id, off - 1, contents) :: rest, allFiles } + | ungetc _ = raise InvalidStream + + fun readline { stack = (fid, off, contents) :: rest, allFiles } = + let + val prevIsSlash = + off > 0 andalso String.sub (contents, off - 1) = #"\\" + + open String + fun read prevIsSlash offset acc = + let + val c = sub (contents, offset) + in + if offset = size contents then + raise LineWithoutNl + else if c = #"\n" then + if prevIsSlash then + read (c = #"\\") (offset + 1) (#" " :: tl acc) + else + (implode $ rev acc, offset + 1) + else + read (c = #"\\") (offset + 1) (c :: acc) + end + + val (arg, newOffset) = read prevIsSlash off [] + in + (arg, { stack = (fid, newOffset, contents) :: rest, allFiles }) + end + | readline _ = raise InvalidStream + + fun getOffset ({ stack = (_, off, _) :: _, ... }: t) = off + | getOffset _ = raise InvalidStream + + fun getPosAfterCharRead + ({ stack = (id, off, _) :: _, ... }: t) = (id, off - 1) + | getPosAfterCharRead _ = raise InvalidStream + + fun getPposFromPos (id, pos) + { stack = (_, _, _) :: _, allFiles} = + let + val (fname, contents) = + case List.find (fn (fid, _, _) => fid = id) allFiles of + NONE => raise InvalidStream + | SOME (_, fname, contents) => (fname, contents) + + val (line, col) = calcFilePos contents pos + in + (fname, line, SOME col) + end + | getPposFromPos _ _ = raise InvalidStream + + fun getPos ({ stack = (id, off, _) :: _, ... }: t) = (id, off) + | getPos _ = raise InvalidStream + + fun getSubstr startOff endOff + ({ stack = (_, _, contents) :: _, ... }: t) = + String.substring (contents, startOff, endOff - startOff) + | getSubstr _ _ _ = raise InvalidStream + + fun advanceToNewFile + ({ stack = (_, off, contents) :: rest, allFiles }: t) = + if off = String.size contents then + { stack = rest, allFiles } + else + raise InvalidStreamAdvance + | advanceToNewFile _ = raise InvalidStreamAdvance + + fun streamInit fname = + let + val contents = readFile fname + in + { allFiles = [(0, fname, contents)], stack = [(0, 0, contents)] } + end + + fun isFirstOnLine (id, offset) (stream: t) = + case List.find (fn (fid, _, _) => id = fid) (#allFiles stream) of + NONE => raise InvalidStream + | SOME (_, _, contents) => + let + fun returnToNL ~1 = true + | returnToNL offset = + let + val chr = String.sub (contents, offset) + in + if chr = #"\n" then + true + else if Char.isSpace chr then + returnToNL (offset - 1) + else + false + end + in + returnToNL (offset - 1) + end + +end diff --git a/tokenizer.sig b/tokenizer.sig new file mode 100644 index 0000000..c31a7e6 --- /dev/null +++ b/tokenizer.sig @@ -0,0 +1,12 @@ +signature TOKENIZER = sig + type token + type fullToken = Stream.pos * token + + (* Fatal. both may be thrown by tokenize *) + exception FsmTableIsTooSmall + exception TkErrorAug of Stream.convPos * string + + val tokenize: Stream.t -> fullToken list -> fullToken list * Stream.filesInfo + + val printToken: token -> unit +end diff --git a/tokenizer.sml b/tokenizer.sml new file mode 100644 index 0000000..53bb396 --- /dev/null +++ b/tokenizer.sml @@ -0,0 +1,787 @@ +structure Tokenizer:> TOKENIZER = struct + datatype includeArg = IARel of string | IAFromRef of string + + datatype token = + Invalid | + Number of string | + Id of string | + CharConst of string * int | + StringConst of string | + + kwBreak | + kwCase | + kwChar | + kwConst | + kwContinue | + kwDefault | + kwDouble | + kwElse | + kwEnum | + kwExtern | + kwFloat | + kwFor | + kwGoto | + kwInt | + kwLong | + kwRegister | + kwReturn | + kwShort | + kwSigned | + kwSizeof | + kwStruct | + kwSwitch | + kwTypedef | + kwUnion | + kwUnsigned | + kwVoid | + kwVolatile | + + LParen | + RParen | + LBracket | + RBracket | + LBrace | + RBrace | + + QuestionMark | + Colon | + Coma | + Semicolon | + + Arrow | + Plus | + DoublePlus| + Minus | + DoubleMinus | + Ampersand | + Asterisk | + Slash | + Tilde | + ExclMark | + Percent | + DoubleGreater | + DoubleLess | + Greater | + Less | + EqualSign | + LessEqualSign | + GreaterEqualSign | + DoubleEqualSign | + ExclMarkEqualSign | + Cap | + VerticalBar | + DoubleAmpersand | + DoubleVerticalBar | + + AsteriskEqualSign | + SlashEqualSign | + PercentEqualSign | + PlusEqualSign | + MinusEqualSign | + DoubleLessEqualSign | + DoubleGreaterEqualSign | + AmpersandEqualSign | + CapEqualSign | + VerticalBarEqualSign | + + Hash | + DoubleHash | + + Dot | + DoubleDot | + TripleDot | + + CommentStart | + + CppInclude of includeArg | + CppDefine | + CppUndef | + CppIf | + CppIfdef | + CppIfndef | + CppElse | + CppElif | + CppEndif | + CppPragma + + val kwPrefix = #"@" + val cppPrefix = #"$" + + type fullToken = Stream.pos * token + + datatype tkErrorAuxInfo = TkiEOF | TkiDx of int | TkiStart + + exception TkError of tkErrorAuxInfo * string + exception TkErrorAug of Stream.convPos * string + + exception ExpectedCppDir (* handled in postprocess *) + + exception FsmTableIsTooSmall + + (* Unreachable (should be) *) + exception Unreachable + exception TokenWithoutRepr + + val tokenRepr = + let + fun & repr = str kwPrefix ^ repr + fun % repr = str cppPrefix ^ repr + in + [ + (kwBreak, &"break"), + (kwCase, &"case"), + (kwChar, &"char"), + (kwConst, &"const"), + (kwContinue, &"continue"), + (kwDefault, &"default"), + (kwDouble, &"double"), + (kwElse, &"else"), + (kwEnum, &"enum"), + (kwExtern, &"extern"), + (kwFloat, &"float"), + (kwFor, &"for"), + (kwGoto, &"goto"), + (kwInt, &"int"), + (kwLong, &"long"), + (kwRegister, &"register"), + (kwReturn, &"return"), + (kwShort, &"short"), + (kwSigned, &"signed"), + (kwSizeof, &"sizeof"), + (kwStruct, &"struct"), + (kwSwitch, &"switch"), + (kwTypedef, &"typedef"), + (kwUnion, &"union"), + (kwUnsigned, &"unsigned"), + (kwVoid, &"void"), + (kwVolatile, &"volatile"), + + (LParen, "("), + (RParen, ")"), + (LBracket, "["), + (RBracket, "]"), + (LBrace, "{"), + (RBrace, "}"), + + (QuestionMark, "?"), + (Colon, ":"), + (Coma, ","), + (Semicolon, ";"), + + (Arrow, "->"), + (Plus, "+"), + (DoublePlus, "++"), + (Minus, "-"), + (DoubleMinus, "--"), + (Ampersand, "&"), + (Asterisk, "*"), + (Slash, "/"), + (Tilde, "~"), + (ExclMark, "!"), + (Percent, "%"), + (DoubleLess, "<<"), + (DoubleGreater, ">>"), + (Less, "<"), + (Greater, ">"), + (EqualSign, "="), + (LessEqualSign, "<="), + (GreaterEqualSign, ">="), + (DoubleEqualSign, "=="), + (ExclMarkEqualSign, "!="), + (Cap, "^"), + (VerticalBar, "|"), + (DoubleAmpersand, "&&"), + (DoubleVerticalBar, "||"), + + (AsteriskEqualSign, "*="), + (SlashEqualSign, "/="), + (PercentEqualSign, "%="), + (PlusEqualSign, "+="), + (MinusEqualSign, "-="), + (DoubleLessEqualSign, "<<="), + (DoubleGreaterEqualSign, ">>="), + (AmpersandEqualSign, "&="), + (CapEqualSign, "^="), + (VerticalBarEqualSign, "|="), + + (Hash, "#"), + (DoubleHash, "##"), + + (Dot, "."), + (DoubleDot, ".."), + (TripleDot, "..."), + + (CommentStart, "/*"), + + (CppDefine, %"define"), + (CppUndef, %"undef"), + (CppIf, %"if"), + (CppIfdef, %"ifdef"), + (CppIfndef, %"ifndef"), + (CppElse, %"else"), + (CppElif, %"elif"), + (CppEndif, %"endif"), + (CppPragma, %"pragma") + ] + end + + val printToken = fn + Number s => printLn $ "Num: " ^ s + | Id s => printLn $ "Id: " ^ s + | CharConst (repr, _) => printLn repr + | CppInclude arg => + let + val (start, end', arg) = + case arg of + IARel v => ("\"", "\"", v) + | IAFromRef v => ("<", ">", v) + in + printLn $ (str cppPrefix) ^ "include " ^ start ^ arg ^ end' + end + | StringConst s => + printLn $ "\"" ^ s ^ "\"" + | v => + case List.find (fn (x, _) => x = v) tokenRepr of + SOME (_, repr) => printLn repr + | NONE => raise TokenWithoutRepr + + fun isIdStart c = Char.isAlpha c orelse c = #"_" + fun isIdBody c = Char.isAlphaNum c orelse c = #"_" + + fun isPrintable c = Char.isPrint c andalso c <> #" " + + (* FSM for parsing symbols *) + + val maxStates = 51 + + fun fsmInsert (nextState, buf) curState tk (c :: cs) = + let + open Array + + val (_, row) = sub (buf, curState) + val potNextState = sub (row, ord c) + in + if potNextState <> ~1 then + fsmInsert (nextState, buf) potNextState tk cs + else ( + update (row, ord c, !nextState); + nextState := !nextState + 1; + fsmInsert (nextState, buf) (!nextState - 1) tk cs + ) + end + | fsmInsert (_, buf) curState tk [] = + let + open Array + val (_, row) = sub (buf, curState) + in + update (buf, curState, (tk, row)) + end + + fun isStartForFsmGen () = + let + open Array + + val lookupTable = array (128, false) + + fun firstChr s = ord $ String.sub (s, 0) + val () = List.app + (fn (_, repr) => update (lookupTable, firstChr repr, true)) + $ List.filter + (fn (_, repr) => + let + val c = String.sub (repr, 0) + in + c <> kwPrefix andalso c <> cppPrefix + end) + tokenRepr + + in + fn c => sub (lookupTable, ord c) + end + + val isStartForFsm = isStartForFsmGen () + + fun fsmTableCreate () = + let + open Array + + val T as (nextState, buf) = + (ref 1, array (maxStates, (Invalid, array (128, ~1)))) + val r = ref 1 + + fun filterNeeded [] acc = acc + | filterNeeded ((T as (_, repr)) :: tks) acc = + filterNeeded tks + (if isStartForFsm $ String.sub (repr, 0) then + T :: acc + else + acc) + + val tokenRepr = filterNeeded tokenRepr [] + + fun fsmInsert' T curState tk repr = fsmInsert T curState tk repr handle + Subscript => raise FsmTableIsTooSmall + in + while !r < length buf do ( + update (buf, !r, (Invalid, array(128, ~1))); + r := !r + 1 + ); + + List.app (fn (v, p) => fsmInsert' T 0 v $ explode p) tokenRepr; + if !nextState <> maxStates then + printLn $ "note: Fsm table size can be smaller: " + ^ Int.toString (!nextState) ^ " is enough" + else (); + T + end + + (* Unused right now + fun printTable (nextState, buf) = + let + fun printRow i row = + if i = length row then + print "\n" + else + let + val state = sub (row, i) + in + if state = ~1 then + () + else + print ((str (chr i)) ^ ": " ^ (Int.toString state) ^ ", "); + printRow (i + 1) row + end + + fun print' rowNum buf = + if rowNum = !nextState then + () + else + let + val (tk, row) = sub (buf, rowNum) + in + print ((token2string tk) ^ ": "); + printRow 0 row; + print' (rowNum + 1) buf + end + in + print ("NextState: " ^ Int.toString (!nextState) ^ "\n"); + print' 0 buf; + print "\n" + end + *) + + val fsmTable = lazy fsmTableCreate + + fun fsmEat stream = + let + open Array + val stream = Stream.ungetc stream + val pos = Stream.getPos stream + + fun get curState stream = + let + val (c, stream) = (fn (c, s) => (SOME c, s)) $ Stream.getchar stream + handle + _ => (NONE, stream) + in + case c of + NONE => (#1 $ sub (#2 $ fsmTable (), curState), stream) + | SOME c => + let + val (tk, row) = sub (#2 $ fsmTable (), curState) + val nextState = sub (row, ord c) + in + if nextState = ~1 then + (tk, Stream.ungetc stream) + else + get nextState stream + end + end + in + (fn (tk, stream) => ((pos, tk), stream)) $ get 0 stream + end + + fun tkError2aug stream (dx, msg) = + let + val (id, pos) = Stream.getPosAfterCharRead stream + val pos = Stream.getPposFromPos (id, pos + dx) stream + in + TkErrorAug (pos, msg) + end + + fun parseGeneric stream parser acc = + let + val stream = Stream.ungetc stream + val P as (_, startOff) = Stream.getPos stream + + fun parse' stream acc = let + val (c, stream) = (fn (c, s) => (SOME c, s)) $ Stream.getchar stream handle + _ => (NONE, stream) + + val (acc, tk, stream) = parser acc (stream, startOff) c handle + TkError (TkiDx dx, msg) => raise tkError2aug stream (dx, msg) + | TkError (TkiStart, msg) => + let + val startPos = Stream.getPposFromPos P stream + in + raise TkErrorAug (startPos, msg) + end + | TkError (TkiEOF, msg) => + let + val (file, line, _) = Stream.getPposFromPos P stream + in + raise TkErrorAug ((file, line, NONE), msg) + end + in + case tk of + NONE => parse' stream acc + | _ => (valOf tk, stream) + end + + val (tk, stream) = parse' stream acc + in + ((P, tk): fullToken, stream) + end + + fun finishSeqRead startOff stream = + let + val (_, endOff) = Stream.getPos stream + val s = Stream.getSubstr startOff endOff stream + in + s + end + + fun findKeyword s = + case List.find + (fn (_, repr) => + String.sub (repr, 0) = kwPrefix andalso + String.extract (repr, 1, NONE) = s) + tokenRepr + of + SOME (tk, _) => tk + | NONE => Id s + + fun idParser () (stream, startOff) c = + let + fun finalize stream = + let + val s = finishSeqRead startOff stream + val tk = findKeyword s + in + ((), SOME tk, stream) + end + in + case c of + NONE => finalize stream + | SOME c => + if isIdBody c then + ((), NONE, stream) + else + finalize (Stream.ungetc stream) + end + + fun formCppDir (Id s) = + let + open String + in + case List.find + (fn (_, repr) => + sub (repr, 0) = cppPrefix andalso + extract (repr, 1, NONE) = s) + tokenRepr + of + SOME (tk, _) => tk + | NONE => raise ExpectedCppDir + end + | formCppDir kwElse = CppElse + | formCppDir _ = raise ExpectedCppDir + + fun numParser () (stream, startOff) c = + let + fun finalize stream = + ((), SOME $ Number (finishSeqRead startOff stream), stream) + in + case c of + NONE => finalize stream + | SOME c => + if Char.isDigit c then + ((), NONE, stream) + else + finalize (Stream.ungetc stream) + end + + fun eatEscSeq stream = + let + val (c, stream) = Stream.getchar stream handle + _ => raise TkError (TkiDx 0, "unfinished escape sequence") + in + (case c of + #"\\" => #"\\" + | #"t" => #"\t" + | #"n" => #"\n" + | c => c, + stream) + end + + fun stringCut s = String.extract (s, 1, SOME $ String.size s - 2) + + datatype SeqParseState = SeqInit | SeqStart | SeqValue of int | SeqTerm + + datatype seqParseMode = SpmChr | SpmStr + + fun seqBound SpmChr = #"'" + | seqBound SpmStr = #"\"" + + fun seqExnConv mode (TkError (v, msg)) = + let + val bound = if mode = SpmChr then "'" else "\"" + val msg = + String.translate (fn c => if c = #"%" then bound else str c) msg + in + TkError (v, msg) + end + | seqExnConv _ _ = raise Unreachable + + fun unfinishedSeq SpmChr = "unfinished character constant" + | unfinishedSeq SpmStr = "unfinished string literal" + + fun seqParser mode SeqInit (stream, _) (SOME c) = + if seqBound mode = c then + (SeqStart, NONE, stream) + else + raise Unreachable + | seqParser mode SeqStart (stream, _) (SOME c) = + if c <> seqBound mode then + let + val (c, stream) = + if c <> #"\\" then (c, stream) else eatEscSeq stream + in + (SeqValue (ord c), NONE, stream) + end + else if mode = SpmStr then + (SeqTerm, SOME $ StringConst "", stream) + else + raise seqExnConv SpmChr $ TkError (TkiDx 0, "expected value after %") + | seqParser mode (SeqValue v) (stream, startOff) (SOME c) = + if seqBound mode = c then + let + fun term s v = + if mode = SpmChr then + CharConst (s, v) + else + StringConst $ stringCut s + in + (SeqTerm, SOME $ term (finishSeqRead startOff stream) v, stream) + end + else if mode = SpmStr then + let + val (_, stream) = + if c <> #"\\" then (c, stream) else eatEscSeq stream + in + (SeqValue v, NONE, stream) + end + else + raise seqExnConv SpmChr $ TkError (TkiDx 0, "expected % after value") + | seqParser _ SeqTerm _ (SOME _) = + raise Unreachable + | seqParser mode state (_, _) NONE = + raise case state of + SeqInit => Unreachable + | SeqStart => seqExnConv mode $ TkError (TkiStart, unfinishedSeq mode) + | SeqValue _ => + seqExnConv mode $ TkError (TkiStart, unfinishedSeq mode) + | SeqTerm => Unreachable + + val charParser = seqParser SpmChr + val strParser = seqParser SpmStr + + fun readIncludeArg stream = + let + open String + + fun triml s idx = + if idx = size s then + "" + else if Char.isSpace $ sub (s, idx) then + triml s (idx + 1) + else + extract (s, idx, NONE) + + fun trimr s idx = + if idx = 0 then + "" + else if Char.isSpace $ sub (s, idx) then + trimr s (idx - 1) + else + extract (s, 0, SOME $ idx + 1) + + fun trim s = triml (trimr s (size s - 1)) 0 + + fun getLinePos () = + let + val (fname, line, _) = Stream.getPposFromPos (Stream.getPos stream) stream + in + (fname, line, NONE) + end + + fun determineType s = + let + fun --> msg = raise TkErrorAug (getLinePos (), msg) + fun isLast c = sub (s, size s - 1) = c + in + if s = "" then + --> "#include argument is empty" + else + case sub (s, 0) of + #"<" => + if isLast #">" then + IAFromRef $ stringCut s + else + --> "expected > at #include argument end" + | #"\"" => + if isLast #"\"" then + IARel $ stringCut s + else + --> "expected \" at #include argument end" + | _ => --> "#include argument should start with \" or <" + end + + val (arg, stream) = Stream.readline stream handle + Stream.LineWithoutNl => + raise TkErrorAug (getLinePos (), + "#include line does not end with \\n") + in + (determineType $ trim arg, stream) + end + + fun postprocessCppDir tk tkl stream = + let + val isCppDir = + (fn Hash => true | _ => false) (#2 $ hd tkl) + andalso Stream.isFirstOnLine (#1 $ hd tkl) stream + handle Empty => false + val (pos, tk') = tk + + fun conv tk = ((#1 $ hd tkl, tk) :: tl tkl) + in + if isCppDir andalso tk' = Id "include" then + let + val (arg, stream) = readIncludeArg stream + in + (conv $ CppInclude arg, stream) + end + else if isCppDir then + (conv $ formCppDir tk', stream) handle + ExpectedCppDir => + raise TkErrorAug (Stream.getPposFromPos pos stream, + "expected preprocessor directive") + + else + (tk :: tkl, stream) + end + + fun unexpectedCharRaise stream c = + let + val (id, pos) = Stream.getPosAfterCharRead stream + val pos = Stream.getPposFromPos (id, pos) stream + val repr = + if isPrintable c then + str c + else + "<" ^ Int.toString (ord c) ^ ">" + in + raise TkErrorAug (pos, "unexpected character " ^ repr) + end + + fun skipComment stream pos = + let + fun skip prevIsAsterisk stream = + let + val (c, stream) = Stream.getchar stream + in + if prevIsAsterisk andalso c = #"/" then + stream + else + skip (c = #"*") stream + end + in + skip false stream handle + Stream.EndOfFile => + let + val pos = Stream.getPposFromPos pos stream + in + raise TkErrorAug (pos, "unfinished comment") + end + end + + fun handleBackslash stream = + let + val (c, stream) = (fn (c, s) => (SOME c, s)) $ Stream.getchar stream handle + _ => (NONE, stream) + + val raiseErr = fn () => + let + val pos = Stream.getPosAfterCharRead stream + val pos = Stream.getPposFromPos pos stream + in + raise TkErrorAug (pos, "expected \\n after backslash") + end + in + case c of + SOME c => + if c = #"\n" then + stream + else + raiseErr () + | NONE => raiseErr () + end + + fun processSymbol stream tkl = + let + val (T as (p as (fid, off), tk), stream) = fsmEat stream + in + case tk of + CommentStart => tokenize (skipComment stream p) tkl + | DoubleDot => tokenize stream (((fid, off + 1), Dot) :: (p, Dot) :: tkl) + | _ => tokenize stream (T :: tkl) + end + + and tokenize stream tkl = + let + fun getcharSkipEof stream = Stream.getchar stream handle + Stream.EndOfFile => getcharSkipEof (Stream.advanceToNewFile stream) + + val (c, stream) = (fn (c, s) => (SOME c, s)) $ getcharSkipEof stream + handle + Stream.EndOfStream => (NONE, stream) + + fun cont (tk, stream) = tokenize stream (tk :: tkl) + fun @-> parser acc = cont $ parseGeneric stream parser acc + in + case c of + NONE => (rev tkl, Stream.extractFilesInfo stream) + | SOME c => + if Char.isSpace c then + tokenize stream tkl + else if isIdStart c then + let + val (tk, stream) = parseGeneric stream idParser () + val (tkl, stream) = postprocessCppDir tk tkl stream + in + tokenize stream tkl + end + else if Char.isDigit c then + @-> numParser () + else if c = #"'" then + @-> charParser SeqInit + else if c = #"\"" then + @-> strParser SeqInit + else if isStartForFsm c then + processSymbol stream tkl + else if c = #"\\" then + tokenize (handleBackslash stream) tkl + else + unexpectedCharRaise stream c + end + +end |