diff options
Diffstat (limited to 'tokenizer.sml')
-rw-r--r-- | tokenizer.sml | 787 |
1 files changed, 787 insertions, 0 deletions
diff --git a/tokenizer.sml b/tokenizer.sml new file mode 100644 index 0000000..53bb396 --- /dev/null +++ b/tokenizer.sml @@ -0,0 +1,787 @@ +structure Tokenizer:> TOKENIZER = struct + datatype includeArg = IARel of string | IAFromRef of string + + datatype token = + Invalid | + Number of string | + Id of string | + CharConst of string * int | + StringConst of string | + + kwBreak | + kwCase | + kwChar | + kwConst | + kwContinue | + kwDefault | + kwDouble | + kwElse | + kwEnum | + kwExtern | + kwFloat | + kwFor | + kwGoto | + kwInt | + kwLong | + kwRegister | + kwReturn | + kwShort | + kwSigned | + kwSizeof | + kwStruct | + kwSwitch | + kwTypedef | + kwUnion | + kwUnsigned | + kwVoid | + kwVolatile | + + LParen | + RParen | + LBracket | + RBracket | + LBrace | + RBrace | + + QuestionMark | + Colon | + Coma | + Semicolon | + + Arrow | + Plus | + DoublePlus| + Minus | + DoubleMinus | + Ampersand | + Asterisk | + Slash | + Tilde | + ExclMark | + Percent | + DoubleGreater | + DoubleLess | + Greater | + Less | + EqualSign | + LessEqualSign | + GreaterEqualSign | + DoubleEqualSign | + ExclMarkEqualSign | + Cap | + VerticalBar | + DoubleAmpersand | + DoubleVerticalBar | + + AsteriskEqualSign | + SlashEqualSign | + PercentEqualSign | + PlusEqualSign | + MinusEqualSign | + DoubleLessEqualSign | + DoubleGreaterEqualSign | + AmpersandEqualSign | + CapEqualSign | + VerticalBarEqualSign | + + Hash | + DoubleHash | + + Dot | + DoubleDot | + TripleDot | + + CommentStart | + + CppInclude of includeArg | + CppDefine | + CppUndef | + CppIf | + CppIfdef | + CppIfndef | + CppElse | + CppElif | + CppEndif | + CppPragma + + val kwPrefix = #"@" + val cppPrefix = #"$" + + type fullToken = Stream.pos * token + + datatype tkErrorAuxInfo = TkiEOF | TkiDx of int | TkiStart + + exception TkError of tkErrorAuxInfo * string + exception TkErrorAug of Stream.convPos * string + + exception ExpectedCppDir (* handled in postprocess *) + + exception FsmTableIsTooSmall + + (* Unreachable (should be) *) + exception Unreachable + exception TokenWithoutRepr + + val tokenRepr = + let + fun & repr = str kwPrefix ^ repr + fun % repr = str cppPrefix ^ repr + in + [ + (kwBreak, &"break"), + (kwCase, &"case"), + (kwChar, &"char"), + (kwConst, &"const"), + (kwContinue, &"continue"), + (kwDefault, &"default"), + (kwDouble, &"double"), + (kwElse, &"else"), + (kwEnum, &"enum"), + (kwExtern, &"extern"), + (kwFloat, &"float"), + (kwFor, &"for"), + (kwGoto, &"goto"), + (kwInt, &"int"), + (kwLong, &"long"), + (kwRegister, &"register"), + (kwReturn, &"return"), + (kwShort, &"short"), + (kwSigned, &"signed"), + (kwSizeof, &"sizeof"), + (kwStruct, &"struct"), + (kwSwitch, &"switch"), + (kwTypedef, &"typedef"), + (kwUnion, &"union"), + (kwUnsigned, &"unsigned"), + (kwVoid, &"void"), + (kwVolatile, &"volatile"), + + (LParen, "("), + (RParen, ")"), + (LBracket, "["), + (RBracket, "]"), + (LBrace, "{"), + (RBrace, "}"), + + (QuestionMark, "?"), + (Colon, ":"), + (Coma, ","), + (Semicolon, ";"), + + (Arrow, "->"), + (Plus, "+"), + (DoublePlus, "++"), + (Minus, "-"), + (DoubleMinus, "--"), + (Ampersand, "&"), + (Asterisk, "*"), + (Slash, "/"), + (Tilde, "~"), + (ExclMark, "!"), + (Percent, "%"), + (DoubleLess, "<<"), + (DoubleGreater, ">>"), + (Less, "<"), + (Greater, ">"), + (EqualSign, "="), + (LessEqualSign, "<="), + (GreaterEqualSign, ">="), + (DoubleEqualSign, "=="), + (ExclMarkEqualSign, "!="), + (Cap, "^"), + (VerticalBar, "|"), + (DoubleAmpersand, "&&"), + (DoubleVerticalBar, "||"), + + (AsteriskEqualSign, "*="), + (SlashEqualSign, "/="), + (PercentEqualSign, "%="), + (PlusEqualSign, "+="), + (MinusEqualSign, "-="), + (DoubleLessEqualSign, "<<="), + (DoubleGreaterEqualSign, ">>="), + (AmpersandEqualSign, "&="), + (CapEqualSign, "^="), + (VerticalBarEqualSign, "|="), + + (Hash, "#"), + (DoubleHash, "##"), + + (Dot, "."), + (DoubleDot, ".."), + (TripleDot, "..."), + + (CommentStart, "/*"), + + (CppDefine, %"define"), + (CppUndef, %"undef"), + (CppIf, %"if"), + (CppIfdef, %"ifdef"), + (CppIfndef, %"ifndef"), + (CppElse, %"else"), + (CppElif, %"elif"), + (CppEndif, %"endif"), + (CppPragma, %"pragma") + ] + end + + val printToken = fn + Number s => printLn $ "Num: " ^ s + | Id s => printLn $ "Id: " ^ s + | CharConst (repr, _) => printLn repr + | CppInclude arg => + let + val (start, end', arg) = + case arg of + IARel v => ("\"", "\"", v) + | IAFromRef v => ("<", ">", v) + in + printLn $ (str cppPrefix) ^ "include " ^ start ^ arg ^ end' + end + | StringConst s => + printLn $ "\"" ^ s ^ "\"" + | v => + case List.find (fn (x, _) => x = v) tokenRepr of + SOME (_, repr) => printLn repr + | NONE => raise TokenWithoutRepr + + fun isIdStart c = Char.isAlpha c orelse c = #"_" + fun isIdBody c = Char.isAlphaNum c orelse c = #"_" + + fun isPrintable c = Char.isPrint c andalso c <> #" " + + (* FSM for parsing symbols *) + + val maxStates = 51 + + fun fsmInsert (nextState, buf) curState tk (c :: cs) = + let + open Array + + val (_, row) = sub (buf, curState) + val potNextState = sub (row, ord c) + in + if potNextState <> ~1 then + fsmInsert (nextState, buf) potNextState tk cs + else ( + update (row, ord c, !nextState); + nextState := !nextState + 1; + fsmInsert (nextState, buf) (!nextState - 1) tk cs + ) + end + | fsmInsert (_, buf) curState tk [] = + let + open Array + val (_, row) = sub (buf, curState) + in + update (buf, curState, (tk, row)) + end + + fun isStartForFsmGen () = + let + open Array + + val lookupTable = array (128, false) + + fun firstChr s = ord $ String.sub (s, 0) + val () = List.app + (fn (_, repr) => update (lookupTable, firstChr repr, true)) + $ List.filter + (fn (_, repr) => + let + val c = String.sub (repr, 0) + in + c <> kwPrefix andalso c <> cppPrefix + end) + tokenRepr + + in + fn c => sub (lookupTable, ord c) + end + + val isStartForFsm = isStartForFsmGen () + + fun fsmTableCreate () = + let + open Array + + val T as (nextState, buf) = + (ref 1, array (maxStates, (Invalid, array (128, ~1)))) + val r = ref 1 + + fun filterNeeded [] acc = acc + | filterNeeded ((T as (_, repr)) :: tks) acc = + filterNeeded tks + (if isStartForFsm $ String.sub (repr, 0) then + T :: acc + else + acc) + + val tokenRepr = filterNeeded tokenRepr [] + + fun fsmInsert' T curState tk repr = fsmInsert T curState tk repr handle + Subscript => raise FsmTableIsTooSmall + in + while !r < length buf do ( + update (buf, !r, (Invalid, array(128, ~1))); + r := !r + 1 + ); + + List.app (fn (v, p) => fsmInsert' T 0 v $ explode p) tokenRepr; + if !nextState <> maxStates then + printLn $ "note: Fsm table size can be smaller: " + ^ Int.toString (!nextState) ^ " is enough" + else (); + T + end + + (* Unused right now + fun printTable (nextState, buf) = + let + fun printRow i row = + if i = length row then + print "\n" + else + let + val state = sub (row, i) + in + if state = ~1 then + () + else + print ((str (chr i)) ^ ": " ^ (Int.toString state) ^ ", "); + printRow (i + 1) row + end + + fun print' rowNum buf = + if rowNum = !nextState then + () + else + let + val (tk, row) = sub (buf, rowNum) + in + print ((token2string tk) ^ ": "); + printRow 0 row; + print' (rowNum + 1) buf + end + in + print ("NextState: " ^ Int.toString (!nextState) ^ "\n"); + print' 0 buf; + print "\n" + end + *) + + val fsmTable = lazy fsmTableCreate + + fun fsmEat stream = + let + open Array + val stream = Stream.ungetc stream + val pos = Stream.getPos stream + + fun get curState stream = + let + val (c, stream) = (fn (c, s) => (SOME c, s)) $ Stream.getchar stream + handle + _ => (NONE, stream) + in + case c of + NONE => (#1 $ sub (#2 $ fsmTable (), curState), stream) + | SOME c => + let + val (tk, row) = sub (#2 $ fsmTable (), curState) + val nextState = sub (row, ord c) + in + if nextState = ~1 then + (tk, Stream.ungetc stream) + else + get nextState stream + end + end + in + (fn (tk, stream) => ((pos, tk), stream)) $ get 0 stream + end + + fun tkError2aug stream (dx, msg) = + let + val (id, pos) = Stream.getPosAfterCharRead stream + val pos = Stream.getPposFromPos (id, pos + dx) stream + in + TkErrorAug (pos, msg) + end + + fun parseGeneric stream parser acc = + let + val stream = Stream.ungetc stream + val P as (_, startOff) = Stream.getPos stream + + fun parse' stream acc = let + val (c, stream) = (fn (c, s) => (SOME c, s)) $ Stream.getchar stream handle + _ => (NONE, stream) + + val (acc, tk, stream) = parser acc (stream, startOff) c handle + TkError (TkiDx dx, msg) => raise tkError2aug stream (dx, msg) + | TkError (TkiStart, msg) => + let + val startPos = Stream.getPposFromPos P stream + in + raise TkErrorAug (startPos, msg) + end + | TkError (TkiEOF, msg) => + let + val (file, line, _) = Stream.getPposFromPos P stream + in + raise TkErrorAug ((file, line, NONE), msg) + end + in + case tk of + NONE => parse' stream acc + | _ => (valOf tk, stream) + end + + val (tk, stream) = parse' stream acc + in + ((P, tk): fullToken, stream) + end + + fun finishSeqRead startOff stream = + let + val (_, endOff) = Stream.getPos stream + val s = Stream.getSubstr startOff endOff stream + in + s + end + + fun findKeyword s = + case List.find + (fn (_, repr) => + String.sub (repr, 0) = kwPrefix andalso + String.extract (repr, 1, NONE) = s) + tokenRepr + of + SOME (tk, _) => tk + | NONE => Id s + + fun idParser () (stream, startOff) c = + let + fun finalize stream = + let + val s = finishSeqRead startOff stream + val tk = findKeyword s + in + ((), SOME tk, stream) + end + in + case c of + NONE => finalize stream + | SOME c => + if isIdBody c then + ((), NONE, stream) + else + finalize (Stream.ungetc stream) + end + + fun formCppDir (Id s) = + let + open String + in + case List.find + (fn (_, repr) => + sub (repr, 0) = cppPrefix andalso + extract (repr, 1, NONE) = s) + tokenRepr + of + SOME (tk, _) => tk + | NONE => raise ExpectedCppDir + end + | formCppDir kwElse = CppElse + | formCppDir _ = raise ExpectedCppDir + + fun numParser () (stream, startOff) c = + let + fun finalize stream = + ((), SOME $ Number (finishSeqRead startOff stream), stream) + in + case c of + NONE => finalize stream + | SOME c => + if Char.isDigit c then + ((), NONE, stream) + else + finalize (Stream.ungetc stream) + end + + fun eatEscSeq stream = + let + val (c, stream) = Stream.getchar stream handle + _ => raise TkError (TkiDx 0, "unfinished escape sequence") + in + (case c of + #"\\" => #"\\" + | #"t" => #"\t" + | #"n" => #"\n" + | c => c, + stream) + end + + fun stringCut s = String.extract (s, 1, SOME $ String.size s - 2) + + datatype SeqParseState = SeqInit | SeqStart | SeqValue of int | SeqTerm + + datatype seqParseMode = SpmChr | SpmStr + + fun seqBound SpmChr = #"'" + | seqBound SpmStr = #"\"" + + fun seqExnConv mode (TkError (v, msg)) = + let + val bound = if mode = SpmChr then "'" else "\"" + val msg = + String.translate (fn c => if c = #"%" then bound else str c) msg + in + TkError (v, msg) + end + | seqExnConv _ _ = raise Unreachable + + fun unfinishedSeq SpmChr = "unfinished character constant" + | unfinishedSeq SpmStr = "unfinished string literal" + + fun seqParser mode SeqInit (stream, _) (SOME c) = + if seqBound mode = c then + (SeqStart, NONE, stream) + else + raise Unreachable + | seqParser mode SeqStart (stream, _) (SOME c) = + if c <> seqBound mode then + let + val (c, stream) = + if c <> #"\\" then (c, stream) else eatEscSeq stream + in + (SeqValue (ord c), NONE, stream) + end + else if mode = SpmStr then + (SeqTerm, SOME $ StringConst "", stream) + else + raise seqExnConv SpmChr $ TkError (TkiDx 0, "expected value after %") + | seqParser mode (SeqValue v) (stream, startOff) (SOME c) = + if seqBound mode = c then + let + fun term s v = + if mode = SpmChr then + CharConst (s, v) + else + StringConst $ stringCut s + in + (SeqTerm, SOME $ term (finishSeqRead startOff stream) v, stream) + end + else if mode = SpmStr then + let + val (_, stream) = + if c <> #"\\" then (c, stream) else eatEscSeq stream + in + (SeqValue v, NONE, stream) + end + else + raise seqExnConv SpmChr $ TkError (TkiDx 0, "expected % after value") + | seqParser _ SeqTerm _ (SOME _) = + raise Unreachable + | seqParser mode state (_, _) NONE = + raise case state of + SeqInit => Unreachable + | SeqStart => seqExnConv mode $ TkError (TkiStart, unfinishedSeq mode) + | SeqValue _ => + seqExnConv mode $ TkError (TkiStart, unfinishedSeq mode) + | SeqTerm => Unreachable + + val charParser = seqParser SpmChr + val strParser = seqParser SpmStr + + fun readIncludeArg stream = + let + open String + + fun triml s idx = + if idx = size s then + "" + else if Char.isSpace $ sub (s, idx) then + triml s (idx + 1) + else + extract (s, idx, NONE) + + fun trimr s idx = + if idx = 0 then + "" + else if Char.isSpace $ sub (s, idx) then + trimr s (idx - 1) + else + extract (s, 0, SOME $ idx + 1) + + fun trim s = triml (trimr s (size s - 1)) 0 + + fun getLinePos () = + let + val (fname, line, _) = Stream.getPposFromPos (Stream.getPos stream) stream + in + (fname, line, NONE) + end + + fun determineType s = + let + fun --> msg = raise TkErrorAug (getLinePos (), msg) + fun isLast c = sub (s, size s - 1) = c + in + if s = "" then + --> "#include argument is empty" + else + case sub (s, 0) of + #"<" => + if isLast #">" then + IAFromRef $ stringCut s + else + --> "expected > at #include argument end" + | #"\"" => + if isLast #"\"" then + IARel $ stringCut s + else + --> "expected \" at #include argument end" + | _ => --> "#include argument should start with \" or <" + end + + val (arg, stream) = Stream.readline stream handle + Stream.LineWithoutNl => + raise TkErrorAug (getLinePos (), + "#include line does not end with \\n") + in + (determineType $ trim arg, stream) + end + + fun postprocessCppDir tk tkl stream = + let + val isCppDir = + (fn Hash => true | _ => false) (#2 $ hd tkl) + andalso Stream.isFirstOnLine (#1 $ hd tkl) stream + handle Empty => false + val (pos, tk') = tk + + fun conv tk = ((#1 $ hd tkl, tk) :: tl tkl) + in + if isCppDir andalso tk' = Id "include" then + let + val (arg, stream) = readIncludeArg stream + in + (conv $ CppInclude arg, stream) + end + else if isCppDir then + (conv $ formCppDir tk', stream) handle + ExpectedCppDir => + raise TkErrorAug (Stream.getPposFromPos pos stream, + "expected preprocessor directive") + + else + (tk :: tkl, stream) + end + + fun unexpectedCharRaise stream c = + let + val (id, pos) = Stream.getPosAfterCharRead stream + val pos = Stream.getPposFromPos (id, pos) stream + val repr = + if isPrintable c then + str c + else + "<" ^ Int.toString (ord c) ^ ">" + in + raise TkErrorAug (pos, "unexpected character " ^ repr) + end + + fun skipComment stream pos = + let + fun skip prevIsAsterisk stream = + let + val (c, stream) = Stream.getchar stream + in + if prevIsAsterisk andalso c = #"/" then + stream + else + skip (c = #"*") stream + end + in + skip false stream handle + Stream.EndOfFile => + let + val pos = Stream.getPposFromPos pos stream + in + raise TkErrorAug (pos, "unfinished comment") + end + end + + fun handleBackslash stream = + let + val (c, stream) = (fn (c, s) => (SOME c, s)) $ Stream.getchar stream handle + _ => (NONE, stream) + + val raiseErr = fn () => + let + val pos = Stream.getPosAfterCharRead stream + val pos = Stream.getPposFromPos pos stream + in + raise TkErrorAug (pos, "expected \\n after backslash") + end + in + case c of + SOME c => + if c = #"\n" then + stream + else + raiseErr () + | NONE => raiseErr () + end + + fun processSymbol stream tkl = + let + val (T as (p as (fid, off), tk), stream) = fsmEat stream + in + case tk of + CommentStart => tokenize (skipComment stream p) tkl + | DoubleDot => tokenize stream (((fid, off + 1), Dot) :: (p, Dot) :: tkl) + | _ => tokenize stream (T :: tkl) + end + + and tokenize stream tkl = + let + fun getcharSkipEof stream = Stream.getchar stream handle + Stream.EndOfFile => getcharSkipEof (Stream.advanceToNewFile stream) + + val (c, stream) = (fn (c, s) => (SOME c, s)) $ getcharSkipEof stream + handle + Stream.EndOfStream => (NONE, stream) + + fun cont (tk, stream) = tokenize stream (tk :: tkl) + fun @-> parser acc = cont $ parseGeneric stream parser acc + in + case c of + NONE => (rev tkl, Stream.extractFilesInfo stream) + | SOME c => + if Char.isSpace c then + tokenize stream tkl + else if isIdStart c then + let + val (tk, stream) = parseGeneric stream idParser () + val (tkl, stream) = postprocessCppDir tk tkl stream + in + tokenize stream tkl + end + else if Char.isDigit c then + @-> numParser () + else if c = #"'" then + @-> charParser SeqInit + else if c = #"\"" then + @-> strParser SeqInit + else if isStartForFsm c then + processSymbol stream tkl + else if c = #"\\" then + tokenize (handleBackslash stream) tkl + else + unexpectedCharRaise stream c + end + +end |