diff options
author | Vladimir Azarov <avm@intermediate-node.net> | 2025-04-04 20:53:56 +0200 |
---|---|---|
committer | Vladimir Azarov <avm@intermediate-node.net> | 2025-04-04 20:53:56 +0200 |
commit | 9d724f17e813fa344d485329d33b5f5ecf8197a3 (patch) | |
tree | 5061e604ea88a379db975b13c1d20688007cacc8 /tokenizer.fun | |
parent | 7b29b31648fd737e7bbc007f480b799add91bc6b (diff) |
Functorization
Diffstat (limited to 'tokenizer.fun')
-rw-r--r-- | tokenizer.fun | 1019 |
1 files changed, 1019 insertions, 0 deletions
diff --git a/tokenizer.fun b/tokenizer.fun new file mode 100644 index 0000000..5cca203 --- /dev/null +++ b/tokenizer.fun @@ -0,0 +1,1019 @@ +functor Tokenizer(structure H: HASHTABLE; structure S: STREAM) + : TOKENIZER = +struct + + structure S = S + + datatype intType = ItDec | ItOct | ItHex + datatype intSfx = IsNone | IsU | IsL | IsUL | IsLL | IsULL + datatype floatSfx = FsNone | FsF | FsL + + datatype numConst = + IntConst of intType * string * intSfx | + FloatConst of string * floatSfx + + datatype token = + Invalid | + EOS | + NewLine | + + Num of numConst | + + Id of string | + CharConst of string * int | + StringConst of string | + + kwBreak | + kwCase | + kwChar | + kwConst | + kwContinue | + kwDefault | + kwDouble | + kwElse | + kwEnum | + kwExtern | + kwFloat | + kwFor | + kwGoto | + kwInt | + kwLong | + kwRegister | + kwReturn | + kwShort | + kwSigned | + kwSizeof | + kwStruct | + kwSwitch | + kwTypedef | + kwUnion | + kwUnsigned | + kwVoid | + kwVolatile | + + LParen | + RParen | + LBracket | + RBracket | + LBrace | + RBrace | + + QuestionMark | + Colon | + Coma | + Semicolon | + + Arrow | + Plus | + DoublePlus| + Minus | + DoubleMinus | + Ampersand | + Asterisk | + Slash | + Tilde | + ExclMark | + Percent | + DoubleGreater | + DoubleLess | + Greater | + Less | + EqualSign | + LessEqualSign | + GreaterEqualSign | + DoubleEqualSign | + ExclMarkEqualSign | + Cap | + VerticalBar | + DoubleAmpersand | + DoubleVerticalBar | + + AsteriskEqualSign | + SlashEqualSign | + PercentEqualSign | + PlusEqualSign | + MinusEqualSign | + DoubleLessEqualSign | + DoubleGreaterEqualSign | + AmpersandEqualSign | + CapEqualSign | + VerticalBarEqualSign | + + Hash | + DoubleHash | + + Dot | + DoubleDot | + TripleDot | + + CommentStart | + + CppInclude | + CppDefine | + CppUndef | + CppIf | + CppIfdef | + CppIfndef | + CppElse | + CppElif | + CppEndif | + CppWarning | + CppError | + CppPragma + + val kwPrefix = #"@" + val cppPrefix = #"$" + + type fullToken = S.pos * token + + datatype tkErrorAuxInfo = TkiEOF | TkiDx of int | TkiStart + + exception TkError of tkErrorAuxInfo * string + exception TkErrorAug of S.ppos * string + + exception ExpectedCppDir (* handled in postprocess *) + + exception FsmTableIsTooSmall + + (* Unreachable (should be) *) + exception TokenWithoutRepr + exception SuffixWithoutRepr + + val tokenRepr = + let + fun & repr = str kwPrefix ^ repr + fun % repr = str cppPrefix ^ repr + in + [ + (NewLine, "NewLine"), + + (kwBreak, &"break"), + (kwCase, &"case"), + (kwChar, &"char"), + (kwConst, &"const"), + (kwContinue, &"continue"), + (kwDefault, &"default"), + (kwDouble, &"double"), + (kwElse, &"else"), + (kwEnum, &"enum"), + (kwExtern, &"extern"), + (kwFloat, &"float"), + (kwFor, &"for"), + (kwGoto, &"goto"), + (kwInt, &"int"), + (kwLong, &"long"), + (kwRegister, &"register"), + (kwReturn, &"return"), + (kwShort, &"short"), + (kwSigned, &"signed"), + (kwSizeof, &"sizeof"), + (kwStruct, &"struct"), + (kwSwitch, &"switch"), + (kwTypedef, &"typedef"), + (kwUnion, &"union"), + (kwUnsigned, &"unsigned"), + (kwVoid, &"void"), + (kwVolatile, &"volatile"), + + (LParen, "("), + (RParen, ")"), + (LBracket, "["), + (RBracket, "]"), + (LBrace, "{"), + (RBrace, "}"), + + (QuestionMark, "?"), + (Colon, ":"), + (Coma, ","), + (Semicolon, ";"), + + (Arrow, "->"), + (Plus, "+"), + (DoublePlus, "++"), + (Minus, "-"), + (DoubleMinus, "--"), + (Ampersand, "&"), + (Asterisk, "*"), + (Slash, "/"), + (Tilde, "~"), + (ExclMark, "!"), + (Percent, "%"), + (DoubleLess, "<<"), + (DoubleGreater, ">>"), + (Less, "<"), + (Greater, ">"), + (EqualSign, "="), + (LessEqualSign, "<="), + (GreaterEqualSign, ">="), + (DoubleEqualSign, "=="), + (ExclMarkEqualSign, "!="), + (Cap, "^"), + (VerticalBar, "|"), + (DoubleAmpersand, "&&"), + (DoubleVerticalBar, "||"), + + (AsteriskEqualSign, "*="), + (SlashEqualSign, "/="), + (PercentEqualSign, "%="), + (PlusEqualSign, "+="), + (MinusEqualSign, "-="), + (DoubleLessEqualSign, "<<="), + (DoubleGreaterEqualSign, ">>="), + (AmpersandEqualSign, "&="), + (CapEqualSign, "^="), + (VerticalBarEqualSign, "|="), + + (Hash, "#"), + (DoubleHash, "##"), + + (Dot, "."), + (DoubleDot, ".."), + (TripleDot, "..."), + + (CommentStart, "/*"), + + (CppInclude, %"include"), + (CppDefine, %"define"), + (CppUndef, %"undef"), + (CppIf, %"if"), + (CppIfdef, %"ifdef"), + (CppIfndef, %"ifndef"), + (CppElse, %"else"), + (CppElif, %"elif"), + (CppEndif, %"endif"), + (CppWarning, %"warning"), + (CppError, %"error"), + (CppPragma, %"pragma") + ] + end + + val intSuffixRepr = [ + (IsNone, ""), + (IsU, "u"), + (IsL, "l"), + (IsUL, "ul"), + (IsLL, "ll"), + (IsULL, "ull") + ] + + val floatSuffixRepr = [ + (FsNone, ""), + (FsF, "f"), + (FsL, "l") + ] + + fun getSfxRepr sfx buf onError = + case List.find (fn (sfx', _) => sfx' = sfx) buf of + NONE => onError () + | SOME (_, repr) => repr + + fun getSfxReprSimple sfx buf = + getSfxRepr sfx buf (fn () => raise SuffixWithoutRepr) + + val token2str = fn + Id s => "id:" ^ s + | Num (IntConst (it, str, sfx)) => + let + val intType = + case it of + ItDec => "" + | ItOct => "0" + | ItHex => "0x" + in + intType ^ str ^ "`" ^ getSfxReprSimple sfx intSuffixRepr ^ "`" + end + | Num (FloatConst (str, sfx)) => + str ^ "`" ^ getSfxReprSimple sfx floatSuffixRepr ^ "`" + | CharConst (repr, _) => repr + | StringConst s => + "\"" ^ s ^ "\"" + | v => + case List.find (fn (x, _) => x = v) tokenRepr of + SOME (_, repr) => repr + | NONE => raise TokenWithoutRepr + + fun printToken tk = print $ token2str tk + + fun isIdStart c = Char.isAlpha c orelse c = #"_" + fun isIdBody c = Char.isAlphaNum c orelse c = #"_" + + fun isOctal c = ord c >= ord #"0" andalso ord c < ord #"8" + val isDigit = Char.isDigit + val isHexDigit = Char.isHexDigit + + fun isPrintable c = Char.isPrint c andalso c <> #" " + + (* FSM for parsing symbols *) + + val maxStates = 51 + + fun fsmInsert (nextState, buf) curState tk (c :: cs) = + let + open Array + + val (_, row) = sub (buf, curState) + val potNextState = sub (row, ord c) + in + if potNextState <> ~1 then + fsmInsert (nextState, buf) potNextState tk cs + else ( + update (row, ord c, !nextState); + nextState := !nextState + 1; + fsmInsert (nextState, buf) (!nextState - 1) tk cs + ) + end + | fsmInsert (_, buf) curState tk [] = + let + open Array + val (_, row) = sub (buf, curState) + in + update (buf, curState, (tk, row)) + end + + fun isStartForFsmGen () = + let + open Array + + val lookupTable = array (128, false) + + fun firstChr s = ord $ String.sub (s, 0) + val () = List.app + (fn (_, repr) => update (lookupTable, firstChr repr, true)) + $ List.filter + (fn (tk, repr) => + let + val c = String.sub (repr, 0) + in + c <> kwPrefix andalso c <> cppPrefix + andalso tk <> NewLine + end) + tokenRepr + + in + fn c => sub (lookupTable, ord c) + end + + val isStartForFsm = isStartForFsmGen () + + fun fsmTableCreate () = + let + open Array + + val T as (nextState, buf) = + (ref 1, array (maxStates, (Invalid, array (128, ~1)))) + val r = ref 1 + + fun filterNeeded [] acc = acc + | filterNeeded ((T as (_, repr)) :: tks) acc = + filterNeeded tks + (if isStartForFsm $ String.sub (repr, 0) then + T :: acc + else + acc) + + val tokenRepr = filterNeeded tokenRepr [] + + fun fsmInsert' T curState tk repr = fsmInsert T curState tk repr handle + Subscript => raise FsmTableIsTooSmall + in + while !r < length buf do ( + update (buf, !r, (Invalid, array(128, ~1))); + r := !r + 1 + ); + + List.app (fn (v, p) => fsmInsert' T 0 v $ explode p) tokenRepr; + if !nextState <> maxStates then + printLn $ "note: Fsm table size can be smaller: " + ^ Int.toString (!nextState) ^ " is enough" + else (); + T + end + + (* Unused right now + fun printTable (nextState, buf) = + let + fun printRow i row = + if i = length row then + print "\n" + else + let + val state = sub (row, i) + in + if state = ~1 then + () + else + print ((str (chr i)) ^ ": " ^ (Int.toString state) ^ ", "); + printRow (i + 1) row + end + + fun print' rowNum buf = + if rowNum = !nextState then + () + else + let + val (tk, row) = sub (buf, rowNum) + in + print ((token2string tk) ^ ": "); + printRow 0 row; + print' (rowNum + 1) buf + end + in + print ("NextState: " ^ Int.toString (!nextState) ^ "\n"); + print' 0 buf; + print "\n" + end + *) + + val fsmTable = lazy fsmTableCreate + + fun fsmEat stream = + let + open Array + val pos = S.getPos stream + + fun get curState stream = + let + val (c, stream) = S.getchar stream + in + case c of + NONE => (#1 $ sub (#2 $ fsmTable (), curState), stream) + | SOME c => + let + val (tk, row) = sub (#2 $ fsmTable (), curState) + val nextState = sub (row, ord c) + in + if nextState = ~1 then + (tk, S.ungetc stream) + else + get nextState stream + end + end + in + (fn (tk, stream) => ((pos, tk), stream)) $ get 0 stream + end + + fun tkError2aug stream (dx, msg) = + let + val (id, pos) = S.getPosAfterCharRead stream + val pos = S.pos2ppos (id, pos + dx) stream + in + TkErrorAug (pos, msg) + end + + fun parserWrapper stream parser acc = + let + val stream = S.ungetc stream + val P as (_, startOff) = S.getPos stream + + fun parse' stream acc = let + val (c, stream) = S.getchar stream + + val (acc, tk, stream) = parser acc (stream, startOff) c handle + TkError (TkiDx dx, msg) => raise tkError2aug stream (dx, msg) + | TkError (TkiStart, msg) => + let + val startPos = S.pos2ppos P stream + in + raise TkErrorAug (startPos, msg) + end + | TkError (TkiEOF, msg) => + let + val pos = S.pposWithoutCol $ S.pos2ppos P stream + in + raise TkErrorAug (pos, msg) + end + in + case tk of + NONE => parse' stream acc + | _ => (valOf tk, stream) + end + + val (tk, stream) = parse' stream acc + in + ((P, tk): fullToken, stream) + end + + fun finishSeqRead startOff stream = + let + val (_, endOff) = S.getPos stream + val s = S.getSubstr startOff endOff stream + in + s + end + + fun keywordHashtableGen () = + let + val table = H.create 128 + val () = + List.app + (fn (tk, repr) => + if String.sub (repr, 0) = kwPrefix then + H.insert table (String.extract (repr, 1, NONE)) tk + else + ()) + tokenRepr + in + table + end + + val keywordHashtable = lazy keywordHashtableGen + + fun findKeyword str = + case H.lookup (keywordHashtable ()) str of + NONE => Id str + | SOME tk => tk + + fun idParser () (stream, startOff) c = + let + fun finalize stream = + let + val s = finishSeqRead startOff stream + val tk = findKeyword s + in + ((), SOME tk, stream) + end + in + case c of + NONE => finalize stream + | SOME c => + if isIdBody c then + ((), NONE, stream) + else + finalize (S.ungetc stream) + end + + datatype intMode = ImDec | ImOct | ImInvalidOct | ImHex + datatype floatMode = FmDot | FmExp + + datatype npState = NpInit | IntMode of intMode | FloatMode of floatMode + + fun getLongestSeq acc pred stream = + let + val (c, stream') = S.getchar stream + in + if isSome c andalso pred $ valOf c then + getLongestSeq (valOf c :: acc) pred stream' + else + (implode $ rev acc, stream) + end + + fun skipDigitSeq off stream = + let + val (res, stream) = getLongestSeq [] isDigit stream + in + if res = "" then + raise TkError (TkiDx off, "expected digit") + else + (String.size res, stream) + end + + fun getSuffixCommon buf off stream = + let + val (sfx, stream) = getLongestSeq [] Char.isAlpha stream + val sfx = String.map (fn c => Char.toLower c) sfx + val sfx = + case List.find (fn (_, repr) => repr = sfx) buf of + NONE => raise TkError (TkiDx off, "unknown suffix") + | SOME (sfx, _) => sfx + in + (sfx, stream) + end + + val getIntSuffix = getSuffixCommon intSuffixRepr 0 + fun getFloatSuffix off = getSuffixCommon floatSuffixRepr off + + (* + * It seems that there is an ambiguity in C: + * gcc/clang/cparser consider 0xe-3 as a fp constant and reject it. + * Right now we do not, but it may be reconsidered. + *) + fun numParser NpInit (stream, _) (SOME c) = + if c = #"0" then + let + val (c, stream') = S.getchar stream + in + case c of + NONE => + (NpInit, SOME $ Num $ IntConst (ItOct, "", IsNone), stream') + | SOME c => + if Char.toLower c = #"x" then + (IntMode ImHex, NONE, stream') + else + (IntMode ImOct, NONE, stream) + end + else + (IntMode ImDec, NONE, stream) + | numParser NpInit _ NONE = raise Unreachable + | numParser (IntMode mode) (stream, startOff) c = + let + val (pred, res, offset) = + case mode of + ImDec => (isDigit, ItDec, 0) + | ImOct => (isOctal, ItOct, 1) + | ImInvalidOct => (isDigit, ItOct, 1) + | ImHex => (isHexDigit, ItHex, 2) + + fun checkAndRaise m msg = + if mode = m then raise TkError (TkiStart, msg) else () + + fun finish () = + let + val () = checkAndRaise ImInvalidOct "invalid octal constant" + val stream = S.ungetc stream + val str = finishSeqRead (startOff + offset) stream + val (sfx, stream) = getIntSuffix stream + in + (IntMode mode, SOME $ Num $ IntConst (res, str, sfx), stream) + end + in + case c of + NONE => finish () + | SOME c => + if pred c then + (IntMode mode, NONE, stream) + else if c = #"." then + (FloatMode FmDot, NONE, stream) + else if Char.toLower c = #"e" then + (checkAndRaise ImHex + "floating constant can not come with 0x prefix"; + (FloatMode FmExp, NONE, stream)) + else + finish () + end + | numParser (FloatMode FmDot) (stream, startOff) _ = + let + val (len, stream) = skipDigitSeq 0 $ S.ungetc stream + + val (c, stream') = S.getchar stream + + fun finish () = + let + val str = finishSeqRead startOff stream + val (sfx, stream) = getFloatSuffix len stream + in + (FloatMode FmDot, SOME $ Num $ FloatConst (str, sfx), stream) + end + in + case c of + NONE => finish () + | SOME c => + if Char.toLower c = #"e" then + (FloatMode FmExp, NONE, stream') + else + finish () + end + | numParser (FloatMode FmExp) (stream, startOff) c = + let + val (off, stream) = + if c = NONE then + raise TkError (TkiDx 0, "expected digit") + else if valOf c <> #"+" andalso valOf c <> #"-" then + (0, S.ungetc stream) + else + (~1, stream) + + val (len, stream) = skipDigitSeq off stream + val str = finishSeqRead startOff stream + val (sfx, stream) = getFloatSuffix len stream + in + (FloatMode FmDot, SOME $ Num $ FloatConst (str, sfx), stream) + end + + fun chrIntVal c = + if isDigit c then + ord c - ord #"0" + else if ord c >= ord #"a" andalso ord c <= ord #"z" then + ord c - ord #"a" + 10 + else if ord c >= ord #"A" andalso ord c <= ord #"Z" then + ord c - ord #"A" + 10 + else + raise Unreachable + + fun parseOctalSeq stream c = + let + fun follow stream acc count = + if count = 3 then + (SOME $ chr acc, stream) + else + let + val (c, stream) = S.getchar stream + in + case c of + NONE => (SOME $ chr acc, stream) + | SOME c => + if isOctal c then + follow stream (acc * 8 + chrIntVal c) (count + 1) + else + (SOME $ chr acc, S.ungetc stream) + end + in + follow stream (chrIntVal c) 1 + end + + fun parseHexSeq stream = + let + fun follow stream acc count = + let + val (c, stream) = S.getchar stream + + val noHex = TkError (TkiDx 0, "\\x without hex digits") + in + case c of + NONE => + if count = 0 then + raise noHex + else + (SOME $ chr acc, stream) + | SOME c => + if isHexDigit c then + if count = 2 then + raise TkError (TkiDx 2, "hex sequence out of range") + else + follow stream (acc * 16 + chrIntVal c) (count + 1) + else + if count = 0 then + raise noHex + else + (SOME $ chr acc, S.ungetc stream) + end + in + follow stream 0 0 + end + + fun eatEscSeq stream = + let + fun raiseErr0 msg = raise TkError (TkiDx 0, msg) + + val (c, stream) = S.getchar stream + val c = + case c of + NONE => raiseErr0 "unfinished escape sequence" + | SOME c => c + + fun & c = (SOME c, stream) + in + case c of + #"'" => & #"'" + | #"\"" => & #"\"" + | #"?" => & #"?" + | #"\\" => & #"\\" + | #"a" => & #"\a" + | #"b" => & #"\b" + | #"f" => & #"\f" + | #"n" => & #"\n" + | #"r" => & #"\r" + | #"t" => & #"\t" + | #"v" => & #"\v" + | #"\n" => (NONE, stream) + | #"x" => parseHexSeq stream + | c => + if isOctal c then + parseOctalSeq stream c + else + raiseErr0 "unknown escape sequence" + end + + datatype SeqParseState = SeqInit | SeqStart | SeqValue of int | SeqTerm + + datatype seqParseMode = SpmChr | SpmStr + + fun seqBound SpmChr = #"'" + | seqBound SpmStr = #"\"" + + fun seqExnConv mode (TkError (v, msg)) = + let + val bound = if mode = SpmChr then "'" else "\"" + val msg = + String.translate (fn c => if c = #"%" then bound else str c) msg + in + TkError (v, msg) + end + | seqExnConv _ _ = raise Unreachable + + fun unfinishedSeq SpmChr = "unfinished character constant" + | unfinishedSeq SpmStr = "unfinished string literal" + + fun seqParser mode SeqInit (stream, _) (SOME c) = + if seqBound mode = c then + (SeqStart, NONE, stream) + else + raise Unreachable + | seqParser mode SeqStart (stream, _) (SOME c) = + if c <> seqBound mode then + let + val (c, stream) = + if c <> #"\\" then (SOME c, stream) else eatEscSeq stream + in + if c = NONE then + (SeqStart, NONE, stream) + else + (SeqValue (ord $ Option.valOf c), NONE, stream) + end + else if mode = SpmStr then + (SeqTerm, SOME $ StringConst "", stream) + else + raise seqExnConv SpmChr $ TkError (TkiDx 0, "expected value after %") + | seqParser mode (SeqValue v) (stream, startOff) (SOME c) = + if seqBound mode = c then + let + fun term s v = + if mode = SpmChr then + CharConst (s, v) + else + StringConst $ String.extract (s, 1, SOME $ String.size s - 2) + in + (SeqTerm, SOME $ term (finishSeqRead startOff stream) v, stream) + end + else if mode = SpmStr then + let + val (_, stream) = + if c <> #"\\" then (SOME c, stream) else eatEscSeq stream + in + (SeqValue v, NONE, stream) + end + else + raise seqExnConv SpmChr $ TkError (TkiDx 0, "expected % after value") + | seqParser _ SeqTerm _ (SOME _) = + raise Unreachable + | seqParser mode state (_, _) NONE = + raise case state of + SeqInit => Unreachable + | SeqStart => seqExnConv mode $ TkError (TkiStart, unfinishedSeq mode) + | SeqValue _ => + seqExnConv mode $ TkError (TkiStart, unfinishedSeq mode) + | SeqTerm => Unreachable + + val charParser = seqParser SpmChr + val strParser = seqParser SpmStr + + fun formCppDir (Id s) = + let + open String + in + case List.find + (fn (_, repr) => + sub (repr, 0) = cppPrefix andalso + extract (repr, 1, NONE) = s) + tokenRepr + of + SOME (tk, _) => tk + | NONE => raise ExpectedCppDir + end + | formCppDir kwElse = CppElse + | formCppDir _ = raise ExpectedCppDir + + fun handleCppDir tk prevPos stream = + let + val (pos, tk') = tk + in + (prevPos, formCppDir tk') handle + ExpectedCppDir => + raise TkErrorAug (S.pos2ppos pos stream, + "expected preprocessor directive") + end + + fun unexpectedCharRaise stream c = + let + val pos = S.pos2ppos (S.getPosAfterCharRead stream) stream + val repr = + if isPrintable c then + str c + else + "<" ^ Int.toString (ord c) ^ ">" + in + raise TkErrorAug (pos, "unexpected character " ^ repr) + end + + fun skipComment stream pos = + let + fun skip prevIsAsterisk stream = + let + val (c, stream) = + case S.getchar stream of + (NONE, _) => + let + val pos = S.pos2ppos pos stream + in + raise TkErrorAug (pos, "unfinished comment") + end + | (SOME c, stream) => (c, stream) + in + if prevIsAsterisk andalso c = #"/" then + stream + else + skip (c = #"*") stream + end + in + skip false stream + end + + fun handleBackslash stream = + let + val (c, stream) = S.getchar stream + + val raiseErr = fn () => + let + val pos = S.getPosAfterCharRead stream + val pos = S.pos2ppos pos stream + in + raise TkErrorAug (pos, "expected \\n after backslash") + end + in + case c of + SOME c => + if c = #"\n" then + stream + else + raiseErr () + | NONE => raiseErr () + end + + fun processSymbol stream = + let + val (T as (p, tk), stream) = fsmEat $ S.ungetc stream + in + case tk of + CommentStart => getToken $ skipComment stream p + | DoubleDot => (SOME (p, Dot), S.ungetc stream) + | Hash => + if S.isFirstOnLine p stream then + let + val (tk, stream) = getToken stream + in + case tk of + NONE => + raise TkErrorAug (S.pos2ppos p stream, + "unfinished preprecessor directive") + | SOME tk => + (SOME $ handleCppDir tk p stream, stream) + end + else + (SOME T, stream) + | _ => (SOME T, stream) + end + + and getToken stream = + let + val (c, stream) = S.getchar stream + + fun @-> parser acc = + (fn (tk, s) => (SOME tk, s)) $ parserWrapper stream parser acc + in + case c of + NONE => (NONE, stream) + | SOME c => + if c = #"\n" then + (SOME (S.getPosAfterCharRead stream, NewLine), stream) + else if Char.isSpace c then + getToken stream + else if isIdStart c then + @-> idParser () + else if isDigit c then + @-> numParser NpInit + else if c = #"'" then + @-> charParser SeqInit + else if c = #"\"" then + @-> strParser SeqInit + else if isStartForFsm c then + processSymbol stream + else if c = #"\\" then + getToken $ handleBackslash stream + else + unexpectedCharRaise stream c + end + + fun tokenize stream = + let + fun aux acc stream = + let + val (tk, stream) = getToken stream + in + case tk of + NONE => rev acc + | SOME tk => aux (tk :: acc) stream + end + in + aux [] stream + end + + fun debugPrint tkl fname = + let + fun print' line _ ((_, NewLine) :: tks) = + print' (line + 1) true tks + | print' line firstOnLine ((_, tk) :: tks) = ( + if firstOnLine then ( + print "\n"; + printLn $ fname ^ ":" ^ Int.toString line; + print "\t") + else + (); + printToken tk; + print " "; + print' line false tks + ) + | print' _ _ [] = () + in + print' 1 true tkl; + print "\n" + end +end |