summaryrefslogtreecommitdiff
path: root/tokenizer.fun
diff options
context:
space:
mode:
authorVladimir Azarov <avm@intermediate-node.net>2025-04-04 20:53:56 +0200
committerVladimir Azarov <avm@intermediate-node.net>2025-04-04 20:53:56 +0200
commit9d724f17e813fa344d485329d33b5f5ecf8197a3 (patch)
tree5061e604ea88a379db975b13c1d20688007cacc8 /tokenizer.fun
parent7b29b31648fd737e7bbc007f480b799add91bc6b (diff)
Functorization
Diffstat (limited to 'tokenizer.fun')
-rw-r--r--tokenizer.fun1019
1 files changed, 1019 insertions, 0 deletions
diff --git a/tokenizer.fun b/tokenizer.fun
new file mode 100644
index 0000000..5cca203
--- /dev/null
+++ b/tokenizer.fun
@@ -0,0 +1,1019 @@
+functor Tokenizer(structure H: HASHTABLE; structure S: STREAM)
+ : TOKENIZER =
+struct
+
+ structure S = S
+
+ datatype intType = ItDec | ItOct | ItHex
+ datatype intSfx = IsNone | IsU | IsL | IsUL | IsLL | IsULL
+ datatype floatSfx = FsNone | FsF | FsL
+
+ datatype numConst =
+ IntConst of intType * string * intSfx |
+ FloatConst of string * floatSfx
+
+ datatype token =
+ Invalid |
+ EOS |
+ NewLine |
+
+ Num of numConst |
+
+ Id of string |
+ CharConst of string * int |
+ StringConst of string |
+
+ kwBreak |
+ kwCase |
+ kwChar |
+ kwConst |
+ kwContinue |
+ kwDefault |
+ kwDouble |
+ kwElse |
+ kwEnum |
+ kwExtern |
+ kwFloat |
+ kwFor |
+ kwGoto |
+ kwInt |
+ kwLong |
+ kwRegister |
+ kwReturn |
+ kwShort |
+ kwSigned |
+ kwSizeof |
+ kwStruct |
+ kwSwitch |
+ kwTypedef |
+ kwUnion |
+ kwUnsigned |
+ kwVoid |
+ kwVolatile |
+
+ LParen |
+ RParen |
+ LBracket |
+ RBracket |
+ LBrace |
+ RBrace |
+
+ QuestionMark |
+ Colon |
+ Coma |
+ Semicolon |
+
+ Arrow |
+ Plus |
+ DoublePlus|
+ Minus |
+ DoubleMinus |
+ Ampersand |
+ Asterisk |
+ Slash |
+ Tilde |
+ ExclMark |
+ Percent |
+ DoubleGreater |
+ DoubleLess |
+ Greater |
+ Less |
+ EqualSign |
+ LessEqualSign |
+ GreaterEqualSign |
+ DoubleEqualSign |
+ ExclMarkEqualSign |
+ Cap |
+ VerticalBar |
+ DoubleAmpersand |
+ DoubleVerticalBar |
+
+ AsteriskEqualSign |
+ SlashEqualSign |
+ PercentEqualSign |
+ PlusEqualSign |
+ MinusEqualSign |
+ DoubleLessEqualSign |
+ DoubleGreaterEqualSign |
+ AmpersandEqualSign |
+ CapEqualSign |
+ VerticalBarEqualSign |
+
+ Hash |
+ DoubleHash |
+
+ Dot |
+ DoubleDot |
+ TripleDot |
+
+ CommentStart |
+
+ CppInclude |
+ CppDefine |
+ CppUndef |
+ CppIf |
+ CppIfdef |
+ CppIfndef |
+ CppElse |
+ CppElif |
+ CppEndif |
+ CppWarning |
+ CppError |
+ CppPragma
+
+ val kwPrefix = #"@"
+ val cppPrefix = #"$"
+
+ type fullToken = S.pos * token
+
+ datatype tkErrorAuxInfo = TkiEOF | TkiDx of int | TkiStart
+
+ exception TkError of tkErrorAuxInfo * string
+ exception TkErrorAug of S.ppos * string
+
+ exception ExpectedCppDir (* handled in postprocess *)
+
+ exception FsmTableIsTooSmall
+
+ (* Unreachable (should be) *)
+ exception TokenWithoutRepr
+ exception SuffixWithoutRepr
+
+ val tokenRepr =
+ let
+ fun & repr = str kwPrefix ^ repr
+ fun % repr = str cppPrefix ^ repr
+ in
+ [
+ (NewLine, "NewLine"),
+
+ (kwBreak, &"break"),
+ (kwCase, &"case"),
+ (kwChar, &"char"),
+ (kwConst, &"const"),
+ (kwContinue, &"continue"),
+ (kwDefault, &"default"),
+ (kwDouble, &"double"),
+ (kwElse, &"else"),
+ (kwEnum, &"enum"),
+ (kwExtern, &"extern"),
+ (kwFloat, &"float"),
+ (kwFor, &"for"),
+ (kwGoto, &"goto"),
+ (kwInt, &"int"),
+ (kwLong, &"long"),
+ (kwRegister, &"register"),
+ (kwReturn, &"return"),
+ (kwShort, &"short"),
+ (kwSigned, &"signed"),
+ (kwSizeof, &"sizeof"),
+ (kwStruct, &"struct"),
+ (kwSwitch, &"switch"),
+ (kwTypedef, &"typedef"),
+ (kwUnion, &"union"),
+ (kwUnsigned, &"unsigned"),
+ (kwVoid, &"void"),
+ (kwVolatile, &"volatile"),
+
+ (LParen, "("),
+ (RParen, ")"),
+ (LBracket, "["),
+ (RBracket, "]"),
+ (LBrace, "{"),
+ (RBrace, "}"),
+
+ (QuestionMark, "?"),
+ (Colon, ":"),
+ (Coma, ","),
+ (Semicolon, ";"),
+
+ (Arrow, "->"),
+ (Plus, "+"),
+ (DoublePlus, "++"),
+ (Minus, "-"),
+ (DoubleMinus, "--"),
+ (Ampersand, "&"),
+ (Asterisk, "*"),
+ (Slash, "/"),
+ (Tilde, "~"),
+ (ExclMark, "!"),
+ (Percent, "%"),
+ (DoubleLess, "<<"),
+ (DoubleGreater, ">>"),
+ (Less, "<"),
+ (Greater, ">"),
+ (EqualSign, "="),
+ (LessEqualSign, "<="),
+ (GreaterEqualSign, ">="),
+ (DoubleEqualSign, "=="),
+ (ExclMarkEqualSign, "!="),
+ (Cap, "^"),
+ (VerticalBar, "|"),
+ (DoubleAmpersand, "&&"),
+ (DoubleVerticalBar, "||"),
+
+ (AsteriskEqualSign, "*="),
+ (SlashEqualSign, "/="),
+ (PercentEqualSign, "%="),
+ (PlusEqualSign, "+="),
+ (MinusEqualSign, "-="),
+ (DoubleLessEqualSign, "<<="),
+ (DoubleGreaterEqualSign, ">>="),
+ (AmpersandEqualSign, "&="),
+ (CapEqualSign, "^="),
+ (VerticalBarEqualSign, "|="),
+
+ (Hash, "#"),
+ (DoubleHash, "##"),
+
+ (Dot, "."),
+ (DoubleDot, ".."),
+ (TripleDot, "..."),
+
+ (CommentStart, "/*"),
+
+ (CppInclude, %"include"),
+ (CppDefine, %"define"),
+ (CppUndef, %"undef"),
+ (CppIf, %"if"),
+ (CppIfdef, %"ifdef"),
+ (CppIfndef, %"ifndef"),
+ (CppElse, %"else"),
+ (CppElif, %"elif"),
+ (CppEndif, %"endif"),
+ (CppWarning, %"warning"),
+ (CppError, %"error"),
+ (CppPragma, %"pragma")
+ ]
+ end
+
+ val intSuffixRepr = [
+ (IsNone, ""),
+ (IsU, "u"),
+ (IsL, "l"),
+ (IsUL, "ul"),
+ (IsLL, "ll"),
+ (IsULL, "ull")
+ ]
+
+ val floatSuffixRepr = [
+ (FsNone, ""),
+ (FsF, "f"),
+ (FsL, "l")
+ ]
+
+ fun getSfxRepr sfx buf onError =
+ case List.find (fn (sfx', _) => sfx' = sfx) buf of
+ NONE => onError ()
+ | SOME (_, repr) => repr
+
+ fun getSfxReprSimple sfx buf =
+ getSfxRepr sfx buf (fn () => raise SuffixWithoutRepr)
+
+ val token2str = fn
+ Id s => "id:" ^ s
+ | Num (IntConst (it, str, sfx)) =>
+ let
+ val intType =
+ case it of
+ ItDec => ""
+ | ItOct => "0"
+ | ItHex => "0x"
+ in
+ intType ^ str ^ "`" ^ getSfxReprSimple sfx intSuffixRepr ^ "`"
+ end
+ | Num (FloatConst (str, sfx)) =>
+ str ^ "`" ^ getSfxReprSimple sfx floatSuffixRepr ^ "`"
+ | CharConst (repr, _) => repr
+ | StringConst s =>
+ "\"" ^ s ^ "\""
+ | v =>
+ case List.find (fn (x, _) => x = v) tokenRepr of
+ SOME (_, repr) => repr
+ | NONE => raise TokenWithoutRepr
+
+ fun printToken tk = print $ token2str tk
+
+ fun isIdStart c = Char.isAlpha c orelse c = #"_"
+ fun isIdBody c = Char.isAlphaNum c orelse c = #"_"
+
+ fun isOctal c = ord c >= ord #"0" andalso ord c < ord #"8"
+ val isDigit = Char.isDigit
+ val isHexDigit = Char.isHexDigit
+
+ fun isPrintable c = Char.isPrint c andalso c <> #" "
+
+ (* FSM for parsing symbols *)
+
+ val maxStates = 51
+
+ fun fsmInsert (nextState, buf) curState tk (c :: cs) =
+ let
+ open Array
+
+ val (_, row) = sub (buf, curState)
+ val potNextState = sub (row, ord c)
+ in
+ if potNextState <> ~1 then
+ fsmInsert (nextState, buf) potNextState tk cs
+ else (
+ update (row, ord c, !nextState);
+ nextState := !nextState + 1;
+ fsmInsert (nextState, buf) (!nextState - 1) tk cs
+ )
+ end
+ | fsmInsert (_, buf) curState tk [] =
+ let
+ open Array
+ val (_, row) = sub (buf, curState)
+ in
+ update (buf, curState, (tk, row))
+ end
+
+ fun isStartForFsmGen () =
+ let
+ open Array
+
+ val lookupTable = array (128, false)
+
+ fun firstChr s = ord $ String.sub (s, 0)
+ val () = List.app
+ (fn (_, repr) => update (lookupTable, firstChr repr, true))
+ $ List.filter
+ (fn (tk, repr) =>
+ let
+ val c = String.sub (repr, 0)
+ in
+ c <> kwPrefix andalso c <> cppPrefix
+ andalso tk <> NewLine
+ end)
+ tokenRepr
+
+ in
+ fn c => sub (lookupTable, ord c)
+ end
+
+ val isStartForFsm = isStartForFsmGen ()
+
+ fun fsmTableCreate () =
+ let
+ open Array
+
+ val T as (nextState, buf) =
+ (ref 1, array (maxStates, (Invalid, array (128, ~1))))
+ val r = ref 1
+
+ fun filterNeeded [] acc = acc
+ | filterNeeded ((T as (_, repr)) :: tks) acc =
+ filterNeeded tks
+ (if isStartForFsm $ String.sub (repr, 0) then
+ T :: acc
+ else
+ acc)
+
+ val tokenRepr = filterNeeded tokenRepr []
+
+ fun fsmInsert' T curState tk repr = fsmInsert T curState tk repr handle
+ Subscript => raise FsmTableIsTooSmall
+ in
+ while !r < length buf do (
+ update (buf, !r, (Invalid, array(128, ~1)));
+ r := !r + 1
+ );
+
+ List.app (fn (v, p) => fsmInsert' T 0 v $ explode p) tokenRepr;
+ if !nextState <> maxStates then
+ printLn $ "note: Fsm table size can be smaller: "
+ ^ Int.toString (!nextState) ^ " is enough"
+ else ();
+ T
+ end
+
+ (* Unused right now
+ fun printTable (nextState, buf) =
+ let
+ fun printRow i row =
+ if i = length row then
+ print "\n"
+ else
+ let
+ val state = sub (row, i)
+ in
+ if state = ~1 then
+ ()
+ else
+ print ((str (chr i)) ^ ": " ^ (Int.toString state) ^ ", ");
+ printRow (i + 1) row
+ end
+
+ fun print' rowNum buf =
+ if rowNum = !nextState then
+ ()
+ else
+ let
+ val (tk, row) = sub (buf, rowNum)
+ in
+ print ((token2string tk) ^ ": ");
+ printRow 0 row;
+ print' (rowNum + 1) buf
+ end
+ in
+ print ("NextState: " ^ Int.toString (!nextState) ^ "\n");
+ print' 0 buf;
+ print "\n"
+ end
+ *)
+
+ val fsmTable = lazy fsmTableCreate
+
+ fun fsmEat stream =
+ let
+ open Array
+ val pos = S.getPos stream
+
+ fun get curState stream =
+ let
+ val (c, stream) = S.getchar stream
+ in
+ case c of
+ NONE => (#1 $ sub (#2 $ fsmTable (), curState), stream)
+ | SOME c =>
+ let
+ val (tk, row) = sub (#2 $ fsmTable (), curState)
+ val nextState = sub (row, ord c)
+ in
+ if nextState = ~1 then
+ (tk, S.ungetc stream)
+ else
+ get nextState stream
+ end
+ end
+ in
+ (fn (tk, stream) => ((pos, tk), stream)) $ get 0 stream
+ end
+
+ fun tkError2aug stream (dx, msg) =
+ let
+ val (id, pos) = S.getPosAfterCharRead stream
+ val pos = S.pos2ppos (id, pos + dx) stream
+ in
+ TkErrorAug (pos, msg)
+ end
+
+ fun parserWrapper stream parser acc =
+ let
+ val stream = S.ungetc stream
+ val P as (_, startOff) = S.getPos stream
+
+ fun parse' stream acc = let
+ val (c, stream) = S.getchar stream
+
+ val (acc, tk, stream) = parser acc (stream, startOff) c handle
+ TkError (TkiDx dx, msg) => raise tkError2aug stream (dx, msg)
+ | TkError (TkiStart, msg) =>
+ let
+ val startPos = S.pos2ppos P stream
+ in
+ raise TkErrorAug (startPos, msg)
+ end
+ | TkError (TkiEOF, msg) =>
+ let
+ val pos = S.pposWithoutCol $ S.pos2ppos P stream
+ in
+ raise TkErrorAug (pos, msg)
+ end
+ in
+ case tk of
+ NONE => parse' stream acc
+ | _ => (valOf tk, stream)
+ end
+
+ val (tk, stream) = parse' stream acc
+ in
+ ((P, tk): fullToken, stream)
+ end
+
+ fun finishSeqRead startOff stream =
+ let
+ val (_, endOff) = S.getPos stream
+ val s = S.getSubstr startOff endOff stream
+ in
+ s
+ end
+
+ fun keywordHashtableGen () =
+ let
+ val table = H.create 128
+ val () =
+ List.app
+ (fn (tk, repr) =>
+ if String.sub (repr, 0) = kwPrefix then
+ H.insert table (String.extract (repr, 1, NONE)) tk
+ else
+ ())
+ tokenRepr
+ in
+ table
+ end
+
+ val keywordHashtable = lazy keywordHashtableGen
+
+ fun findKeyword str =
+ case H.lookup (keywordHashtable ()) str of
+ NONE => Id str
+ | SOME tk => tk
+
+ fun idParser () (stream, startOff) c =
+ let
+ fun finalize stream =
+ let
+ val s = finishSeqRead startOff stream
+ val tk = findKeyword s
+ in
+ ((), SOME tk, stream)
+ end
+ in
+ case c of
+ NONE => finalize stream
+ | SOME c =>
+ if isIdBody c then
+ ((), NONE, stream)
+ else
+ finalize (S.ungetc stream)
+ end
+
+ datatype intMode = ImDec | ImOct | ImInvalidOct | ImHex
+ datatype floatMode = FmDot | FmExp
+
+ datatype npState = NpInit | IntMode of intMode | FloatMode of floatMode
+
+ fun getLongestSeq acc pred stream =
+ let
+ val (c, stream') = S.getchar stream
+ in
+ if isSome c andalso pred $ valOf c then
+ getLongestSeq (valOf c :: acc) pred stream'
+ else
+ (implode $ rev acc, stream)
+ end
+
+ fun skipDigitSeq off stream =
+ let
+ val (res, stream) = getLongestSeq [] isDigit stream
+ in
+ if res = "" then
+ raise TkError (TkiDx off, "expected digit")
+ else
+ (String.size res, stream)
+ end
+
+ fun getSuffixCommon buf off stream =
+ let
+ val (sfx, stream) = getLongestSeq [] Char.isAlpha stream
+ val sfx = String.map (fn c => Char.toLower c) sfx
+ val sfx =
+ case List.find (fn (_, repr) => repr = sfx) buf of
+ NONE => raise TkError (TkiDx off, "unknown suffix")
+ | SOME (sfx, _) => sfx
+ in
+ (sfx, stream)
+ end
+
+ val getIntSuffix = getSuffixCommon intSuffixRepr 0
+ fun getFloatSuffix off = getSuffixCommon floatSuffixRepr off
+
+ (*
+ * It seems that there is an ambiguity in C:
+ * gcc/clang/cparser consider 0xe-3 as a fp constant and reject it.
+ * Right now we do not, but it may be reconsidered.
+ *)
+ fun numParser NpInit (stream, _) (SOME c) =
+ if c = #"0" then
+ let
+ val (c, stream') = S.getchar stream
+ in
+ case c of
+ NONE =>
+ (NpInit, SOME $ Num $ IntConst (ItOct, "", IsNone), stream')
+ | SOME c =>
+ if Char.toLower c = #"x" then
+ (IntMode ImHex, NONE, stream')
+ else
+ (IntMode ImOct, NONE, stream)
+ end
+ else
+ (IntMode ImDec, NONE, stream)
+ | numParser NpInit _ NONE = raise Unreachable
+ | numParser (IntMode mode) (stream, startOff) c =
+ let
+ val (pred, res, offset) =
+ case mode of
+ ImDec => (isDigit, ItDec, 0)
+ | ImOct => (isOctal, ItOct, 1)
+ | ImInvalidOct => (isDigit, ItOct, 1)
+ | ImHex => (isHexDigit, ItHex, 2)
+
+ fun checkAndRaise m msg =
+ if mode = m then raise TkError (TkiStart, msg) else ()
+
+ fun finish () =
+ let
+ val () = checkAndRaise ImInvalidOct "invalid octal constant"
+ val stream = S.ungetc stream
+ val str = finishSeqRead (startOff + offset) stream
+ val (sfx, stream) = getIntSuffix stream
+ in
+ (IntMode mode, SOME $ Num $ IntConst (res, str, sfx), stream)
+ end
+ in
+ case c of
+ NONE => finish ()
+ | SOME c =>
+ if pred c then
+ (IntMode mode, NONE, stream)
+ else if c = #"." then
+ (FloatMode FmDot, NONE, stream)
+ else if Char.toLower c = #"e" then
+ (checkAndRaise ImHex
+ "floating constant can not come with 0x prefix";
+ (FloatMode FmExp, NONE, stream))
+ else
+ finish ()
+ end
+ | numParser (FloatMode FmDot) (stream, startOff) _ =
+ let
+ val (len, stream) = skipDigitSeq 0 $ S.ungetc stream
+
+ val (c, stream') = S.getchar stream
+
+ fun finish () =
+ let
+ val str = finishSeqRead startOff stream
+ val (sfx, stream) = getFloatSuffix len stream
+ in
+ (FloatMode FmDot, SOME $ Num $ FloatConst (str, sfx), stream)
+ end
+ in
+ case c of
+ NONE => finish ()
+ | SOME c =>
+ if Char.toLower c = #"e" then
+ (FloatMode FmExp, NONE, stream')
+ else
+ finish ()
+ end
+ | numParser (FloatMode FmExp) (stream, startOff) c =
+ let
+ val (off, stream) =
+ if c = NONE then
+ raise TkError (TkiDx 0, "expected digit")
+ else if valOf c <> #"+" andalso valOf c <> #"-" then
+ (0, S.ungetc stream)
+ else
+ (~1, stream)
+
+ val (len, stream) = skipDigitSeq off stream
+ val str = finishSeqRead startOff stream
+ val (sfx, stream) = getFloatSuffix len stream
+ in
+ (FloatMode FmDot, SOME $ Num $ FloatConst (str, sfx), stream)
+ end
+
+ fun chrIntVal c =
+ if isDigit c then
+ ord c - ord #"0"
+ else if ord c >= ord #"a" andalso ord c <= ord #"z" then
+ ord c - ord #"a" + 10
+ else if ord c >= ord #"A" andalso ord c <= ord #"Z" then
+ ord c - ord #"A" + 10
+ else
+ raise Unreachable
+
+ fun parseOctalSeq stream c =
+ let
+ fun follow stream acc count =
+ if count = 3 then
+ (SOME $ chr acc, stream)
+ else
+ let
+ val (c, stream) = S.getchar stream
+ in
+ case c of
+ NONE => (SOME $ chr acc, stream)
+ | SOME c =>
+ if isOctal c then
+ follow stream (acc * 8 + chrIntVal c) (count + 1)
+ else
+ (SOME $ chr acc, S.ungetc stream)
+ end
+ in
+ follow stream (chrIntVal c) 1
+ end
+
+ fun parseHexSeq stream =
+ let
+ fun follow stream acc count =
+ let
+ val (c, stream) = S.getchar stream
+
+ val noHex = TkError (TkiDx 0, "\\x without hex digits")
+ in
+ case c of
+ NONE =>
+ if count = 0 then
+ raise noHex
+ else
+ (SOME $ chr acc, stream)
+ | SOME c =>
+ if isHexDigit c then
+ if count = 2 then
+ raise TkError (TkiDx 2, "hex sequence out of range")
+ else
+ follow stream (acc * 16 + chrIntVal c) (count + 1)
+ else
+ if count = 0 then
+ raise noHex
+ else
+ (SOME $ chr acc, S.ungetc stream)
+ end
+ in
+ follow stream 0 0
+ end
+
+ fun eatEscSeq stream =
+ let
+ fun raiseErr0 msg = raise TkError (TkiDx 0, msg)
+
+ val (c, stream) = S.getchar stream
+ val c =
+ case c of
+ NONE => raiseErr0 "unfinished escape sequence"
+ | SOME c => c
+
+ fun & c = (SOME c, stream)
+ in
+ case c of
+ #"'" => & #"'"
+ | #"\"" => & #"\""
+ | #"?" => & #"?"
+ | #"\\" => & #"\\"
+ | #"a" => & #"\a"
+ | #"b" => & #"\b"
+ | #"f" => & #"\f"
+ | #"n" => & #"\n"
+ | #"r" => & #"\r"
+ | #"t" => & #"\t"
+ | #"v" => & #"\v"
+ | #"\n" => (NONE, stream)
+ | #"x" => parseHexSeq stream
+ | c =>
+ if isOctal c then
+ parseOctalSeq stream c
+ else
+ raiseErr0 "unknown escape sequence"
+ end
+
+ datatype SeqParseState = SeqInit | SeqStart | SeqValue of int | SeqTerm
+
+ datatype seqParseMode = SpmChr | SpmStr
+
+ fun seqBound SpmChr = #"'"
+ | seqBound SpmStr = #"\""
+
+ fun seqExnConv mode (TkError (v, msg)) =
+ let
+ val bound = if mode = SpmChr then "'" else "\""
+ val msg =
+ String.translate (fn c => if c = #"%" then bound else str c) msg
+ in
+ TkError (v, msg)
+ end
+ | seqExnConv _ _ = raise Unreachable
+
+ fun unfinishedSeq SpmChr = "unfinished character constant"
+ | unfinishedSeq SpmStr = "unfinished string literal"
+
+ fun seqParser mode SeqInit (stream, _) (SOME c) =
+ if seqBound mode = c then
+ (SeqStart, NONE, stream)
+ else
+ raise Unreachable
+ | seqParser mode SeqStart (stream, _) (SOME c) =
+ if c <> seqBound mode then
+ let
+ val (c, stream) =
+ if c <> #"\\" then (SOME c, stream) else eatEscSeq stream
+ in
+ if c = NONE then
+ (SeqStart, NONE, stream)
+ else
+ (SeqValue (ord $ Option.valOf c), NONE, stream)
+ end
+ else if mode = SpmStr then
+ (SeqTerm, SOME $ StringConst "", stream)
+ else
+ raise seqExnConv SpmChr $ TkError (TkiDx 0, "expected value after %")
+ | seqParser mode (SeqValue v) (stream, startOff) (SOME c) =
+ if seqBound mode = c then
+ let
+ fun term s v =
+ if mode = SpmChr then
+ CharConst (s, v)
+ else
+ StringConst $ String.extract (s, 1, SOME $ String.size s - 2)
+ in
+ (SeqTerm, SOME $ term (finishSeqRead startOff stream) v, stream)
+ end
+ else if mode = SpmStr then
+ let
+ val (_, stream) =
+ if c <> #"\\" then (SOME c, stream) else eatEscSeq stream
+ in
+ (SeqValue v, NONE, stream)
+ end
+ else
+ raise seqExnConv SpmChr $ TkError (TkiDx 0, "expected % after value")
+ | seqParser _ SeqTerm _ (SOME _) =
+ raise Unreachable
+ | seqParser mode state (_, _) NONE =
+ raise case state of
+ SeqInit => Unreachable
+ | SeqStart => seqExnConv mode $ TkError (TkiStart, unfinishedSeq mode)
+ | SeqValue _ =>
+ seqExnConv mode $ TkError (TkiStart, unfinishedSeq mode)
+ | SeqTerm => Unreachable
+
+ val charParser = seqParser SpmChr
+ val strParser = seqParser SpmStr
+
+ fun formCppDir (Id s) =
+ let
+ open String
+ in
+ case List.find
+ (fn (_, repr) =>
+ sub (repr, 0) = cppPrefix andalso
+ extract (repr, 1, NONE) = s)
+ tokenRepr
+ of
+ SOME (tk, _) => tk
+ | NONE => raise ExpectedCppDir
+ end
+ | formCppDir kwElse = CppElse
+ | formCppDir _ = raise ExpectedCppDir
+
+ fun handleCppDir tk prevPos stream =
+ let
+ val (pos, tk') = tk
+ in
+ (prevPos, formCppDir tk') handle
+ ExpectedCppDir =>
+ raise TkErrorAug (S.pos2ppos pos stream,
+ "expected preprocessor directive")
+ end
+
+ fun unexpectedCharRaise stream c =
+ let
+ val pos = S.pos2ppos (S.getPosAfterCharRead stream) stream
+ val repr =
+ if isPrintable c then
+ str c
+ else
+ "<" ^ Int.toString (ord c) ^ ">"
+ in
+ raise TkErrorAug (pos, "unexpected character " ^ repr)
+ end
+
+ fun skipComment stream pos =
+ let
+ fun skip prevIsAsterisk stream =
+ let
+ val (c, stream) =
+ case S.getchar stream of
+ (NONE, _) =>
+ let
+ val pos = S.pos2ppos pos stream
+ in
+ raise TkErrorAug (pos, "unfinished comment")
+ end
+ | (SOME c, stream) => (c, stream)
+ in
+ if prevIsAsterisk andalso c = #"/" then
+ stream
+ else
+ skip (c = #"*") stream
+ end
+ in
+ skip false stream
+ end
+
+ fun handleBackslash stream =
+ let
+ val (c, stream) = S.getchar stream
+
+ val raiseErr = fn () =>
+ let
+ val pos = S.getPosAfterCharRead stream
+ val pos = S.pos2ppos pos stream
+ in
+ raise TkErrorAug (pos, "expected \\n after backslash")
+ end
+ in
+ case c of
+ SOME c =>
+ if c = #"\n" then
+ stream
+ else
+ raiseErr ()
+ | NONE => raiseErr ()
+ end
+
+ fun processSymbol stream =
+ let
+ val (T as (p, tk), stream) = fsmEat $ S.ungetc stream
+ in
+ case tk of
+ CommentStart => getToken $ skipComment stream p
+ | DoubleDot => (SOME (p, Dot), S.ungetc stream)
+ | Hash =>
+ if S.isFirstOnLine p stream then
+ let
+ val (tk, stream) = getToken stream
+ in
+ case tk of
+ NONE =>
+ raise TkErrorAug (S.pos2ppos p stream,
+ "unfinished preprecessor directive")
+ | SOME tk =>
+ (SOME $ handleCppDir tk p stream, stream)
+ end
+ else
+ (SOME T, stream)
+ | _ => (SOME T, stream)
+ end
+
+ and getToken stream =
+ let
+ val (c, stream) = S.getchar stream
+
+ fun @-> parser acc =
+ (fn (tk, s) => (SOME tk, s)) $ parserWrapper stream parser acc
+ in
+ case c of
+ NONE => (NONE, stream)
+ | SOME c =>
+ if c = #"\n" then
+ (SOME (S.getPosAfterCharRead stream, NewLine), stream)
+ else if Char.isSpace c then
+ getToken stream
+ else if isIdStart c then
+ @-> idParser ()
+ else if isDigit c then
+ @-> numParser NpInit
+ else if c = #"'" then
+ @-> charParser SeqInit
+ else if c = #"\"" then
+ @-> strParser SeqInit
+ else if isStartForFsm c then
+ processSymbol stream
+ else if c = #"\\" then
+ getToken $ handleBackslash stream
+ else
+ unexpectedCharRaise stream c
+ end
+
+ fun tokenize stream =
+ let
+ fun aux acc stream =
+ let
+ val (tk, stream) = getToken stream
+ in
+ case tk of
+ NONE => rev acc
+ | SOME tk => aux (tk :: acc) stream
+ end
+ in
+ aux [] stream
+ end
+
+ fun debugPrint tkl fname =
+ let
+ fun print' line _ ((_, NewLine) :: tks) =
+ print' (line + 1) true tks
+ | print' line firstOnLine ((_, tk) :: tks) = (
+ if firstOnLine then (
+ print "\n";
+ printLn $ fname ^ ":" ^ Int.toString line;
+ print "\t")
+ else
+ ();
+ printToken tk;
+ print " ";
+ print' line false tks
+ )
+ | print' _ _ [] = ()
+ in
+ print' 1 true tkl;
+ print "\n"
+ end
+end