summaryrefslogtreecommitdiff
path: root/tokenizer.sml
diff options
context:
space:
mode:
Diffstat (limited to 'tokenizer.sml')
-rw-r--r--tokenizer.sml1018
1 files changed, 0 insertions, 1018 deletions
diff --git a/tokenizer.sml b/tokenizer.sml
deleted file mode 100644
index 10c2e77..0000000
--- a/tokenizer.sml
+++ /dev/null
@@ -1,1018 +0,0 @@
-structure Tokenizer:> TOKENIZER = struct
-
- datatype intType = ItDec | ItOct | ItHex
- datatype intSfx = IsNone | IsU | IsL | IsUL | IsLL | IsULL
- datatype floatSfx = FsNone | FsF | FsL
-
- datatype numConst =
- IntConst of intType * string * intSfx |
- FloatConst of string * floatSfx
-
- datatype token =
- Invalid |
- EOS |
- NewLine |
-
- Num of numConst |
-
- Id of string |
- CharConst of string * int |
- StringConst of string |
-
- kwBreak |
- kwCase |
- kwChar |
- kwConst |
- kwContinue |
- kwDefault |
- kwDouble |
- kwElse |
- kwEnum |
- kwExtern |
- kwFloat |
- kwFor |
- kwGoto |
- kwInt |
- kwLong |
- kwRegister |
- kwReturn |
- kwShort |
- kwSigned |
- kwSizeof |
- kwStruct |
- kwSwitch |
- kwTypedef |
- kwUnion |
- kwUnsigned |
- kwVoid |
- kwVolatile |
-
- LParen |
- RParen |
- LBracket |
- RBracket |
- LBrace |
- RBrace |
-
- QuestionMark |
- Colon |
- Coma |
- Semicolon |
-
- Arrow |
- Plus |
- DoublePlus|
- Minus |
- DoubleMinus |
- Ampersand |
- Asterisk |
- Slash |
- Tilde |
- ExclMark |
- Percent |
- DoubleGreater |
- DoubleLess |
- Greater |
- Less |
- EqualSign |
- LessEqualSign |
- GreaterEqualSign |
- DoubleEqualSign |
- ExclMarkEqualSign |
- Cap |
- VerticalBar |
- DoubleAmpersand |
- DoubleVerticalBar |
-
- AsteriskEqualSign |
- SlashEqualSign |
- PercentEqualSign |
- PlusEqualSign |
- MinusEqualSign |
- DoubleLessEqualSign |
- DoubleGreaterEqualSign |
- AmpersandEqualSign |
- CapEqualSign |
- VerticalBarEqualSign |
-
- Hash |
- DoubleHash |
-
- Dot |
- DoubleDot |
- TripleDot |
-
- CommentStart |
-
- CppInclude |
- CppDefine |
- CppUndef |
- CppIf |
- CppIfdef |
- CppIfndef |
- CppElse |
- CppElif |
- CppEndif |
- CppWarning |
- CppError |
- CppPragma
-
- val kwPrefix = #"@"
- val cppPrefix = #"$"
-
- type fullToken = Stream.pos * token
-
- datatype tkErrorAuxInfo = TkiEOF | TkiDx of int | TkiStart
-
- exception TkError of tkErrorAuxInfo * string
- exception TkErrorAug of Stream.ppos * string
-
- exception ExpectedCppDir (* handled in postprocess *)
-
- exception FsmTableIsTooSmall
-
- (* Unreachable (should be) *)
- exception TokenWithoutRepr
- exception SuffixWithoutRepr
-
- val tokenRepr =
- let
- fun & repr = str kwPrefix ^ repr
- fun % repr = str cppPrefix ^ repr
- in
- [
- (NewLine, "NewLine"),
-
- (kwBreak, &"break"),
- (kwCase, &"case"),
- (kwChar, &"char"),
- (kwConst, &"const"),
- (kwContinue, &"continue"),
- (kwDefault, &"default"),
- (kwDouble, &"double"),
- (kwElse, &"else"),
- (kwEnum, &"enum"),
- (kwExtern, &"extern"),
- (kwFloat, &"float"),
- (kwFor, &"for"),
- (kwGoto, &"goto"),
- (kwInt, &"int"),
- (kwLong, &"long"),
- (kwRegister, &"register"),
- (kwReturn, &"return"),
- (kwShort, &"short"),
- (kwSigned, &"signed"),
- (kwSizeof, &"sizeof"),
- (kwStruct, &"struct"),
- (kwSwitch, &"switch"),
- (kwTypedef, &"typedef"),
- (kwUnion, &"union"),
- (kwUnsigned, &"unsigned"),
- (kwVoid, &"void"),
- (kwVolatile, &"volatile"),
-
- (LParen, "("),
- (RParen, ")"),
- (LBracket, "["),
- (RBracket, "]"),
- (LBrace, "{"),
- (RBrace, "}"),
-
- (QuestionMark, "?"),
- (Colon, ":"),
- (Coma, ","),
- (Semicolon, ";"),
-
- (Arrow, "->"),
- (Plus, "+"),
- (DoublePlus, "++"),
- (Minus, "-"),
- (DoubleMinus, "--"),
- (Ampersand, "&"),
- (Asterisk, "*"),
- (Slash, "/"),
- (Tilde, "~"),
- (ExclMark, "!"),
- (Percent, "%"),
- (DoubleLess, "<<"),
- (DoubleGreater, ">>"),
- (Less, "<"),
- (Greater, ">"),
- (EqualSign, "="),
- (LessEqualSign, "<="),
- (GreaterEqualSign, ">="),
- (DoubleEqualSign, "=="),
- (ExclMarkEqualSign, "!="),
- (Cap, "^"),
- (VerticalBar, "|"),
- (DoubleAmpersand, "&&"),
- (DoubleVerticalBar, "||"),
-
- (AsteriskEqualSign, "*="),
- (SlashEqualSign, "/="),
- (PercentEqualSign, "%="),
- (PlusEqualSign, "+="),
- (MinusEqualSign, "-="),
- (DoubleLessEqualSign, "<<="),
- (DoubleGreaterEqualSign, ">>="),
- (AmpersandEqualSign, "&="),
- (CapEqualSign, "^="),
- (VerticalBarEqualSign, "|="),
-
- (Hash, "#"),
- (DoubleHash, "##"),
-
- (Dot, "."),
- (DoubleDot, ".."),
- (TripleDot, "..."),
-
- (CommentStart, "/*"),
-
- (CppInclude, %"include"),
- (CppDefine, %"define"),
- (CppUndef, %"undef"),
- (CppIf, %"if"),
- (CppIfdef, %"ifdef"),
- (CppIfndef, %"ifndef"),
- (CppElse, %"else"),
- (CppElif, %"elif"),
- (CppEndif, %"endif"),
- (CppWarning, %"warning"),
- (CppError, %"error"),
- (CppPragma, %"pragma")
- ]
- end
-
- val intSuffixRepr = [
- (IsNone, ""),
- (IsU, "u"),
- (IsL, "l"),
- (IsUL, "ul"),
- (IsLL, "ll"),
- (IsULL, "ull")
- ]
-
- val floatSuffixRepr = [
- (FsNone, ""),
- (FsF, "f"),
- (FsL, "l")
- ]
-
- fun getSfxRepr sfx buf onError =
- case List.find (fn (sfx', _) => sfx' = sfx) buf of
- NONE => onError ()
- | SOME (_, repr) => repr
-
- fun getSfxReprSimple sfx buf =
- getSfxRepr sfx buf (fn () => raise SuffixWithoutRepr)
-
- val token2str = fn
- Id s => "id:" ^ s
- | Num (IntConst (it, str, sfx)) =>
- let
- val intType =
- case it of
- ItDec => ""
- | ItOct => "0"
- | ItHex => "0x"
- in
- intType ^ str ^ "`" ^ getSfxReprSimple sfx intSuffixRepr ^ "`"
- end
- | Num (FloatConst (str, sfx)) =>
- str ^ "`" ^ getSfxReprSimple sfx floatSuffixRepr ^ "`"
- | CharConst (repr, _) => repr
- | StringConst s =>
- "\"" ^ s ^ "\""
- | v =>
- case List.find (fn (x, _) => x = v) tokenRepr of
- SOME (_, repr) => repr
- | NONE => raise TokenWithoutRepr
-
- fun printToken tk = print $ token2str tk
-
- fun isIdStart c = Char.isAlpha c orelse c = #"_"
- fun isIdBody c = Char.isAlphaNum c orelse c = #"_"
-
- fun isOctal c = ord c >= ord #"0" andalso ord c < ord #"8"
- val isDigit = Char.isDigit
- val isHexDigit = Char.isHexDigit
-
- fun isPrintable c = Char.isPrint c andalso c <> #" "
-
- (* FSM for parsing symbols *)
-
- val maxStates = 51
-
- fun fsmInsert (nextState, buf) curState tk (c :: cs) =
- let
- open Array
-
- val (_, row) = sub (buf, curState)
- val potNextState = sub (row, ord c)
- in
- if potNextState <> ~1 then
- fsmInsert (nextState, buf) potNextState tk cs
- else (
- update (row, ord c, !nextState);
- nextState := !nextState + 1;
- fsmInsert (nextState, buf) (!nextState - 1) tk cs
- )
- end
- | fsmInsert (_, buf) curState tk [] =
- let
- open Array
- val (_, row) = sub (buf, curState)
- in
- update (buf, curState, (tk, row))
- end
-
- fun isStartForFsmGen () =
- let
- open Array
-
- val lookupTable = array (128, false)
-
- fun firstChr s = ord $ String.sub (s, 0)
- val () = List.app
- (fn (_, repr) => update (lookupTable, firstChr repr, true))
- $ List.filter
- (fn (tk, repr) =>
- let
- val c = String.sub (repr, 0)
- in
- c <> kwPrefix andalso c <> cppPrefix
- andalso tk <> NewLine
- end)
- tokenRepr
-
- in
- fn c => sub (lookupTable, ord c)
- end
-
- val isStartForFsm = isStartForFsmGen ()
-
- fun fsmTableCreate () =
- let
- open Array
-
- val T as (nextState, buf) =
- (ref 1, array (maxStates, (Invalid, array (128, ~1))))
- val r = ref 1
-
- fun filterNeeded [] acc = acc
- | filterNeeded ((T as (_, repr)) :: tks) acc =
- filterNeeded tks
- (if isStartForFsm $ String.sub (repr, 0) then
- T :: acc
- else
- acc)
-
- val tokenRepr = filterNeeded tokenRepr []
-
- fun fsmInsert' T curState tk repr = fsmInsert T curState tk repr handle
- Subscript => raise FsmTableIsTooSmall
- in
- while !r < length buf do (
- update (buf, !r, (Invalid, array(128, ~1)));
- r := !r + 1
- );
-
- List.app (fn (v, p) => fsmInsert' T 0 v $ explode p) tokenRepr;
- if !nextState <> maxStates then
- printLn $ "note: Fsm table size can be smaller: "
- ^ Int.toString (!nextState) ^ " is enough"
- else ();
- T
- end
-
- (* Unused right now
- fun printTable (nextState, buf) =
- let
- fun printRow i row =
- if i = length row then
- print "\n"
- else
- let
- val state = sub (row, i)
- in
- if state = ~1 then
- ()
- else
- print ((str (chr i)) ^ ": " ^ (Int.toString state) ^ ", ");
- printRow (i + 1) row
- end
-
- fun print' rowNum buf =
- if rowNum = !nextState then
- ()
- else
- let
- val (tk, row) = sub (buf, rowNum)
- in
- print ((token2string tk) ^ ": ");
- printRow 0 row;
- print' (rowNum + 1) buf
- end
- in
- print ("NextState: " ^ Int.toString (!nextState) ^ "\n");
- print' 0 buf;
- print "\n"
- end
- *)
-
- val fsmTable = lazy fsmTableCreate
-
- fun fsmEat stream =
- let
- open Array
- val pos = Stream.getPos stream
-
- fun get curState stream =
- let
- val (c, stream) = Stream.getchar stream
- in
- case c of
- NONE => (#1 $ sub (#2 $ fsmTable (), curState), stream)
- | SOME c =>
- let
- val (tk, row) = sub (#2 $ fsmTable (), curState)
- val nextState = sub (row, ord c)
- in
- if nextState = ~1 then
- (tk, Stream.ungetc stream)
- else
- get nextState stream
- end
- end
- in
- (fn (tk, stream) => ((pos, tk), stream)) $ get 0 stream
- end
-
- fun tkError2aug stream (dx, msg) =
- let
- val (id, pos) = Stream.getPosAfterCharRead stream
- val pos = Stream.pos2ppos (id, pos + dx) stream
- in
- TkErrorAug (pos, msg)
- end
-
- fun parseGeneric stream parser acc =
- let
- val stream = Stream.ungetc stream
- val P as (_, startOff) = Stream.getPos stream
-
- fun parse' stream acc = let
- val (c, stream) = Stream.getchar stream
-
- val (acc, tk, stream) = parser acc (stream, startOff) c handle
- TkError (TkiDx dx, msg) => raise tkError2aug stream (dx, msg)
- | TkError (TkiStart, msg) =>
- let
- val startPos = Stream.pos2ppos P stream
- in
- raise TkErrorAug (startPos, msg)
- end
- | TkError (TkiEOF, msg) =>
- let
- open Stream
- val pos = pposWithoutCol $ pos2ppos P stream
- in
- raise TkErrorAug (pos, msg)
- end
- in
- case tk of
- NONE => parse' stream acc
- | _ => (valOf tk, stream)
- end
-
- val (tk, stream) = parse' stream acc
- in
- ((P, tk): fullToken, stream)
- end
-
- fun finishSeqRead startOff stream =
- let
- val (_, endOff) = Stream.getPos stream
- val s = Stream.getSubstr startOff endOff stream
- in
- s
- end
-
- fun keywordHashtableGen () =
- let
- open Hashtable
- val table = create 128
- val () =
- List.app
- (fn (tk, repr) =>
- if String.sub (repr, 0) = kwPrefix then
- insert table (String.extract (repr, 1, NONE)) tk
- else
- ())
- tokenRepr
- in
- table
- end
-
- val keywordHashtable = lazy keywordHashtableGen
-
- fun findKeyword str =
- case Hashtable.lookup (keywordHashtable ()) str of
- NONE => Id str
- | SOME tk => tk
-
- fun idParser () (stream, startOff) c =
- let
- fun finalize stream =
- let
- val s = finishSeqRead startOff stream
- val tk = findKeyword s
- in
- ((), SOME tk, stream)
- end
- in
- case c of
- NONE => finalize stream
- | SOME c =>
- if isIdBody c then
- ((), NONE, stream)
- else
- finalize (Stream.ungetc stream)
- end
-
- datatype intMode = ImDec | ImOct | ImInvalidOct | ImHex
- datatype floatMode = FmDot | FmExp
-
- datatype npState = NpInit | IntMode of intMode | FloatMode of floatMode
-
- fun getLongestSeq acc pred stream =
- let
- val (c, stream') = Stream.getchar stream
- in
- if isSome c andalso pred $ valOf c then
- getLongestSeq (valOf c :: acc) pred stream'
- else
- (implode $ rev acc, stream)
- end
-
- fun skipDigitSeq off stream =
- let
- val (res, stream) = getLongestSeq [] isDigit stream
- in
- if res = "" then
- raise TkError (TkiDx off, "expected digit")
- else
- (String.size res, stream)
- end
-
- fun getSuffixCommon buf off stream =
- let
- val (sfx, stream) = getLongestSeq [] Char.isAlpha stream
- val sfx = String.map (fn c => Char.toLower c) sfx
- val sfx =
- case List.find (fn (_, repr) => repr = sfx) buf of
- NONE => raise TkError (TkiDx off, "unknown suffix")
- | SOME (sfx, _) => sfx
- in
- (sfx, stream)
- end
-
- val getIntSuffix = getSuffixCommon intSuffixRepr 0
- fun getFloatSuffix off = getSuffixCommon floatSuffixRepr off
-
- (*
- * It seems that there is an ambiguity in C:
- * gcc/clang/cparser consider 0xe-3 as a fp constant and reject it.
- * Right now we do not, but it may be reconsidered.
- *)
- fun numParser NpInit (stream, _) (SOME c) =
- if c = #"0" then
- let
- val (c, stream') = Stream.getchar stream
- in
- case c of
- NONE =>
- (NpInit, SOME $ Num $ IntConst (ItOct, "", IsNone), stream')
- | SOME c =>
- if Char.toLower c = #"x" then
- (IntMode ImHex, NONE, stream')
- else
- (IntMode ImOct, NONE, stream)
- end
- else
- (IntMode ImDec, NONE, stream)
- | numParser NpInit _ NONE = raise Unreachable
- | numParser (IntMode mode) (stream, startOff) c =
- let
- val (pred, res, offset) =
- case mode of
- ImDec => (isDigit, ItDec, 0)
- | ImOct => (isOctal, ItOct, 1)
- | ImInvalidOct => (isDigit, ItOct, 1)
- | ImHex => (isHexDigit, ItHex, 2)
-
- fun checkAndRaise m msg =
- if mode = m then raise TkError (TkiStart, msg) else ()
-
- fun finish () =
- let
- val () = checkAndRaise ImInvalidOct "invalid octal constant"
- val stream = Stream.ungetc stream
- val str = finishSeqRead (startOff + offset) stream
- val (sfx, stream) = getIntSuffix stream
- in
- (IntMode mode, SOME $ Num $ IntConst (res, str, sfx), stream)
- end
- in
- case c of
- NONE => finish ()
- | SOME c =>
- if pred c then
- (IntMode mode, NONE, stream)
- else if c = #"." then
- (FloatMode FmDot, NONE, stream)
- else if Char.toLower c = #"e" then
- (checkAndRaise ImHex
- "floating constant can not come with 0x prefix";
- (FloatMode FmExp, NONE, stream))
- else
- finish ()
- end
- | numParser (FloatMode FmDot) (stream, startOff) _ =
- let
- val (len, stream) = skipDigitSeq 0 $ Stream.ungetc stream
-
- val (c, stream') = Stream.getchar stream
-
- fun finish () =
- let
- val str = finishSeqRead startOff stream
- val (sfx, stream) = getFloatSuffix len stream
- in
- (FloatMode FmDot, SOME $ Num $ FloatConst (str, sfx), stream)
- end
- in
- case c of
- NONE => finish ()
- | SOME c =>
- if Char.toLower c = #"e" then
- (FloatMode FmExp, NONE, stream')
- else
- finish ()
- end
- | numParser (FloatMode FmExp) (stream, startOff) c =
- let
- val (off, stream) =
- if c = NONE then
- raise TkError (TkiDx 0, "expected digit")
- else if valOf c <> #"+" andalso valOf c <> #"-" then
- (0, Stream.ungetc stream)
- else
- (~1, stream)
-
- val (len, stream) = skipDigitSeq off stream
- val str = finishSeqRead startOff stream
- val (sfx, stream) = getFloatSuffix len stream
- in
- (FloatMode FmDot, SOME $ Num $ FloatConst (str, sfx), stream)
- end
-
- fun chrIntVal c =
- if isDigit c then
- ord c - ord #"0"
- else if ord c >= ord #"a" andalso ord c <= ord #"z" then
- ord c - ord #"a" + 10
- else if ord c >= ord #"A" andalso ord c <= ord #"Z" then
- ord c - ord #"A" + 10
- else
- raise Unreachable
-
- fun parseOctalSeq stream c =
- let
- fun follow stream acc count =
- if count = 3 then
- (SOME $ chr acc, stream)
- else
- let
- val (c, stream) = Stream.getchar stream
- in
- case c of
- NONE => (SOME $ chr acc, stream)
- | SOME c =>
- if isOctal c then
- follow stream (acc * 8 + chrIntVal c) (count + 1)
- else
- (SOME $ chr acc, Stream.ungetc stream)
- end
- in
- follow stream (chrIntVal c) 1
- end
-
- fun parseHexSeq stream =
- let
- fun follow stream acc count =
- let
- val (c, stream) = Stream.getchar stream
-
- val noHex = TkError (TkiDx 0, "\\x without hex digits")
- in
- case c of
- NONE =>
- if count = 0 then
- raise noHex
- else
- (SOME $ chr acc, stream)
- | SOME c =>
- if isHexDigit c then
- if count = 2 then
- raise TkError (TkiDx 2, "hex sequence out of range")
- else
- follow stream (acc * 16 + chrIntVal c) (count + 1)
- else
- if count = 0 then
- raise noHex
- else
- (SOME $ chr acc, Stream.ungetc stream)
- end
- in
- follow stream 0 0
- end
-
- fun eatEscSeq stream =
- let
- fun raiseErr0 msg = raise TkError (TkiDx 0, msg)
-
- val (c, stream) = Stream.getchar stream
- val c =
- case c of
- NONE => raiseErr0 "unfinished escape sequence"
- | SOME c => c
-
- fun & c = (SOME c, stream)
- in
- case c of
- #"'" => & #"'"
- | #"\"" => & #"\""
- | #"?" => & #"?"
- | #"\\" => & #"\\"
- | #"a" => & #"\a"
- | #"b" => & #"\b"
- | #"f" => & #"\f"
- | #"n" => & #"\n"
- | #"r" => & #"\r"
- | #"t" => & #"\t"
- | #"v" => & #"\v"
- | #"\n" => (NONE, stream)
- | #"x" => parseHexSeq stream
- | c =>
- if isOctal c then
- parseOctalSeq stream c
- else
- raiseErr0 "unknown escape sequence"
- end
-
- datatype SeqParseState = SeqInit | SeqStart | SeqValue of int | SeqTerm
-
- datatype seqParseMode = SpmChr | SpmStr
-
- fun seqBound SpmChr = #"'"
- | seqBound SpmStr = #"\""
-
- fun seqExnConv mode (TkError (v, msg)) =
- let
- val bound = if mode = SpmChr then "'" else "\""
- val msg =
- String.translate (fn c => if c = #"%" then bound else str c) msg
- in
- TkError (v, msg)
- end
- | seqExnConv _ _ = raise Unreachable
-
- fun unfinishedSeq SpmChr = "unfinished character constant"
- | unfinishedSeq SpmStr = "unfinished string literal"
-
- fun seqParser mode SeqInit (stream, _) (SOME c) =
- if seqBound mode = c then
- (SeqStart, NONE, stream)
- else
- raise Unreachable
- | seqParser mode SeqStart (stream, _) (SOME c) =
- if c <> seqBound mode then
- let
- val (c, stream) =
- if c <> #"\\" then (SOME c, stream) else eatEscSeq stream
- in
- if c = NONE then
- (SeqStart, NONE, stream)
- else
- (SeqValue (ord $ Option.valOf c), NONE, stream)
- end
- else if mode = SpmStr then
- (SeqTerm, SOME $ StringConst "", stream)
- else
- raise seqExnConv SpmChr $ TkError (TkiDx 0, "expected value after %")
- | seqParser mode (SeqValue v) (stream, startOff) (SOME c) =
- if seqBound mode = c then
- let
- fun term s v =
- if mode = SpmChr then
- CharConst (s, v)
- else
- StringConst $ String.extract (s, 1, SOME $ String.size s - 2)
- in
- (SeqTerm, SOME $ term (finishSeqRead startOff stream) v, stream)
- end
- else if mode = SpmStr then
- let
- val (_, stream) =
- if c <> #"\\" then (SOME c, stream) else eatEscSeq stream
- in
- (SeqValue v, NONE, stream)
- end
- else
- raise seqExnConv SpmChr $ TkError (TkiDx 0, "expected % after value")
- | seqParser _ SeqTerm _ (SOME _) =
- raise Unreachable
- | seqParser mode state (_, _) NONE =
- raise case state of
- SeqInit => Unreachable
- | SeqStart => seqExnConv mode $ TkError (TkiStart, unfinishedSeq mode)
- | SeqValue _ =>
- seqExnConv mode $ TkError (TkiStart, unfinishedSeq mode)
- | SeqTerm => Unreachable
-
- val charParser = seqParser SpmChr
- val strParser = seqParser SpmStr
-
- fun formCppDir (Id s) =
- let
- open String
- in
- case List.find
- (fn (_, repr) =>
- sub (repr, 0) = cppPrefix andalso
- extract (repr, 1, NONE) = s)
- tokenRepr
- of
- SOME (tk, _) => tk
- | NONE => raise ExpectedCppDir
- end
- | formCppDir kwElse = CppElse
- | formCppDir _ = raise ExpectedCppDir
-
- fun handleCppDir tk prevPos stream =
- let
- val (pos, tk') = tk
- in
- (prevPos, formCppDir tk') handle
- ExpectedCppDir =>
- raise TkErrorAug (Stream.pos2ppos pos stream,
- "expected preprocessor directive")
- end
-
- fun unexpectedCharRaise stream c =
- let
- open Stream
- val pos = pos2ppos (getPosAfterCharRead stream) stream
- val repr =
- if isPrintable c then
- str c
- else
- "<" ^ Int.toString (ord c) ^ ">"
- in
- raise TkErrorAug (pos, "unexpected character " ^ repr)
- end
-
- fun skipComment stream pos =
- let
- fun skip prevIsAsterisk stream =
- let
- val (c, stream) =
- case Stream.getchar stream of
- (NONE, _) =>
- let
- val pos = Stream.pos2ppos pos stream
- in
- raise TkErrorAug (pos, "unfinished comment")
- end
- | (SOME c, stream) => (c, stream)
- in
- if prevIsAsterisk andalso c = #"/" then
- stream
- else
- skip (c = #"*") stream
- end
- in
- skip false stream
- end
-
- fun handleBackslash stream =
- let
- val (c, stream) = Stream.getchar stream
-
- val raiseErr = fn () =>
- let
- val pos = Stream.getPosAfterCharRead stream
- val pos = Stream.pos2ppos pos stream
- in
- raise TkErrorAug (pos, "expected \\n after backslash")
- end
- in
- case c of
- SOME c =>
- if c = #"\n" then
- stream
- else
- raiseErr ()
- | NONE => raiseErr ()
- end
-
- fun processSymbol stream =
- let
- val (T as (p, tk), stream) = fsmEat $ Stream.ungetc stream
- in
- case tk of
- CommentStart => getToken $ skipComment stream p
- | DoubleDot => (SOME (p, Dot), Stream.ungetc stream)
- | Hash =>
- if Stream.isFirstOnLine p stream then
- let
- val (tk, stream) = getToken stream
- in
- case tk of
- NONE =>
- raise TkErrorAug (Stream.pos2ppos p stream,
- "unfinished preprecessor directive")
- | SOME tk =>
- (SOME $ handleCppDir tk p stream, stream)
- end
- else
- (SOME T, stream)
- | _ => (SOME T, stream)
- end
-
- and getToken stream =
- let
- val (c, stream) = Stream.getchar stream
-
- fun @-> parser acc =
- (fn (tk, s) => (SOME tk, s)) $ parseGeneric stream parser acc
- in
- case c of
- NONE => (NONE, stream)
- | SOME c =>
- if c = #"\n" then
- (SOME (Stream.getPosAfterCharRead stream, NewLine), stream)
- else if Char.isSpace c then
- getToken stream
- else if isIdStart c then
- @-> idParser ()
- else if isDigit c then
- @-> numParser NpInit
- else if c = #"'" then
- @-> charParser SeqInit
- else if c = #"\"" then
- @-> strParser SeqInit
- else if isStartForFsm c then
- processSymbol stream
- else if c = #"\\" then
- getToken $ handleBackslash stream
- else
- unexpectedCharRaise stream c
- end
-
- fun tokenize stream =
- let
- fun aux acc stream =
- let
- val (tk, stream) = getToken stream
- in
- case tk of
- NONE => rev acc
- | SOME tk => aux (tk :: acc) stream
- end
- in
- aux [] stream
- end
-
- fun debugPrint tkl fname =
- let
- fun print' line _ ((_, NewLine) :: tks) =
- print' (line + 1) true tks
- | print' line firstOnLine ((_, tk) :: tks) = (
- if firstOnLine then (
- print "\n";
- printLn $ fname ^ ":" ^ Int.toString line;
- print "\t")
- else
- ();
- printToken tk;
- print " ";
- print' line false tks
- )
- | print' _ _ [] = ()
- in
- print' 1 true tkl;
- print "\n"
- end
-end