summaryrefslogtreecommitdiff
path: root/tokenizer.sml
diff options
context:
space:
mode:
Diffstat (limited to 'tokenizer.sml')
-rw-r--r--tokenizer.sml787
1 files changed, 787 insertions, 0 deletions
diff --git a/tokenizer.sml b/tokenizer.sml
new file mode 100644
index 0000000..53bb396
--- /dev/null
+++ b/tokenizer.sml
@@ -0,0 +1,787 @@
+structure Tokenizer:> TOKENIZER = struct
+ datatype includeArg = IARel of string | IAFromRef of string
+
+ datatype token =
+ Invalid |
+ Number of string |
+ Id of string |
+ CharConst of string * int |
+ StringConst of string |
+
+ kwBreak |
+ kwCase |
+ kwChar |
+ kwConst |
+ kwContinue |
+ kwDefault |
+ kwDouble |
+ kwElse |
+ kwEnum |
+ kwExtern |
+ kwFloat |
+ kwFor |
+ kwGoto |
+ kwInt |
+ kwLong |
+ kwRegister |
+ kwReturn |
+ kwShort |
+ kwSigned |
+ kwSizeof |
+ kwStruct |
+ kwSwitch |
+ kwTypedef |
+ kwUnion |
+ kwUnsigned |
+ kwVoid |
+ kwVolatile |
+
+ LParen |
+ RParen |
+ LBracket |
+ RBracket |
+ LBrace |
+ RBrace |
+
+ QuestionMark |
+ Colon |
+ Coma |
+ Semicolon |
+
+ Arrow |
+ Plus |
+ DoublePlus|
+ Minus |
+ DoubleMinus |
+ Ampersand |
+ Asterisk |
+ Slash |
+ Tilde |
+ ExclMark |
+ Percent |
+ DoubleGreater |
+ DoubleLess |
+ Greater |
+ Less |
+ EqualSign |
+ LessEqualSign |
+ GreaterEqualSign |
+ DoubleEqualSign |
+ ExclMarkEqualSign |
+ Cap |
+ VerticalBar |
+ DoubleAmpersand |
+ DoubleVerticalBar |
+
+ AsteriskEqualSign |
+ SlashEqualSign |
+ PercentEqualSign |
+ PlusEqualSign |
+ MinusEqualSign |
+ DoubleLessEqualSign |
+ DoubleGreaterEqualSign |
+ AmpersandEqualSign |
+ CapEqualSign |
+ VerticalBarEqualSign |
+
+ Hash |
+ DoubleHash |
+
+ Dot |
+ DoubleDot |
+ TripleDot |
+
+ CommentStart |
+
+ CppInclude of includeArg |
+ CppDefine |
+ CppUndef |
+ CppIf |
+ CppIfdef |
+ CppIfndef |
+ CppElse |
+ CppElif |
+ CppEndif |
+ CppPragma
+
+ val kwPrefix = #"@"
+ val cppPrefix = #"$"
+
+ type fullToken = Stream.pos * token
+
+ datatype tkErrorAuxInfo = TkiEOF | TkiDx of int | TkiStart
+
+ exception TkError of tkErrorAuxInfo * string
+ exception TkErrorAug of Stream.convPos * string
+
+ exception ExpectedCppDir (* handled in postprocess *)
+
+ exception FsmTableIsTooSmall
+
+ (* Unreachable (should be) *)
+ exception Unreachable
+ exception TokenWithoutRepr
+
+ val tokenRepr =
+ let
+ fun & repr = str kwPrefix ^ repr
+ fun % repr = str cppPrefix ^ repr
+ in
+ [
+ (kwBreak, &"break"),
+ (kwCase, &"case"),
+ (kwChar, &"char"),
+ (kwConst, &"const"),
+ (kwContinue, &"continue"),
+ (kwDefault, &"default"),
+ (kwDouble, &"double"),
+ (kwElse, &"else"),
+ (kwEnum, &"enum"),
+ (kwExtern, &"extern"),
+ (kwFloat, &"float"),
+ (kwFor, &"for"),
+ (kwGoto, &"goto"),
+ (kwInt, &"int"),
+ (kwLong, &"long"),
+ (kwRegister, &"register"),
+ (kwReturn, &"return"),
+ (kwShort, &"short"),
+ (kwSigned, &"signed"),
+ (kwSizeof, &"sizeof"),
+ (kwStruct, &"struct"),
+ (kwSwitch, &"switch"),
+ (kwTypedef, &"typedef"),
+ (kwUnion, &"union"),
+ (kwUnsigned, &"unsigned"),
+ (kwVoid, &"void"),
+ (kwVolatile, &"volatile"),
+
+ (LParen, "("),
+ (RParen, ")"),
+ (LBracket, "["),
+ (RBracket, "]"),
+ (LBrace, "{"),
+ (RBrace, "}"),
+
+ (QuestionMark, "?"),
+ (Colon, ":"),
+ (Coma, ","),
+ (Semicolon, ";"),
+
+ (Arrow, "->"),
+ (Plus, "+"),
+ (DoublePlus, "++"),
+ (Minus, "-"),
+ (DoubleMinus, "--"),
+ (Ampersand, "&"),
+ (Asterisk, "*"),
+ (Slash, "/"),
+ (Tilde, "~"),
+ (ExclMark, "!"),
+ (Percent, "%"),
+ (DoubleLess, "<<"),
+ (DoubleGreater, ">>"),
+ (Less, "<"),
+ (Greater, ">"),
+ (EqualSign, "="),
+ (LessEqualSign, "<="),
+ (GreaterEqualSign, ">="),
+ (DoubleEqualSign, "=="),
+ (ExclMarkEqualSign, "!="),
+ (Cap, "^"),
+ (VerticalBar, "|"),
+ (DoubleAmpersand, "&&"),
+ (DoubleVerticalBar, "||"),
+
+ (AsteriskEqualSign, "*="),
+ (SlashEqualSign, "/="),
+ (PercentEqualSign, "%="),
+ (PlusEqualSign, "+="),
+ (MinusEqualSign, "-="),
+ (DoubleLessEqualSign, "<<="),
+ (DoubleGreaterEqualSign, ">>="),
+ (AmpersandEqualSign, "&="),
+ (CapEqualSign, "^="),
+ (VerticalBarEqualSign, "|="),
+
+ (Hash, "#"),
+ (DoubleHash, "##"),
+
+ (Dot, "."),
+ (DoubleDot, ".."),
+ (TripleDot, "..."),
+
+ (CommentStart, "/*"),
+
+ (CppDefine, %"define"),
+ (CppUndef, %"undef"),
+ (CppIf, %"if"),
+ (CppIfdef, %"ifdef"),
+ (CppIfndef, %"ifndef"),
+ (CppElse, %"else"),
+ (CppElif, %"elif"),
+ (CppEndif, %"endif"),
+ (CppPragma, %"pragma")
+ ]
+ end
+
+ val printToken = fn
+ Number s => printLn $ "Num: " ^ s
+ | Id s => printLn $ "Id: " ^ s
+ | CharConst (repr, _) => printLn repr
+ | CppInclude arg =>
+ let
+ val (start, end', arg) =
+ case arg of
+ IARel v => ("\"", "\"", v)
+ | IAFromRef v => ("<", ">", v)
+ in
+ printLn $ (str cppPrefix) ^ "include " ^ start ^ arg ^ end'
+ end
+ | StringConst s =>
+ printLn $ "\"" ^ s ^ "\""
+ | v =>
+ case List.find (fn (x, _) => x = v) tokenRepr of
+ SOME (_, repr) => printLn repr
+ | NONE => raise TokenWithoutRepr
+
+ fun isIdStart c = Char.isAlpha c orelse c = #"_"
+ fun isIdBody c = Char.isAlphaNum c orelse c = #"_"
+
+ fun isPrintable c = Char.isPrint c andalso c <> #" "
+
+ (* FSM for parsing symbols *)
+
+ val maxStates = 51
+
+ fun fsmInsert (nextState, buf) curState tk (c :: cs) =
+ let
+ open Array
+
+ val (_, row) = sub (buf, curState)
+ val potNextState = sub (row, ord c)
+ in
+ if potNextState <> ~1 then
+ fsmInsert (nextState, buf) potNextState tk cs
+ else (
+ update (row, ord c, !nextState);
+ nextState := !nextState + 1;
+ fsmInsert (nextState, buf) (!nextState - 1) tk cs
+ )
+ end
+ | fsmInsert (_, buf) curState tk [] =
+ let
+ open Array
+ val (_, row) = sub (buf, curState)
+ in
+ update (buf, curState, (tk, row))
+ end
+
+ fun isStartForFsmGen () =
+ let
+ open Array
+
+ val lookupTable = array (128, false)
+
+ fun firstChr s = ord $ String.sub (s, 0)
+ val () = List.app
+ (fn (_, repr) => update (lookupTable, firstChr repr, true))
+ $ List.filter
+ (fn (_, repr) =>
+ let
+ val c = String.sub (repr, 0)
+ in
+ c <> kwPrefix andalso c <> cppPrefix
+ end)
+ tokenRepr
+
+ in
+ fn c => sub (lookupTable, ord c)
+ end
+
+ val isStartForFsm = isStartForFsmGen ()
+
+ fun fsmTableCreate () =
+ let
+ open Array
+
+ val T as (nextState, buf) =
+ (ref 1, array (maxStates, (Invalid, array (128, ~1))))
+ val r = ref 1
+
+ fun filterNeeded [] acc = acc
+ | filterNeeded ((T as (_, repr)) :: tks) acc =
+ filterNeeded tks
+ (if isStartForFsm $ String.sub (repr, 0) then
+ T :: acc
+ else
+ acc)
+
+ val tokenRepr = filterNeeded tokenRepr []
+
+ fun fsmInsert' T curState tk repr = fsmInsert T curState tk repr handle
+ Subscript => raise FsmTableIsTooSmall
+ in
+ while !r < length buf do (
+ update (buf, !r, (Invalid, array(128, ~1)));
+ r := !r + 1
+ );
+
+ List.app (fn (v, p) => fsmInsert' T 0 v $ explode p) tokenRepr;
+ if !nextState <> maxStates then
+ printLn $ "note: Fsm table size can be smaller: "
+ ^ Int.toString (!nextState) ^ " is enough"
+ else ();
+ T
+ end
+
+ (* Unused right now
+ fun printTable (nextState, buf) =
+ let
+ fun printRow i row =
+ if i = length row then
+ print "\n"
+ else
+ let
+ val state = sub (row, i)
+ in
+ if state = ~1 then
+ ()
+ else
+ print ((str (chr i)) ^ ": " ^ (Int.toString state) ^ ", ");
+ printRow (i + 1) row
+ end
+
+ fun print' rowNum buf =
+ if rowNum = !nextState then
+ ()
+ else
+ let
+ val (tk, row) = sub (buf, rowNum)
+ in
+ print ((token2string tk) ^ ": ");
+ printRow 0 row;
+ print' (rowNum + 1) buf
+ end
+ in
+ print ("NextState: " ^ Int.toString (!nextState) ^ "\n");
+ print' 0 buf;
+ print "\n"
+ end
+ *)
+
+ val fsmTable = lazy fsmTableCreate
+
+ fun fsmEat stream =
+ let
+ open Array
+ val stream = Stream.ungetc stream
+ val pos = Stream.getPos stream
+
+ fun get curState stream =
+ let
+ val (c, stream) = (fn (c, s) => (SOME c, s)) $ Stream.getchar stream
+ handle
+ _ => (NONE, stream)
+ in
+ case c of
+ NONE => (#1 $ sub (#2 $ fsmTable (), curState), stream)
+ | SOME c =>
+ let
+ val (tk, row) = sub (#2 $ fsmTable (), curState)
+ val nextState = sub (row, ord c)
+ in
+ if nextState = ~1 then
+ (tk, Stream.ungetc stream)
+ else
+ get nextState stream
+ end
+ end
+ in
+ (fn (tk, stream) => ((pos, tk), stream)) $ get 0 stream
+ end
+
+ fun tkError2aug stream (dx, msg) =
+ let
+ val (id, pos) = Stream.getPosAfterCharRead stream
+ val pos = Stream.getPposFromPos (id, pos + dx) stream
+ in
+ TkErrorAug (pos, msg)
+ end
+
+ fun parseGeneric stream parser acc =
+ let
+ val stream = Stream.ungetc stream
+ val P as (_, startOff) = Stream.getPos stream
+
+ fun parse' stream acc = let
+ val (c, stream) = (fn (c, s) => (SOME c, s)) $ Stream.getchar stream handle
+ _ => (NONE, stream)
+
+ val (acc, tk, stream) = parser acc (stream, startOff) c handle
+ TkError (TkiDx dx, msg) => raise tkError2aug stream (dx, msg)
+ | TkError (TkiStart, msg) =>
+ let
+ val startPos = Stream.getPposFromPos P stream
+ in
+ raise TkErrorAug (startPos, msg)
+ end
+ | TkError (TkiEOF, msg) =>
+ let
+ val (file, line, _) = Stream.getPposFromPos P stream
+ in
+ raise TkErrorAug ((file, line, NONE), msg)
+ end
+ in
+ case tk of
+ NONE => parse' stream acc
+ | _ => (valOf tk, stream)
+ end
+
+ val (tk, stream) = parse' stream acc
+ in
+ ((P, tk): fullToken, stream)
+ end
+
+ fun finishSeqRead startOff stream =
+ let
+ val (_, endOff) = Stream.getPos stream
+ val s = Stream.getSubstr startOff endOff stream
+ in
+ s
+ end
+
+ fun findKeyword s =
+ case List.find
+ (fn (_, repr) =>
+ String.sub (repr, 0) = kwPrefix andalso
+ String.extract (repr, 1, NONE) = s)
+ tokenRepr
+ of
+ SOME (tk, _) => tk
+ | NONE => Id s
+
+ fun idParser () (stream, startOff) c =
+ let
+ fun finalize stream =
+ let
+ val s = finishSeqRead startOff stream
+ val tk = findKeyword s
+ in
+ ((), SOME tk, stream)
+ end
+ in
+ case c of
+ NONE => finalize stream
+ | SOME c =>
+ if isIdBody c then
+ ((), NONE, stream)
+ else
+ finalize (Stream.ungetc stream)
+ end
+
+ fun formCppDir (Id s) =
+ let
+ open String
+ in
+ case List.find
+ (fn (_, repr) =>
+ sub (repr, 0) = cppPrefix andalso
+ extract (repr, 1, NONE) = s)
+ tokenRepr
+ of
+ SOME (tk, _) => tk
+ | NONE => raise ExpectedCppDir
+ end
+ | formCppDir kwElse = CppElse
+ | formCppDir _ = raise ExpectedCppDir
+
+ fun numParser () (stream, startOff) c =
+ let
+ fun finalize stream =
+ ((), SOME $ Number (finishSeqRead startOff stream), stream)
+ in
+ case c of
+ NONE => finalize stream
+ | SOME c =>
+ if Char.isDigit c then
+ ((), NONE, stream)
+ else
+ finalize (Stream.ungetc stream)
+ end
+
+ fun eatEscSeq stream =
+ let
+ val (c, stream) = Stream.getchar stream handle
+ _ => raise TkError (TkiDx 0, "unfinished escape sequence")
+ in
+ (case c of
+ #"\\" => #"\\"
+ | #"t" => #"\t"
+ | #"n" => #"\n"
+ | c => c,
+ stream)
+ end
+
+ fun stringCut s = String.extract (s, 1, SOME $ String.size s - 2)
+
+ datatype SeqParseState = SeqInit | SeqStart | SeqValue of int | SeqTerm
+
+ datatype seqParseMode = SpmChr | SpmStr
+
+ fun seqBound SpmChr = #"'"
+ | seqBound SpmStr = #"\""
+
+ fun seqExnConv mode (TkError (v, msg)) =
+ let
+ val bound = if mode = SpmChr then "'" else "\""
+ val msg =
+ String.translate (fn c => if c = #"%" then bound else str c) msg
+ in
+ TkError (v, msg)
+ end
+ | seqExnConv _ _ = raise Unreachable
+
+ fun unfinishedSeq SpmChr = "unfinished character constant"
+ | unfinishedSeq SpmStr = "unfinished string literal"
+
+ fun seqParser mode SeqInit (stream, _) (SOME c) =
+ if seqBound mode = c then
+ (SeqStart, NONE, stream)
+ else
+ raise Unreachable
+ | seqParser mode SeqStart (stream, _) (SOME c) =
+ if c <> seqBound mode then
+ let
+ val (c, stream) =
+ if c <> #"\\" then (c, stream) else eatEscSeq stream
+ in
+ (SeqValue (ord c), NONE, stream)
+ end
+ else if mode = SpmStr then
+ (SeqTerm, SOME $ StringConst "", stream)
+ else
+ raise seqExnConv SpmChr $ TkError (TkiDx 0, "expected value after %")
+ | seqParser mode (SeqValue v) (stream, startOff) (SOME c) =
+ if seqBound mode = c then
+ let
+ fun term s v =
+ if mode = SpmChr then
+ CharConst (s, v)
+ else
+ StringConst $ stringCut s
+ in
+ (SeqTerm, SOME $ term (finishSeqRead startOff stream) v, stream)
+ end
+ else if mode = SpmStr then
+ let
+ val (_, stream) =
+ if c <> #"\\" then (c, stream) else eatEscSeq stream
+ in
+ (SeqValue v, NONE, stream)
+ end
+ else
+ raise seqExnConv SpmChr $ TkError (TkiDx 0, "expected % after value")
+ | seqParser _ SeqTerm _ (SOME _) =
+ raise Unreachable
+ | seqParser mode state (_, _) NONE =
+ raise case state of
+ SeqInit => Unreachable
+ | SeqStart => seqExnConv mode $ TkError (TkiStart, unfinishedSeq mode)
+ | SeqValue _ =>
+ seqExnConv mode $ TkError (TkiStart, unfinishedSeq mode)
+ | SeqTerm => Unreachable
+
+ val charParser = seqParser SpmChr
+ val strParser = seqParser SpmStr
+
+ fun readIncludeArg stream =
+ let
+ open String
+
+ fun triml s idx =
+ if idx = size s then
+ ""
+ else if Char.isSpace $ sub (s, idx) then
+ triml s (idx + 1)
+ else
+ extract (s, idx, NONE)
+
+ fun trimr s idx =
+ if idx = 0 then
+ ""
+ else if Char.isSpace $ sub (s, idx) then
+ trimr s (idx - 1)
+ else
+ extract (s, 0, SOME $ idx + 1)
+
+ fun trim s = triml (trimr s (size s - 1)) 0
+
+ fun getLinePos () =
+ let
+ val (fname, line, _) = Stream.getPposFromPos (Stream.getPos stream) stream
+ in
+ (fname, line, NONE)
+ end
+
+ fun determineType s =
+ let
+ fun --> msg = raise TkErrorAug (getLinePos (), msg)
+ fun isLast c = sub (s, size s - 1) = c
+ in
+ if s = "" then
+ --> "#include argument is empty"
+ else
+ case sub (s, 0) of
+ #"<" =>
+ if isLast #">" then
+ IAFromRef $ stringCut s
+ else
+ --> "expected > at #include argument end"
+ | #"\"" =>
+ if isLast #"\"" then
+ IARel $ stringCut s
+ else
+ --> "expected \" at #include argument end"
+ | _ => --> "#include argument should start with \" or <"
+ end
+
+ val (arg, stream) = Stream.readline stream handle
+ Stream.LineWithoutNl =>
+ raise TkErrorAug (getLinePos (),
+ "#include line does not end with \\n")
+ in
+ (determineType $ trim arg, stream)
+ end
+
+ fun postprocessCppDir tk tkl stream =
+ let
+ val isCppDir =
+ (fn Hash => true | _ => false) (#2 $ hd tkl)
+ andalso Stream.isFirstOnLine (#1 $ hd tkl) stream
+ handle Empty => false
+ val (pos, tk') = tk
+
+ fun conv tk = ((#1 $ hd tkl, tk) :: tl tkl)
+ in
+ if isCppDir andalso tk' = Id "include" then
+ let
+ val (arg, stream) = readIncludeArg stream
+ in
+ (conv $ CppInclude arg, stream)
+ end
+ else if isCppDir then
+ (conv $ formCppDir tk', stream) handle
+ ExpectedCppDir =>
+ raise TkErrorAug (Stream.getPposFromPos pos stream,
+ "expected preprocessor directive")
+
+ else
+ (tk :: tkl, stream)
+ end
+
+ fun unexpectedCharRaise stream c =
+ let
+ val (id, pos) = Stream.getPosAfterCharRead stream
+ val pos = Stream.getPposFromPos (id, pos) stream
+ val repr =
+ if isPrintable c then
+ str c
+ else
+ "<" ^ Int.toString (ord c) ^ ">"
+ in
+ raise TkErrorAug (pos, "unexpected character " ^ repr)
+ end
+
+ fun skipComment stream pos =
+ let
+ fun skip prevIsAsterisk stream =
+ let
+ val (c, stream) = Stream.getchar stream
+ in
+ if prevIsAsterisk andalso c = #"/" then
+ stream
+ else
+ skip (c = #"*") stream
+ end
+ in
+ skip false stream handle
+ Stream.EndOfFile =>
+ let
+ val pos = Stream.getPposFromPos pos stream
+ in
+ raise TkErrorAug (pos, "unfinished comment")
+ end
+ end
+
+ fun handleBackslash stream =
+ let
+ val (c, stream) = (fn (c, s) => (SOME c, s)) $ Stream.getchar stream handle
+ _ => (NONE, stream)
+
+ val raiseErr = fn () =>
+ let
+ val pos = Stream.getPosAfterCharRead stream
+ val pos = Stream.getPposFromPos pos stream
+ in
+ raise TkErrorAug (pos, "expected \\n after backslash")
+ end
+ in
+ case c of
+ SOME c =>
+ if c = #"\n" then
+ stream
+ else
+ raiseErr ()
+ | NONE => raiseErr ()
+ end
+
+ fun processSymbol stream tkl =
+ let
+ val (T as (p as (fid, off), tk), stream) = fsmEat stream
+ in
+ case tk of
+ CommentStart => tokenize (skipComment stream p) tkl
+ | DoubleDot => tokenize stream (((fid, off + 1), Dot) :: (p, Dot) :: tkl)
+ | _ => tokenize stream (T :: tkl)
+ end
+
+ and tokenize stream tkl =
+ let
+ fun getcharSkipEof stream = Stream.getchar stream handle
+ Stream.EndOfFile => getcharSkipEof (Stream.advanceToNewFile stream)
+
+ val (c, stream) = (fn (c, s) => (SOME c, s)) $ getcharSkipEof stream
+ handle
+ Stream.EndOfStream => (NONE, stream)
+
+ fun cont (tk, stream) = tokenize stream (tk :: tkl)
+ fun @-> parser acc = cont $ parseGeneric stream parser acc
+ in
+ case c of
+ NONE => (rev tkl, Stream.extractFilesInfo stream)
+ | SOME c =>
+ if Char.isSpace c then
+ tokenize stream tkl
+ else if isIdStart c then
+ let
+ val (tk, stream) = parseGeneric stream idParser ()
+ val (tkl, stream) = postprocessCppDir tk tkl stream
+ in
+ tokenize stream tkl
+ end
+ else if Char.isDigit c then
+ @-> numParser ()
+ else if c = #"'" then
+ @-> charParser SeqInit
+ else if c = #"\"" then
+ @-> strParser SeqInit
+ else if isStartForFsm c then
+ processSymbol stream tkl
+ else if c = #"\\" then
+ tokenize (handleBackslash stream) tkl
+ else
+ unexpectedCharRaise stream c
+ end
+
+end