diff options
Diffstat (limited to 'tokenizer.sml')
-rw-r--r-- | tokenizer.sml | 234 |
1 files changed, 167 insertions, 67 deletions
diff --git a/tokenizer.sml b/tokenizer.sml index b40e03f..8b1ace4 100644 --- a/tokenizer.sml +++ b/tokenizer.sml @@ -100,6 +100,8 @@ structure Tokenizer:> TOKENIZER = struct CppElse | CppElif | CppEndif | + CppWarning | + CppError | CppPragma val kwPrefix = #"@" @@ -219,6 +221,8 @@ structure Tokenizer:> TOKENIZER = struct (CppElse, %"else"), (CppElif, %"elif"), (CppEndif, %"endif"), + (CppWarning, %"warning"), + (CppError, %"error"), (CppPragma, %"pragma") ] end @@ -364,7 +368,6 @@ structure Tokenizer:> TOKENIZER = struct fun fsmEat stream = let open Array - val stream = Stream.ungetc stream val pos = Stream.getPos stream fun get curState stream = @@ -391,7 +394,7 @@ structure Tokenizer:> TOKENIZER = struct fun tkError2aug stream (dx, msg) = let val (id, pos) = Stream.getPosAfterCharRead stream - val pos = Stream.getPposFromPos (id, pos + dx) stream + val pos = Stream.pos2ppos (id, pos + dx) stream in TkErrorAug (pos, msg) end @@ -408,14 +411,14 @@ structure Tokenizer:> TOKENIZER = struct TkError (TkiDx dx, msg) => raise tkError2aug stream (dx, msg) | TkError (TkiStart, msg) => let - val startPos = Stream.getPposFromPos P stream + val startPos = Stream.pos2ppos P stream in raise TkErrorAug (startPos, msg) end | TkError (TkiEOF, msg) => let open Stream - val pos = pposWithoutCol $ getPposFromPos P stream + val pos = pposWithoutCol $ pos2ppos P stream in raise TkErrorAug (pos, msg) end @@ -467,22 +470,6 @@ structure Tokenizer:> TOKENIZER = struct finalize (Stream.ungetc stream) end - fun formCppDir (Id s) = - let - open String - in - case List.find - (fn (_, repr) => - sub (repr, 0) = cppPrefix andalso - extract (repr, 1, NONE) = s) - tokenRepr - of - SOME (tk, _) => tk - | NONE => raise ExpectedCppDir - end - | formCppDir kwElse = CppElse - | formCppDir _ = raise ExpectedCppDir - fun numParser () (stream, startOff) c = let fun finalize stream = @@ -497,20 +484,101 @@ structure Tokenizer:> TOKENIZER = struct finalize (Stream.ungetc stream) end + fun isOctal c = ord c >= ord #"0" andalso ord c < ord #"8" + + fun chrIntVal c = + if Char.isDigit c then + ord c - ord #"0" + else if ord c >= ord #"a" andalso ord c <= ord #"z" then + ord c - ord #"a" + 10 + else if ord c >= ord #"A" andalso ord c <= ord #"Z" then + ord c - ord #"A" + 10 + else + raise Unreachable + + fun parseOctalSeq stream c = + let + + fun follow stream acc count = + if count = 3 then + (SOME $ chr acc, stream) + else + let + val (c, stream) = Stream.getchar stream + in + case c of + NONE => (SOME $ chr acc, stream) + | SOME c => + if isOctal c then + follow stream (acc * 8 + chrIntVal c) (count + 1) + else + (SOME $ chr acc, Stream.ungetc stream) + end + in + follow stream (chrIntVal c) 1 + end + + fun parseHexSeq stream = + let + fun follow stream acc count = + let + val (c, stream) = Stream.getchar stream + + val noHex = TkError (TkiDx 0, "\\x without hex digits") + in + case c of + NONE => + if count = 0 then + raise noHex + else + (SOME $ chr acc, stream) + | SOME c => + if Char.isHexDigit c then + if count = 2 then + raise TkError (TkiDx 2, "hex sequence out of range") + else + follow stream (acc * 16 + chrIntVal c) (count + 1) + else + if count = 0 then + raise noHex + else + (SOME $ chr acc, Stream.ungetc stream) + end + in + follow stream 0 0 + end + fun eatEscSeq stream = let + fun raiseErr0 msg = raise TkError (TkiDx 0, msg) + val (c, stream) = Stream.getchar stream val c = case c of - NONE => raise TkError (TkiDx 0, "unfinished escape sequence") + NONE => raiseErr0 "unfinished escape sequence" | SOME c => c + + fun & c = (SOME c, stream) in - (case c of - #"\\" => #"\\" - | #"t" => #"\t" - | #"n" => #"\n" - | c => c, - stream) + case c of + #"'" => & #"'" + | #"\"" => & #"\"" + | #"?" => & #"?" + | #"\\" => & #"\\" + | #"a" => & #"\a" + | #"b" => & #"\b" + | #"f" => & #"\f" + | #"n" => & #"\n" + | #"r" => & #"\r" + | #"t" => & #"\t" + | #"v" => & #"\v" + | #"\n" => (NONE, stream) + | #"x" => parseHexSeq stream + | c => + if isOctal c then + parseOctalSeq stream c + else + raiseErr0 "unknown escape sequence" end fun stringCut s = String.extract (s, 1, SOME $ String.size s - 2) @@ -544,9 +612,12 @@ structure Tokenizer:> TOKENIZER = struct if c <> seqBound mode then let val (c, stream) = - if c <> #"\\" then (c, stream) else eatEscSeq stream + if c <> #"\\" then (SOME c, stream) else eatEscSeq stream in - (SeqValue (ord c), NONE, stream) + if c = NONE then + (SeqStart, NONE, stream) + else + (SeqValue (ord $ Option.valOf c), NONE, stream) end else if mode = SpmStr then (SeqTerm, SOME $ StringConst "", stream) @@ -566,7 +637,7 @@ structure Tokenizer:> TOKENIZER = struct else if mode = SpmStr then let val (_, stream) = - if c <> #"\\" then (c, stream) else eatEscSeq stream + if c <> #"\\" then (SOME c, stream) else eatEscSeq stream in (SeqValue v, NONE, stream) end @@ -585,30 +656,36 @@ structure Tokenizer:> TOKENIZER = struct val charParser = seqParser SpmChr val strParser = seqParser SpmStr - fun postprocessCppDir tk tkl stream = + fun formCppDir (Id s) = let - val isCppDir = - (fn Hash => true | _ => false) (#2 $ hd tkl) - andalso Stream.isFirstOnLine (#1 $ hd tkl) stream - handle Empty => false - val (pos, tk') = tk - - fun conv tk = ((#1 $ hd tkl, tk) :: tl tkl) + open String in - if isCppDir then - (conv $ formCppDir tk', stream) handle - ExpectedCppDir => - raise TkErrorAug (Stream.getPposFromPos pos stream, - "expected preprocessor directive") + case List.find + (fn (_, repr) => + sub (repr, 0) = cppPrefix andalso + extract (repr, 1, NONE) = s) + tokenRepr + of + SOME (tk, _) => tk + | NONE => raise ExpectedCppDir + end + | formCppDir kwElse = CppElse + | formCppDir _ = raise ExpectedCppDir - else - (tk :: tkl, stream) + fun handleCppDir tk prevPos stream = + let + val (pos, tk') = tk + in + (prevPos, formCppDir tk') handle + ExpectedCppDir => + raise TkErrorAug (Stream.pos2ppos pos stream, + "expected preprocessor directive") end fun unexpectedCharRaise stream c = let - val (id, pos) = Stream.getPosAfterCharRead stream - val pos = Stream.getPposFromPos (id, pos) stream + open Stream + val pos = pos2ppos (getPosAfterCharRead stream) stream val repr = if isPrintable c then str c @@ -626,7 +703,7 @@ structure Tokenizer:> TOKENIZER = struct case Stream.getchar stream of (NONE, _) => let - val pos = Stream.getPposFromPos pos stream + val pos = Stream.pos2ppos pos stream in raise TkErrorAug (pos, "unfinished comment") end @@ -648,7 +725,7 @@ structure Tokenizer:> TOKENIZER = struct val raiseErr = fn () => let val pos = Stream.getPosAfterCharRead stream - val pos = Stream.getPposFromPos pos stream + val pos = Stream.pos2ppos pos stream in raise TkErrorAug (pos, "expected \\n after backslash") end @@ -662,35 +739,44 @@ structure Tokenizer:> TOKENIZER = struct | NONE => raiseErr () end - fun processSymbol stream tkl = + fun processSymbol stream = let - val (T as (p as (fid, off), tk), stream) = fsmEat stream + val (T as (p, tk), stream) = fsmEat $ Stream.ungetc stream in case tk of - CommentStart => tokenize (skipComment stream p) tkl - | DoubleDot => tokenize stream (((fid, off + 1), Dot) :: (p, Dot) :: tkl) - | _ => tokenize stream (T :: tkl) + CommentStart => getToken $ skipComment stream p + | DoubleDot => (SOME (p, Dot), Stream.ungetc stream) + | Hash => + if Stream.isFirstOnLine p stream then + let + val (tk, stream) = getToken stream + in + case tk of + NONE => + raise TkErrorAug (Stream.pos2ppos p stream, + "unfinished preprecessor directive") + | SOME tk => + (SOME $ handleCppDir tk p stream, stream) + end + else + (SOME T, stream) + | _ => (SOME T, stream) end - and tokenize stream tkl = + and getToken stream = let val (c, stream) = Stream.getchar stream - fun cont (tk, stream) = tokenize stream (tk :: tkl) - fun @-> parser acc = cont $ parseGeneric stream parser acc + fun @-> parser acc = + (fn (tk, s) => (SOME tk, s)) $ parseGeneric stream parser acc in case c of - NONE => (rev tkl, Stream.recycle stream) + NONE => (NONE, stream) | SOME c => if Char.isSpace c then - tokenize stream tkl + getToken stream else if isIdStart c then - let - val (tk, stream) = parseGeneric stream idParser () - val (tkl, stream) = postprocessCppDir tk tkl stream - in - tokenize stream tkl - end + @-> idParser () else if Char.isDigit c then @-> numParser () else if c = #"'" then @@ -698,11 +784,25 @@ structure Tokenizer:> TOKENIZER = struct else if c = #"\"" then @-> strParser SeqInit else if isStartForFsm c then - processSymbol stream tkl + processSymbol stream else if c = #"\\" then - tokenize (handleBackslash stream) tkl + getToken $ handleBackslash stream else unexpectedCharRaise stream c end + fun tokenize stream = + let + fun aux acc stream = + let + val (tk, stream) = getToken stream + in + case tk of + NONE => rev acc + | SOME tk => aux (tk :: acc) stream + end + in + aux [] stream + end + end |