diff options
author | Vladimir Azarov <avm@intermediate-node.net> | 2025-05-26 21:06:51 +0200 |
---|---|---|
committer | Vladimir Azarov <avm@intermediate-node.net> | 2025-05-26 21:06:51 +0200 |
commit | 3a32398248e5593b1b536c837478cab276f7aebf (patch) | |
tree | 55ff4004a06278c84908668c26d39d0fc3098a08 | |
parent | 6f3fa80b37ca5f8d992f5d6f66aee77ead303bf4 (diff) |
Simpler tokenizer
-rw-r--r-- | parser.fun | 6 | ||||
-rw-r--r-- | stream.sig | 3 | ||||
-rw-r--r-- | stream.sml | 8 | ||||
-rw-r--r-- | tokenizer.fun | 592 | ||||
-rw-r--r-- | tokenizer.sig | 14 |
5 files changed, 214 insertions, 409 deletions
@@ -92,7 +92,7 @@ functor Parser(P: PPC): PARSER = struct datatype expr = Enum | Eid of int | - Estrlit of string | + Estrlit of int | EmemberByV of int * exprAug | EmemberByP of int * exprAug | EfuncCall of exprAug * exprAug list | @@ -254,7 +254,7 @@ functor Parser(P: PPC): PARSER = struct case e of Eid id => Printf out P.?id P % | Enum => Printf out `"num" P % - | Estrlit s => Printf out `"\"" `s `"\"" P % + | Estrlit s => Printf out P.?s P % | EmemberByV pair => member pair "." | EmemberByP pair => member pair "->" | EfuncCall (func, args) => ( @@ -399,7 +399,7 @@ functor Parser(P: PPC): PARSER = struct in case tk of Tk (T.Id id) => wrap $ Eid id - | Tk (T.StringConst s) => wrap $ Estrlit s + | Tk (T.Strlit id) => wrap $ Estrlit id | Tk (T.CharConst _) => raise Unimplemented | Tk (T.Num _) => wrap Enum | TkParens list => @@ -10,7 +10,7 @@ signature STREAM = sig val Ppos: (pos, 'a, 'b, 'c) a1printer - val getchar: t -> char option * t + val getchar: t -> char * t (* Will throw UngetcError, if applied at the beginning of the stream. * Can be always avoided, so is not provided in sig file *) @@ -30,5 +30,6 @@ signature STREAM = sig val isFirstOnLine: t -> fileOffset -> bool val getPosRaw: fileOffset -> t -> pos * t val getPos: t -> pos * t + val getPosDisc: t -> pos val getPosAfterChar: t -> pos * t end @@ -37,8 +37,8 @@ structure Stream :> STREAM = struct end z fun getchar (S as { contents, off, ... }: t) = - (SOME $ String.sub (contents, off), updateStream S s#off (off + 1) %) - handle Subscript => (NONE, S) + (String.sub (contents, off), updateStream S s#off (off + 1) %) + handle Subscript => (chr 0, S) fun ungetc ({ off = 0, ... }: t) = raise UngetcError @@ -100,7 +100,7 @@ structure Stream :> STREAM = struct (line, col) else calcPos (curOff + 1) - (if String.sub (contents,curOff) = #"\n" then (line + 1, 1) + (if String.sub (contents, curOff) = #"\n" then (line + 1, 1) else (line, col + 1)) val (line, col) = calcPos prevOff (line, col) in @@ -111,6 +111,8 @@ structure Stream :> STREAM = struct fun getPos (S as { off, ... }: t) = getPosRaw off S + fun getPosDisc s = #1 $ getPos s + fun EOFpos (S as { contents, ... }: t) = getPosRaw (String.size contents) S diff --git a/tokenizer.fun b/tokenizer.fun index 93a7099..d6695d0 100644 --- a/tokenizer.fun +++ b/tokenizer.fun @@ -5,25 +5,17 @@ struct structure ST = ST structure S = S - datatype intType = ItDec | ItOct | ItHex - datatype intSfx = IsNone | IsU | IsL | IsUL | IsLL | IsULL - datatype floatSfx = FsNone | FsF | FsL - - datatype numConst = - IntConst of intType * string * intSfx | - FloatConst of string * floatSfx - datatype token = Invalid | EOS | NewLine | MacroEnd of int | - Num of numConst | + Num of string | Id of int | - CharConst of string * int | - StringConst of string | + CharConst of int * int | + Strlit of int | kwBreak | kwCase | @@ -127,15 +119,11 @@ struct PpcError | PpcPragma - datatype tkErrorAuxInfo = TkiEOF | TkiDx of int | TkiStart - - exception TkError of tkErrorAuxInfo * string + exception TkError of int * string fun error pos msg = (printf `"\n" S.Ppos pos `": " `msg `"\n" %; exit 1) - (* Unreachable (should be) *) exception TokenWithoutRepr - exception SuffixWithoutRepr val kwPrefix = #"`" val ppcPrefix = #"$" @@ -255,21 +243,6 @@ struct ] end - val intSuffixRepr = [ - (IsNone, ""), - (IsU, "u"), - (IsL, "l"), - (IsUL, "ul"), - (IsLL, "ll"), - (IsULL, "ull") - ] - - val floatSuffixRepr = [ - (FsNone, ""), - (FsF, "f"), - (FsL, "l") - ] - fun initSymtab () = let val symtab = ST.init () @@ -292,14 +265,6 @@ struct symtab end - fun getSfxRepr sfx buf onError = - case List.find (fn (sfx', _) => sfx' = sfx) buf of - NONE => onError () - | SOME (_, repr) => repr - - fun getSfxReprSimple sfx buf = - getSfxRepr sfx buf (fn () => raise SuffixWithoutRepr) - fun printToken (out, symtab, tk) = let val ? = fn z => @@ -315,20 +280,9 @@ struct | NewLine => Printf out `"\\n" % | PpcInclude (dir, arg) => Printf out `"#include(" `dir `", " `arg `")" % - | Num (IntConst (it, str, sfx)) => - let - val intType = - case it of - ItDec => "" - | ItOct => "0" - | ItHex => "0x" - in - Printf out `intType `str `(getSfxReprSimple sfx intSuffixRepr) % - end - | Num (FloatConst (str, sfx)) => - Printf out `str `(getSfxReprSimple sfx floatSuffixRepr) % - | CharConst (repr, _) => Printf out `repr % - | StringConst s => Printf out `"\"" `s `"\"" % + | Num s => Printf out `s % + | CharConst (repr, _) => Printf out ?repr % + | Strlit id => Printf out ?id % | v => case List.find (fn (x, _) => x = v) tokenRepr of SOME (_, repr) => @@ -344,8 +298,7 @@ struct val Ptk = fn z => bind A2 printToken z - fun isIdStart c = Char.isAlpha c orelse c = #"_" - fun isIdBody c = Char.isAlphaNum c orelse c = #"_" + fun isNondigit c = Char.isAlpha c orelse c = #"_" fun isOctal c = ord c >= ord #"0" andalso ord c < ord #"8" val isDigit = Char.isDigit @@ -486,221 +439,25 @@ struct let val (c, stream) = S.getchar stream in - case c of - NONE => (#1 $ sub (#2 $ fsmTable (), curState), stream) - | SOME c => - let - val (tk, row) = sub (#2 $ fsmTable (), curState) - val nextState = sub (row, ord c) - in - if nextState = ~1 then - (tk, S.ungetc stream) - else - get nextState stream - end + if c = #"\000" then + (#1 $ sub (#2 $ fsmTable (), curState), stream) + else + let + val (tk, row) = sub (#2 $ fsmTable (), curState) + val nextState = sub (row, ord c) + in + if nextState = ~1 then + (tk, S.ungetc stream) + else + get nextState stream + end end val (tk, stream) = get 0 stream in (tk, pos, stream) end - fun errorDx stream dx msg = - let - val off = S.getOffset stream - 1 + dx - val (pos, _) = S.getPosRaw off stream - in - error pos msg - end - - fun parserWrapper stream parser acc = - let - val stream = S.ungetc stream - - val startOff = S.getOffset stream - val (pos, stream) = S.getPos stream - - fun parse' stream acc = - let - val (c, stream) = S.getchar stream - - val (acc, tk, stream) = parser acc (stream, startOff) c handle - TkError (TkiDx dx, msg) => errorDx stream dx msg - | TkError (TkiStart, msg) => error pos msg - | TkError (TkiEOF, msg) => - let - val (pos, _) = S.EOFpos stream - in - error pos msg - end - in - case tk of - NONE => parse' stream acc - | _ => (valOf tk, stream) - end - - val (tk, stream) = parse' stream acc - in - (tk, pos, stream) - end - - fun finishSeqRead startOff stream = - S.getSubstr startOff (S.getOffset stream) stream - - fun idParser symtab () (stream, startOff) c = - let - fun finalize stream = - let - val s = finishSeqRead startOff stream - val id = ST.getId symtab s - in - ((), SOME $ Id id, stream) - end - in - case c of - NONE => finalize stream - | SOME c => - if isIdBody c then - ((), NONE, stream) - else - finalize (S.ungetc stream) - end - - datatype intMode = ImDec | ImOct | ImInvalidOct | ImHex - datatype floatMode = FmDot | FmExp - - datatype npState = NpInit | IntMode of intMode | FloatMode of floatMode - - fun getLongestSeq acc pred stream = - let - val (c, stream') = S.getchar stream - in - if isSome c andalso pred $ valOf c then - getLongestSeq (valOf c :: acc) pred stream' - else - (implode $ rev acc, stream) - end - - fun skipDigitSeq off stream = - let - val (res, stream) = getLongestSeq [] isDigit stream - in - if res = "" then - raise TkError (TkiDx off, "expected digit") - else - (String.size res, stream) - end - - fun getSuffixCommon buf off stream = - let - val (sfx, stream) = getLongestSeq [] Char.isAlpha stream - val sfx = String.map (fn c => Char.toLower c) sfx - val sfx = - case List.find (fn (_, repr) => repr = sfx) buf of - NONE => raise TkError (TkiDx off, "unknown suffix") - | SOME (sfx, _) => sfx - in - (sfx, stream) - end - - val getIntSuffix = getSuffixCommon intSuffixRepr 0 - fun getFloatSuffix off = getSuffixCommon floatSuffixRepr off - - (* - * It seems that there is an ambiguity in C: - * gcc/clang/cparser consider 0xe-3 as a fp constant and reject it. - * Right now we do not, but it may be reconsidered. - *) - fun numParser _ NpInit (stream, _) (SOME c) = - if c = #"0" then - let - val (c, stream') = S.getchar stream - in - case c of - NONE => - (NpInit, SOME $ Num $ IntConst (ItOct, "", IsNone), stream') - | SOME c => - if Char.toLower c = #"x" then - (IntMode ImHex, NONE, stream') - else - (IntMode ImOct, NONE, stream) - end - else - (IntMode ImDec, NONE, stream) - | numParser _ NpInit _ NONE = raise Unreachable - | numParser _ (IntMode mode) (stream, startOff) c = - let - val (pred, res, offset) = - case mode of - ImDec => (isDigit, ItDec, 0) - | ImOct => (isOctal, ItOct, 1) - | ImInvalidOct => (isDigit, ItOct, 1) - | ImHex => (isHexDigit, ItHex, 2) - - fun checkAndRaise m msg = - if mode = m then raise TkError (TkiStart, msg) else () - - fun finish () = - let - val () = checkAndRaise ImInvalidOct "invalid octal constant" - val stream = S.ungetc stream - val str = finishSeqRead (startOff + offset) stream - val (sfx, stream) = getIntSuffix stream - in - (IntMode mode, SOME $ Num $ IntConst (res, str, sfx), stream) - end - in - case c of - NONE => finish () - | SOME c => - if pred c then - (IntMode mode, NONE, stream) - else if c = #"." then - (FloatMode FmDot, NONE, stream) - else if Char.toLower c = #"e" then - (checkAndRaise ImHex - "floating constant can not come with 0x prefix"; - (FloatMode FmExp, NONE, stream)) - else - finish () - end - | numParser _ (FloatMode FmDot) (stream, startOff) _ = - let - val (len, stream) = skipDigitSeq 0 $ S.ungetc stream - - val (c, stream') = S.getchar stream - - fun finish () = - let - val str = finishSeqRead startOff stream - val (sfx, stream) = getFloatSuffix len stream - in - (FloatMode FmDot, SOME $ Num $ FloatConst (str, sfx), stream) - end - in - case c of - NONE => finish () - | SOME c => - if Char.toLower c = #"e" then - (FloatMode FmExp, NONE, stream') - else - finish () - end - | numParser _ (FloatMode FmExp) (stream, startOff) c = - let - val (off, stream) = - if c = NONE then - raise TkError (TkiDx 0, "expected digit") - else if valOf c <> #"+" andalso valOf c <> #"-" then - (0, S.ungetc stream) - else - (~1, stream) - - val (len, stream) = skipDigitSeq off stream - val str = finishSeqRead startOff stream - val (sfx, stream) = getFloatSuffix len stream - in - (FloatMode FmDot, SOME $ Num $ FloatConst (str, sfx), stream) - end + datatype chr = Reg of char | EscSeqed of char | NoChar fun chrIntVal c = if isDigit c then @@ -716,18 +473,17 @@ struct let fun follow stream acc count = if count = 3 then - (SOME $ chr acc, stream) + (EscSeqed $ chr acc, stream) else let val (c, stream) = S.getchar stream in - case c of - NONE => (SOME $ chr acc, stream) - | SOME c => - if isOctal c then - follow stream (acc * 8 + chrIntVal c) (count + 1) - else - (SOME $ chr acc, S.ungetc stream) + if c = #"\000" then + (EscSeqed $ chr acc, stream) + else if isOctal c then + follow stream (acc * 8 + chrIntVal c) (count + 1) + else + (EscSeqed $ chr acc, S.ungetc stream) end in follow stream (chrIntVal c) 1 @@ -739,25 +495,24 @@ struct let val (c, stream) = S.getchar stream - val noHex = TkError (TkiDx 0, "\\x without hex digits") + val noHex = TkError (0, "\\x without hex digits") in - case c of - NONE => - if count = 0 then - raise noHex - else - (SOME $ chr acc, stream) - | SOME c => - if isHexDigit c then - if count = 2 then - raise TkError (TkiDx 2, "hex sequence out of range") - else - follow stream (acc * 16 + chrIntVal c) (count + 1) + if c = #"\000" then + if count = 0 then + raise noHex + else + (EscSeqed $ chr acc, stream) + else + if isHexDigit c then + if count = 2 then + raise TkError (2, "hex sequence out of range") + else + follow stream (acc * 16 + chrIntVal c) (count + 1) + else + if count = 0 then + raise noHex else - if count = 0 then - raise noHex - else - (SOME $ chr acc, S.ungetc stream) + (EscSeqed $ chr acc, S.ungetc stream) end in follow stream 0 0 @@ -765,15 +520,16 @@ struct fun eatEscSeq stream = let - fun raiseErr0 msg = raise TkError (TkiDx 0, msg) + fun raiseErr0 msg = raise TkError (0, msg) val (c, stream) = S.getchar stream val c = - case c of - NONE => raiseErr0 "unfinished escape sequence" - | SOME c => c + if c = #"\000" then + raiseErr0 "unfinished escape sequence" + else + c - fun & c = (SOME c, stream) + fun & c = (EscSeqed c, stream) in case c of #"'" => & #"'" @@ -787,87 +543,135 @@ struct | #"r" => & #"\r" | #"t" => & #"\t" | #"v" => & #"\v" - | #"\n" => (NONE, stream) + | #"\n" => (NoChar, stream) | #"x" => parseHexSeq stream | c => if isOctal c then parseOctalSeq stream c else raiseErr0 "unknown escape sequence" - end + end handle + TkError (dx, msg) => + let + val offset = S.getOffset stream + dx + val (pos, _) = S.getPosRaw offset stream + in + error pos msg + end + + fun getMaybeBackslashed stream = + let + fun getMaybeBackslashed' stream = + let + val (c, stream) = S.getchar stream + in + if c = #"\\" then + eatEscSeq stream + else + (Reg c, stream) + end - datatype SeqParseState = SeqInit | SeqStart | SeqValue of int | SeqTerm + val (c, stream) = getMaybeBackslashed' stream + in + case c of + NoChar => getMaybeBackslashed stream + | _ => (c, stream) + end - datatype seqParseMode = SpmChr | SpmStr + fun parseCharConst symtab stream = + let + val startOff = S.getOffset stream - 1 + val (pos, stream) = S.getPosRaw startOff stream + + val (chr, stream1) = getMaybeBackslashed stream + val v = + case chr of + Reg #"\000" => error (S.getPosDisc stream) "expected character" + | Reg c | EscSeqed c => ord c + | NoChar => raise Unreachable + + val (c, stream2) = getMaybeBackslashed stream1 + val () = + case c of + Reg #"'" => () + | _ => error (S.getPosDisc stream1) "expected '" - fun seqBound SpmChr = #"'" - | seqBound SpmStr = #"\"" + val endOff = S.getOffset stream2 + val repr = S.getSubstr startOff endOff stream2 + val id = ST.getId symtab repr + in + (CharConst (id, v), pos, stream2) + end - fun seqExnConv mode (TkError (v, msg)) = + fun parseStrlit symtab stream = let - val bound = if mode = SpmChr then "'" else "\"" - val msg = - String.translate (fn c => if c = #"%" then bound else str c) msg + val startOff = S.getOffset stream - 1 + val (pos, stream) = S.getPosRaw startOff stream + + fun collect stream = + let + val (c, stream) = getMaybeBackslashed stream + in + case c of + Reg #"\000" => error pos "unfinished string literal" + | Reg #"\"" => (S.getOffset stream, stream) + | _ => collect stream + end + + val (endOff, stream) = collect stream + val s = S.getSubstr startOff endOff stream + val id = ST.getId symtab s in - TkError (v, msg) + (Strlit id, pos, stream) end - | seqExnConv _ _ = raise Unreachable - fun unfinishedSeq SpmChr = "unfinished character constant" - | unfinishedSeq SpmStr = "unfinished string literal" + fun parseId symtab _ stream = + let + val startOff = S.getOffset stream - 1 - fun seqParser mode SeqInit (stream, _) (SOME c) = - if seqBound mode = c then - (SeqStart, NONE, stream) - else - raise Unreachable - | seqParser mode SeqStart (stream, _) (SOME c) = - if c <> seqBound mode then - let - val (c, stream) = - if c <> #"\\" then (SOME c, stream) else eatEscSeq stream - in - if c = NONE then - (SeqStart, NONE, stream) - else - (SeqValue (ord $ Option.valOf c), NONE, stream) - end - else if mode = SpmStr then - (SeqTerm, SOME $ StringConst "", stream) + fun collect stream = + let + val (c, stream') = S.getchar stream + in + if isNondigit c orelse isDigit c then + collect stream' else - raise seqExnConv SpmChr $ TkError (TkiDx 0, "expected value after %") - | seqParser mode (SeqValue v) (stream, startOff) (SOME c) = - if seqBound mode = c then - let - fun term s v = - if mode = SpmChr then - CharConst (s, v) - else - StringConst $ String.extract (s, 1, SOME $ String.size s - 2) - in - (SeqTerm, SOME $ term (finishSeqRead startOff stream) v, stream) - end - else if mode = SpmStr then - let - val (_, stream) = - if c <> #"\\" then (SOME c, stream) else eatEscSeq stream - in - (SeqValue v, NONE, stream) - end + (S.getOffset stream, stream) + end + + val (endOff, stream) = collect stream + val id = S.getSubstr startOff endOff stream + val id = ST.getId symtab id + in + (Id id, stream) + end + + fun parseNumber dx stream = + let + fun collect stream = + let + val (c, stream') = S.getchar stream + val (c1, stream2) = S.getchar stream' + in + if isDigit c orelse c = #"." then + collect stream' + else if Char.toLower c = #"e" andalso (c1 = #"+" orelse c1 = #"-") + then + collect stream2 + else if isNondigit c then + collect stream' else - raise seqExnConv SpmChr $ TkError (TkiDx 0, "expected % after value") - | seqParser _ SeqTerm _ (SOME _) = - raise Unreachable - | seqParser mode state (_, _) NONE = - raise case state of - SeqInit => Unreachable - | SeqStart => seqExnConv mode $ TkError (TkiStart, unfinishedSeq mode) - | SeqValue _ => - seqExnConv mode $ TkError (TkiStart, unfinishedSeq mode) - | SeqTerm => Unreachable + (S.getOffset stream, stream) + end - fun charParser _ = seqParser SpmChr - fun strParser _ = seqParser SpmStr + val startOff = S.getOffset stream - dx + val (pos, stream) = S.getPosRaw startOff stream + val (endOff, stream) = collect stream + + val s = S.getSubstr startOff endOff stream + in + (Num s, pos, stream) + end fun getDir stream = OS.Path.getParent o S.getFname $ stream @@ -922,10 +726,8 @@ struct let fun skip prevIsAsterisk stream = let - val (c, stream) = - case S.getchar stream of - (NONE, _) => error pos "unfinished comment" - | (SOME c, stream) => (c, stream) + val (c, stream) = S.getchar stream + val () = if c = #"\000" then error pos "unfinished comment" else () in if prevIsAsterisk andalso c = #"/" then stream @@ -947,13 +749,13 @@ struct error pos "expected \\n after backslash" end in - case c of - SOME c => - if c = #"\n" then - stream - else - raiseErr () - | NONE => raiseErr () + if c = #"\000" then + raiseErr () + else + if c = #"\n" then + stream + else + raiseErr () end fun processSymbol symtab stream = @@ -985,31 +787,39 @@ struct and getToken symtab stream = let val (c, stream) = S.getchar stream - + val (c1, _) = S.getchar stream fun conv tk (pos, stream) = (tk, pos, stream) - fun @-> parser acc = parserWrapper stream (parser symtab) acc + + fun @-> parser = + let + val (pos, stream) = S.getPosAfterChar stream + val (tk, stream) = parser pos stream + in + (tk, pos, stream) + end in - case c of - NONE => conv EOS $ S.EOFpos stream - | SOME c => - if c = #"\n" then - conv NewLine $ S.getPosAfterChar stream - else if Char.isSpace c then - getToken symtab stream - else if isIdStart c then - @-> idParser () - else if isDigit c then - @-> numParser NpInit - else if c = #"'" then - @-> charParser SeqInit - else if c = #"\"" then - @-> strParser SeqInit - else if isStartForFsm c then - processSymbol symtab stream - else if c = #"\\" then - getToken symtab $ handleBackslash stream - else - unexpectedCharRaise stream c + if c = #"\000" then + conv EOS $ S.EOFpos stream + else if c = #"\n" then + conv NewLine $ S.getPosAfterChar stream + else if Char.isSpace c then + getToken symtab stream + else if isNondigit c then + @-> $ parseId symtab + else if isDigit c then + parseNumber 1 stream + else if c = #"." andalso isDigit c1 then + parseNumber 2 stream + else if c = #"'" then + parseCharConst symtab stream + else if c = #"\"" then + parseStrlit symtab stream + else if isStartForFsm c then + processSymbol symtab stream + else if c = #"\\" then + getToken symtab $ handleBackslash stream + else + unexpectedCharRaise stream c end fun debugPrint fname = diff --git a/tokenizer.sig b/tokenizer.sig index 67666a4..f221ab3 100644 --- a/tokenizer.sig +++ b/tokenizer.sig @@ -2,25 +2,17 @@ signature TOKENIZER = sig structure ST: SYMTAB structure S: STREAM - datatype intType = ItDec | ItOct | ItHex - datatype intSfx = IsNone | IsU | IsL | IsUL | IsLL | IsULL - datatype floatSfx = FsNone | FsF | FsL - - datatype numConst = - IntConst of intType * string * intSfx | - FloatConst of string * floatSfx - datatype token = Invalid | EOS | NewLine | MacroEnd of int | - Num of numConst | + Num of string | Id of int | - CharConst of string * int | - StringConst of string | + CharConst of int * int | + Strlit of int | kwBreak | kwCase | |