diff options
author | Vladimir Azarov <avm@intermediate-node.net> | 2025-04-04 18:24:46 +0200 |
---|---|---|
committer | Vladimir Azarov <avm@intermediate-node.net> | 2025-04-04 18:24:46 +0200 |
commit | 7b29b31648fd737e7bbc007f480b799add91bc6b (patch) | |
tree | e724c15c959d98ece73c186b82a61100f4e8d06a | |
parent | d7d4830443f1e385af862462f976553c8a9033e1 (diff) |
Beginning of the preprocessor
-rw-r--r-- | .gitignore | 2 | ||||
-rw-r--r-- | Makefile | 2 | ||||
-rw-r--r-- | ccross.mlb (renamed from cpp.mlb) | 5 | ||||
-rw-r--r-- | ccross.sml | 4 | ||||
-rw-r--r-- | cpp.sig | 25 | ||||
-rw-r--r-- | cpp.sml | 104 | ||||
-rw-r--r-- | exn_handler.sml | 3 | ||||
-rw-r--r-- | general.sml | 4 | ||||
-rw-r--r-- | hashtable.sml | 4 | ||||
-rw-r--r-- | stream.sig | 7 | ||||
-rw-r--r-- | stream.sml | 20 | ||||
-rw-r--r-- | tokenizer.sig | 123 | ||||
-rw-r--r-- | tokenizer.sml | 38 |
13 files changed, 292 insertions, 49 deletions
@@ -1,5 +1,5 @@ test* -cpp +ccross todo* mlmon.out* *dot @@ -1,4 +1,4 @@ history := -const "Exn.keepHistory true" def: - mlton $(history) cpp.mlb + mlton $(history) ccross.mlb @@ -15,6 +15,9 @@ in tokenizer.sig tokenizer.sml - exn_handler.sml + cpp.sig cpp.sml + + exn_handler.sml + ccross.sml end diff --git a/ccross.sml b/ccross.sml new file mode 100644 index 0000000..58f9c32 --- /dev/null +++ b/ccross.sml @@ -0,0 +1,4 @@ +fun main [fname] = Cpp.debugPrint fname + | main _ = printLn "Expected a single argument: file name" + +val () = main $ CommandLine.arguments () @@ -0,0 +1,25 @@ +signature CPP = sig + type t + type tkPos + type tkExpectedValue + + exception TkExpected of tkExpectedValue + + datatype tkExp = + Tk of Tokenizer.token | + Id | + NumConst | + StrLiteral | + UnOp | + BinOp | + Op + + val create: string -> t + val getToken: t -> Tokenizer.token * t + val getLastPos: t -> tkPos + + val prepAndRaise: t -> tkPos -> tkExp list -> 'a + val tkExpectedPrint: tkExpectedValue -> unit + + val debugPrint: string -> unit +end @@ -1,11 +1,95 @@ -fun main [fname] = -let - val stream = Stream.streamInit fname - val tkl = Tokenizer.tokenize stream - val fileInfo = Stream.convert stream -in - Tokenizer.printTokens tkl (#2 fileInfo) -end - | main _ = printLn "Expected a signle argument: file name" +structure Cpp:> CPP = struct + type tkPos = Stream.pos + + type t = + { streams: Stream.t list, fileInfo: Stream.fileInfo list, + lastPos: tkPos option, firstId: Stream.fileId }; + + datatype tkExp = + Tk of Tokenizer.token | + Id | + NumConst | + StrLiteral | + UnOp | + BinOp | + Op + + type tkExpectedValue = string * tkExp list + + exception StreamTooOld + exception TkExpected of tkExpectedValue + + fun create fname = + let + val stream = Stream.create fname + val info = Stream.convert stream + in + { streams = [stream] , fileInfo = [info], lastPos = NONE, firstId = #1 info } + end -val () = main $ CommandLine.arguments () + fun getToken + ({ streams = stream :: tail, fileInfo, lastPos, firstId }: t) = + let + val (tk, stream) = Tokenizer.getToken stream + in + case tk of + NONE => getToken { streams = tail, fileInfo, lastPos, firstId } + | SOME (pos, tk) => + ( tk, { streams = stream :: tail, fileInfo, + lastPos = SOME pos, firstId }) + end + | getToken + { streams = [], fileInfo, lastPos = SOME lastPos, firstId } = + let + val pos = SOME (#1 lastPos, ~1) (* EOF *) + in + (Tokenizer.EOS, {streams = [], fileInfo, lastPos = pos, firstId }) + end + | getToken { streams = [], fileInfo, lastPos = NONE, firstId } = + (Tokenizer.EOS, { streams = [], fileInfo, + lastPos = SOME (firstId, ~1), firstId }) + + fun getLastPos ({ lastPos = NONE, ... }: t) = raise Unreachable + | getLastPos { lastPos = SOME p, ... } = p + + val tkExp2str = fn + (Tk tk) => Tokenizer.token2str tk + | Id: tkExp => "identifier" + | NumConst => "numeric constant" + | StrLiteral => "string literal" + | UnOp => "unary operator" + | BinOp => "binary operator" + | Op => "operator" + + fun prepAndRaise (stream: t) (id, pos) expList = + let + val fileInfo = + case List.find (fn (id', _, _) => id' = id) $ #fileInfo stream of + NONE => raise StreamTooOld + | SOME fileInfo => fileInfo + val pos = Stream.ppos2str $ Stream.pos2pposWithFI (id, pos) fileInfo + in + raise TkExpected (pos, expList) + end + + fun tkExpectedPrint (pos, expList) = + let + fun tkExps2str [e] [] = tkExp2str e + | tkExps2str [e] acc = + (String.concatWith ", " acc ^ " or ") ^ tkExp2str e + | tkExps2str (e :: ec) acc = + tkExps2str ec (tkExp2str e :: acc) + | tkExps2str [] _ = raise Unreachable + in + print pos; + print ":expected "; + printLn $ tkExps2str expList [] + end + + fun debugPrint fname = + let + val stream = create fname + in + () + end +end diff --git a/exn_handler.sml b/exn_handler.sml index 52a2c5c..50fb8dc 100644 --- a/exn_handler.sml +++ b/exn_handler.sml @@ -30,13 +30,14 @@ structure GlobalExnHandler: sig val handler: exn -> unit end = struct fun handler e = let - open Tokenizer + open Tokenizer Cpp in (case e of FsmTableIsTooSmall => eprint "fsm table is too small. Increate 'maxState' value" | IO.Io _ => ioExn e | TkErrorAug (pos, msg) => eprint $ Stream.ppos2str pos ^ ": " ^ msg + | TkExpected v => tkExpectedPrint v | _ => otherExn e; exit 255) end diff --git a/general.sml b/general.sml index 1434d5b..0cf2ebd 100644 --- a/general.sml +++ b/general.sml @@ -5,6 +5,10 @@ infixr 0 $ fun printLn s = (print s; print "\n") +(* All global values which computations may raise an exception must be + * wrapped in lazy, so that no exception is thrown before custom + * top-level handler is set. + *) fun lazy thunk = let datatype 'a value = diff --git a/hashtable.sml b/hashtable.sml index 24e5bba..5b89ebd 100644 --- a/hashtable.sml +++ b/hashtable.sml @@ -12,11 +12,7 @@ structure Hashtable :> HASHTABLE = struct val ` = Word32.fromInt fun hashFunc key = - let - open Word32 - in List.foldl (fn (x, acc) => acc * `31 + `(ord x)) (`0) $ explode key - end fun insert (buf, (taken, total)) key value = if Real.fromInt (taken + 1) / Real.fromInt total > fillLimit then @@ -20,7 +20,12 @@ signature STREAM = sig val getPos: t -> pos val getPosAfterCharRead: t -> pos + + (* pos must come from t, see pos2pposWithFI *) val pos2ppos: pos -> t -> ppos + (* #id pos must be equal to fileId of fileInfo, + * otherwise InvalidFileInfo is thrown *) + val pos2pposWithFI: pos -> fileInfo -> ppos (* Assumed to be called once for given pos, so will throw Unreachable on * second call *) @@ -30,5 +35,5 @@ signature STREAM = sig val isFirstOnLine: pos -> t -> bool (* throws IO.Io *) - val streamInit: string -> t + val create: string -> t end @@ -6,6 +6,7 @@ structure Stream :> STREAM = struct type fileInfo = fileId * string * string exception UngetcError + exception InvalidFileInfo fun ppos2str (pos, line, col) = let @@ -56,12 +57,17 @@ structure Stream :> STREAM = struct fun getPosAfterCharRead (fid, _, off, _) = (fid, off - 1) - fun pos2ppos (_, pos) (_, fname, _, contents) = - let - val (line, col) = calcFilePos contents pos - in - (fname, line, SOME col) - end + fun pos2pposWithFI (id, pos) (id', fname, contents) = + if id <> id' then + raise InvalidFileInfo + else + let + val (line, col) = calcFilePos contents pos + in + (fname, line, SOME col) + end + + fun pos2ppos pos stream = pos2pposWithFI pos (convert stream) fun pposWithoutCol (fname, line, SOME _) = (fname, line, NONE) | pposWithoutCol (_, _, NONE) = raise Unreachable @@ -71,7 +77,7 @@ structure Stream :> STREAM = struct fun getSubstr startOff endOff (_, _, _, contents) = String.substring (contents, startOff, endOff - startOff) - fun streamInit fname = + fun create fname = let open TextIO diff --git a/tokenizer.sig b/tokenizer.sig index ffbec52..245cfbe 100644 --- a/tokenizer.sig +++ b/tokenizer.sig @@ -1,13 +1,132 @@ signature TOKENIZER = sig - type token + datatype intType = ItDec | ItOct | ItHex + datatype intSfx = IsNone | IsU | IsL | IsUL | IsLL | IsULL + datatype floatSfx = FsNone | FsF | FsL + + datatype numConst = + IntConst of intType * string * intSfx | + FloatConst of string * floatSfx + + datatype token = + Invalid | + EOS | + NewLine | + + Num of numConst | + + Id of string | + CharConst of string * int | + StringConst of string | + + kwBreak | + kwCase | + kwChar | + kwConst | + kwContinue | + kwDefault | + kwDouble | + kwElse | + kwEnum | + kwExtern | + kwFloat | + kwFor | + kwGoto | + kwInt | + kwLong | + kwRegister | + kwReturn | + kwShort | + kwSigned | + kwSizeof | + kwStruct | + kwSwitch | + kwTypedef | + kwUnion | + kwUnsigned | + kwVoid | + kwVolatile | + + LParen | + RParen | + LBracket | + RBracket | + LBrace | + RBrace | + + QuestionMark | + Colon | + Coma | + Semicolon | + + Arrow | + Plus | + DoublePlus| + Minus | + DoubleMinus | + Ampersand | + Asterisk | + Slash | + Tilde | + ExclMark | + Percent | + DoubleGreater | + DoubleLess | + Greater | + Less | + EqualSign | + LessEqualSign | + GreaterEqualSign | + DoubleEqualSign | + ExclMarkEqualSign | + Cap | + VerticalBar | + DoubleAmpersand | + DoubleVerticalBar | + + AsteriskEqualSign | + SlashEqualSign | + PercentEqualSign | + PlusEqualSign | + MinusEqualSign | + DoubleLessEqualSign | + DoubleGreaterEqualSign | + AmpersandEqualSign | + CapEqualSign | + VerticalBarEqualSign | + + Hash | + DoubleHash | + + Dot | + DoubleDot | + TripleDot | + + CommentStart | + + CppInclude | + CppDefine | + CppUndef | + CppIf | + CppIfdef | + CppIfndef | + CppElse | + CppElif | + CppEndif | + CppWarning | + CppError | + CppPragma + type fullToken = Stream.pos * token (* Fatal. both may be thrown by tokenize *) exception FsmTableIsTooSmall exception TkErrorAug of Stream.ppos * string + val getToken: Stream.t -> fullToken option * Stream.t + val tokenize: Stream.t -> fullToken list + val token2str: token -> string val printToken: token -> unit - val printTokens: fullToken list -> string -> unit + val debugPrint: fullToken list -> string -> unit end diff --git a/tokenizer.sml b/tokenizer.sml index 104ae61..10c2e77 100644 --- a/tokenizer.sml +++ b/tokenizer.sml @@ -10,6 +10,7 @@ structure Tokenizer:> TOKENIZER = struct datatype token = Invalid | + EOS | NewLine | Num of numConst | @@ -265,8 +266,8 @@ structure Tokenizer:> TOKENIZER = struct fun getSfxReprSimple sfx buf = getSfxRepr sfx buf (fn () => raise SuffixWithoutRepr) - val printToken = fn - Id s => print $ "id:" ^ s + val token2str = fn + Id s => "id:" ^ s | Num (IntConst (it, str, sfx)) => let val intType = @@ -275,22 +276,20 @@ structure Tokenizer:> TOKENIZER = struct | ItOct => "0" | ItHex => "0x" in - print intType; - print str; - print $ "`" ^ getSfxReprSimple sfx intSuffixRepr ^ "`" + intType ^ str ^ "`" ^ getSfxReprSimple sfx intSuffixRepr ^ "`" end - | Num (FloatConst (str, sfx)) => ( - print str; - print $ "`" ^ getSfxReprSimple sfx floatSuffixRepr ^ "`" - ) - | CharConst (repr, _) => print repr + | Num (FloatConst (str, sfx)) => + str ^ "`" ^ getSfxReprSimple sfx floatSuffixRepr ^ "`" + | CharConst (repr, _) => repr | StringConst s => - print $ "\"" ^ s ^ "\"" + "\"" ^ s ^ "\"" | v => case List.find (fn (x, _) => x = v) tokenRepr of - SOME (_, repr) => print repr + SOME (_, repr) => repr | NONE => raise TokenWithoutRepr + fun printToken tk = print $ token2str tk + fun isIdStart c = Char.isAlpha c orelse c = #"_" fun isIdBody c = Char.isAlphaNum c orelse c = #"_" @@ -766,14 +765,12 @@ structure Tokenizer:> TOKENIZER = struct | #"\n" => (NONE, stream) | #"x" => parseHexSeq stream | c => - if isOctal c then - parseOctalSeq stream c - else - raiseErr0 "unknown escape sequence" + if isOctal c then + parseOctalSeq stream c + else + raiseErr0 "unknown escape sequence" end - fun stringCut s = String.extract (s, 1, SOME $ String.size s - 2) - datatype SeqParseState = SeqInit | SeqStart | SeqValue of int | SeqTerm datatype seqParseMode = SpmChr | SpmStr @@ -821,7 +818,7 @@ structure Tokenizer:> TOKENIZER = struct if mode = SpmChr then CharConst (s, v) else - StringConst $ stringCut s + StringConst $ String.extract (s, 1, SOME $ String.size s - 2) in (SeqTerm, SOME $ term (finishSeqRead startOff stream) v, stream) end @@ -998,7 +995,7 @@ structure Tokenizer:> TOKENIZER = struct aux [] stream end - fun printTokens tkl fname = + fun debugPrint tkl fname = let fun print' line _ ((_, NewLine) :: tks) = print' (line + 1) true tks @@ -1018,5 +1015,4 @@ structure Tokenizer:> TOKENIZER = struct print' 1 true tkl; print "\n" end - end |