summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorVladimir Azarov <avm@intermediate-node.net>2025-04-04 18:24:46 +0200
committerVladimir Azarov <avm@intermediate-node.net>2025-04-04 18:24:46 +0200
commit7b29b31648fd737e7bbc007f480b799add91bc6b (patch)
treee724c15c959d98ece73c186b82a61100f4e8d06a
parentd7d4830443f1e385af862462f976553c8a9033e1 (diff)
Beginning of the preprocessor
-rw-r--r--.gitignore2
-rw-r--r--Makefile2
-rw-r--r--ccross.mlb (renamed from cpp.mlb)5
-rw-r--r--ccross.sml4
-rw-r--r--cpp.sig25
-rw-r--r--cpp.sml104
-rw-r--r--exn_handler.sml3
-rw-r--r--general.sml4
-rw-r--r--hashtable.sml4
-rw-r--r--stream.sig7
-rw-r--r--stream.sml20
-rw-r--r--tokenizer.sig123
-rw-r--r--tokenizer.sml38
13 files changed, 292 insertions, 49 deletions
diff --git a/.gitignore b/.gitignore
index 58c478a..28bb688 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,5 @@
test*
-cpp
+ccross
todo*
mlmon.out*
*dot
diff --git a/Makefile b/Makefile
index e18503d..92e0285 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
history := -const "Exn.keepHistory true"
def:
- mlton $(history) cpp.mlb
+ mlton $(history) ccross.mlb
diff --git a/cpp.mlb b/ccross.mlb
index 0cc9341..76eeb47 100644
--- a/cpp.mlb
+++ b/ccross.mlb
@@ -15,6 +15,9 @@ in
tokenizer.sig
tokenizer.sml
- exn_handler.sml
+ cpp.sig
cpp.sml
+
+ exn_handler.sml
+ ccross.sml
end
diff --git a/ccross.sml b/ccross.sml
new file mode 100644
index 0000000..58f9c32
--- /dev/null
+++ b/ccross.sml
@@ -0,0 +1,4 @@
+fun main [fname] = Cpp.debugPrint fname
+ | main _ = printLn "Expected a single argument: file name"
+
+val () = main $ CommandLine.arguments ()
diff --git a/cpp.sig b/cpp.sig
new file mode 100644
index 0000000..378a5d4
--- /dev/null
+++ b/cpp.sig
@@ -0,0 +1,25 @@
+signature CPP = sig
+ type t
+ type tkPos
+ type tkExpectedValue
+
+ exception TkExpected of tkExpectedValue
+
+ datatype tkExp =
+ Tk of Tokenizer.token |
+ Id |
+ NumConst |
+ StrLiteral |
+ UnOp |
+ BinOp |
+ Op
+
+ val create: string -> t
+ val getToken: t -> Tokenizer.token * t
+ val getLastPos: t -> tkPos
+
+ val prepAndRaise: t -> tkPos -> tkExp list -> 'a
+ val tkExpectedPrint: tkExpectedValue -> unit
+
+ val debugPrint: string -> unit
+end
diff --git a/cpp.sml b/cpp.sml
index e2ffc2c..91f7f82 100644
--- a/cpp.sml
+++ b/cpp.sml
@@ -1,11 +1,95 @@
-fun main [fname] =
-let
- val stream = Stream.streamInit fname
- val tkl = Tokenizer.tokenize stream
- val fileInfo = Stream.convert stream
-in
- Tokenizer.printTokens tkl (#2 fileInfo)
-end
- | main _ = printLn "Expected a signle argument: file name"
+structure Cpp:> CPP = struct
+ type tkPos = Stream.pos
+
+ type t =
+ { streams: Stream.t list, fileInfo: Stream.fileInfo list,
+ lastPos: tkPos option, firstId: Stream.fileId };
+
+ datatype tkExp =
+ Tk of Tokenizer.token |
+ Id |
+ NumConst |
+ StrLiteral |
+ UnOp |
+ BinOp |
+ Op
+
+ type tkExpectedValue = string * tkExp list
+
+ exception StreamTooOld
+ exception TkExpected of tkExpectedValue
+
+ fun create fname =
+ let
+ val stream = Stream.create fname
+ val info = Stream.convert stream
+ in
+ { streams = [stream] , fileInfo = [info], lastPos = NONE, firstId = #1 info }
+ end
-val () = main $ CommandLine.arguments ()
+ fun getToken
+ ({ streams = stream :: tail, fileInfo, lastPos, firstId }: t) =
+ let
+ val (tk, stream) = Tokenizer.getToken stream
+ in
+ case tk of
+ NONE => getToken { streams = tail, fileInfo, lastPos, firstId }
+ | SOME (pos, tk) =>
+ ( tk, { streams = stream :: tail, fileInfo,
+ lastPos = SOME pos, firstId })
+ end
+ | getToken
+ { streams = [], fileInfo, lastPos = SOME lastPos, firstId } =
+ let
+ val pos = SOME (#1 lastPos, ~1) (* EOF *)
+ in
+ (Tokenizer.EOS, {streams = [], fileInfo, lastPos = pos, firstId })
+ end
+ | getToken { streams = [], fileInfo, lastPos = NONE, firstId } =
+ (Tokenizer.EOS, { streams = [], fileInfo,
+ lastPos = SOME (firstId, ~1), firstId })
+
+ fun getLastPos ({ lastPos = NONE, ... }: t) = raise Unreachable
+ | getLastPos { lastPos = SOME p, ... } = p
+
+ val tkExp2str = fn
+ (Tk tk) => Tokenizer.token2str tk
+ | Id: tkExp => "identifier"
+ | NumConst => "numeric constant"
+ | StrLiteral => "string literal"
+ | UnOp => "unary operator"
+ | BinOp => "binary operator"
+ | Op => "operator"
+
+ fun prepAndRaise (stream: t) (id, pos) expList =
+ let
+ val fileInfo =
+ case List.find (fn (id', _, _) => id' = id) $ #fileInfo stream of
+ NONE => raise StreamTooOld
+ | SOME fileInfo => fileInfo
+ val pos = Stream.ppos2str $ Stream.pos2pposWithFI (id, pos) fileInfo
+ in
+ raise TkExpected (pos, expList)
+ end
+
+ fun tkExpectedPrint (pos, expList) =
+ let
+ fun tkExps2str [e] [] = tkExp2str e
+ | tkExps2str [e] acc =
+ (String.concatWith ", " acc ^ " or ") ^ tkExp2str e
+ | tkExps2str (e :: ec) acc =
+ tkExps2str ec (tkExp2str e :: acc)
+ | tkExps2str [] _ = raise Unreachable
+ in
+ print pos;
+ print ":expected ";
+ printLn $ tkExps2str expList []
+ end
+
+ fun debugPrint fname =
+ let
+ val stream = create fname
+ in
+ ()
+ end
+end
diff --git a/exn_handler.sml b/exn_handler.sml
index 52a2c5c..50fb8dc 100644
--- a/exn_handler.sml
+++ b/exn_handler.sml
@@ -30,13 +30,14 @@ structure GlobalExnHandler: sig val handler: exn -> unit end = struct
fun handler e =
let
- open Tokenizer
+ open Tokenizer Cpp
in
(case e of
FsmTableIsTooSmall =>
eprint "fsm table is too small. Increate 'maxState' value"
| IO.Io _ => ioExn e
| TkErrorAug (pos, msg) => eprint $ Stream.ppos2str pos ^ ": " ^ msg
+ | TkExpected v => tkExpectedPrint v
| _ => otherExn e;
exit 255)
end
diff --git a/general.sml b/general.sml
index 1434d5b..0cf2ebd 100644
--- a/general.sml
+++ b/general.sml
@@ -5,6 +5,10 @@ infixr 0 $
fun printLn s = (print s; print "\n")
+(* All global values which computations may raise an exception must be
+ * wrapped in lazy, so that no exception is thrown before custom
+ * top-level handler is set.
+ *)
fun lazy thunk =
let
datatype 'a value =
diff --git a/hashtable.sml b/hashtable.sml
index 24e5bba..5b89ebd 100644
--- a/hashtable.sml
+++ b/hashtable.sml
@@ -12,11 +12,7 @@ structure Hashtable :> HASHTABLE = struct
val ` = Word32.fromInt
fun hashFunc key =
- let
- open Word32
- in
List.foldl (fn (x, acc) => acc * `31 + `(ord x)) (`0) $ explode key
- end
fun insert (buf, (taken, total)) key value =
if Real.fromInt (taken + 1) / Real.fromInt total > fillLimit then
diff --git a/stream.sig b/stream.sig
index 795d193..2bdb155 100644
--- a/stream.sig
+++ b/stream.sig
@@ -20,7 +20,12 @@ signature STREAM = sig
val getPos: t -> pos
val getPosAfterCharRead: t -> pos
+
+ (* pos must come from t, see pos2pposWithFI *)
val pos2ppos: pos -> t -> ppos
+ (* #id pos must be equal to fileId of fileInfo,
+ * otherwise InvalidFileInfo is thrown *)
+ val pos2pposWithFI: pos -> fileInfo -> ppos
(* Assumed to be called once for given pos, so will throw Unreachable on
* second call *)
@@ -30,5 +35,5 @@ signature STREAM = sig
val isFirstOnLine: pos -> t -> bool
(* throws IO.Io *)
- val streamInit: string -> t
+ val create: string -> t
end
diff --git a/stream.sml b/stream.sml
index 1755817..bc9048b 100644
--- a/stream.sml
+++ b/stream.sml
@@ -6,6 +6,7 @@ structure Stream :> STREAM = struct
type fileInfo = fileId * string * string
exception UngetcError
+ exception InvalidFileInfo
fun ppos2str (pos, line, col) =
let
@@ -56,12 +57,17 @@ structure Stream :> STREAM = struct
fun getPosAfterCharRead (fid, _, off, _) = (fid, off - 1)
- fun pos2ppos (_, pos) (_, fname, _, contents) =
- let
- val (line, col) = calcFilePos contents pos
- in
- (fname, line, SOME col)
- end
+ fun pos2pposWithFI (id, pos) (id', fname, contents) =
+ if id <> id' then
+ raise InvalidFileInfo
+ else
+ let
+ val (line, col) = calcFilePos contents pos
+ in
+ (fname, line, SOME col)
+ end
+
+ fun pos2ppos pos stream = pos2pposWithFI pos (convert stream)
fun pposWithoutCol (fname, line, SOME _) = (fname, line, NONE)
| pposWithoutCol (_, _, NONE) = raise Unreachable
@@ -71,7 +77,7 @@ structure Stream :> STREAM = struct
fun getSubstr startOff endOff (_, _, _, contents) =
String.substring (contents, startOff, endOff - startOff)
- fun streamInit fname =
+ fun create fname =
let
open TextIO
diff --git a/tokenizer.sig b/tokenizer.sig
index ffbec52..245cfbe 100644
--- a/tokenizer.sig
+++ b/tokenizer.sig
@@ -1,13 +1,132 @@
signature TOKENIZER = sig
- type token
+ datatype intType = ItDec | ItOct | ItHex
+ datatype intSfx = IsNone | IsU | IsL | IsUL | IsLL | IsULL
+ datatype floatSfx = FsNone | FsF | FsL
+
+ datatype numConst =
+ IntConst of intType * string * intSfx |
+ FloatConst of string * floatSfx
+
+ datatype token =
+ Invalid |
+ EOS |
+ NewLine |
+
+ Num of numConst |
+
+ Id of string |
+ CharConst of string * int |
+ StringConst of string |
+
+ kwBreak |
+ kwCase |
+ kwChar |
+ kwConst |
+ kwContinue |
+ kwDefault |
+ kwDouble |
+ kwElse |
+ kwEnum |
+ kwExtern |
+ kwFloat |
+ kwFor |
+ kwGoto |
+ kwInt |
+ kwLong |
+ kwRegister |
+ kwReturn |
+ kwShort |
+ kwSigned |
+ kwSizeof |
+ kwStruct |
+ kwSwitch |
+ kwTypedef |
+ kwUnion |
+ kwUnsigned |
+ kwVoid |
+ kwVolatile |
+
+ LParen |
+ RParen |
+ LBracket |
+ RBracket |
+ LBrace |
+ RBrace |
+
+ QuestionMark |
+ Colon |
+ Coma |
+ Semicolon |
+
+ Arrow |
+ Plus |
+ DoublePlus|
+ Minus |
+ DoubleMinus |
+ Ampersand |
+ Asterisk |
+ Slash |
+ Tilde |
+ ExclMark |
+ Percent |
+ DoubleGreater |
+ DoubleLess |
+ Greater |
+ Less |
+ EqualSign |
+ LessEqualSign |
+ GreaterEqualSign |
+ DoubleEqualSign |
+ ExclMarkEqualSign |
+ Cap |
+ VerticalBar |
+ DoubleAmpersand |
+ DoubleVerticalBar |
+
+ AsteriskEqualSign |
+ SlashEqualSign |
+ PercentEqualSign |
+ PlusEqualSign |
+ MinusEqualSign |
+ DoubleLessEqualSign |
+ DoubleGreaterEqualSign |
+ AmpersandEqualSign |
+ CapEqualSign |
+ VerticalBarEqualSign |
+
+ Hash |
+ DoubleHash |
+
+ Dot |
+ DoubleDot |
+ TripleDot |
+
+ CommentStart |
+
+ CppInclude |
+ CppDefine |
+ CppUndef |
+ CppIf |
+ CppIfdef |
+ CppIfndef |
+ CppElse |
+ CppElif |
+ CppEndif |
+ CppWarning |
+ CppError |
+ CppPragma
+
type fullToken = Stream.pos * token
(* Fatal. both may be thrown by tokenize *)
exception FsmTableIsTooSmall
exception TkErrorAug of Stream.ppos * string
+ val getToken: Stream.t -> fullToken option * Stream.t
+
val tokenize: Stream.t -> fullToken list
+ val token2str: token -> string
val printToken: token -> unit
- val printTokens: fullToken list -> string -> unit
+ val debugPrint: fullToken list -> string -> unit
end
diff --git a/tokenizer.sml b/tokenizer.sml
index 104ae61..10c2e77 100644
--- a/tokenizer.sml
+++ b/tokenizer.sml
@@ -10,6 +10,7 @@ structure Tokenizer:> TOKENIZER = struct
datatype token =
Invalid |
+ EOS |
NewLine |
Num of numConst |
@@ -265,8 +266,8 @@ structure Tokenizer:> TOKENIZER = struct
fun getSfxReprSimple sfx buf =
getSfxRepr sfx buf (fn () => raise SuffixWithoutRepr)
- val printToken = fn
- Id s => print $ "id:" ^ s
+ val token2str = fn
+ Id s => "id:" ^ s
| Num (IntConst (it, str, sfx)) =>
let
val intType =
@@ -275,22 +276,20 @@ structure Tokenizer:> TOKENIZER = struct
| ItOct => "0"
| ItHex => "0x"
in
- print intType;
- print str;
- print $ "`" ^ getSfxReprSimple sfx intSuffixRepr ^ "`"
+ intType ^ str ^ "`" ^ getSfxReprSimple sfx intSuffixRepr ^ "`"
end
- | Num (FloatConst (str, sfx)) => (
- print str;
- print $ "`" ^ getSfxReprSimple sfx floatSuffixRepr ^ "`"
- )
- | CharConst (repr, _) => print repr
+ | Num (FloatConst (str, sfx)) =>
+ str ^ "`" ^ getSfxReprSimple sfx floatSuffixRepr ^ "`"
+ | CharConst (repr, _) => repr
| StringConst s =>
- print $ "\"" ^ s ^ "\""
+ "\"" ^ s ^ "\""
| v =>
case List.find (fn (x, _) => x = v) tokenRepr of
- SOME (_, repr) => print repr
+ SOME (_, repr) => repr
| NONE => raise TokenWithoutRepr
+ fun printToken tk = print $ token2str tk
+
fun isIdStart c = Char.isAlpha c orelse c = #"_"
fun isIdBody c = Char.isAlphaNum c orelse c = #"_"
@@ -766,14 +765,12 @@ structure Tokenizer:> TOKENIZER = struct
| #"\n" => (NONE, stream)
| #"x" => parseHexSeq stream
| c =>
- if isOctal c then
- parseOctalSeq stream c
- else
- raiseErr0 "unknown escape sequence"
+ if isOctal c then
+ parseOctalSeq stream c
+ else
+ raiseErr0 "unknown escape sequence"
end
- fun stringCut s = String.extract (s, 1, SOME $ String.size s - 2)
-
datatype SeqParseState = SeqInit | SeqStart | SeqValue of int | SeqTerm
datatype seqParseMode = SpmChr | SpmStr
@@ -821,7 +818,7 @@ structure Tokenizer:> TOKENIZER = struct
if mode = SpmChr then
CharConst (s, v)
else
- StringConst $ stringCut s
+ StringConst $ String.extract (s, 1, SOME $ String.size s - 2)
in
(SeqTerm, SOME $ term (finishSeqRead startOff stream) v, stream)
end
@@ -998,7 +995,7 @@ structure Tokenizer:> TOKENIZER = struct
aux [] stream
end
- fun printTokens tkl fname =
+ fun debugPrint tkl fname =
let
fun print' line _ ((_, NewLine) :: tks) =
print' (line + 1) true tks
@@ -1018,5 +1015,4 @@ structure Tokenizer:> TOKENIZER = struct
print' 1 true tkl;
print "\n"
end
-
end