summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorVladimir Azarov <avm@intermediate-node.net>2025-03-25 00:24:52 +0100
committerVladimir Azarov <avm@intermediate-node.net>2025-03-25 00:24:52 +0100
commita0be166ef23e1b34b09060b99c03cdadb6ac2132 (patch)
tree3a54e586b504c6ad4fcb5b620c0ddcead5219ae6
parent226f58656b4b7f92f6de9a817ab9106937e061e9 (diff)
Escape sequences
-rw-r--r--cpp.sml5
-rw-r--r--stream.sig5
-rw-r--r--stream.sml6
-rw-r--r--tokenizer.sig2
-rw-r--r--tokenizer.sml234
5 files changed, 175 insertions, 77 deletions
diff --git a/cpp.sml b/cpp.sml
index 32fcd3c..b2762b5 100644
--- a/cpp.sml
+++ b/cpp.sml
@@ -1,10 +1,11 @@
fun main [fname] =
let
val stream = Stream.streamInit fname
- val (tkl, fileList) = Tokenizer.tokenize stream []
+ val tkl = Tokenizer.tokenize stream
+ val fileInfo = Stream.convert stream
in
List.app
- (fn (p, x) => (Stream.printPos fileList p; Tokenizer.printToken x)) tkl
+ (fn (p, x) => (Stream.printPos fileInfo p; Tokenizer.printToken x)) tkl
end
| main _ = printLn "Expected a signle argument: file name"
diff --git a/stream.sig b/stream.sig
index febd4b7..c09b7c1 100644
--- a/stream.sig
+++ b/stream.sig
@@ -7,7 +7,7 @@ signature STREAM = sig
type t
type fileInfo
- val recycle: t -> fileInfo
+ val convert: t -> fileInfo
val ppos2str: ppos -> string
val printPos: fileInfo -> pos -> unit
@@ -18,10 +18,9 @@ signature STREAM = sig
* Can be always avoided, so is not provided in sig file *)
val ungetc: t -> t
- val getOffset: t -> fileOffset
val getPos: t -> pos
val getPosAfterCharRead: t -> pos
- val getPposFromPos: pos -> t -> ppos
+ val pos2ppos: pos -> t -> ppos
(* Assumed to be called once for given pos, so will throw Unreachable on
* second call *)
diff --git a/stream.sml b/stream.sml
index 92deb61..eb2cea1 100644
--- a/stream.sml
+++ b/stream.sml
@@ -18,7 +18,7 @@ structure Stream :> STREAM = struct
type t = fileId * string * fileOffset * string
- fun recycle (fid, fname, _, contents) = (fid, fname, contents)
+ fun convert (fid, fname, _, contents) = (fid, fname, contents)
fun calcFilePos s offset =
let
@@ -64,11 +64,9 @@ structure Stream :> STREAM = struct
else
(fid, fname, off - 1, contents)
- fun getOffset (_, _, off, _) = off
-
fun getPosAfterCharRead (fid, _, off, _) = (fid, off - 1)
- fun getPposFromPos (_, pos) (_, fname, _, contents) =
+ fun pos2ppos (_, pos) (_, fname, _, contents) =
let
val (line, col) = calcFilePos contents pos
in
diff --git a/tokenizer.sig b/tokenizer.sig
index 5956273..dd0eab4 100644
--- a/tokenizer.sig
+++ b/tokenizer.sig
@@ -6,6 +6,6 @@ signature TOKENIZER = sig
exception FsmTableIsTooSmall
exception TkErrorAug of Stream.ppos * string
- val tokenize: Stream.t -> fullToken list -> fullToken list * Stream.fileInfo
+ val tokenize: Stream.t -> fullToken list
val printToken: token -> unit
end
diff --git a/tokenizer.sml b/tokenizer.sml
index b40e03f..8b1ace4 100644
--- a/tokenizer.sml
+++ b/tokenizer.sml
@@ -100,6 +100,8 @@ structure Tokenizer:> TOKENIZER = struct
CppElse |
CppElif |
CppEndif |
+ CppWarning |
+ CppError |
CppPragma
val kwPrefix = #"@"
@@ -219,6 +221,8 @@ structure Tokenizer:> TOKENIZER = struct
(CppElse, %"else"),
(CppElif, %"elif"),
(CppEndif, %"endif"),
+ (CppWarning, %"warning"),
+ (CppError, %"error"),
(CppPragma, %"pragma")
]
end
@@ -364,7 +368,6 @@ structure Tokenizer:> TOKENIZER = struct
fun fsmEat stream =
let
open Array
- val stream = Stream.ungetc stream
val pos = Stream.getPos stream
fun get curState stream =
@@ -391,7 +394,7 @@ structure Tokenizer:> TOKENIZER = struct
fun tkError2aug stream (dx, msg) =
let
val (id, pos) = Stream.getPosAfterCharRead stream
- val pos = Stream.getPposFromPos (id, pos + dx) stream
+ val pos = Stream.pos2ppos (id, pos + dx) stream
in
TkErrorAug (pos, msg)
end
@@ -408,14 +411,14 @@ structure Tokenizer:> TOKENIZER = struct
TkError (TkiDx dx, msg) => raise tkError2aug stream (dx, msg)
| TkError (TkiStart, msg) =>
let
- val startPos = Stream.getPposFromPos P stream
+ val startPos = Stream.pos2ppos P stream
in
raise TkErrorAug (startPos, msg)
end
| TkError (TkiEOF, msg) =>
let
open Stream
- val pos = pposWithoutCol $ getPposFromPos P stream
+ val pos = pposWithoutCol $ pos2ppos P stream
in
raise TkErrorAug (pos, msg)
end
@@ -467,22 +470,6 @@ structure Tokenizer:> TOKENIZER = struct
finalize (Stream.ungetc stream)
end
- fun formCppDir (Id s) =
- let
- open String
- in
- case List.find
- (fn (_, repr) =>
- sub (repr, 0) = cppPrefix andalso
- extract (repr, 1, NONE) = s)
- tokenRepr
- of
- SOME (tk, _) => tk
- | NONE => raise ExpectedCppDir
- end
- | formCppDir kwElse = CppElse
- | formCppDir _ = raise ExpectedCppDir
-
fun numParser () (stream, startOff) c =
let
fun finalize stream =
@@ -497,20 +484,101 @@ structure Tokenizer:> TOKENIZER = struct
finalize (Stream.ungetc stream)
end
+ fun isOctal c = ord c >= ord #"0" andalso ord c < ord #"8"
+
+ fun chrIntVal c =
+ if Char.isDigit c then
+ ord c - ord #"0"
+ else if ord c >= ord #"a" andalso ord c <= ord #"z" then
+ ord c - ord #"a" + 10
+ else if ord c >= ord #"A" andalso ord c <= ord #"Z" then
+ ord c - ord #"A" + 10
+ else
+ raise Unreachable
+
+ fun parseOctalSeq stream c =
+ let
+
+ fun follow stream acc count =
+ if count = 3 then
+ (SOME $ chr acc, stream)
+ else
+ let
+ val (c, stream) = Stream.getchar stream
+ in
+ case c of
+ NONE => (SOME $ chr acc, stream)
+ | SOME c =>
+ if isOctal c then
+ follow stream (acc * 8 + chrIntVal c) (count + 1)
+ else
+ (SOME $ chr acc, Stream.ungetc stream)
+ end
+ in
+ follow stream (chrIntVal c) 1
+ end
+
+ fun parseHexSeq stream =
+ let
+ fun follow stream acc count =
+ let
+ val (c, stream) = Stream.getchar stream
+
+ val noHex = TkError (TkiDx 0, "\\x without hex digits")
+ in
+ case c of
+ NONE =>
+ if count = 0 then
+ raise noHex
+ else
+ (SOME $ chr acc, stream)
+ | SOME c =>
+ if Char.isHexDigit c then
+ if count = 2 then
+ raise TkError (TkiDx 2, "hex sequence out of range")
+ else
+ follow stream (acc * 16 + chrIntVal c) (count + 1)
+ else
+ if count = 0 then
+ raise noHex
+ else
+ (SOME $ chr acc, Stream.ungetc stream)
+ end
+ in
+ follow stream 0 0
+ end
+
fun eatEscSeq stream =
let
+ fun raiseErr0 msg = raise TkError (TkiDx 0, msg)
+
val (c, stream) = Stream.getchar stream
val c =
case c of
- NONE => raise TkError (TkiDx 0, "unfinished escape sequence")
+ NONE => raiseErr0 "unfinished escape sequence"
| SOME c => c
+
+ fun & c = (SOME c, stream)
in
- (case c of
- #"\\" => #"\\"
- | #"t" => #"\t"
- | #"n" => #"\n"
- | c => c,
- stream)
+ case c of
+ #"'" => & #"'"
+ | #"\"" => & #"\""
+ | #"?" => & #"?"
+ | #"\\" => & #"\\"
+ | #"a" => & #"\a"
+ | #"b" => & #"\b"
+ | #"f" => & #"\f"
+ | #"n" => & #"\n"
+ | #"r" => & #"\r"
+ | #"t" => & #"\t"
+ | #"v" => & #"\v"
+ | #"\n" => (NONE, stream)
+ | #"x" => parseHexSeq stream
+ | c =>
+ if isOctal c then
+ parseOctalSeq stream c
+ else
+ raiseErr0 "unknown escape sequence"
end
fun stringCut s = String.extract (s, 1, SOME $ String.size s - 2)
@@ -544,9 +612,12 @@ structure Tokenizer:> TOKENIZER = struct
if c <> seqBound mode then
let
val (c, stream) =
- if c <> #"\\" then (c, stream) else eatEscSeq stream
+ if c <> #"\\" then (SOME c, stream) else eatEscSeq stream
in
- (SeqValue (ord c), NONE, stream)
+ if c = NONE then
+ (SeqStart, NONE, stream)
+ else
+ (SeqValue (ord $ Option.valOf c), NONE, stream)
end
else if mode = SpmStr then
(SeqTerm, SOME $ StringConst "", stream)
@@ -566,7 +637,7 @@ structure Tokenizer:> TOKENIZER = struct
else if mode = SpmStr then
let
val (_, stream) =
- if c <> #"\\" then (c, stream) else eatEscSeq stream
+ if c <> #"\\" then (SOME c, stream) else eatEscSeq stream
in
(SeqValue v, NONE, stream)
end
@@ -585,30 +656,36 @@ structure Tokenizer:> TOKENIZER = struct
val charParser = seqParser SpmChr
val strParser = seqParser SpmStr
- fun postprocessCppDir tk tkl stream =
+ fun formCppDir (Id s) =
let
- val isCppDir =
- (fn Hash => true | _ => false) (#2 $ hd tkl)
- andalso Stream.isFirstOnLine (#1 $ hd tkl) stream
- handle Empty => false
- val (pos, tk') = tk
-
- fun conv tk = ((#1 $ hd tkl, tk) :: tl tkl)
+ open String
in
- if isCppDir then
- (conv $ formCppDir tk', stream) handle
- ExpectedCppDir =>
- raise TkErrorAug (Stream.getPposFromPos pos stream,
- "expected preprocessor directive")
+ case List.find
+ (fn (_, repr) =>
+ sub (repr, 0) = cppPrefix andalso
+ extract (repr, 1, NONE) = s)
+ tokenRepr
+ of
+ SOME (tk, _) => tk
+ | NONE => raise ExpectedCppDir
+ end
+ | formCppDir kwElse = CppElse
+ | formCppDir _ = raise ExpectedCppDir
- else
- (tk :: tkl, stream)
+ fun handleCppDir tk prevPos stream =
+ let
+ val (pos, tk') = tk
+ in
+ (prevPos, formCppDir tk') handle
+ ExpectedCppDir =>
+ raise TkErrorAug (Stream.pos2ppos pos stream,
+ "expected preprocessor directive")
end
fun unexpectedCharRaise stream c =
let
- val (id, pos) = Stream.getPosAfterCharRead stream
- val pos = Stream.getPposFromPos (id, pos) stream
+ open Stream
+ val pos = pos2ppos (getPosAfterCharRead stream) stream
val repr =
if isPrintable c then
str c
@@ -626,7 +703,7 @@ structure Tokenizer:> TOKENIZER = struct
case Stream.getchar stream of
(NONE, _) =>
let
- val pos = Stream.getPposFromPos pos stream
+ val pos = Stream.pos2ppos pos stream
in
raise TkErrorAug (pos, "unfinished comment")
end
@@ -648,7 +725,7 @@ structure Tokenizer:> TOKENIZER = struct
val raiseErr = fn () =>
let
val pos = Stream.getPosAfterCharRead stream
- val pos = Stream.getPposFromPos pos stream
+ val pos = Stream.pos2ppos pos stream
in
raise TkErrorAug (pos, "expected \\n after backslash")
end
@@ -662,35 +739,44 @@ structure Tokenizer:> TOKENIZER = struct
| NONE => raiseErr ()
end
- fun processSymbol stream tkl =
+ fun processSymbol stream =
let
- val (T as (p as (fid, off), tk), stream) = fsmEat stream
+ val (T as (p, tk), stream) = fsmEat $ Stream.ungetc stream
in
case tk of
- CommentStart => tokenize (skipComment stream p) tkl
- | DoubleDot => tokenize stream (((fid, off + 1), Dot) :: (p, Dot) :: tkl)
- | _ => tokenize stream (T :: tkl)
+ CommentStart => getToken $ skipComment stream p
+ | DoubleDot => (SOME (p, Dot), Stream.ungetc stream)
+ | Hash =>
+ if Stream.isFirstOnLine p stream then
+ let
+ val (tk, stream) = getToken stream
+ in
+ case tk of
+ NONE =>
+ raise TkErrorAug (Stream.pos2ppos p stream,
+ "unfinished preprecessor directive")
+ | SOME tk =>
+ (SOME $ handleCppDir tk p stream, stream)
+ end
+ else
+ (SOME T, stream)
+ | _ => (SOME T, stream)
end
- and tokenize stream tkl =
+ and getToken stream =
let
val (c, stream) = Stream.getchar stream
- fun cont (tk, stream) = tokenize stream (tk :: tkl)
- fun @-> parser acc = cont $ parseGeneric stream parser acc
+ fun @-> parser acc =
+ (fn (tk, s) => (SOME tk, s)) $ parseGeneric stream parser acc
in
case c of
- NONE => (rev tkl, Stream.recycle stream)
+ NONE => (NONE, stream)
| SOME c =>
if Char.isSpace c then
- tokenize stream tkl
+ getToken stream
else if isIdStart c then
- let
- val (tk, stream) = parseGeneric stream idParser ()
- val (tkl, stream) = postprocessCppDir tk tkl stream
- in
- tokenize stream tkl
- end
+ @-> idParser ()
else if Char.isDigit c then
@-> numParser ()
else if c = #"'" then
@@ -698,11 +784,25 @@ structure Tokenizer:> TOKENIZER = struct
else if c = #"\"" then
@-> strParser SeqInit
else if isStartForFsm c then
- processSymbol stream tkl
+ processSymbol stream
else if c = #"\\" then
- tokenize (handleBackslash stream) tkl
+ getToken $ handleBackslash stream
else
unexpectedCharRaise stream c
end
+ fun tokenize stream =
+ let
+ fun aux acc stream =
+ let
+ val (tk, stream) = getToken stream
+ in
+ case tk of
+ NONE => rev acc
+ | SOME tk => aux (tk :: acc) stream
+ end
+ in
+ aux [] stream
+ end
+
end