summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorVladimir Azarov <avm@intermediate-node.net>2025-03-22 20:08:04 +0100
committerVladimir Azarov <avm@intermediate-node.net>2025-03-22 20:08:04 +0100
commit8b4d40a1bb5e62788c9e2d8da774596ce8f4dd57 (patch)
tree4a8e705b6fd09a75910def2d4f2f72fdbafe4d39
parent40f6375daa736ca549a1d0cbd6df87dd83df4781 (diff)
Character literal parser
-rw-r--r--.gitignore1
-rw-r--r--cpp.sml108
2 files changed, 97 insertions, 12 deletions
diff --git a/.gitignore b/.gitignore
index d088a4b..57e20f1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
test
cpp
+todo*
diff --git a/cpp.sml b/cpp.sml
index 99f8b36..168df7f 100644
--- a/cpp.sml
+++ b/cpp.sml
@@ -7,6 +7,10 @@ datatype token =
Invalid |
Number of string |
Id of string |
+ CharConst of string * int |
+
+ OParen |
+ CParen |
Plus |
DoublePlus|
@@ -16,6 +20,11 @@ datatype token =
Semicolon |
EqualSign |
DoubleEqualSign |
+ ExclMark |
+ ExclMarkEqualSign |
+
+ QuestionMark |
+ Colon |
Hash |
DoubleHash |
@@ -24,6 +33,9 @@ datatype token =
CppDefine
val tokenRepr = [
+ (OParen, "("),
+ (CParen, ")"),
+
(Plus, "+"),
(DoublePlus, "++"),
(Minus, "-"),
@@ -32,6 +44,11 @@ val tokenRepr = [
(Semicolon, ";"),
(EqualSign, "="),
(DoubleEqualSign, "=="),
+ (ExclMark, "!"),
+ (ExclMarkEqualSign, "!="),
+
+ (QuestionMark, "?"),
+ (Colon, ":"),
(Hash, "#"),
(DoubleHash, "##"),
@@ -45,9 +62,11 @@ fun isCppDir CppInclude = true
exception TokenWithoutRepr
-fun printToken (Number s) = printLn $ "Num: " ^ s
- | printToken (Id s) = printLn $ "Id: " ^ s
- | printToken v =
+val printToken = fn
+ (Number s) => printLn $ "Num: " ^ s
+ | (Id s) => printLn $ "Id: " ^ s
+ | (CharConst (repr, _)) => printLn $ "chr: " ^ repr
+ | v =>
case List.find (fn (x, _) => x = v) tokenRepr of
SOME (_, repr) => printLn repr
| NONE => raise TokenWithoutRepr
@@ -158,7 +177,10 @@ end
(* FSM for parsing symbols *)
-val maxStates = 40
+val maxStates = 21
+
+exception FsmTableIsTooSmall
+exception FsmTableIsTooLarge of int
fun fsmInsert (nextState, buf) curState tk (c :: cs) =
let
@@ -187,7 +209,7 @@ fun fsmTableCreate () =
let
open Array
- val T as (_, buf) =
+ val T as (nextState, buf) =
(ref 1, array (maxStates, (Invalid, array (128, ~1))))
val r = ref 1
@@ -196,13 +218,19 @@ let
filterNeeded tks (if isCppDir tk then acc else T :: acc)
val tokenRepr = filterNeeded tokenRepr []
+
+ fun fsmInsert' T curState tk repr = fsmInsert T curState tk repr handle
+ Subscript => raise FsmTableIsTooSmall
in
while !r < length buf do (
update (buf, !r, (Invalid, array(128, ~1)));
r := !r + 1
);
- List.app (fn (v, p) => fsmInsert T 0 v (explode p)) tokenRepr;
+ List.app (fn (v, p) => fsmInsert' T 0 v $ explode p) tokenRepr;
+ if !nextState <> maxStates then
+ raise FsmTableIsTooLarge $ !nextState
+ else ();
T
end
@@ -223,6 +251,8 @@ val isStartForFsm = isStartForFsmGen ()
val fsmTable = fsmTableCreate ()
+val () = printLn $ "states: " ^ Int.toString (!(#1 fsmTable))
+
fun fsmEat stream =
let
open Array
@@ -353,7 +383,60 @@ in
finalize (ungetc stream)
end
-fun tokenize stream (tkl: fullToken list) =
+exception IncompleteEsqSeq
+
+fun eatEscSeq stream =
+let
+ val (c, stream) = getchar stream handle
+ _ => raise IncompleteEsqSeq
+in
+ (case c of
+ #"\\" => #"\\"
+ | #"t" => #"\t"
+ | #"n" => #"\n"
+ | c => c,
+ stream)
+end
+
+datatype CharParseState = ChInit | ChStart | ChValue of int | ChTerm
+exception CharConstExn of string
+
+val chExnExpValue = CharConstExn "expected value after '"
+val chExnExpTerm = CharConstExn "expected ' after value"
+val chExnStart = CharConstExn "can not start parsing (unreachable)"
+val chExnAfterTerm = CharConstExn "can not finish parsing (unreachable)"
+
+fun charParser ChInit (stream, _) (SOME c) =
+ if c = #"'" then
+ (ChStart, NONE, stream)
+ else
+ raise chExnStart
+ | charParser ChStart (stream, _) (SOME c) =
+ if c <> #"'" then
+ let
+ val (c, stream) =
+ if c <> #"\\" then (c, stream) else eatEscSeq stream
+ in
+ (ChValue $ ord c, NONE, stream)
+ end
+ else
+ raise chExnExpValue
+ | charParser (ChValue v) (stream, startOff) (SOME c) =
+ if c = #"'" then
+ (ChTerm,
+ SOME $ CharConst (finishSeqRead startOff stream, v), stream)
+ else
+ raise chExnExpTerm
+ | charParser chTerm _ (SOME _) =
+ raise chExnAfterTerm
+ | charParser state (stream, _) NONE =
+ raise case state of
+ ChInit => chExnStart
+ | ChStart => chExnExpValue
+ | ChValue _ => chExnExpTerm
+ | ChTerm => chExnAfterTerm
+
+fun tokenize stream tkl =
let
fun getcharSkipEof stream = getchar stream handle
EndOfFile => getcharSkipEof (advanceToNewFile stream)
@@ -362,9 +445,8 @@ let
handle
EndOfStream => (NONE, stream)
- fun % tk = (getPosAfterCharRead stream, tk)
-
- fun @-> (tk, stream) = tokenize stream (tk :: tkl)
+ fun cont (tk, stream) = tokenize stream (tk :: tkl)
+ fun @-> parser acc = cont $ parseGeneric stream parser acc
in
case c of
NONE => (rev tkl, #allFiles stream)
@@ -381,9 +463,11 @@ in
(if isCppDir then (formCppDir (#2 tk) tkl) else (tk :: tkl))
end
else if isDigit c then
- @-> $ parseGeneric stream numParser ()
+ @-> numParser ()
+ else if c = #"'" then
+ @-> charParser ChInit
else if isStartForFsm c then
- @-> $ fsmEat stream
+ cont $ fsmEat stream
else
raise UnexpectedCharacter c
end