diff options
author | Vladimir Azarov <avm@intermediate-node.net> | 2025-03-22 20:08:04 +0100 |
---|---|---|
committer | Vladimir Azarov <avm@intermediate-node.net> | 2025-03-22 20:08:04 +0100 |
commit | 8b4d40a1bb5e62788c9e2d8da774596ce8f4dd57 (patch) | |
tree | 4a8e705b6fd09a75910def2d4f2f72fdbafe4d39 | |
parent | 40f6375daa736ca549a1d0cbd6df87dd83df4781 (diff) |
Character literal parser
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | cpp.sml | 108 |
2 files changed, 97 insertions, 12 deletions
@@ -1,2 +1,3 @@ test cpp +todo* @@ -7,6 +7,10 @@ datatype token = Invalid | Number of string | Id of string | + CharConst of string * int | + + OParen | + CParen | Plus | DoublePlus| @@ -16,6 +20,11 @@ datatype token = Semicolon | EqualSign | DoubleEqualSign | + ExclMark | + ExclMarkEqualSign | + + QuestionMark | + Colon | Hash | DoubleHash | @@ -24,6 +33,9 @@ datatype token = CppDefine val tokenRepr = [ + (OParen, "("), + (CParen, ")"), + (Plus, "+"), (DoublePlus, "++"), (Minus, "-"), @@ -32,6 +44,11 @@ val tokenRepr = [ (Semicolon, ";"), (EqualSign, "="), (DoubleEqualSign, "=="), + (ExclMark, "!"), + (ExclMarkEqualSign, "!="), + + (QuestionMark, "?"), + (Colon, ":"), (Hash, "#"), (DoubleHash, "##"), @@ -45,9 +62,11 @@ fun isCppDir CppInclude = true exception TokenWithoutRepr -fun printToken (Number s) = printLn $ "Num: " ^ s - | printToken (Id s) = printLn $ "Id: " ^ s - | printToken v = +val printToken = fn + (Number s) => printLn $ "Num: " ^ s + | (Id s) => printLn $ "Id: " ^ s + | (CharConst (repr, _)) => printLn $ "chr: " ^ repr + | v => case List.find (fn (x, _) => x = v) tokenRepr of SOME (_, repr) => printLn repr | NONE => raise TokenWithoutRepr @@ -158,7 +177,10 @@ end (* FSM for parsing symbols *) -val maxStates = 40 +val maxStates = 21 + +exception FsmTableIsTooSmall +exception FsmTableIsTooLarge of int fun fsmInsert (nextState, buf) curState tk (c :: cs) = let @@ -187,7 +209,7 @@ fun fsmTableCreate () = let open Array - val T as (_, buf) = + val T as (nextState, buf) = (ref 1, array (maxStates, (Invalid, array (128, ~1)))) val r = ref 1 @@ -196,13 +218,19 @@ let filterNeeded tks (if isCppDir tk then acc else T :: acc) val tokenRepr = filterNeeded tokenRepr [] + + fun fsmInsert' T curState tk repr = fsmInsert T curState tk repr handle + Subscript => raise FsmTableIsTooSmall in while !r < length buf do ( update (buf, !r, (Invalid, array(128, ~1))); r := !r + 1 ); - List.app (fn (v, p) => fsmInsert T 0 v (explode p)) tokenRepr; + List.app (fn (v, p) => fsmInsert' T 0 v $ explode p) tokenRepr; + if !nextState <> maxStates then + raise FsmTableIsTooLarge $ !nextState + else (); T end @@ -223,6 +251,8 @@ val isStartForFsm = isStartForFsmGen () val fsmTable = fsmTableCreate () +val () = printLn $ "states: " ^ Int.toString (!(#1 fsmTable)) + fun fsmEat stream = let open Array @@ -353,7 +383,60 @@ in finalize (ungetc stream) end -fun tokenize stream (tkl: fullToken list) = +exception IncompleteEsqSeq + +fun eatEscSeq stream = +let + val (c, stream) = getchar stream handle + _ => raise IncompleteEsqSeq +in + (case c of + #"\\" => #"\\" + | #"t" => #"\t" + | #"n" => #"\n" + | c => c, + stream) +end + +datatype CharParseState = ChInit | ChStart | ChValue of int | ChTerm +exception CharConstExn of string + +val chExnExpValue = CharConstExn "expected value after '" +val chExnExpTerm = CharConstExn "expected ' after value" +val chExnStart = CharConstExn "can not start parsing (unreachable)" +val chExnAfterTerm = CharConstExn "can not finish parsing (unreachable)" + +fun charParser ChInit (stream, _) (SOME c) = + if c = #"'" then + (ChStart, NONE, stream) + else + raise chExnStart + | charParser ChStart (stream, _) (SOME c) = + if c <> #"'" then + let + val (c, stream) = + if c <> #"\\" then (c, stream) else eatEscSeq stream + in + (ChValue $ ord c, NONE, stream) + end + else + raise chExnExpValue + | charParser (ChValue v) (stream, startOff) (SOME c) = + if c = #"'" then + (ChTerm, + SOME $ CharConst (finishSeqRead startOff stream, v), stream) + else + raise chExnExpTerm + | charParser chTerm _ (SOME _) = + raise chExnAfterTerm + | charParser state (stream, _) NONE = + raise case state of + ChInit => chExnStart + | ChStart => chExnExpValue + | ChValue _ => chExnExpTerm + | ChTerm => chExnAfterTerm + +fun tokenize stream tkl = let fun getcharSkipEof stream = getchar stream handle EndOfFile => getcharSkipEof (advanceToNewFile stream) @@ -362,9 +445,8 @@ let handle EndOfStream => (NONE, stream) - fun % tk = (getPosAfterCharRead stream, tk) - - fun @-> (tk, stream) = tokenize stream (tk :: tkl) + fun cont (tk, stream) = tokenize stream (tk :: tkl) + fun @-> parser acc = cont $ parseGeneric stream parser acc in case c of NONE => (rev tkl, #allFiles stream) @@ -381,9 +463,11 @@ in (if isCppDir then (formCppDir (#2 tk) tkl) else (tk :: tkl)) end else if isDigit c then - @-> $ parseGeneric stream numParser () + @-> numParser () + else if c = #"'" then + @-> charParser ChInit else if isStartForFsm c then - @-> $ fsmEat stream + cont $ fsmEat stream else raise UnexpectedCharacter c end |