From 40f6375daa736ca549a1d0cbd6df87dd83df4781 Mon Sep 17 00:00:00 2001 From: Vladimir Azarov Date: Sat, 22 Mar 2025 17:58:30 +0100 Subject: Symbol parser --- cpp.sml | 280 ++++++++++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 227 insertions(+), 53 deletions(-) (limited to 'cpp.sml') diff --git a/cpp.sml b/cpp.sml index ea3b35c..99f8b36 100644 --- a/cpp.sml +++ b/cpp.sml @@ -4,19 +4,53 @@ infixr 0 $ fun printLn s = (print s; print "\n") datatype token = - Number of substring | - Id of substring | + Invalid | + Number of string | + Id of string | + Plus | + DoublePlus| + Minus | + DoubleMinus | + Semicolon | + EqualSign | + DoubleEqualSign | + Hash | - CppInclude + DoubleHash | + + CppInclude | + CppDefine + +val tokenRepr = [ + (Plus, "+"), + (DoublePlus, "++"), + (Minus, "-"), + (DoubleMinus, "--"), + + (Semicolon, ";"), + (EqualSign, "="), + (DoubleEqualSign, "=="), + + (Hash, "#"), + (DoubleHash, "##"), -fun printToken (Number s) = printLn $ "Number: " ^ Substring.string s - | printToken (Id s) = printLn $ "Id: " ^ Substring.string s - | printToken Plus = printLn "+" - | printToken Semicolon = printLn ";" - | printToken Hash = printLn "#" - | printToken CppInclude = printLn "$include" + (CppInclude, "$include"), + (CppDefine, "$define") +] + +fun isCppDir CppInclude = true + | isCppDir _ = false + +exception TokenWithoutRepr + +fun printToken (Number s) = printLn $ "Num: " ^ s + | printToken (Id s) = printLn $ "Id: " ^ s + | printToken v = + case List.find (fn (x, _) => x = v) tokenRepr of + SOME (_, repr) => printLn repr + | NONE => raise TokenWithoutRepr type fileId = int type fileOffset = int @@ -41,10 +75,9 @@ let if cur = offset then (line, col) else - if String.sub (s, cur) = #"\n" then - calc s (cur + 1) offset (line + 1, 1) - else - calc s (cur + 1) offset (line, col + 1) + calc s (cur + 1) offset + (if String.sub (s, cur) = #"\n" then (line + 1, 1) + else (line, col + 1)) in calc s 0 offset (1, 1) end @@ -97,8 +130,7 @@ fun getOffset ({ stack = (id, off, contents) :: _, ... }: stream) = off | getOffset _ = raise InvalidStream fun getPosAfterCharRead - ({ stack = (id, off, contents) :: _, ... }: stream) = - (id, off - 1) + ({ stack = (id, off, contents) :: _, ... }: stream) = (id, off - 1) | getPosAfterCharRead _ = raise InvalidStream fun getPos ({ stack = (id, off, contents) :: _, ... }: stream) = (id, off) @@ -106,7 +138,7 @@ fun getPos ({ stack = (id, off, contents) :: _, ... }: stream) = (id, off) fun getSubstr startOff endOff ({ stack = (_, _, contents) :: _, ... }: stream) = - Substring.substring (contents, startOff, endOff - startOff) + String.substring (contents, startOff, endOff - startOff) | getSubstr _ _ _ = raise InvalidStream fun advanceToNewFile @@ -124,6 +156,101 @@ in { allFiles = [(0, fname, contents)], stack = [(0, 0, contents)] } end +(* FSM for parsing symbols *) + +val maxStates = 40 + +fun fsmInsert (nextState, buf) curState tk (c :: cs) = +let + open Array + + val (_, row) = sub (buf, curState) + val potNextState = sub (row, ord c) +in + if potNextState <> ~1 then + fsmInsert (nextState, buf) potNextState tk cs + else ( + update (row, ord c, !nextState); + nextState := !nextState + 1; + fsmInsert (nextState, buf) (!nextState - 1) tk cs + ) +end + | fsmInsert (nextState, buf) curState tk [] = +let + open Array + val (_, row) = sub (buf, curState) +in + update (buf, curState, (tk, row)) +end + +fun fsmTableCreate () = +let + open Array + + val T as (_, buf) = + (ref 1, array (maxStates, (Invalid, array (128, ~1)))) + val r = ref 1 + + fun filterNeeded [] acc = acc + | filterNeeded ((T as (tk, _)) :: tks) acc = + filterNeeded tks (if isCppDir tk then acc else T :: acc) + + val tokenRepr = filterNeeded tokenRepr [] +in + while !r < length buf do ( + update (buf, !r, (Invalid, array(128, ~1))); + r := !r + 1 + ); + + List.app (fn (v, p) => fsmInsert T 0 v (explode p)) tokenRepr; + T +end + +fun isStartForFsmGen () = +let + open Array + + val lookupTable = array (128, false) + + fun firstChr s = ord $ String.sub (s, 0) + val () = List.app + (fn (_, repr) => update (lookupTable, firstChr repr, true)) tokenRepr +in + fn c => sub (lookupTable, ord c) +end + +val isStartForFsm = isStartForFsmGen () + +val fsmTable = fsmTableCreate () + +fun fsmEat stream = +let + open Array + val stream = ungetc stream + val pos = getPos stream + + fun get curState stream = + let + val (c, stream) = (fn (c, s) => (SOME c, s)) $ getchar stream handle + _ => (NONE, stream) + in + case c of + NONE => (#1 $ sub (#2 fsmTable, curState), stream) + | SOME c => + let + val (tk, row) = sub (#2 fsmTable, curState) + val nextState = sub (row, ord c) + in + if nextState = ~1 then + (tk, ungetc stream) + else + get nextState stream + end + end +in + (fn (tk, stream) => ((pos, tk), stream)) $ get 0 stream +end + exception UnexpectedCharacter of char fun isSpace c = c = #" " orelse c = #"\t" orelse c = #"\n" @@ -159,7 +286,74 @@ in end end -fun tokenize stream tkl = +fun parseGeneric stream parser acc = +let + val stream = ungetc stream + val P as (fid, startOff) = getPos stream + + fun parse' stream acc = let + val (c, stream) = (fn (c, s) => (SOME c, s)) $ getchar stream handle + _ => (NONE, stream) + + val (acc, tk, stream) = parser acc (stream, startOff) c + in + case tk of + NONE => parse' stream acc + | _ => (valOf tk, stream) + end + + val (tk, stream) = parse' stream acc +in + ((P, tk): fullToken, stream) +end + +fun finishSeqRead startOff stream = +let + val (_, endOff) = getPos stream + val s = getSubstr startOff endOff stream +in + s +end + +fun idParser () (stream, startOff) c = +let + fun finalize stream = + ((), SOME $ Id (finishSeqRead startOff stream), stream) +in + case c of + NONE => finalize stream + | SOME c => + if isIdBody c then + ((), NONE, stream) + else + finalize (ungetc stream) +end + +fun formCppDir tk tkl = +let + fun conv tk = ((#1 $ hd tkl, tk) :: tl tkl) +in + case tk of + Id "include" => conv CppInclude + | Id "define" => conv CppDefine + | _ => raise Fail $ "Expected cpp direcive\n" +end + +fun numParser () (stream, startOff) c = +let + fun finalize stream = + ((), SOME $ Number (finishSeqRead startOff stream), stream) +in + case c of + NONE => finalize stream + | SOME c => + if isDigit c then + ((), NONE, stream) + else + finalize (ungetc stream) +end + +fun tokenize stream (tkl: fullToken list) = let fun getcharSkipEof stream = getchar stream handle EndOfFile => getcharSkipEof (advanceToNewFile stream) @@ -169,6 +363,8 @@ let EndOfStream => (NONE, stream) fun % tk = (getPosAfterCharRead stream, tk) + + fun @-> (tk, stream) = tokenize stream (tk :: tkl) in case c of NONE => (rev tkl, #allFiles stream) @@ -176,42 +372,20 @@ in if isSpace c then tokenize stream tkl else if isIdStart c then - parseId c stream tkl - else case c of - #"+" => tokenize stream (% Plus :: tkl) - | #";" => tokenize stream (% Semicolon :: tkl) - | #"#" => tokenize stream (% Hash :: tkl) - | _ => raise UnexpectedCharacter c -end and -parseId c stream (tkl: fullToken list) = -let - val P as (_, pos) = getPosAfterCharRead stream - - fun eatId curId stream = - let - val (c, stream) = (fn (c, s) => (SOME c, s)) $ getchar stream handle - _ => (NONE, stream) - in - case c of - NONE => (curId, stream) - | SOME c => - if isIdBody c then - eatId (curId + 1) stream - else - (curId, ungetc stream) - end - - val (endOffset, stream) = eatId (pos + 1) stream - val id = getSubstr pos endOffset stream - -in - if ((fn Hash => true | _ => false) (#2 $ hd tkl) handle Empty => false) - andalso isFirstOnLine (hd tkl) stream - andalso Substring.string id = "include" - then - tokenize stream (((#1 $ hd tkl), CppInclude) :: tl tkl) - else - tokenize stream ((P, Id id) :: tkl) + let + val isCppDir = (fn Hash => true | _ => false) (#2 $ hd tkl) + handle Empty => false + val (tk, stream) = parseGeneric stream idParser () + in + tokenize stream + (if isCppDir then (formCppDir (#2 tk) tkl) else (tk :: tkl)) + end + else if isDigit c then + @-> $ parseGeneric stream numParser () + else if isStartForFsm c then + @-> $ fsmEat stream + else + raise UnexpectedCharacter c end fun main [fname] = -- cgit v1.2.3