From 11e14dd4b93154964c87fc97cfcee62c52edf97a Mon Sep 17 00:00:00 2001 From: Vladimir Azarov Date: Thu, 27 Mar 2025 14:46:47 +0100 Subject: Number constant parser --- exn_handler.sml | 3 +- stream.sml | 16 ++--- tokenizer.sml | 212 +++++++++++++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 199 insertions(+), 32 deletions(-) diff --git a/exn_handler.sml b/exn_handler.sml index 32ad20b..52a2c5c 100644 --- a/exn_handler.sml +++ b/exn_handler.sml @@ -18,11 +18,10 @@ structure GlobalExnHandler: sig val handler: exn -> unit end = struct fun ioExn (IO.Io { name, function = _, cause }) = let - open OS val prefix = name ^ ": " val reason = case cause of - SysErr (str, _) => str + OS.SysErr (str, _) => str | _ => exnMessage cause in printLn $ prefix ^ reason diff --git a/stream.sml b/stream.sml index eb2cea1..1755817 100644 --- a/stream.sml +++ b/stream.sml @@ -42,16 +42,6 @@ structure Stream :> STREAM = struct print $ fname ^ ":" ^ line ^ ":" ^ col ^ ": " end - fun readFile fname = - let - open TextIO - val h = openIn fname - val s = inputAll h - val () = closeIn h - in - s - end - fun getchar (S as (fid, fname, off, contents)) = if off < String.size contents then (SOME $ String.sub (contents, off), (fid, fname, off + 1, contents)) @@ -83,7 +73,11 @@ structure Stream :> STREAM = struct fun streamInit fname = let - val contents = readFile fname + open TextIO + + val h = openIn fname + val contents = inputAll h + val () = closeIn h in (0, fname, 0, contents) end diff --git a/tokenizer.sml b/tokenizer.sml index ba9d0df..41f8d38 100644 --- a/tokenizer.sml +++ b/tokenizer.sml @@ -1,9 +1,20 @@ structure Tokenizer:> TOKENIZER = struct + + datatype intType = ItDec | ItOct | ItHex + datatype intSfx = IsNone | IsU | IsL | IsUL | IsLL | IsULL + datatype floatSfx = FsNone | FsF | FsL + + datatype numConst = + IntConst of intType * string * intSfx | + FloatConst of string * floatSfx + + datatype token = Invalid | NewLine | - Number of string | + Num of numConst | + Id of string | CharConst of string * int | StringConst of string | @@ -122,6 +133,7 @@ structure Tokenizer:> TOKENIZER = struct (* Unreachable (should be) *) exception TokenWithoutRepr + exception SuffixWithoutRepr val tokenRepr = let @@ -231,9 +243,47 @@ structure Tokenizer:> TOKENIZER = struct ] end + val intSuffixRepr = [ + (IsNone, ""), + (IsU, "u"), + (IsL, "l"), + (IsUL, "ul"), + (IsLL, "ll"), + (IsULL, "ull") + ] + + val floatSuffixRepr = [ + (FsNone, ""), + (FsF, "f"), + (FsL, "l") + ] + + fun getSfxRepr sfx buf onError = + case List.find (fn (sfx', _) => sfx' = sfx) buf of + NONE => onError () + | SOME (_, repr) => repr + + fun getSfxReprSimple sfx buf = + getSfxRepr sfx buf (fn () => raise SuffixWithoutRepr) + val printToken = fn - Number s => print $ "`" ^ s ^ "`" - | Id s => print $ "id:" ^ s + Id s => print $ "id:" ^ s + | Num (IntConst (it, str, sfx)) => + let + val intType = + case it of + ItDec => "" + | ItOct => "0" + | ItHex => "0x" + in + print intType; + print str; + print $ "`" ^ getSfxReprSimple sfx intSuffixRepr ^ "`" + end + | Num (FloatConst (str, sfx)) => ( + print str; + print $ "`" ^ getSfxReprSimple sfx floatSuffixRepr ^ "`" + ) | CharConst (repr, _) => print repr | StringConst s => print $ "\"" ^ s ^ "\"" @@ -245,6 +295,10 @@ structure Tokenizer:> TOKENIZER = struct fun isIdStart c = Char.isAlpha c orelse c = #"_" fun isIdBody c = Char.isAlphaNum c orelse c = #"_" + fun isOctal c = ord c >= ord #"0" andalso ord c < ord #"8" + val isDigit = Char.isDigit + val isHexDigit = Char.isHexDigit + fun isPrintable c = Char.isPrint c andalso c <> #" " (* FSM for parsing symbols *) @@ -475,24 +529,145 @@ structure Tokenizer:> TOKENIZER = struct finalize (Stream.ungetc stream) end - fun numParser () (stream, startOff) c = + datatype intMode = ImDec | ImOct | ImInvalidOct | ImHex + datatype floatMode = FmDot | FmExp + + datatype npState = NpInit | IntMode of intMode | FloatMode of floatMode + + fun getLongestSeq acc pred stream = let - fun finalize stream = - ((), SOME $ Number (finishSeqRead startOff stream), stream) + val (c, stream') = Stream.getchar stream in - case c of - NONE => finalize stream - | SOME c => - if Char.isDigit c then - ((), NONE, stream) - else - finalize (Stream.ungetc stream) + if isSome c andalso pred $ valOf c then + getLongestSeq (valOf c :: acc) pred stream' + else + (implode $ rev acc, stream) end - fun isOctal c = ord c >= ord #"0" andalso ord c < ord #"8" + fun skipDigitSeq off stream = + let + val (res, stream) = getLongestSeq [] isDigit stream + in + if res = "" then + raise TkError (TkiDx off, "expected digit") + else + (String.size res, stream) + end + + fun getSuffixCommon buf off stream = + let + val (sfx, stream) = getLongestSeq [] Char.isAlpha stream + val sfx = String.map (fn c => Char.toLower c) sfx + val sfx = + case List.find (fn (_, repr) => repr = sfx) buf of + NONE => raise TkError (TkiDx off, "unknown suffix") + | SOME (sfx, _) => sfx + in + (sfx, stream) + end + + val getIntSuffix = getSuffixCommon intSuffixRepr 0 + fun getFloatSuffix off = getSuffixCommon floatSuffixRepr off + + (* + * It seems that there is an ambiguity in C: + * gcc/clang/cparser consider 0xe-3 as a fp constant and reject it. + * Right now we do not, but it may be reconsidered. + *) + fun numParser NpInit (stream, _) (SOME c) = + if c = #"0" then + let + val (c, stream') = Stream.getchar stream + in + case c of + NONE => + (NpInit, SOME $ Num $ IntConst (ItOct, "", IsNone), stream') + | SOME c => + if Char.toLower c = #"x" then + (IntMode ImHex, NONE, stream') + else + (IntMode ImOct, NONE, stream) + end + else + (IntMode ImDec, NONE, stream) + | numParser NpInit _ NONE = raise Unreachable + | numParser (IntMode mode) (stream, startOff) c = + let + val (pred, res, offset) = + case mode of + ImDec => (isDigit, ItDec, 0) + | ImOct => (isOctal, ItOct, 1) + | ImInvalidOct => (isDigit, ItOct, 1) + | ImHex => (isHexDigit, ItHex, 2) + + fun checkAndRaise m msg = + if mode = m then raise TkError (TkiStart, msg) else () + + fun finish () = + let + val () = checkAndRaise ImInvalidOct "invalid octal constant" + val stream = Stream.ungetc stream + val str = finishSeqRead (startOff + offset) stream + val (sfx, stream) = getIntSuffix stream + in + (IntMode mode, SOME $ Num $ IntConst (res, str, sfx), stream) + end + in + case c of + NONE => finish () + | SOME c => + if pred c then + (IntMode mode, NONE, stream) + else if c = #"." then + (FloatMode FmDot, NONE, stream) + else if Char.toLower c = #"e" then + (checkAndRaise ImHex + "floating constant can not come with 0x prefix"; + (FloatMode FmExp, NONE, stream)) + else + finish () + end + | numParser (FloatMode FmDot) (stream, startOff) _ = + let + val (len, stream) = skipDigitSeq 0 $ Stream.ungetc stream + + val (c, stream') = Stream.getchar stream + + fun finish () = + let + val str = finishSeqRead startOff stream + val (sfx, stream) = getFloatSuffix len stream + in + (FloatMode FmDot, SOME $ Num $ FloatConst (str, sfx), stream) + end + in + case c of + NONE => finish () + | SOME c => + if Char.toLower c = #"e" then + (FloatMode FmExp, NONE, stream') + else + finish () + end + | numParser (FloatMode FmExp) (stream, startOff) c = + let + val (off, stream) = + if c = NONE then + raise TkError (TkiDx 0, "expected digit") + else if valOf c <> #"+" andalso valOf c <> #"-" then + (0, Stream.ungetc stream) + else + (~1, stream) + + val (len, stream) = skipDigitSeq off stream + val str = finishSeqRead startOff stream + val (sfx, stream) = getFloatSuffix len stream + in + (FloatMode FmDot, SOME $ Num $ FloatConst (str, sfx), stream) + end fun chrIntVal c = - if Char.isDigit c then + if isDigit c then ord c - ord #"0" else if ord c >= ord #"a" andalso ord c <= ord #"z" then ord c - ord #"a" + 10 @@ -503,7 +678,6 @@ structure Tokenizer:> TOKENIZER = struct fun parseOctalSeq stream c = let - fun follow stream acc count = if count = 3 then (SOME $ chr acc, stream) @@ -538,7 +712,7 @@ structure Tokenizer:> TOKENIZER = struct else (SOME $ chr acc, stream) | SOME c => - if Char.isHexDigit c then + if isHexDigit c then if count = 2 then raise TkError (TkiDx 2, "hex sequence out of range") else @@ -784,8 +958,8 @@ structure Tokenizer:> TOKENIZER = struct getToken stream else if isIdStart c then @-> idParser () - else if Char.isDigit c then - @-> numParser () + else if isDigit c then + @-> numParser NpInit else if c = #"'" then @-> charParser SeqInit else if c = #"\"" then -- cgit v1.2.3