summaryrefslogtreecommitdiff
path: root/tokenizer.sml
diff options
context:
space:
mode:
Diffstat (limited to 'tokenizer.sml')
-rw-r--r--tokenizer.sml212
1 files changed, 193 insertions, 19 deletions
diff --git a/tokenizer.sml b/tokenizer.sml
index ba9d0df..41f8d38 100644
--- a/tokenizer.sml
+++ b/tokenizer.sml
@@ -1,9 +1,20 @@
structure Tokenizer:> TOKENIZER = struct
+
+ datatype intType = ItDec | ItOct | ItHex
+ datatype intSfx = IsNone | IsU | IsL | IsUL | IsLL | IsULL
+ datatype floatSfx = FsNone | FsF | FsL
+
+ datatype numConst =
+ IntConst of intType * string * intSfx |
+ FloatConst of string * floatSfx
+
+
datatype token =
Invalid |
NewLine |
- Number of string |
+ Num of numConst |
+
Id of string |
CharConst of string * int |
StringConst of string |
@@ -122,6 +133,7 @@ structure Tokenizer:> TOKENIZER = struct
(* Unreachable (should be) *)
exception TokenWithoutRepr
+ exception SuffixWithoutRepr
val tokenRepr =
let
@@ -231,9 +243,47 @@ structure Tokenizer:> TOKENIZER = struct
]
end
+ val intSuffixRepr = [
+ (IsNone, ""),
+ (IsU, "u"),
+ (IsL, "l"),
+ (IsUL, "ul"),
+ (IsLL, "ll"),
+ (IsULL, "ull")
+ ]
+
+ val floatSuffixRepr = [
+ (FsNone, ""),
+ (FsF, "f"),
+ (FsL, "l")
+ ]
+
+ fun getSfxRepr sfx buf onError =
+ case List.find (fn (sfx', _) => sfx' = sfx) buf of
+ NONE => onError ()
+ | SOME (_, repr) => repr
+
+ fun getSfxReprSimple sfx buf =
+ getSfxRepr sfx buf (fn () => raise SuffixWithoutRepr)
+
val printToken = fn
- Number s => print $ "`" ^ s ^ "`"
- | Id s => print $ "id:" ^ s
+ Id s => print $ "id:" ^ s
+ | Num (IntConst (it, str, sfx)) =>
+ let
+ val intType =
+ case it of
+ ItDec => ""
+ | ItOct => "0"
+ | ItHex => "0x"
+ in
+ print intType;
+ print str;
+ print $ "`" ^ getSfxReprSimple sfx intSuffixRepr ^ "`"
+ end
+ | Num (FloatConst (str, sfx)) => (
+ print str;
+ print $ "`" ^ getSfxReprSimple sfx floatSuffixRepr ^ "`"
+ )
| CharConst (repr, _) => print repr
| StringConst s =>
print $ "\"" ^ s ^ "\""
@@ -245,6 +295,10 @@ structure Tokenizer:> TOKENIZER = struct
fun isIdStart c = Char.isAlpha c orelse c = #"_"
fun isIdBody c = Char.isAlphaNum c orelse c = #"_"
+ fun isOctal c = ord c >= ord #"0" andalso ord c < ord #"8"
+ val isDigit = Char.isDigit
+ val isHexDigit = Char.isHexDigit
+
fun isPrintable c = Char.isPrint c andalso c <> #" "
(* FSM for parsing symbols *)
@@ -475,24 +529,145 @@ structure Tokenizer:> TOKENIZER = struct
finalize (Stream.ungetc stream)
end
- fun numParser () (stream, startOff) c =
+ datatype intMode = ImDec | ImOct | ImInvalidOct | ImHex
+ datatype floatMode = FmDot | FmExp
+
+ datatype npState = NpInit | IntMode of intMode | FloatMode of floatMode
+
+ fun getLongestSeq acc pred stream =
let
- fun finalize stream =
- ((), SOME $ Number (finishSeqRead startOff stream), stream)
+ val (c, stream') = Stream.getchar stream
in
- case c of
- NONE => finalize stream
- | SOME c =>
- if Char.isDigit c then
- ((), NONE, stream)
- else
- finalize (Stream.ungetc stream)
+ if isSome c andalso pred $ valOf c then
+ getLongestSeq (valOf c :: acc) pred stream'
+ else
+ (implode $ rev acc, stream)
end
- fun isOctal c = ord c >= ord #"0" andalso ord c < ord #"8"
+ fun skipDigitSeq off stream =
+ let
+ val (res, stream) = getLongestSeq [] isDigit stream
+ in
+ if res = "" then
+ raise TkError (TkiDx off, "expected digit")
+ else
+ (String.size res, stream)
+ end
+
+ fun getSuffixCommon buf off stream =
+ let
+ val (sfx, stream) = getLongestSeq [] Char.isAlpha stream
+ val sfx = String.map (fn c => Char.toLower c) sfx
+ val sfx =
+ case List.find (fn (_, repr) => repr = sfx) buf of
+ NONE => raise TkError (TkiDx off, "unknown suffix")
+ | SOME (sfx, _) => sfx
+ in
+ (sfx, stream)
+ end
+
+ val getIntSuffix = getSuffixCommon intSuffixRepr 0
+ fun getFloatSuffix off = getSuffixCommon floatSuffixRepr off
+
+ (*
+ * It seems that there is an ambiguity in C:
+ * gcc/clang/cparser consider 0xe-3 as a fp constant and reject it.
+ * Right now we do not, but it may be reconsidered.
+ *)
+ fun numParser NpInit (stream, _) (SOME c) =
+ if c = #"0" then
+ let
+ val (c, stream') = Stream.getchar stream
+ in
+ case c of
+ NONE =>
+ (NpInit, SOME $ Num $ IntConst (ItOct, "", IsNone), stream')
+ | SOME c =>
+ if Char.toLower c = #"x" then
+ (IntMode ImHex, NONE, stream')
+ else
+ (IntMode ImOct, NONE, stream)
+ end
+ else
+ (IntMode ImDec, NONE, stream)
+ | numParser NpInit _ NONE = raise Unreachable
+ | numParser (IntMode mode) (stream, startOff) c =
+ let
+ val (pred, res, offset) =
+ case mode of
+ ImDec => (isDigit, ItDec, 0)
+ | ImOct => (isOctal, ItOct, 1)
+ | ImInvalidOct => (isDigit, ItOct, 1)
+ | ImHex => (isHexDigit, ItHex, 2)
+
+ fun checkAndRaise m msg =
+ if mode = m then raise TkError (TkiStart, msg) else ()
+
+ fun finish () =
+ let
+ val () = checkAndRaise ImInvalidOct "invalid octal constant"
+ val stream = Stream.ungetc stream
+ val str = finishSeqRead (startOff + offset) stream
+ val (sfx, stream) = getIntSuffix stream
+ in
+ (IntMode mode, SOME $ Num $ IntConst (res, str, sfx), stream)
+ end
+ in
+ case c of
+ NONE => finish ()
+ | SOME c =>
+ if pred c then
+ (IntMode mode, NONE, stream)
+ else if c = #"." then
+ (FloatMode FmDot, NONE, stream)
+ else if Char.toLower c = #"e" then
+ (checkAndRaise ImHex
+ "floating constant can not come with 0x prefix";
+ (FloatMode FmExp, NONE, stream))
+ else
+ finish ()
+ end
+ | numParser (FloatMode FmDot) (stream, startOff) _ =
+ let
+ val (len, stream) = skipDigitSeq 0 $ Stream.ungetc stream
+
+ val (c, stream') = Stream.getchar stream
+
+ fun finish () =
+ let
+ val str = finishSeqRead startOff stream
+ val (sfx, stream) = getFloatSuffix len stream
+ in
+ (FloatMode FmDot, SOME $ Num $ FloatConst (str, sfx), stream)
+ end
+ in
+ case c of
+ NONE => finish ()
+ | SOME c =>
+ if Char.toLower c = #"e" then
+ (FloatMode FmExp, NONE, stream')
+ else
+ finish ()
+ end
+ | numParser (FloatMode FmExp) (stream, startOff) c =
+ let
+ val (off, stream) =
+ if c = NONE then
+ raise TkError (TkiDx 0, "expected digit")
+ else if valOf c <> #"+" andalso valOf c <> #"-" then
+ (0, Stream.ungetc stream)
+ else
+ (~1, stream)
+
+ val (len, stream) = skipDigitSeq off stream
+ val str = finishSeqRead startOff stream
+ val (sfx, stream) = getFloatSuffix len stream
+ in
+ (FloatMode FmDot, SOME $ Num $ FloatConst (str, sfx), stream)
+ end
fun chrIntVal c =
- if Char.isDigit c then
+ if isDigit c then
ord c - ord #"0"
else if ord c >= ord #"a" andalso ord c <= ord #"z" then
ord c - ord #"a" + 10
@@ -503,7 +678,6 @@ structure Tokenizer:> TOKENIZER = struct
fun parseOctalSeq stream c =
let
-
fun follow stream acc count =
if count = 3 then
(SOME $ chr acc, stream)
@@ -538,7 +712,7 @@ structure Tokenizer:> TOKENIZER = struct
else
(SOME $ chr acc, stream)
| SOME c =>
- if Char.isHexDigit c then
+ if isHexDigit c then
if count = 2 then
raise TkError (TkiDx 2, "hex sequence out of range")
else
@@ -784,8 +958,8 @@ structure Tokenizer:> TOKENIZER = struct
getToken stream
else if isIdStart c then
@-> idParser ()
- else if Char.isDigit c then
- @-> numParser ()
+ else if isDigit c then
+ @-> numParser NpInit
else if c = #"'" then
@-> charParser SeqInit
else if c = #"\"" then