(* diderot.lex * * COPYRIGHT (c) 2010 The Diderot Project (http://diderot.cs.uchicago.edu) * All rights reserved. *) %name DiderotLex; %arg (lexErr); %defs( structure T = DiderotTokens (* some type lex_result is necessitated by ml-ulex *) type lex_result = T.token (* the depth int ref will be used for keeping track of comment depth *) val depth = ref 0 (* list of string fragments to concatenate *) val buf : string list ref = ref [] (* add a string to the buffer *) fun addStr s = (buf := s :: !buf) (* make a string from buf *) fun mkString () = let val s = String.concat(List.rev(!buf)) in buf := []; T.STRING s end (* make a FLOAT token from a substring *) fun mkFloat ss = let val (isNeg, rest) = (case Substring.getc ss of SOME(#"-", r) => (true, r) | SOME(#"+", r) => (false, r) | _ => (false, ss) (* end case *)) val (whole, rest) = Substring.splitl Char.isDigit rest val rest = Substring.triml 1 rest (* remove "." *) val (frac, rest) = Substring.splitl Char.isDigit rest val exp = if Substring.isEmpty rest then 0 else let val rest = Substring.triml 1 rest (* remove "e" or "E" *) in #1(valOf(Int.scan StringCvt.DEC Substring.getc rest)) end in T.FLOAT(FloatLit.float{ isNeg = isNeg, whole = Substring.string whole, frac = Substring.string frac, exp = exp }) end (* scan a number from a hexidecimal string *) val fromHexString = valOf o (StringCvt.scanString (IntInf.scan StringCvt.HEX)) (* FIXME: the above code doesn't work in SML/NJ; here is a work around *) fun fromHexString s = let val SOME(n, _) = IntInf.scan StringCvt.HEX Substring.getc (Substring.triml 2 (Substring.full s)) in n end (* eof : unit -> lex_result *) (* ml-ulex requires this as well *) fun eof () = T.EOF ); %states INITIAL STRING COM1 COM2; %let letter = [a-zA-Z]; %let dig = [0-9]; %let num = {dig}+; %let hexdigit = [0-9a-fA-F]; %let hexnum = "0x"{hexdigit}+; %let idchar = {letter}|{dig}|"_"|"'"; %let id = {letter}{idchar}*; %let ws = " "|[\t\n\v\f\r]; %let esc = "\\"[abfnrtv\\\"]|"\\"{dig}{dig}{dig}; %let sgood = [\032-\126]&[^\"\\]; (* sgood means "characters good inside strings" *) %let eol = "\n"; (***** Keywords and operators *****) "||" => (T.OP_orelse); "&&" => (T.OP_andalso); "<" => (T.OP_lt); "<=" => (T.OP_lte); "==" => (T.OP_eqeq); "!=" => (T.OP_neq); ">=" => (T.OP_gte); ">" => (T.OP_gt); "+" => (T.OP_plus); "-" => (T.OP_minus); "*" => (T.OP_star); "/" => (T.OP_slash); "@" => (T.OP_at); "(" => (T.LP); ")" => (T.RP); "[" => (T.LB); "]" => (T.RB); "{" => (T.LCB); "}" => (T.RCB); "," => (T.COMMA); ";" => (T.SEMI); "#" => (T.HASH); "!" => (T.BANG); "=" => (T.OP_eq); "|" => (T.BAR); ".." => (T.DOTDOT); {id} => (Keywords.idToken yytext); {num} => (T.INT(valOf (IntInf.fromString yytext))); {num}"."{num}([eE][+-]?{num})? => (mkFloat yysubstr); {ws} => (skip ()); "\"" => (YYBEGIN STRING; continue()); . => (lexErr(yypos, ["bad character `", String.toString yytext]); continue()); (***** Strings *****) {esc} => (addStr(valOf(String.fromString yytext)); continue()); {sgood}+ => (addStr yytext; continue()); "\"" => (YYBEGIN INITIAL; mkString()); . => (lexErr(yypos, [ "bad character `", String.toString yytext, "' in string literal" ]); continue()); (***** Comments *****) "//" => (YYBEGIN COM1; skip()); {eol} => (YYBEGIN INITIAL; skip()); . => (skip()); "/*" => (YYBEGIN COM2; skip()); "*/" => (YYBEGIN INITIAL; skip()); . => (skip());
Click to toggle
does not end with </html> tag
does not end with </body> tag
The output has ended thus: > "/*" => (YYBEGIN COM2; skip()); <COM2> "*/" => (YYBEGIN INITIAL; skip()); <COM2> . => (skip());