SCM Repository
Annotation of /sml/trunk/src/ml-lex/lexgen.sml
Parent Directory
|
Revision Log
Revision 250 - (view) (download)
1 : | monnier | 249 | (* Lexical analyzer generator for Standard ML. |
2 : | Version 1.7.0, June 1998 | ||
3 : | |||
4 : | Copyright (c) 1989-1992 by Andrew W. Appel, | ||
5 : | David R. Tarditi, James S. Mattson | ||
6 : | |||
7 : | This software comes with ABSOLUTELY NO WARRANTY. | ||
8 : | This software is subject only to the PRINCETON STANDARD ML SOFTWARE LIBRARY | ||
9 : | COPYRIGHT NOTICE, LICENSE AND DISCLAIMER, (in the file "COPYRIGHT", | ||
10 : | distributed with this software). You may copy and distribute this software; | ||
11 : | see the COPYRIGHT NOTICE for details and restrictions. | ||
12 : | |||
13 : | Changes: | ||
14 : | 07/25/89 (drt): added %header declaration, code to place | ||
15 : | user declarations at same level as makeLexer, etc. | ||
16 : | This is needed for the parser generator. | ||
17 : | /10/89 (appel): added %arg declaration (see lexgen.doc). | ||
18 : | /04/90 (drt): fixed following bug: couldn't use the lexer after an | ||
19 : | error occurred -- NextTok and inquote weren't being reset | ||
20 : | 10/22/91 (drt): disabled use of lookahead | ||
21 : | 10/23/92 (drt): disabled use of $ operator (which involves lookahead), | ||
22 : | added handlers for dictionary lookup routine | ||
23 : | 11/02/92 (drt): changed handler for exception Reject in generated lexer | ||
24 : | to Internal.Reject | ||
25 : | 02/01/94 (appel): Moved the exception handler for Reject in such | ||
26 : | a way as to allow tail-recursion (improves performance | ||
27 : | wonderfully!). | ||
28 : | 02/01/94 (appel): Fixed a bug in parsing of state names. | ||
29 : | 05/19/94 (Mikael Pettersson, mpe@ida.liu.se): | ||
30 : | Transition tables are usually represented as strings, but | ||
31 : | when the range is too large, int vectors constructed by | ||
32 : | code like "Vector.vector[1,2,3,...]" are used instead. | ||
33 : | The problem with this isn't that the vector itself takes | ||
34 : | a lot of space, but that the code generated by SML/NJ to | ||
35 : | construct the intermediate list at run-time is *HUGE*. My | ||
36 : | fix is to encode an int vector as a string literal (using | ||
37 : | two bytes per int) and emit code to decode the string to | ||
38 : | a vector at run-time. SML/NJ compiles string literals into | ||
39 : | substrings in the code, so this uses much less space. | ||
40 : | 06/02/94 (jhr): Modified export-lex.sml to conform to new installation | ||
41 : | scheme. Also removed tab characters from string literals. | ||
42 : | 10/05/94 (jhr): Changed generator to produce code that uses the new | ||
43 : | basis style strings and characters. | ||
44 : | 10/06/94 (jhr) Modified code to compile under new basis style strings | ||
45 : | and characters. | ||
46 : | 02/08/95 (jhr) Modified to use new List module interface. | ||
47 : | 05/18/95 (jhr) changed Vector.vector to Vector.fromList | ||
48 : | * | ||
49 : | * $Log: lexgen.sml,v $ | ||
50 : | * Revision 1.1.1.1 1998/04/08 18:40:10 george | ||
51 : | * Version 110.5 | ||
52 : | * | ||
53 : | * Revision 1.9 1998/01/06 19:23:53 appel | ||
54 : | * added %posarg feature to permit position-within-file to be passed | ||
55 : | * as a parameter to makeLexer | ||
56 : | * | ||
57 : | # Revision 1.8 1998/01/06 19:01:48 appel | ||
58 : | # repaired error messages like "cannot have both %structure and %header" | ||
59 : | # | ||
60 : | # Revision 1.7 1998/01/06 18:55:49 appel | ||
61 : | # permit %% to be unescaped within regular expressions | ||
62 : | # | ||
63 : | # Revision 1.6 1998/01/06 18:46:13 appel | ||
64 : | # removed undocumented feature that permitted extra %% at end of rules | ||
65 : | # | ||
66 : | # Revision 1.5 1998/01/06 18:29:23 appel | ||
67 : | # put yylineno variable inside makeLexer function | ||
68 : | # | ||
69 : | # Revision 1.4 1998/01/06 18:19:59 appel | ||
70 : | # check for newline inside quoted string | ||
71 : | # | ||
72 : | # Revision 1.3 1997/10/04 03:52:13 dbm | ||
73 : | # Fix to remove output file if ml-lex fails. | ||
74 : | # | ||
75 : | # Revision 1.2 1997/05/06 01:12:38 george | ||
76 : | # *** empty log message *** | ||
77 : | # | ||
78 : | * Revision 1.2 1996/02/26 15:02:27 george | ||
79 : | * print no longer overloaded. | ||
80 : | * use of makestring has been removed and replaced with Int.toString .. | ||
81 : | * use of IO replaced with TextIO | ||
82 : | * | ||
83 : | * Revision 1.1.1.1 1996/01/31 16:01:15 george | ||
84 : | * Version 109 | ||
85 : | * | ||
86 : | *) | ||
87 : | |||
88 : | (* Subject: lookahead in sml-lex | ||
89 : | Reply-to: david.tarditi@CS.CMU.EDU | ||
90 : | Date: Mon, 21 Oct 91 14:13:26 -0400 | ||
91 : | |||
92 : | There is a serious bug in the implementation of lookahead, | ||
93 : | as done in sml-lex, and described in Aho, Sethi, and Ullman, | ||
94 : | p. 134 "Implementing the Lookahead Operator" | ||
95 : | |||
96 : | We have disallowed the use of lookahead for now because | ||
97 : | of this bug. | ||
98 : | |||
99 : | As a counter-example to the implementation described in | ||
100 : | ASU, consider the following specification with the | ||
101 : | input string "aba" (this example is taken from | ||
102 : | a comp.compilers message from Dec. 1989, I think): | ||
103 : | |||
104 : | type lexresult=unit | ||
105 : | val linenum = ref 1 | ||
106 : | fun error x = TextIO.output(TextIO.stdErr, x ^ "\n") | ||
107 : | val eof = fn () => () | ||
108 : | %% | ||
109 : | %structure Lex | ||
110 : | %% | ||
111 : | (a|ab)/ba => (print yytext; print "\n"; ()); | ||
112 : | |||
113 : | The ASU proposal works as follows. Suppose that we are | ||
114 : | using NFA's to represent our regular expressions. Then to | ||
115 : | build an NFA for e1 / e2, we build an NFA n1 for e1 | ||
116 : | and an NFA n2 for e2, and add an epsilon transition | ||
117 : | from e1 to e2. | ||
118 : | |||
119 : | When lexing, when we encounter the end state of e1e2, | ||
120 : | we take as the end of the string the position in | ||
121 : | the string that was the last occurrence of the state of | ||
122 : | the NFA having a transition on the epsilon introduced | ||
123 : | for /. | ||
124 : | |||
125 : | Using the example we have above, we'll have an NFA | ||
126 : | with the following states: | ||
127 : | |||
128 : | |||
129 : | 1 -- a --> 2 -- b --> 3 | ||
130 : | | | | ||
131 : | | epsilon | epsilon | ||
132 : | | | | ||
133 : | |------------> 4 -- b --> 5 -- a --> 6 | ||
134 : | |||
135 : | On our example, we get the following list of transitions: | ||
136 : | |||
137 : | a : 2, 4 (make an epsilon transition from 2 to 4) | ||
138 : | ab : 3, 4, 5 (make an epsilon transition from 3 to 4) | ||
139 : | aba : 6 | ||
140 : | |||
141 : | If we chose the last state in which we made an epsilon transition, | ||
142 : | we'll chose the transition from 3 to 4, and end up with "ab" | ||
143 : | as our token, when we should have "a" as our token. | ||
144 : | |||
145 : | *) | ||
146 : | |||
147 : | functor RedBlack(B : sig type key | ||
148 : | val > : key*key->bool | ||
149 : | end): | ||
150 : | sig type tree | ||
151 : | type key | ||
152 : | val empty : tree | ||
153 : | val insert : key * tree -> tree | ||
154 : | val lookup : key * tree -> key | ||
155 : | exception notfound of key | ||
156 : | end = | ||
157 : | struct | ||
158 : | open B | ||
159 : | datatype color = RED | BLACK | ||
160 : | datatype tree = empty | tree of key * color * tree * tree | ||
161 : | exception notfound of key | ||
162 : | |||
163 : | fun insert (key,t) = | ||
164 : | let fun f empty = tree(key,RED,empty,empty) | ||
165 : | | f (tree(k,BLACK,l,r)) = | ||
166 : | if key>k | ||
167 : | then case f r | ||
168 : | of r as tree(rk,RED, rl as tree(rlk,RED,rll,rlr),rr) => | ||
169 : | (case l | ||
170 : | of tree(lk,RED,ll,lr) => | ||
171 : | tree(k,RED,tree(lk,BLACK,ll,lr), | ||
172 : | tree(rk,BLACK,rl,rr)) | ||
173 : | | _ => tree(rlk,BLACK,tree(k,RED,l,rll), | ||
174 : | tree(rk,RED,rlr,rr))) | ||
175 : | | r as tree(rk,RED,rl, rr as tree(rrk,RED,rrl,rrr)) => | ||
176 : | (case l | ||
177 : | of tree(lk,RED,ll,lr) => | ||
178 : | tree(k,RED,tree(lk,BLACK,ll,lr), | ||
179 : | tree(rk,BLACK,rl,rr)) | ||
180 : | | _ => tree(rk,BLACK,tree(k,RED,l,rl),rr)) | ||
181 : | | r => tree(k,BLACK,l,r) | ||
182 : | else if k>key | ||
183 : | then case f l | ||
184 : | of l as tree(lk,RED,ll, lr as tree(lrk,RED,lrl,lrr)) => | ||
185 : | (case r | ||
186 : | of tree(rk,RED,rl,rr) => | ||
187 : | tree(k,RED,tree(lk,BLACK,ll,lr), | ||
188 : | tree(rk,BLACK,rl,rr)) | ||
189 : | | _ => tree(lrk,BLACK,tree(lk,RED,ll,lrl), | ||
190 : | tree(k,RED,lrr,r))) | ||
191 : | | l as tree(lk,RED, ll as tree(llk,RED,lll,llr), lr) => | ||
192 : | (case r | ||
193 : | of tree(rk,RED,rl,rr) => | ||
194 : | tree(k,RED,tree(lk,BLACK,ll,lr), | ||
195 : | tree(rk,BLACK,rl,rr)) | ||
196 : | | _ => tree(lk,BLACK,ll,tree(k,RED,lr,r))) | ||
197 : | | l => tree(k,BLACK,l,r) | ||
198 : | else tree(key,BLACK,l,r) | ||
199 : | | f (tree(k,RED,l,r)) = | ||
200 : | if key>k then tree(k,RED,l, f r) | ||
201 : | else if k>key then tree(k,RED, f l, r) | ||
202 : | else tree(key,RED,l,r) | ||
203 : | in case f t | ||
204 : | of tree(k,RED, l as tree(_,RED,_,_), r) => tree(k,BLACK,l,r) | ||
205 : | | tree(k,RED, l, r as tree(_,RED,_,_)) => tree(k,BLACK,l,r) | ||
206 : | | t => t | ||
207 : | end | ||
208 : | |||
209 : | |||
210 : | fun lookup (key,t) = | ||
211 : | let fun look empty = raise (notfound key) | ||
212 : | | look (tree(k,_,l,r)) = | ||
213 : | if k>key then look l | ||
214 : | else if key>k then look r | ||
215 : | else k | ||
216 : | in look t | ||
217 : | end | ||
218 : | |||
219 : | end | ||
220 : | |||
221 : | signature LEXGEN = | ||
222 : | sig | ||
223 : | val lexGen: string -> unit | ||
224 : | end | ||
225 : | |||
226 : | structure LexGen: LEXGEN = | ||
227 : | struct | ||
228 : | open Array List | ||
229 : | infix 9 sub | ||
230 : | |||
231 : | datatype token = CHARS of bool array | QMARK | STAR | PLUS | BAR | ||
232 : | | LP | RP | CARAT | DOLLAR | SLASH | STATE of string list | ||
233 : | | REPS of int * int | ID of string | ACTION of string | ||
234 : | | BOF | EOF | ASSIGN | SEMI | ARROW | LEXMARK | LEXSTATES | ||
235 : | | COUNT | REJECT | FULLCHARSET | STRUCT | HEADER | ARG | POSARG | ||
236 : | |||
237 : | datatype exp = EPS | CLASS of bool array * int | CLOSURE of exp | ||
238 : | | ALT of exp * exp | CAT of exp * exp | TRAIL of int | ||
239 : | | END of int | ||
240 : | |||
241 : | (* flags describing input Lex spec. - unnecessary code is omitted *) | ||
242 : | (* if possible *) | ||
243 : | |||
244 : | val CharFormat = ref false; | ||
245 : | val UsesTrailingContext = ref false; | ||
246 : | val UsesPrevNewLine = ref false; | ||
247 : | |||
248 : | (* flags for various bells & whistles that Lex has. These slow the | ||
249 : | lexer down and should be omitted from production lexers (if you | ||
250 : | really want speed) *) | ||
251 : | |||
252 : | val CountNewLines = ref false; | ||
253 : | val PosArg = ref false; | ||
254 : | val HaveReject = ref false; | ||
255 : | |||
256 : | (* Can increase size of character set *) | ||
257 : | |||
258 : | val CharSetSize = ref 129; | ||
259 : | |||
260 : | (* Can name structure or declare header code *) | ||
261 : | |||
262 : | val StrName = ref "Mlex" | ||
263 : | val HeaderCode = ref "" | ||
264 : | val HeaderDecl = ref false | ||
265 : | val ArgCode = ref (NONE: string option) | ||
266 : | val StrDecl = ref false | ||
267 : | |||
268 : | val ResetFlags = fn () => (CountNewLines := false; HaveReject := false; | ||
269 : | PosArg := false; | ||
270 : | UsesTrailingContext := false; | ||
271 : | CharSetSize := 129; StrName := "Mlex"; | ||
272 : | HeaderCode := ""; HeaderDecl:= false; | ||
273 : | ArgCode := NONE; | ||
274 : | StrDecl := false) | ||
275 : | |||
276 : | val LexOut = ref(TextIO.stdOut) | ||
277 : | fun say x = TextIO.output(!LexOut, x) | ||
278 : | |||
279 : | (* Union: merge two sorted lists of integers *) | ||
280 : | |||
281 : | fun union(a,b) = let val rec merge = fn | ||
282 : | (nil,nil,z) => z | ||
283 : | | (nil,el::more,z) => merge(nil,more,el::z) | ||
284 : | | (el::more,nil,z) => merge(more,nil,el::z) | ||
285 : | | (x::morex,y::morey,z) => if (x:int)=(y:int) | ||
286 : | then merge(morex,morey,x::z) | ||
287 : | else if x>y then merge(morex,y::morey,x::z) | ||
288 : | else merge(x::morex,morey,y::z) | ||
289 : | in merge(rev a,rev b,nil) | ||
290 : | end | ||
291 : | |||
292 : | (* Nullable: compute if a important expression parse tree node is nullable *) | ||
293 : | |||
294 : | val rec nullable = fn | ||
295 : | EPS => true | ||
296 : | | CLASS(_) => false | ||
297 : | | CLOSURE(_) => true | ||
298 : | | ALT(n1,n2) => nullable(n1) orelse nullable(n2) | ||
299 : | | CAT(n1,n2) => nullable(n1) andalso nullable(n2) | ||
300 : | | TRAIL(_) => true | ||
301 : | | END(_) => false | ||
302 : | |||
303 : | (* FIRSTPOS: firstpos function for parse tree expressions *) | ||
304 : | |||
305 : | and firstpos = fn | ||
306 : | EPS => nil | ||
307 : | | CLASS(_,i) => [i] | ||
308 : | | CLOSURE(n) => firstpos(n) | ||
309 : | | ALT(n1,n2) => union(firstpos(n1),firstpos(n2)) | ||
310 : | | CAT(n1,n2) => if nullable(n1) then union(firstpos(n1),firstpos(n2)) | ||
311 : | else firstpos(n1) | ||
312 : | | TRAIL(i) => [i] | ||
313 : | | END(i) => [i] | ||
314 : | |||
315 : | (* LASTPOS: Lastpos function for parse tree expressions *) | ||
316 : | |||
317 : | and lastpos = fn | ||
318 : | EPS => nil | ||
319 : | | CLASS(_,i) => [i] | ||
320 : | | CLOSURE(n) => lastpos(n) | ||
321 : | | ALT(n1,n2) => union(lastpos(n1),lastpos(n2)) | ||
322 : | | CAT(n1,n2) => if nullable(n2) then union(lastpos(n1),lastpos(n2)) | ||
323 : | else lastpos(n2) | ||
324 : | | TRAIL(i) => [i] | ||
325 : | | END(i) => [i] | ||
326 : | ; | ||
327 : | |||
328 : | (* ++: Increment an integer reference *) | ||
329 : | |||
330 : | fun ++(x) : int = (x := !x + 1; !x); | ||
331 : | |||
332 : | structure dict = | ||
333 : | struct | ||
334 : | type 'a relation = 'a * 'a -> bool | ||
335 : | abstype ('b,'a) dictionary = DATA of { Table : ('b * 'a) list, | ||
336 : | Leq : 'b * 'b -> bool } | ||
337 : | with | ||
338 : | exception LOOKUP | ||
339 : | fun create Leqfunc = DATA { Table = nil, Leq = Leqfunc } | ||
340 : | fun lookup (DATA { Table = entrylist, Leq = leq }) key = | ||
341 : | let fun search [] = raise LOOKUP | ||
342 : | | search((k,item)::entries) = | ||
343 : | if leq(key,k) | ||
344 : | then if leq(k,key) then item else raise LOOKUP | ||
345 : | else search entries | ||
346 : | in search entrylist | ||
347 : | end | ||
348 : | fun enter (DATA { Table = entrylist, Leq = leq }) | ||
349 : | (newentry as (key : 'b,item :'a)) : ('b,'a) dictionary = | ||
350 : | let val gt = fn a => fn b => not (leq(a,b)) | ||
351 : | val eq = fn k => fn k' => (leq(k,k')) andalso (leq(k',k)) | ||
352 : | fun update nil = [ newentry ] | ||
353 : | | update ((entry as (k,_))::entries) = | ||
354 : | if (eq key k) then newentry::entries | ||
355 : | else if gt k key then newentry::(entry::entries) | ||
356 : | else entry::(update entries) | ||
357 : | in DATA { Table = update entrylist, Leq = leq } | ||
358 : | end | ||
359 : | fun listofdict (DATA { Table = entrylist,Leq = leq}) = | ||
360 : | let fun f (nil,r) = rev r | ||
361 : | | f (a::b,r) = f (b,a::r) | ||
362 : | in f(entrylist,nil) | ||
363 : | end | ||
364 : | end | ||
365 : | end | ||
366 : | |||
367 : | open dict; | ||
368 : | |||
369 : | (* INPUT.ML : Input w/ one character push back capability *) | ||
370 : | |||
371 : | val LineNum = ref 1; | ||
372 : | |||
373 : | abstype ibuf = | ||
374 : | BUF of TextIO.instream * {b : string ref, p : int ref} | ||
375 : | with | ||
376 : | fun make_ibuf(s) = BUF (s, {b=ref"", p = ref 0}) | ||
377 : | fun close_ibuf (BUF (s,_)) = TextIO.closeIn(s) | ||
378 : | exception eof | ||
379 : | fun getch (a as (BUF(s,{b,p}))) = | ||
380 : | if (!p = (size (!b))) | ||
381 : | then (b := TextIO.inputN(s, 1024); | ||
382 : | p := 0; | ||
383 : | if (size (!b))=0 | ||
384 : | then raise eof | ||
385 : | else getch a) | ||
386 : | else (let val ch = String.sub(!b,!p) | ||
387 : | in (if ch = #"\n" | ||
388 : | then LineNum := !LineNum + 1 | ||
389 : | else (); | ||
390 : | p := !p + 1; | ||
391 : | ch) | ||
392 : | end) | ||
393 : | fun ungetch(BUF(s,{b,p})) = ( | ||
394 : | p := !p - 1; | ||
395 : | if String.sub(!b,!p) = #"\n" | ||
396 : | then LineNum := !LineNum - 1 | ||
397 : | else ()) | ||
398 : | end; | ||
399 : | |||
400 : | exception Error | ||
401 : | |||
402 : | fun prErr x = ( | ||
403 : | TextIO.output (TextIO.stdErr, String.concat [ | ||
404 : | "ml-lex: error, line ", (Int.toString (!LineNum)), ": ", x, "\n" | ||
405 : | ]); | ||
406 : | raise Error) | ||
407 : | fun prSynErr x = ( | ||
408 : | TextIO.output (TextIO.stdErr, String.concat [ | ||
409 : | "ml-lex: syntax error, line ", (Int.toString (!LineNum)), ": ", x, "\n" | ||
410 : | ]); | ||
411 : | raise Error) | ||
412 : | |||
413 : | exception SyntaxError; (* error in user's input file *) | ||
414 : | |||
415 : | exception LexError; (* unexpected error in lexer *) | ||
416 : | |||
417 : | val LexBuf = ref(make_ibuf(TextIO.stdIn)); | ||
418 : | val LexState = ref 0; | ||
419 : | val NextTok = ref BOF; | ||
420 : | val inquote = ref false; | ||
421 : | |||
422 : | fun AdvanceTok () : unit = let | ||
423 : | fun isLetter c = | ||
424 : | ((c >= #"a") andalso (c <= #"z")) orelse | ||
425 : | ((c >= #"A") andalso (c <= #"Z")) | ||
426 : | fun isDigit c = (c >= #"0") andalso (c <= #"9") | ||
427 : | (* check for valid (non-leading) identifier character (added by JHR) *) | ||
428 : | fun isIdentChr c = | ||
429 : | ((isLetter c) orelse (isDigit c) orelse (c = #"_") orelse (c = #"'")) | ||
430 : | fun atoi s = let | ||
431 : | fun num (c::r, n) = if isDigit c | ||
432 : | then num (r, 10*n + (Char.ord c - Char.ord #"0")) | ||
433 : | else n | ||
434 : | | num ([], n) = n | ||
435 : | in | ||
436 : | num (explode s, 0) | ||
437 : | end | ||
438 : | |||
439 : | fun skipws () = (case nextch() | ||
440 : | of #" " => skipws() | ||
441 : | | #"\t" => skipws() | ||
442 : | | #"\n" => skipws() | ||
443 : | | x => x | ||
444 : | (* end case *)) | ||
445 : | |||
446 : | and nextch () = getch(!LexBuf) | ||
447 : | |||
448 : | and escaped () = (case nextch() | ||
449 : | of #"b" => #"\008" | ||
450 : | | #"n" => #"\n" | ||
451 : | | #"t" => #"\t" | ||
452 : | | #"h" => #"\128" | ||
453 : | | x => let | ||
454 : | fun err t = prErr("illegal ascii escape '"^(implode(rev t))^"'") | ||
455 : | fun cvt c = (Char.ord c - Char.ord #"0") | ||
456 : | fun f (n, c, t) = if c=3 | ||
457 : | then if n >= (!CharSetSize) | ||
458 : | then err t | ||
459 : | else Char.chr n | ||
460 : | else let val ch=nextch() | ||
461 : | in | ||
462 : | if isDigit ch | ||
463 : | then f(n*10+(cvt ch), c+1, ch::t) | ||
464 : | else err t | ||
465 : | end | ||
466 : | in | ||
467 : | if isDigit x then f(cvt x, 1, [x]) else x | ||
468 : | end | ||
469 : | (* end case *)) | ||
470 : | |||
471 : | and onechar x = let val c = array(!CharSetSize, false) | ||
472 : | in | ||
473 : | update(c, Char.ord(x), true); CHARS(c) | ||
474 : | end | ||
475 : | |||
476 : | in case !LexState of 0 => let val makeTok = fn () => | ||
477 : | case skipws() | ||
478 : | (* Lex % operators *) | ||
479 : | of #"%" => (case nextch() of | ||
480 : | #"%" => LEXMARK | ||
481 : | | a => let fun f s = | ||
482 : | let val a = nextch() | ||
483 : | in if isLetter a then f(a::s) | ||
484 : | else (ungetch(!LexBuf); | ||
485 : | implode(rev s)) | ||
486 : | end | ||
487 : | in case f [a] | ||
488 : | of "reject" => REJECT | ||
489 : | | "count" => COUNT | ||
490 : | | "full" => FULLCHARSET | ||
491 : | | "s" => LEXSTATES | ||
492 : | | "S" => LEXSTATES | ||
493 : | | "structure" => STRUCT | ||
494 : | | "header" => HEADER | ||
495 : | | "arg" => ARG | ||
496 : | | "posarg" => POSARG | ||
497 : | | _ => prErr "unknown % operator " | ||
498 : | end | ||
499 : | ) | ||
500 : | (* semicolon (for end of LEXSTATES) *) | ||
501 : | | #";" => SEMI | ||
502 : | (* anything else *) | ||
503 : | | ch => if isLetter(ch) then | ||
504 : | let fun getID matched = | ||
505 : | let val x = nextch() | ||
506 : | (**** fix by JHR | ||
507 : | in if isLetter(x) orelse isDigit(x) orelse | ||
508 : | x = "_" orelse x = "'" | ||
509 : | ****) | ||
510 : | in if (isIdentChr x) | ||
511 : | then getID (x::matched) | ||
512 : | else (ungetch(!LexBuf); implode(rev matched)) | ||
513 : | end | ||
514 : | in ID(getID [ch]) | ||
515 : | end | ||
516 : | else (prSynErr ("bad character: " ^ String.str ch)) | ||
517 : | in NextTok := makeTok() | ||
518 : | end | ||
519 : | | 1 => let val rec makeTok = fn () => | ||
520 : | if !inquote then case nextch() of | ||
521 : | (* inside quoted string *) | ||
522 : | #"\\" => onechar(escaped()) | ||
523 : | | #"\"" => (inquote := false; makeTok()) | ||
524 : | | #"\n" => (prSynErr "end-of-line inside quoted string"; | ||
525 : | inquote := false; makeTok()) | ||
526 : | | x => onechar(x) | ||
527 : | else case skipws() of | ||
528 : | (* single character operators *) | ||
529 : | #"?" => QMARK | ||
530 : | | #"*" => STAR | ||
531 : | | #"+" => PLUS | ||
532 : | | #"|" => BAR | ||
533 : | | #"(" => LP | ||
534 : | | #")" => RP | ||
535 : | | #"^" => CARAT | ||
536 : | | #"$" => DOLLAR | ||
537 : | | #"/" => SLASH | ||
538 : | | #";" => SEMI | ||
539 : | | #"." => let val c = array(!CharSetSize,true) in | ||
540 : | update(c,10,false); CHARS(c) | ||
541 : | end | ||
542 : | (* assign and arrow *) | ||
543 : | | #"=" => let val c = nextch() in | ||
544 : | if c = #">" then ARROW else (ungetch(!LexBuf); ASSIGN) | ||
545 : | end | ||
546 : | (* character set *) | ||
547 : | | #"[" => let val rec classch = fn () => let val x = skipws() | ||
548 : | in if x = #"\\" then escaped() else x | ||
549 : | end; | ||
550 : | val first = classch(); | ||
551 : | val flag = (first <> #"^"); | ||
552 : | val c = array(!CharSetSize,not flag); | ||
553 : | fun add NONE = () | ||
554 : | | add (SOME x) = update(c, Char.ord(x), flag) | ||
555 : | and range (x, y) = if x>y | ||
556 : | then (prErr "bad char. range") | ||
557 : | else let | ||
558 : | val i = ref(Char.ord(x)) and j = Char.ord(y) | ||
559 : | in while !i<=j do ( | ||
560 : | add (SOME(Char.chr(!i))); | ||
561 : | i := !i + 1) | ||
562 : | end | ||
563 : | and getClass last = (case classch() | ||
564 : | of #"]" => (add(last); c) | ||
565 : | | #"-" => (case last | ||
566 : | of NONE => getClass(SOME #"-") | ||
567 : | | (SOME last') => let val x = classch() | ||
568 : | in | ||
569 : | if x = #"]" | ||
570 : | then (add(last); add(SOME #"-"); c) | ||
571 : | else (range(last',x); getClass(NONE)) | ||
572 : | end | ||
573 : | (* end case *)) | ||
574 : | | x => (add(last); getClass(SOME x)) | ||
575 : | (* end case *)) | ||
576 : | in CHARS(getClass(if first = #"^" then NONE else SOME first)) | ||
577 : | end | ||
578 : | (* Start States specification *) | ||
579 : | | #"<" => let val rec get_state = fn (prev,matched) => | ||
580 : | case nextch() of | ||
581 : | #">" => matched::prev | ||
582 : | | #"," => get_state(matched::prev,"") | ||
583 : | | x => if isIdentChr(x) | ||
584 : | then get_state(prev,matched ^ String.str x) | ||
585 : | else (prSynErr "bad start state list") | ||
586 : | in STATE(get_state(nil,"")) | ||
587 : | end | ||
588 : | (* {id} or repititions *) | ||
589 : | | #"{" => let val ch = nextch() in if isLetter(ch) then | ||
590 : | let fun getID matched = (case nextch() | ||
591 : | of #"}" => matched | ||
592 : | | x => if (isIdentChr x) then | ||
593 : | getID(matched ^ String.str x) | ||
594 : | else (prErr "invalid char. class name") | ||
595 : | (* end case *)) | ||
596 : | in ID(getID(String.str ch)) | ||
597 : | end | ||
598 : | else if isDigit(ch) then | ||
599 : | let fun get_r (matched, r1) = (case nextch() | ||
600 : | of #"}" => let val n = atoi(matched) in | ||
601 : | if r1 = ~1 then (n,n) else (r1,n) | ||
602 : | end | ||
603 : | | #"," => if r1 = ~1 then get_r("",atoi(matched)) | ||
604 : | else (prErr "invalid repetitions spec.") | ||
605 : | | x => if isDigit(x) | ||
606 : | then get_r(matched ^ String.str x,r1) | ||
607 : | else (prErr "invalid char in repetitions spec") | ||
608 : | (* end case *)) | ||
609 : | in REPS(get_r(String.str ch,~1)) | ||
610 : | end | ||
611 : | else (prErr "bad repetitions spec") | ||
612 : | end | ||
613 : | (* Lex % operators *) | ||
614 : | | #"\\" => onechar(escaped()) | ||
615 : | (* start quoted string *) | ||
616 : | | #"\"" => (inquote := true; makeTok()) | ||
617 : | (* anything else *) | ||
618 : | | ch => onechar(ch) | ||
619 : | in NextTok := makeTok() | ||
620 : | end | ||
621 : | | 2 => NextTok := | ||
622 : | (case skipws() | ||
623 : | of #"(" => let | ||
624 : | fun GetAct (lpct,x) = (case getch(!LexBuf) | ||
625 : | of #"(" => GetAct (lpct+1, #"("::x) | ||
626 : | | #")" => if lpct = 0 then (implode (rev x)) | ||
627 : | else GetAct(lpct-1, #")"::x) | ||
628 : | | y => GetAct(lpct,y::x) | ||
629 : | (* end case *)) | ||
630 : | in ACTION (GetAct (0,nil)) | ||
631 : | end | ||
632 : | | #";" => SEMI | ||
633 : | | c => (prSynErr ("invalid character " ^ String.str c))) | ||
634 : | | _ => raise LexError | ||
635 : | end | ||
636 : | handle eof => NextTok := EOF ; | ||
637 : | |||
638 : | fun GetTok (_:unit) : token = | ||
639 : | let val t = !NextTok in AdvanceTok(); t | ||
640 : | end; | ||
641 : | val SymTab = ref (create String.<=) : (string,exp) dictionary ref | ||
642 : | |||
643 : | fun GetExp () : exp = | ||
644 : | |||
645 : | let val rec optional = fn e => ALT(EPS,e) | ||
646 : | |||
647 : | and lookup' = fn name => | ||
648 : | lookup(!SymTab) name | ||
649 : | handle LOOKUP => prErr ("bad regular expression name: "^ | ||
650 : | name) | ||
651 : | |||
652 : | and newline = fn () => let val c = array(!CharSetSize,false) in | ||
653 : | update(c,10,true); c | ||
654 : | end | ||
655 : | |||
656 : | and endline = fn e => trail(e,CLASS(newline(),0)) | ||
657 : | |||
658 : | and trail = fn (e1,e2) => CAT(CAT(e1,TRAIL(0)),e2) | ||
659 : | |||
660 : | and closure1 = fn e => CAT(e,CLOSURE(e)) | ||
661 : | |||
662 : | and repeat = fn (min,max,e) => let val rec rep = fn | ||
663 : | (0,0) => EPS | ||
664 : | | (0,1) => ALT(e,EPS) | ||
665 : | | (0,i) => CAT(rep(0,1),rep(0,i-1)) | ||
666 : | | (i,j) => CAT(e,rep(i-1,j-1)) | ||
667 : | in rep(min,max) | ||
668 : | end | ||
669 : | |||
670 : | and exp0 = fn () => case GetTok() of | ||
671 : | CHARS(c) => exp1(CLASS(c,0)) | ||
672 : | | LP => let val e = exp0() in | ||
673 : | if !NextTok = RP then | ||
674 : | (AdvanceTok(); exp1(e)) | ||
675 : | else (prSynErr "missing '('") end | ||
676 : | | ID(name) => exp1(lookup' name) | ||
677 : | | _ => raise SyntaxError | ||
678 : | |||
679 : | and exp1 = fn (e) => case !NextTok of | ||
680 : | SEMI => e | ||
681 : | | ARROW => e | ||
682 : | | EOF => e | ||
683 : | | LP => exp2(e,exp0()) | ||
684 : | | RP => e | ||
685 : | | t => (AdvanceTok(); case t of | ||
686 : | QMARK => exp1(optional(e)) | ||
687 : | | STAR => exp1(CLOSURE(e)) | ||
688 : | | PLUS => exp1(closure1(e)) | ||
689 : | | CHARS(c) => exp2(e,CLASS(c,0)) | ||
690 : | | BAR => ALT(e,exp0()) | ||
691 : | | DOLLAR => (UsesTrailingContext := true; endline(e)) | ||
692 : | | SLASH => (UsesTrailingContext := true; | ||
693 : | trail(e,exp0())) | ||
694 : | | REPS(i,j) => exp1(repeat(i,j,e)) | ||
695 : | | ID(name) => exp2(e,lookup' name) | ||
696 : | | _ => raise SyntaxError) | ||
697 : | |||
698 : | and exp2 = fn (e1,e2) => case !NextTok of | ||
699 : | SEMI => CAT(e1,e2) | ||
700 : | | ARROW => CAT(e1,e2) | ||
701 : | | EOF => CAT(e1,e2) | ||
702 : | | LP => exp2(CAT(e1,e2),exp0()) | ||
703 : | | RP => CAT(e1,e2) | ||
704 : | | t => (AdvanceTok(); case t of | ||
705 : | QMARK => exp1(CAT(e1,optional(e2))) | ||
706 : | | STAR => exp1(CAT(e1,CLOSURE(e2))) | ||
707 : | | PLUS => exp1(CAT(e1,closure1(e2))) | ||
708 : | | CHARS(c) => exp2(CAT(e1,e2),CLASS(c,0)) | ||
709 : | | BAR => ALT(CAT(e1,e2),exp0()) | ||
710 : | | DOLLAR => (UsesTrailingContext := true; | ||
711 : | endline(CAT(e1,e2))) | ||
712 : | | SLASH => (UsesTrailingContext := true; | ||
713 : | trail(CAT(e1,e2),exp0())) | ||
714 : | | REPS(i,j) => exp1(CAT(e1,repeat(i,j,e2))) | ||
715 : | | ID(name) => exp2(CAT(e1,e2),lookup' name) | ||
716 : | | _ => raise SyntaxError) | ||
717 : | in exp0() | ||
718 : | end; | ||
719 : | val StateTab = ref(create(String.<=)) : (string,int) dictionary ref | ||
720 : | |||
721 : | val StateNum = ref 0; | ||
722 : | |||
723 : | fun GetStates () : int list = | ||
724 : | |||
725 : | let fun add nil sl = sl | ||
726 : | | add (x::y) sl = add y (union ([lookup (!StateTab)(x) | ||
727 : | handle LOOKUP => | ||
728 : | prErr ("bad state name: "^x) | ||
729 : | ],sl)) | ||
730 : | |||
731 : | fun addall i sl = | ||
732 : | if i <= !StateNum then addall (i+2) (union ([i],sl)) | ||
733 : | else sl | ||
734 : | |||
735 : | fun incall (x::y) = (x+1)::incall y | ||
736 : | | incall nil = nil | ||
737 : | |||
738 : | fun addincs nil = nil | ||
739 : | | addincs (x::y) = x::(x+1)::addincs y | ||
740 : | |||
741 : | val state_list = | ||
742 : | case !NextTok of | ||
743 : | STATE s => (AdvanceTok(); LexState := 1; add s nil) | ||
744 : | | _ => addall 1 nil | ||
745 : | |||
746 : | in case !NextTok | ||
747 : | of CARAT => (LexState := 1; AdvanceTok(); UsesPrevNewLine := true; | ||
748 : | incall state_list) | ||
749 : | | _ => addincs state_list | ||
750 : | end | ||
751 : | |||
752 : | val LeafNum = ref ~1; | ||
753 : | |||
754 : | fun renum(e : exp) : exp = | ||
755 : | let val rec label = fn | ||
756 : | EPS => EPS | ||
757 : | | CLASS(x,_) => CLASS(x,++LeafNum) | ||
758 : | | CLOSURE(e) => CLOSURE(label(e)) | ||
759 : | | ALT(e1,e2) => ALT(label(e1),label(e2)) | ||
760 : | | CAT(e1,e2) => CAT(label(e1),label(e2)) | ||
761 : | | TRAIL(i) => TRAIL(++LeafNum) | ||
762 : | | END(i) => END(++LeafNum) | ||
763 : | in label(e) | ||
764 : | end; | ||
765 : | |||
766 : | exception ParseError; | ||
767 : | |||
768 : | fun parse() : (string * (int list * exp) list * ((string,string) dictionary)) = | ||
769 : | let val Accept = ref (create String.<=) : (string,string) dictionary ref | ||
770 : | val rec ParseRtns = fn l => case getch(!LexBuf) of | ||
771 : | #"%" => let val c = getch(!LexBuf) in | ||
772 : | if c = #"%" then (implode (rev l)) | ||
773 : | else ParseRtns(c :: #"%" :: l) | ||
774 : | end | ||
775 : | | c => ParseRtns(c::l) | ||
776 : | and ParseDefs = fn () => | ||
777 : | (LexState:=0; AdvanceTok(); case !NextTok of | ||
778 : | LEXMARK => () | ||
779 : | | LEXSTATES => | ||
780 : | let fun f () = (case !NextTok of (ID i) => | ||
781 : | (StateTab := enter(!StateTab)(i,++StateNum); | ||
782 : | ++StateNum; AdvanceTok(); f()) | ||
783 : | | _ => ()) | ||
784 : | in AdvanceTok(); f (); | ||
785 : | if !NextTok=SEMI then ParseDefs() else | ||
786 : | (prSynErr "expected ';'") | ||
787 : | end | ||
788 : | | ID x => (LexState:=1; AdvanceTok(); if GetTok() = ASSIGN | ||
789 : | then (SymTab := enter(!SymTab)(x,GetExp()); | ||
790 : | if !NextTok = SEMI then ParseDefs() | ||
791 : | else (prSynErr "expected ';'")) | ||
792 : | else raise SyntaxError) | ||
793 : | | REJECT => (HaveReject := true; ParseDefs()) | ||
794 : | | COUNT => (CountNewLines := true; ParseDefs()) | ||
795 : | | FULLCHARSET => (CharSetSize := 256; ParseDefs()) | ||
796 : | | HEADER => (LexState := 2; AdvanceTok(); | ||
797 : | case GetTok() | ||
798 : | of ACTION s => | ||
799 : | if (!StrDecl) then | ||
800 : | (prErr "cannot have both %structure and %header \ | ||
801 : | \declarations") | ||
802 : | else if (!HeaderDecl) then | ||
803 : | (prErr "duplicate %header declarations") | ||
804 : | else | ||
805 : | (HeaderCode := s; LexState := 0; | ||
806 : | HeaderDecl := true; ParseDefs()) | ||
807 : | | _ => raise SyntaxError) | ||
808 : | | POSARG => (PosArg := true; ParseDefs()) | ||
809 : | | ARG => (LexState := 2; AdvanceTok(); | ||
810 : | case GetTok() | ||
811 : | of ACTION s => | ||
812 : | (case !ArgCode | ||
813 : | of SOME _ => prErr "duplicate %arg declarations" | ||
814 : | | NONE => ArgCode := SOME s; | ||
815 : | LexState := 0; | ||
816 : | ParseDefs()) | ||
817 : | | _ => raise SyntaxError) | ||
818 : | | STRUCT => (AdvanceTok(); | ||
819 : | case !NextTok of | ||
820 : | (ID i) => | ||
821 : | if (!HeaderDecl) then | ||
822 : | (prErr "cannot have both %structure and %header \ | ||
823 : | \declarations") | ||
824 : | else if (!StrDecl) then | ||
825 : | (prErr "duplicate %structure declarations") | ||
826 : | else (StrName := i; StrDecl := true) | ||
827 : | | _ => (prErr "expected ID"); | ||
828 : | ParseDefs()) | ||
829 : | | _ => raise SyntaxError) | ||
830 : | and ParseRules = | ||
831 : | fn rules => (LexState:=1; AdvanceTok(); case !NextTok of | ||
832 : | EOF => rules | ||
833 : | | _ => | ||
834 : | let val s = GetStates() | ||
835 : | val e = renum(CAT(GetExp(),END(0))) | ||
836 : | in | ||
837 : | if !NextTok = ARROW then | ||
838 : | (LexState:=2; AdvanceTok(); | ||
839 : | case GetTok() of ACTION(act) => | ||
840 : | if !NextTok=SEMI then | ||
841 : | (Accept:=enter(!Accept) (Int.toString (!LeafNum),act); | ||
842 : | ParseRules((s,e)::rules)) | ||
843 : | else (prSynErr "expected ';'") | ||
844 : | | _ => raise SyntaxError) | ||
845 : | else (prSynErr "expected '=>'") | ||
846 : | end) | ||
847 : | in let val usercode = ParseRtns nil | ||
848 : | in (ParseDefs(); (usercode,ParseRules(nil),!Accept)) | ||
849 : | end | ||
850 : | end handle SyntaxError => (prSynErr "") | ||
851 : | |||
852 : | fun makebegin () : unit = | ||
853 : | let fun make nil = () | ||
854 : | | make ((x,n:int)::y)=(say "val "; say x; say " = " ; | ||
855 : | say "STARTSTATE "; | ||
856 : | say (Int.toString n); say ";\n"; make y) | ||
857 : | in say "\n(* start state definitions *)\n\n"; make(listofdict(!StateTab)) | ||
858 : | end | ||
859 : | |||
860 : | structure L = | ||
861 : | struct | ||
862 : | nonfix > | ||
863 : | type key = int list * string | ||
864 : | fun > ((key,item:string),(key',item')) = | ||
865 : | let fun f ((a:int)::a') (b::b') = if Int.> (a,b) then true | ||
866 : | else if a=b then f a' b' | ||
867 : | else false | ||
868 : | | f _ _ = false | ||
869 : | in f key key' | ||
870 : | end | ||
871 : | end | ||
872 : | |||
873 : | structure RB = RedBlack(L) | ||
874 : | |||
875 : | fun maketable (fins:(int * (int list)) list, | ||
876 : | tcs :(int * (int list)) list, | ||
877 : | tcpairs: (int * int) list, | ||
878 : | trans : (int*(int list)) list) : unit = | ||
879 : | |||
880 : | (* Fins = (state #, list of final leaves for the state) list | ||
881 : | tcs = (state #, list of trailing context leaves which begin in this state) | ||
882 : | list | ||
883 : | tcpairs = (trailing context leaf, end leaf) list | ||
884 : | trans = (state #,list of transitions for state) list *) | ||
885 : | |||
886 : | let datatype elem = N of int | T of int | D of int | ||
887 : | val count = ref 0 | ||
888 : | val _ = (if length(trans)<256 then CharFormat := true | ||
889 : | else CharFormat := false; | ||
890 : | if !UsesTrailingContext then | ||
891 : | (say "\ndatatype yyfinstate = N of int | \ | ||
892 : | \ T of int | D of int\n") | ||
893 : | else say "\ndatatype yyfinstate = N of int"; | ||
894 : | say "\ntype statedata = {fin : yyfinstate list, trans: "; | ||
895 : | case !CharFormat of | ||
896 : | true => say "string}" | ||
897 : | | false => say "int Vector.vector}"; | ||
898 : | say "\n(* transition & final state table *)\nval tab = let\n"; | ||
899 : | case !CharFormat of | ||
900 : | true => () | ||
901 : | | false => | ||
902 : | (say "fun decode s k =\n"; | ||
903 : | say " let val k' = k + k\n"; | ||
904 : | say " val hi = Char.ord(String.sub(s, k'))\n"; | ||
905 : | say " val lo = Char.ord(String.sub(s, k' + 1))\n"; | ||
906 : | say " in hi * 256 + lo end\n")) | ||
907 : | |||
908 : | val newfins = | ||
909 : | let fun IsEndLeaf t = | ||
910 : | let fun f ((l,e)::r) = if (e=t) then true else f r | ||
911 : | | f nil = false in f tcpairs end | ||
912 : | |||
913 : | fun GetEndLeaf t = | ||
914 : | let fun f ((tl,el)::r) = if (tl=t) then el else f r | ||
915 : | in f tcpairs | ||
916 : | end | ||
917 : | fun GetTrConLeaves s = | ||
918 : | let fun f ((s',l)::r) = if (s = s') then l else f r | ||
919 : | | f nil = nil | ||
920 : | in f tcs | ||
921 : | end | ||
922 : | fun sort_leaves s = | ||
923 : | let fun insert (x:int) (a::b) = | ||
924 : | if (x <= a) then x::(a::b) | ||
925 : | else a::(insert x b) | ||
926 : | | insert x nil = [x] | ||
927 : | in List.foldr (fn (x,r) => insert x r) [] s | ||
928 : | end | ||
929 : | fun conv a = if (IsEndLeaf a) then (D a) else (N a) | ||
930 : | fun merge (a::a',b::b') = | ||
931 : | if (a <= b) then (conv a)::merge(a',b::b') | ||
932 : | else (T b)::(merge(a::a',b')) | ||
933 : | | merge (a::a',nil) = (conv a)::(merge (a',nil)) | ||
934 : | | merge (nil,b::b') = (T b)::(merge (b',nil)) | ||
935 : | | merge (nil,nil) = nil | ||
936 : | |||
937 : | in map (fn (x,l) => | ||
938 : | rev (merge (l, | ||
939 : | sort_leaves (map (fn x => GetEndLeaf x) (GetTrConLeaves x))))) | ||
940 : | fins | ||
941 : | end | ||
942 : | |||
943 : | val rs = | ||
944 : | let open RB | ||
945 : | fun makeItems x = | ||
946 : | let fun emit8(x, pos) = | ||
947 : | let val s = StringCvt.padLeft #"0" 3 (Int.toString x) | ||
948 : | in | ||
949 : | case pos | ||
950 : | of 16 => (say "\\\n\\\\"; say s; 1) | ||
951 : | | _ => (say "\\"; say s; pos+1) | ||
952 : | end | ||
953 : | fun emit16(x, pos) = | ||
954 : | let val hi8 = x div 256 | ||
955 : | val lo8 = x - hi8 * 256 (* x rem 256 *) | ||
956 : | in | ||
957 : | emit8(lo8, emit8(hi8, pos)) | ||
958 : | end | ||
959 : | fun MakeString([], _, _) = () | ||
960 : | | MakeString(x::xs, emitter, pos) = | ||
961 : | MakeString(xs, emitter, emitter(x, pos)) | ||
962 : | in case !CharFormat of | ||
963 : | true => (say " \n\""; MakeString(x,emit8,0); say "\"\n") | ||
964 : | | false => (say (Int.toString(length x)); | ||
965 : | say ", \n\""; MakeString(x,emit16,0); say "\"\n") | ||
966 : | end | ||
967 : | |||
968 : | fun makeEntry(nil,rs,t) = rev rs | ||
969 : | | makeEntry(((l:int,x)::y),rs,t) = | ||
970 : | let val name = (Int.toString l) | ||
971 : | in let val (r,n) = lookup ((x,name),t) | ||
972 : | in makeEntry(y,(n::rs),t) | ||
973 : | end handle notfound _ => | ||
974 : | (count := !count+1; | ||
975 : | say " ("; say name; say ","; | ||
976 : | makeItems x; say "),\n"; | ||
977 : | makeEntry(y,(name::rs),(insert ((x,name),t)))) | ||
978 : | end | ||
979 : | |||
980 : | val _ = say "val s = [ \n" | ||
981 : | val res = makeEntry(trans,nil,empty) | ||
982 : | val _ = | ||
983 : | case !CharFormat | ||
984 : | of true => (say "(0, \"\")]\n"; say "fun f x = x \n") | ||
985 : | | false => (say "(0, 0, \"\")]\n"; | ||
986 : | say "fun f(n, i, x) = (n, Vector.tabulate(i, decode x)) \n") | ||
987 : | |||
988 : | val _ = say "val s = map f (rev (tl (rev s))) \n" | ||
989 : | val _ = say "exception LexHackingError \n" | ||
990 : | val _ = say "fun look ((j,x)::r, i) = if i = j then x else look(r, i) \n" | ||
991 : | val _ = say " | look ([], i) = raise LexHackingError\n" | ||
992 : | |||
993 : | val _ = say "fun g {fin=x, trans=i} = {fin=x, trans=look(s,i)} \n" | ||
994 : | in res | ||
995 : | end | ||
996 : | |||
997 : | fun makeTable(nil,nil) = () | ||
998 : | | makeTable(a::a',b::b') = | ||
999 : | let fun makeItems nil = () | ||
1000 : | | makeItems (hd::tl) = | ||
1001 : | let val (t,n) = | ||
1002 : | case hd of | ||
1003 : | (N i) => ("(N ",i) | ||
1004 : | | (T i) => ("(T ",i) | ||
1005 : | | (D i) => ("(D ",i) | ||
1006 : | in (say t; say (Int.toString n); say ")"; | ||
1007 : | if null tl | ||
1008 : | then () | ||
1009 : | else (say ","; makeItems tl)) | ||
1010 : | end | ||
1011 : | in (say "{fin = ["; makeItems b; | ||
1012 : | say "], trans = "; say a; say "}"; | ||
1013 : | if null a' | ||
1014 : | then () | ||
1015 : | else (say ",\n"; makeTable(a',b'))) | ||
1016 : | end | ||
1017 : | |||
1018 : | fun msg x = TextIO.output(TextIO.stdOut, x) | ||
1019 : | |||
1020 : | in (say "in Vector.fromList(map g \n["; makeTable(rs,newfins); | ||
1021 : | say "])\nend\n"; | ||
1022 : | msg ("\nNumber of states = " ^ (Int.toString (length trans))); | ||
1023 : | msg ("\nNumber of distinct rows = " ^ (Int.toString (!count))); | ||
1024 : | msg ("\nApprox. memory size of trans. table = " ^ | ||
1025 : | (Int.toString (!count*(!CharSetSize)*(if !CharFormat then 1 else 8)))); | ||
1026 : | msg " bytes\n") | ||
1027 : | end | ||
1028 : | |||
1029 : | (* makeaccept: Takes a (string,string) dictionary, prints case statement for | ||
1030 : | accepting leaf actions. The key strings are the leaf #'s, the data strings | ||
1031 : | are the actions *) | ||
1032 : | |||
1033 : | fun makeaccept ends = | ||
1034 : | let fun startline f = if f then say " " else say "| " | ||
1035 : | fun make(nil,f) = (startline f; say "_ => raise Internal.LexerError\n") | ||
1036 : | | make((x,a)::y,f) = (startline f; say x; say " => "; | ||
1037 : | if Substring.size(#2 (Substring.position "yytext" (Substring.all a))) = 0 | ||
1038 : | then | ||
1039 : | (say "("; say a; say ")") | ||
1040 : | else (say "let val yytext=yymktext() in "; | ||
1041 : | say a; say " end"); | ||
1042 : | say "\n"; make(y,false)) | ||
1043 : | in make (listofdict(ends),true) | ||
1044 : | end | ||
1045 : | |||
1046 : | fun leafdata(e:(int list * exp) list) = | ||
1047 : | let val fp = array(!LeafNum + 1,nil) | ||
1048 : | and leaf = array(!LeafNum + 1,EPS) | ||
1049 : | and tcpairs = ref nil | ||
1050 : | and trailmark = ref ~1; | ||
1051 : | val rec add = fn | ||
1052 : | (nil,x) => () | ||
1053 : | | (hd::tl,x) => (update(fp,hd,union(fp sub hd,x)); | ||
1054 : | add(tl,x)) | ||
1055 : | and moredata = fn | ||
1056 : | CLOSURE(e1) => | ||
1057 : | (moredata(e1); add(lastpos(e1),firstpos(e1))) | ||
1058 : | | ALT(e1,e2) => (moredata(e1); moredata(e2)) | ||
1059 : | | CAT(e1,e2) => (moredata(e1); moredata(e2); | ||
1060 : | add(lastpos(e1),firstpos(e2))) | ||
1061 : | | CLASS(x,i) => update(leaf,i,CLASS(x,i)) | ||
1062 : | | TRAIL(i) => (update(leaf,i,TRAIL(i)); if !trailmark = ~1 | ||
1063 : | then trailmark := i else ()) | ||
1064 : | | END(i) => (update(leaf,i,END(i)); if !trailmark <> ~1 | ||
1065 : | then (tcpairs := (!trailmark,i)::(!tcpairs); | ||
1066 : | trailmark := ~1) else ()) | ||
1067 : | | _ => () | ||
1068 : | and makedata = fn | ||
1069 : | nil => () | ||
1070 : | | (_,x)::tl => (moredata(x);makedata(tl)) | ||
1071 : | in trailmark := ~1; makedata(e); (fp,leaf,!tcpairs) | ||
1072 : | end; | ||
1073 : | |||
1074 : | fun makedfa(rules) = | ||
1075 : | let val StateTab = ref (create(String.<=)) : (string,int) dictionary ref | ||
1076 : | val fintab = ref (create(Int.<=)) : (int,(int list)) dictionary ref | ||
1077 : | val transtab = ref (create(Int.<=)) : (int,int list) dictionary ref | ||
1078 : | val tctab = ref (create(Int.<=)) : (int,(int list)) dictionary ref | ||
1079 : | val (fp, leaf, tcpairs) = leafdata(rules); | ||
1080 : | |||
1081 : | fun visit (state,statenum) = | ||
1082 : | let val transitions = gettrans(state) in | ||
1083 : | fintab := enter(!fintab)(statenum,getfin(state)); | ||
1084 : | tctab := enter(!tctab)(statenum,gettc(state)); | ||
1085 : | transtab := enter(!transtab)(statenum,transitions) | ||
1086 : | end | ||
1087 : | |||
1088 : | and visitstarts (states) = | ||
1089 : | let fun vs nil i = () | ||
1090 : | | vs (hd::tl) i = (visit (hd,i); vs tl (i+1)) | ||
1091 : | in vs states 0 | ||
1092 : | end | ||
1093 : | |||
1094 : | and hashstate(s: int list) = | ||
1095 : | let val rec hs = | ||
1096 : | fn (nil,z) => z | ||
1097 : | | ((x:int)::y,z) => hs(y,z ^ " " ^ (Int.toString x)) | ||
1098 : | in hs(s,"") | ||
1099 : | end | ||
1100 : | |||
1101 : | and find(s) = lookup(!StateTab)(hashstate(s)) | ||
1102 : | |||
1103 : | and add(s,n) = StateTab := enter(!StateTab)(hashstate(s),n) | ||
1104 : | |||
1105 : | and getstate (state) = | ||
1106 : | find(state) | ||
1107 : | handle LOOKUP => let val n = ++StateNum in | ||
1108 : | add(state,n); visit(state,n); n | ||
1109 : | end | ||
1110 : | |||
1111 : | and getfin state = | ||
1112 : | let fun f nil fins = fins | ||
1113 : | | f (hd::tl) fins = | ||
1114 : | case (leaf sub hd) | ||
1115 : | of END _ => f tl (hd::fins) | ||
1116 : | | _ => f tl fins | ||
1117 : | in f state nil | ||
1118 : | end | ||
1119 : | |||
1120 : | and gettc state = | ||
1121 : | let fun f nil fins = fins | ||
1122 : | | f (hd::tl) fins = | ||
1123 : | case (leaf sub hd) | ||
1124 : | of TRAIL _ => f tl (hd::fins) | ||
1125 : | | _ => f tl fins | ||
1126 : | in f state nil | ||
1127 : | end | ||
1128 : | |||
1129 : | and gettrans (state) = | ||
1130 : | let fun loop c tlist = | ||
1131 : | let fun cktrans nil r = r | ||
1132 : | | cktrans (hd::tl) r = | ||
1133 : | case (leaf sub hd) of | ||
1134 : | CLASS(i,_)=> | ||
1135 : | (if (i sub c) then cktrans tl (union(r,fp sub hd)) | ||
1136 : | else cktrans tl r handle Subscript => | ||
1137 : | cktrans tl r | ||
1138 : | ) | ||
1139 : | | _ => cktrans tl r | ||
1140 : | in if c >= 0 then | ||
1141 : | let val v=cktrans state nil | ||
1142 : | in loop (c-1) (if v=nil then 0::tlist else (getstate v)::tlist) | ||
1143 : | end | ||
1144 : | else tlist | ||
1145 : | end | ||
1146 : | in loop ((!CharSetSize) - 1) nil | ||
1147 : | end | ||
1148 : | |||
1149 : | and startstates() = | ||
1150 : | let val startarray = array(!StateNum + 1, nil); | ||
1151 : | fun listofarray(a,n) = | ||
1152 : | let fun f i l = if i >= 0 then f (i-1) ((a sub i)::l) else l | ||
1153 : | in f (n-1) nil end | ||
1154 : | val rec makess = fn | ||
1155 : | nil => () | ||
1156 : | | (startlist,e)::tl => (fix(startlist,firstpos(e));makess(tl)) | ||
1157 : | and fix = fn | ||
1158 : | (nil,_) => () | ||
1159 : | | (s::tl,firsts) => (update(startarray,s, | ||
1160 : | union(firsts,startarray sub s)); | ||
1161 : | fix(tl,firsts)) | ||
1162 : | in makess(rules);listofarray(startarray, !StateNum + 1) | ||
1163 : | end | ||
1164 : | |||
1165 : | in visitstarts(startstates()); | ||
1166 : | (listofdict(!fintab),listofdict(!transtab),listofdict(!tctab),tcpairs) | ||
1167 : | end | ||
1168 : | |||
1169 : | val skel_hd = | ||
1170 : | " struct\n\ | ||
1171 : | \ structure UserDeclarations =\n\ | ||
1172 : | \ struct\n\ | ||
1173 : | \" | ||
1174 : | |||
1175 : | val skel_mid2 = | ||
1176 : | " | Internal.D k => action (i,(acts::l),k::rs)\n\ | ||
1177 : | \ | Internal.T k =>\n\ | ||
1178 : | \ let fun f (a::b,r) =\n\ | ||
1179 : | \ if a=k\n\ | ||
1180 : | \ then action(i,(((Internal.N a)::acts)::l),(b@r))\n\ | ||
1181 : | \ else f (b,a::r)\n\ | ||
1182 : | \ | f (nil,r) = action(i,(acts::l),rs)\n\ | ||
1183 : | \ in f (rs,nil)\n\ | ||
1184 : | \ end\n\ | ||
1185 : | \" | ||
1186 : | |||
1187 : | fun lexGen(infile) = | ||
1188 : | let val outfile = infile ^ ".sml" | ||
1189 : | fun PrintLexer (ends) = | ||
1190 : | let val sayln = fn x => (say x; say "\n") | ||
1191 : | in case !ArgCode | ||
1192 : | of NONE => (sayln "fun lex () : Internal.result ="; | ||
1193 : | sayln "let fun continue() = lex() in") | ||
1194 : | | SOME s => (say "fun lex "; say "(yyarg as ("; say s; sayln ")) ="; | ||
1195 : | sayln "let fun continue() : Internal.result = "); | ||
1196 : | say " let fun scan (s,AcceptingLeaves : Internal.yyfinstate"; | ||
1197 : | sayln " list list,l,i0) ="; | ||
1198 : | if !UsesTrailingContext | ||
1199 : | then say "\tlet fun action (i,nil,rs)" | ||
1200 : | else say "\tlet fun action (i,nil)"; | ||
1201 : | sayln " = raise LexError"; | ||
1202 : | if !UsesTrailingContext | ||
1203 : | then sayln "\t| action (i,nil::l,rs) = action(i-1,l,rs)" | ||
1204 : | else sayln "\t| action (i,nil::l) = action (i-1,l)"; | ||
1205 : | if !UsesTrailingContext | ||
1206 : | then sayln "\t| action (i,(node::acts)::l,rs) =" | ||
1207 : | else sayln "\t| action (i,(node::acts)::l) ="; | ||
1208 : | sayln "\t\tcase node of"; | ||
1209 : | sayln "\t\t Internal.N yyk => "; | ||
1210 : | sayln "\t\t\t(let fun yymktext() = substring(!yyb,i0,i-i0)\n\ | ||
1211 : | \\t\t\t val yypos = i0+ !yygone"; | ||
1212 : | if !CountNewLines | ||
1213 : | then (sayln "\t\t\tval _ = yylineno := CharVector.foldl"; | ||
1214 : | sayln "\t\t\t\t(fn (_,#\"\\n\", n) => n+1 | (_,_, n) => n) (!yylineno) (!yyb,i0,SOME(i-i0))") | ||
1215 : | else (); | ||
1216 : | if !HaveReject | ||
1217 : | then (say "\t\t\tfun REJECT() = action(i,acts::l"; | ||
1218 : | if !UsesTrailingContext | ||
1219 : | then sayln ",rs)" else sayln ")") | ||
1220 : | else (); | ||
1221 : | sayln "\t\t\topen UserDeclarations Internal.StartStates"; | ||
1222 : | sayln " in (yybufpos := i; case yyk of "; | ||
1223 : | sayln ""; | ||
1224 : | sayln "\t\t\t(* Application actions *)\n"; | ||
1225 : | makeaccept(ends); | ||
1226 : | say "\n\t\t) end "; | ||
1227 : | say ")\n\n"; | ||
1228 : | if (!UsesTrailingContext) then say skel_mid2 else (); | ||
1229 : | sayln "\tval {fin,trans} = Unsafe.Vector.sub(Internal.tab, s)"; | ||
1230 : | sayln "\tval NewAcceptingLeaves = fin::AcceptingLeaves"; | ||
1231 : | sayln "\tin if l = !yybl then"; | ||
1232 : | sayln "\t if trans = #trans(Vector.sub(Internal.tab,0))"; | ||
1233 : | sayln "\t then action(l,NewAcceptingLeaves"; | ||
1234 : | if !UsesTrailingContext then say ",nil" else (); | ||
1235 : | say ") else"; | ||
1236 : | sayln "\t let val newchars= if !yydone then \"\" else yyinput 1024"; | ||
1237 : | sayln "\t in if (size newchars)=0"; | ||
1238 : | sayln "\t\t then (yydone := true;"; | ||
1239 : | say "\t\t if (l=i0) then UserDeclarations.eof "; | ||
1240 : | sayln (case !ArgCode of NONE => "()" | SOME _ => "yyarg"); | ||
1241 : | say "\t\t else action(l,NewAcceptingLeaves"; | ||
1242 : | if !UsesTrailingContext then | ||
1243 : | sayln ",nil))" else sayln "))"; | ||
1244 : | sayln "\t\t else (if i0=l then yyb := newchars"; | ||
1245 : | sayln "\t\t else yyb := substring(!yyb,i0,l-i0)^newchars;"; | ||
1246 : | sayln "\t\t yygone := !yygone+i0;"; | ||
1247 : | sayln "\t\t yybl := size (!yyb);"; | ||
1248 : | sayln "\t\t scan (s,AcceptingLeaves,l-i0,0))"; | ||
1249 : | sayln "\t end"; | ||
1250 : | sayln "\t else let val NewChar = Char.ord(Unsafe.CharVector.sub(!yyb,l))"; | ||
1251 : | if !CharSetSize=129 | ||
1252 : | then sayln "\t\tval NewChar = if NewChar<128 then NewChar else 128" | ||
1253 : | else (); | ||
1254 : | say "\t\tval NewState = "; | ||
1255 : | sayln (if !CharFormat | ||
1256 : | then "Char.ord(Unsafe.CharVector.sub(trans,NewChar))" | ||
1257 : | else "Unsafe.Vector.sub(trans, NewChar)"); | ||
1258 : | say "\t\tin if NewState=0 then action(l,NewAcceptingLeaves"; | ||
1259 : | if !UsesTrailingContext then sayln ",nil)" else sayln ")"; | ||
1260 : | sayln "\t\telse scan(NewState,NewAcceptingLeaves,l+1,i0)"; | ||
1261 : | sayln "\tend"; | ||
1262 : | sayln "\tend"; | ||
1263 : | if !UsesPrevNewLine then () else sayln "(*"; | ||
1264 : | sayln "\tval start= if substring(!yyb,!yybufpos-1,1)=\"\\n\""; | ||
1265 : | sayln "then !yybegin+1 else !yybegin"; | ||
1266 : | if !UsesPrevNewLine then () else sayln "*)"; | ||
1267 : | say "\tin scan("; | ||
1268 : | if !UsesPrevNewLine then say "start" | ||
1269 : | else say "!yybegin (* start *)"; | ||
1270 : | sayln ",nil,!yybufpos,!yybufpos)"; | ||
1271 : | sayln " end"; | ||
1272 : | sayln (case !ArgCode of NONE => "end" | SOME _ => "in continue end"); | ||
1273 : | sayln " in lex"; | ||
1274 : | sayln " end"; | ||
1275 : | sayln "end" | ||
1276 : | end | ||
1277 : | |||
1278 : | in (UsesPrevNewLine := false; | ||
1279 : | ResetFlags(); | ||
1280 : | LexBuf := make_ibuf(TextIO.openIn infile); | ||
1281 : | NextTok := BOF; | ||
1282 : | inquote := false; | ||
1283 : | LexOut := TextIO.openOut(outfile); | ||
1284 : | StateNum := 2; | ||
1285 : | LineNum := 1; | ||
1286 : | StateTab := enter(create(String.<=))("INITIAL",1); | ||
1287 : | LeafNum := ~1; | ||
1288 : | let | ||
1289 : | val (user_code,rules,ends) = | ||
1290 : | parse() handle x => | ||
1291 : | (close_ibuf(!LexBuf); | ||
1292 : | TextIO.closeOut(!LexOut); | ||
1293 : | OS.FileSys.remove outfile; | ||
1294 : | raise x) | ||
1295 : | val (fins,trans,tctab,tcpairs) = makedfa(rules) | ||
1296 : | val _ = if !UsesTrailingContext then | ||
1297 : | (close_ibuf(!LexBuf); | ||
1298 : | TextIO.closeOut(!LexOut); | ||
1299 : | OS.FileSys.remove outfile; | ||
1300 : | prErr "lookahead is unimplemented") | ||
1301 : | else () | ||
1302 : | in | ||
1303 : | if (!HeaderDecl) | ||
1304 : | then say (!HeaderCode) | ||
1305 : | else say ("structure " ^ (!StrName)); | ||
1306 : | say "=\n"; | ||
1307 : | say skel_hd; | ||
1308 : | say user_code; | ||
1309 : | say "end (* end of user routines *)\n"; | ||
1310 : | say "exception LexError (* raised if illegal leaf "; | ||
1311 : | say "action tried *)\n"; | ||
1312 : | say "structure Internal =\n\tstruct\n"; | ||
1313 : | maketable(fins,tctab,tcpairs,trans); | ||
1314 : | say "structure StartStates =\n\tstruct\n"; | ||
1315 : | say "\tdatatype yystartstate = STARTSTATE of int\n"; | ||
1316 : | makebegin(); | ||
1317 : | say "\nend\n"; | ||
1318 : | say "type result = UserDeclarations.lexresult\n"; | ||
1319 : | say "\texception LexerError (* raised if illegal leaf "; | ||
1320 : | say "action tried *)\n"; | ||
1321 : | say "end\n\n"; | ||
1322 : | say (if (!PosArg) then "fun makeLexer (yyinput,yygone0:int) =\nlet\n" | ||
1323 : | else "fun makeLexer yyinput =\nlet\tval yygone0=1\n"); | ||
1324 : | if !CountNewLines then say "\tval yylineno = ref 0\n\n" else (); | ||
1325 : | say "\tval yyb = ref \"\\n\" \t\t(* buffer *)\n\ | ||
1326 : | \\tval yybl = ref 1\t\t(*buffer length *)\n\ | ||
1327 : | \\tval yybufpos = ref 1\t\t(* location of next character to use *)\n\ | ||
1328 : | \\tval yygone = ref yygone0\t(* position in file of beginning of buffer *)\n\ | ||
1329 : | \\tval yydone = ref false\t\t(* eof found yet? *)\n\ | ||
1330 : | \\tval yybegin = ref 1\t\t(*Current 'start state' for lexer *)\n\ | ||
1331 : | \\n\tval YYBEGIN = fn (Internal.StartStates.STARTSTATE x) =>\n\ | ||
1332 : | \\t\t yybegin := x\n\n"; | ||
1333 : | PrintLexer(ends); | ||
1334 : | close_ibuf(!LexBuf); | ||
1335 : | TextIO.closeOut(!LexOut) | ||
1336 : | end) | ||
1337 : | end | ||
1338 : | end |
root@smlnj-gforge.cs.uchicago.edu | ViewVC Help |
Powered by ViewVC 1.0.0 |