Home My Page Projects Code Snippets Project Openings SML/NJ
Summary Activity Forums Tracker Lists Tasks Docs Surveys News SCM Files

SCM Repository

[smlnj] Annotation of /lexgen/releases/release-110.61/src/reg-exp.sml
ViewVC logotype

Annotation of /lexgen/releases/release-110.61/src/reg-exp.sml

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2246 - (view) (download)

1 : mblume 1902 (* reg-exp-fn.sml
2 :     *
3 :     * COPYRIGHT (c) 2005
4 :     * John Reppy (http://www.cs.uchicago.edu/~jhr)
5 :     * Aaron Turon (adrassi@gmail.com)
6 :     * All rights reserved.
7 :     *
8 :     * Regular expression representation and manipulation.
9 :     *
10 :     * The main points here are to:
11 :     * (1) make it easy for an RE parser to construct
12 :     * RE expressions
13 :     * (2) canonicalize REs for effective comparison
14 :     * (3) implement the RE derivatives algorithm
15 :     *
16 :     * See the implementation notes for details on the derivatives
17 :     * algorithm and the canonicalization strategy.
18 :     *)
19 :    
20 :     structure RegExp : REG_EXP =
21 :     struct
22 :    
23 :     (* symbols (i.e., words) *)
24 :     structure Sym =
25 :     struct
26 :    
27 :     structure W32 = Word32
28 :     type point = W32.word
29 :    
30 :     val compare = W32.compare
31 :     val minPt : W32.word = 0w0
32 :     val maxPt = W32.notb 0w0
33 :    
34 :     fun succ (w : W32.word) =
35 :     if w = W32.notb 0w0 then w
36 :     else w + 0w1
37 :     fun pred (w : W32.word) =
38 :     if w = 0w0 then w
39 :     else w - 0w1
40 :    
41 :     fun isSucc (w1, w2) = (succ w1 = w2)
42 :    
43 :     end
44 :    
45 :     structure SymSet = IntervalSetFn(Sym)
46 :    
47 :     type symbol = Sym.point
48 :     type sym_set = SymSet.set
49 :    
50 :     structure SIS = SymSet
51 :    
52 :     (* REs *)
53 :     datatype re
54 :     = Epsilon (* matches the empty string *)
55 :     | Any (* matches any single symbol *)
56 :     | None (* matches nothing (i.e. the empty language) *)
57 :     | SymSet of sym_set
58 :     | Concat of re list
59 :     | Closure of re
60 :     | Op of (rator * re list) (* list length <> 1 and in sorted order *)
61 :     | Not of re
62 :     and rator = OR | AND | XOR
63 :    
64 :     (* we give a total order to REs; this is useful for canonicalization *)
65 :     fun compare (re1, re2) = let
66 :     fun cmpOp (OR, OR) = EQUAL
67 :     | cmpOp (OR, _) = LESS
68 :     | cmpOp (_, OR) = GREATER
69 :     | cmpOp (AND, AND) = EQUAL
70 :     | cmpOp (AND, _) = LESS
71 :     | cmpOp (_, AND) = GREATER
72 :     | cmpOp (XOR, XOR) = EQUAL
73 :     fun compareList (res1, res2) =
74 :     List.collate compare (res1, res2)
75 :     in
76 :     case (re1, re2)
77 :     of (Epsilon, Epsilon) => EQUAL
78 :     | (Epsilon, _) => LESS
79 :     | (_, Epsilon) => GREATER
80 :     | (Any, Any) => EQUAL
81 :     | (Any, _) => LESS
82 :     | (_, Any) => GREATER
83 :     | (None, None) => EQUAL
84 :     | (None, _) => LESS
85 :     | (_, None) => GREATER
86 :     | (SymSet a, SymSet b) => SIS.compare(a, b)
87 :     | (SymSet a, _) => LESS
88 :     | (_, SymSet b) => GREATER
89 :     | (Concat a, Concat b) => compareList(a, b)
90 :     | (Concat a, _) => LESS
91 :     | (_, Concat b) => GREATER
92 :     | (Closure a, Closure b) => compare(a, b)
93 :     | (Closure a, _) => LESS
94 :     | (_, Closure b) => GREATER
95 :     | (Op(op1, res1), Op(op2, res2)) => (case cmpOp (op1, op2)
96 :     of EQUAL => compareList(res1, res2)
97 :     | order => order
98 :     (* end case *))
99 :     | (Op _, _) => LESS
100 :     | (_, Op _) => GREATER
101 :     | (Not a, Not b) => compare(a, b)
102 :     (* end case *)
103 :     end
104 :     (* val sort = ListMergeSort.sort (fn (re1, re2) => compare(re1, re2) = LESS) *)
105 :    
106 :     (* primitive REs *)
107 :    
108 :     val any = Any
109 :     val none = None
110 :     val epsilon = Epsilon
111 :    
112 :     (* canonical constructors *)
113 :    
114 :     fun mkSymSet c =
115 :     if SIS.isEmpty c then None
116 :     else if SIS.isUniverse c then Any
117 :     else SymSet c
118 :    
119 :     fun mkSym sym = mkSymSet (SIS.singleton sym)
120 :    
121 :     fun mkConcat (re1, re2) = (case (re1, re2)
122 :     of (Epsilon, re2) => re2
123 :     | (re1, Epsilon) => re1
124 :     | (None, _) => None
125 :     | (_, None) => None
126 :     | (Concat res1, Concat res2) => Concat(res1@res2)
127 :     | (re1, Concat res2) => Concat(re1::res2)
128 :     | (Concat res1, re2) => Concat(res1@[re2])
129 :     | _ => Concat[re1, re2]
130 :     (* end case *))
131 :    
132 :     fun mkConcatList [] = Epsilon
133 :     | mkConcatList (re::res) = mkConcat(re, mkConcatList res)
134 :    
135 :     fun mkClosure (Epsilon) = Epsilon
136 :     | mkClosure (None) = Epsilon
137 :     | mkClosure (re as Closure _) = re
138 :     | mkClosure re = Closure re
139 :    
140 : mblume 1904 fun mergeSIS (inRes, mop) = let
141 : mblume 1902 fun isSIS (SymSet _) = true
142 :     | isSIS _ = false
143 : mblume 1904 val (siss, res) = List.partition isSIS inRes
144 : mblume 1902 in case siss
145 : mblume 1904 of [] => inRes
146 :     | [re] => inRes
147 : mblume 1902 | sis::siss' => let
148 :     fun wrapmop (SymSet s1, SymSet s2) =
149 :     SymSet (mop (s1, s2))
150 :     | wrapmop _ = raise Fail "BUG: wrapmop: SymSet expected"
151 :     val merged = List.foldl wrapmop sis siss'
152 :     fun reinsert (re1, []) = [re1]
153 :     | reinsert (re1, re::res) = (case compare (re1, re)
154 :     of LESS => re1::re::res
155 :     | EQUAL => raise Fail "BUG: mergeSIS: only one SymSet expected"
156 :     | GREATER => re::(reinsert (re1, res))
157 :     (* end case *))
158 :     in reinsert (merged, res)
159 :     end
160 :     end
161 :    
162 :     fun mkOr (re1, re2) = let
163 :     fun merge ([], res2) = res2
164 :     | merge (res1, []) = res1
165 :     | merge (re1::r1, re2::r2) = (case compare(re1, re2)
166 :     of LESS => re1::merge(r1, re2::r2)
167 :     | EQUAL => merge (re1::r1, r2)
168 :     | GREATER => re2 :: merge(re1::r1, r2)
169 :     (* end case *))
170 :     fun mk (a, b) = (case mergeSIS(merge(a, b), SIS.union)
171 :     of [] => None
172 :     | [re] => re
173 :     | res => Op(OR, res)
174 :     (* end case *))
175 :     in
176 :     case (re1, re2)
177 :     of (None, _) => re2
178 :     | (_, None) => re1
179 :     | (SymSet s1, SymSet s2) => mkSymSet (SIS.union (s1, s2))
180 :     | (Op(OR, res1), Op(OR, res2)) => mk(res1, res2)
181 :     | (Op(OR, res1), _) => mk(res1, [re2])
182 :     | (_, Op(OR, res2)) => mk([re1], res2)
183 :     | (re1, re2) => (case compare(re1, re2)
184 :     of LESS => Op(OR, [re1, re2])
185 :     | EQUAL => re1
186 :     | GREATER => Op(OR, [re2, re1])
187 :     (* end case *))
188 :     (* end case *)
189 :     end
190 :    
191 :     fun mkAnd (re1, re2) = let
192 :     fun merge ([], res2) = res2
193 :     | merge (res1, []) = res1
194 :     | merge (re1::r1, re2::r2) = (case compare(re1, re2)
195 :     of LESS => re1::merge(r1, re2::r2)
196 :     | EQUAL => merge (re1::r1, r2)
197 :     | GREATER => re2 :: merge(re1::r1, r2)
198 :     (* end case *))
199 :     fun mk (a, b) = (case mergeSIS(merge(a, b), SIS.intersect)
200 :     of [] => None
201 :     | [re] => re
202 :     | res => Op(AND, res)
203 :     (* end case *))
204 :     in
205 :     case (re1, re2)
206 :     of (None, _) => None
207 :     | (_, None) => None
208 :     | (SymSet s1, SymSet s2) => mkSymSet (SIS.intersect (s1, s2))
209 :     | (Op(AND, res1), Op(AND, res2)) => mk(res1, res2)
210 :     | (Op(AND, res1), _) => mk(res1, [re2])
211 :     | (_, Op(AND, res2)) => mk([re1], res2)
212 :     | (re1, re2) => (case compare(re1, re2)
213 :     of LESS => Op(AND, [re1, re2])
214 :     | EQUAL => re1
215 :     | GREATER => Op(AND, [re2, re1])
216 :     (* end case *))
217 :     (* end case *)
218 :     end
219 :    
220 :     fun mkXor (re1, re2) = let
221 :     fun merge ([], res2) = res2
222 :     | merge (res1, []) = res1
223 :     | merge (re1::r1, re2::r2) = (case compare(re1, re2)
224 :     of LESS => re1::merge(r1, re2::r2)
225 :     | EQUAL => merge (r1, r2)
226 :     | GREATER => re2 :: merge(re1::r1, r2)
227 :     (* end case *))
228 :     fun mk (a, b) = (case merge(a, b)
229 :     of [] => None
230 :     | [re] => re
231 :     | res => Op(XOR, res)
232 :     (* end case *))
233 :     in
234 :     case (re1, re2)
235 :     of (None, _) => re2
236 :     | (_, None) => re1
237 :     | (SymSet s1, SymSet s2) =>
238 :     mkSymSet (SIS.intersect (
239 :     SIS.union (s1, s2),
240 :     SIS.complement (SIS.intersect (s1, s2))
241 :     ))
242 :     | (Op(XOR, res1), Op(XOR, res2)) => mk(res1, res2)
243 :     | (Op(XOR, res1), _) => mk(res1, [re2])
244 :     | (_, Op(XOR, res2)) => mk([re1], res2)
245 :     | (re1, re2) => (case compare(re1, re2)
246 :     of LESS => Op(XOR, [re1, re2])
247 :     | EQUAL => None (* FIXME is this right? *)
248 :     | GREATER => Op(XOR, [re2, re1])
249 :     (* end case *))
250 :     (* end case *)
251 :     end
252 :    
253 :     fun mkOp (OR, re1, re2) = mkOr(re1, re2)
254 :     | mkOp (AND, re1, re2) = mkAnd(re1, re2)
255 :     | mkOp (XOR, re1, re2) = mkXor(re1, re2)
256 :    
257 :     fun mkNot (Not re) = re
258 :     | mkNot (None) = mkClosure(Any)
259 :     | mkNot re = Not re
260 :    
261 :     fun mkOpt re = mkOr(Epsilon, re)
262 :    
263 :     fun mkRep (re, low, high) = let
264 :     fun lowReps 0 = Epsilon
265 :     | lowReps 1 = re
266 :     | lowReps n = mkConcat (re, lowReps (n-1))
267 :     fun highReps 0 = Epsilon
268 :     | highReps 1 = mkOpt re
269 :     | highReps n = mkConcat (mkOpt re, highReps (n-1))
270 :     in
271 :     if high < low then raise Subscript
272 :     else mkConcat (lowReps low, highReps (high - low))
273 :     end
274 :    
275 :     fun mkAtLeast (re, 0) = mkClosure re
276 :     | mkAtLeast (re, n) = mkConcat (re, mkAtLeast (re, n-1))
277 :    
278 :     fun isNone None = true
279 :     | isNone _ = false
280 :    
281 :     fun symToString w = "#\"" ^ (Char.toString (Char.chr (Word32.toInt w))) ^ "\""
282 :     handle Overflow => raise Fail "(BUG) RegExp: symToString on a nonascii character"
283 :    
284 :     fun SISToString s = let
285 :     fun c2s c =
286 :     if (c < 0w128) then
287 :     Char.toString (Char.chr (Word32.toInt c))
288 :     else
289 :     String.concat ["\\u", Word32.toString c]
290 :     fun f (a, b) =
291 :     if a=b then c2s a
292 :     else concat[c2s a, "-", c2s b]
293 :     (* we want to describe the interval set as concisely as possible,
294 :     * so we compare the number of intervals in the set to the number
295 :     * of intervals in its complement, and use the smaller of the two.
296 :     *)
297 :     val intervals = SIS.intervals s
298 :     val intervals' = SIS.intervals (SIS.complement s)
299 :     val (neg, rngs) =
300 :     if List.length intervals < List.length intervals'
301 :     then ("", intervals)
302 :     else ("^", intervals')
303 :     val str = neg ^ (String.concat (List.map f (rngs)))
304 :     in
305 :     if String.size str <= 1
306 :     then str
307 :     else "[" ^ str ^ "]"
308 :     end
309 :    
310 :     fun toString re = let
311 :     fun opToString OR = "|"
312 :     | opToString AND = "&"
313 :     | opToString XOR = "^"
314 :     fun opPrec OR = 0
315 :     | opPrec AND = 2
316 :     | opPrec XOR = 1
317 :     fun prec Any = 6
318 :     | prec None = 6
319 :     | prec Epsilon = 6
320 :     | prec (SymSet _) = 6
321 :     | prec (Concat[]) = 6
322 :     | prec (Concat _) = 3
323 :     | prec (Closure _) = 5
324 :     | prec (Op(_, [])) = 6
325 :     | prec (Op(_, [re])) = prec re
326 :     | prec (Op(rator, _)) = opPrec rator
327 :     | prec (Not _) = 4
328 :     fun toS (Any, l) = "{any}" :: l
329 :     | toS (None, l) = "{none}" :: l
330 :     | toS (Epsilon, l) = "{epsilon}" :: l
331 :     | toS (SymSet s, l) = SISToString s :: l
332 :     | toS (Concat[], l) = "" :: l
333 :     | toS (Concat[re], l) = toS(re, l)
334 :     | toS (Concat res, l) = toS'(res, 3, "", l)
335 :     | toS (Closure re, l) = paren(5, re, "*" :: l)
336 :     | toS (Op(_, []), l) = "{}" :: l
337 :     | toS (Op(rator, [re]), l) = toS(re, l)
338 :     | toS (Op(rator, res), l) = toS'(res, opPrec rator, opToString rator, l)
339 :     | toS (Not re, l) = "!" :: paren(4, re, l)
340 :     and toS' ([], p, rator, l) = raise Fail "empty"
341 :     | toS' (re::r, p, rator, l) =
342 :     paren(p, re, List.foldr
343 :     (fn (re, l) => rator :: paren(p, re, l))
344 :     l r)
345 :     and paren (p, re, l) = if (p <= prec re)
346 :     then toS (re, l)
347 :     else "(" :: toS(re, ")" :: l)
348 :     in
349 :     String.concat(toS(re, []))
350 :     end
351 :    
352 :     (* true iff epsilon is in the language recognized by the RE *)
353 :     fun nullable Any = false
354 :     | nullable None = false
355 :     | nullable Epsilon = true
356 :     | nullable (SymSet _) = false
357 :     | nullable (Closure _) = true
358 :     | nullable (Concat res) = List.all nullable res
359 :     | nullable (Op(OR, res)) = List.exists nullable res
360 :     | nullable (Op(AND, res)) = List.all nullable res
361 :     | nullable (Op(XOR, re::r)) =
362 :     (nullable re andalso not(List.exists nullable r))
363 :     orelse nullable(Op(XOR, r))
364 :     | nullable (Op(XOR, [])) = raise Fail "(BUG) RegExp: RE operator has no operands"
365 :     | nullable (Not re) = not(nullable re)
366 :    
367 :     fun delta re = if (nullable re) then Epsilon else None
368 :    
369 :     (* compute derivative w.r.t. a symbol *)
370 :     fun derivative a = let
371 :     fun da Any = Epsilon
372 :     | da None = None
373 :     | da Epsilon = None
374 :     | da (SymSet s) = if SIS.member(s, a) then Epsilon else None
375 :     | da (re as Closure re') = mkConcat(da re', re)
376 :     | da (Concat[]) = None
377 :     | da (Concat[re]) = da re
378 :     | da (Concat(re::res)) =
379 :     mkOr(
380 :     mkConcatList((da re)::res),
381 :     mkConcat(delta re, da(Concat res)))
382 :     | da (Op(_, [])) = raise Fail "(BUG) RegExp: RE operator has no operands"
383 :     | da (Op(rator, [re])) = da re
384 :     | da (Op(rator, re::res)) = mkOp(rator, da re, da(Op(rator, res)))
385 :     | da (Not re) = mkNot(da re)
386 :     in
387 :     da
388 :     end
389 :    
390 :     structure Map = RedBlackMapFn (
391 :     struct
392 :     type ord_key = re Vector.vector
393 :     val compare = Vector.collate compare
394 :     end)
395 :    
396 :     (* yields the smallest partitioning of the alphabet that
397 :     * "respects" the given sets. if S is one of the sets
398 :     * returned by compress, then it must be either disjoint
399 :     * with or a subset of each of the sets in the sets
400 :     * parameter. see the implementation notes for more detail.
401 :     *)
402 :     fun compress sets = let
403 :     (* performs partition of a set againt a list of sets,
404 :     * assuming the list of sets is pairwise disjoint.
405 :     *)
406 :     fun part1 (set, []) =
407 :     if SIS.isEmpty set then []
408 :     else [set]
409 :     | part1 (set1, set2 :: ss) =
410 :     if SIS.isEmpty set1 then
411 :     set2 :: ss
412 :     else let
413 :     val i = SIS.intersect (set1, set2)
414 :     in if SIS.isEmpty i then
415 :     (set2 :: (part1 (set1, ss)))
416 :     else let
417 :     val s1 = SIS.difference (set1, i)
418 :     val s2 = SIS.difference (set2, i)
419 :     val ss' = if SIS.isEmpty s1 then ss
420 :     else part1 (s1, ss)
421 :     in if SIS.isEmpty s2 then
422 :     (i :: ss')
423 :     else
424 :     (i :: s2 :: ss')
425 :     end
426 :     end
427 :     in
428 :     List.foldl part1 [] (SIS.universe::sets)
429 :     end
430 :    
431 :     fun derivatives (res : re Vector.vector) = let
432 :     (* ds is the "factoring function" *)
433 :     fun ds Any = [SIS.universe]
434 :     | ds None = []
435 :     | ds Epsilon = []
436 :     | ds (SymSet s) = [s]
437 :     | ds (Closure re) = ds re
438 :     | ds (Concat []) = []
439 :     | ds (Concat [re]) = ds re
440 :     | ds (Concat (re::res)) =
441 :     if nullable re then
442 :     (ds re) @ (ds (Concat res))
443 :     else ds re
444 :     | ds (Op(rator, res)) = List.concat (map ds res)
445 :     | ds (Not re) = ds re
446 :     val sets = Vector.foldl
447 :     (fn (re, sets) => (ds re) @ sets)
448 :     [] res
449 :     val sets' = compress sets
450 :     fun classes ([], classMap) = Map.listItemsi classMap
451 :     | classes (set::sets, classMap) = let
452 :     (* use first element as representative of the equiv class *)
453 :     val (rep, _) = List.hd (SIS.intervals set)
454 :     val derivs = Vector.map (derivative rep) res
455 :     in case Map.find (classMap, derivs)
456 :     of NONE =>
457 :     classes (sets, Map.insert(classMap, derivs, set))
458 :     | SOME set' => let
459 :     val map' = Map.insert(classMap,
460 :     derivs,
461 :     SIS.union (set, set'))
462 :     in classes (sets, map')
463 :     end
464 :     end
465 :     in
466 :     classes (sets', Map.empty)
467 :     end
468 :    
469 :     end
470 : mblume 1904
471 :    

root@smlnj-gforge.cs.uchicago.edu
ViewVC Help
Powered by ViewVC 1.0.0