(* reg-exp-fn.sml * * COPYRIGHT (c) 2005 * John Reppy (http://www.cs.uchicago.edu/~jhr) * Aaron Turon (adrassi@gmail.com) * All rights reserved. * * Regular expression representation and manipulation. * * The main points here are to: * (1) make it easy for an RE parser to construct * RE expressions * (2) canonicalize REs for effective comparison * (3) implement the RE derivatives algorithm * * See the implementation notes for details on the derivatives * algorithm and the canonicalization strategy. *) structure RegExp : REG_EXP = struct (* symbols (i.e., words) *) structure Sym = struct structure W32 = Word32 type point = W32.word val compare = W32.compare val minPt : W32.word = 0w0 val maxPt = W32.notb 0w0 fun succ (w : W32.word) = if w = W32.notb 0w0 then w else w + 0w1 fun pred (w : W32.word) = if w = 0w0 then w else w - 0w1 fun isSucc (w1, w2) = (succ w1 = w2) end structure SymSet = IntervalSetFn(Sym) type symbol = Sym.point type sym_set = SymSet.set structure SIS = SymSet (* REs *) datatype re = Epsilon (* matches the empty string *) | Any (* matches any single symbol *) | None (* matches nothing (i.e. the empty language) *) | SymSet of sym_set | Concat of re list | Closure of re | Op of (rator * re list) (* list length <> 1 and in sorted order *) | Not of re and rator = OR | AND | XOR (* we give a total order to REs; this is useful for canonicalization *) fun compare (re1, re2) = let fun cmpOp (OR, OR) = EQUAL | cmpOp (OR, _) = LESS | cmpOp (_, OR) = GREATER | cmpOp (AND, AND) = EQUAL | cmpOp (AND, _) = LESS | cmpOp (_, AND) = GREATER | cmpOp (XOR, XOR) = EQUAL fun compareList (res1, res2) = List.collate compare (res1, res2) in case (re1, re2) of (Epsilon, Epsilon) => EQUAL | (Epsilon, _) => LESS | (_, Epsilon) => GREATER | (Any, Any) => EQUAL | (Any, _) => LESS | (_, Any) => GREATER | (None, None) => EQUAL | (None, _) => LESS | (_, None) => GREATER | (SymSet a, SymSet b) => SIS.compare(a, b) | (SymSet a, _) => LESS | (_, SymSet b) => GREATER | (Concat a, Concat b) => compareList(a, b) | (Concat a, _) => LESS | (_, Concat b) => GREATER | (Closure a, Closure b) => compare(a, b) | (Closure a, _) => LESS | (_, Closure b) => GREATER | (Op(op1, res1), Op(op2, res2)) => (case cmpOp (op1, op2) of EQUAL => compareList(res1, res2) | order => order (* end case *)) | (Op _, _) => LESS | (_, Op _) => GREATER | (Not a, Not b) => compare(a, b) (* end case *) end (* val sort = ListMergeSort.sort (fn (re1, re2) => compare(re1, re2) = LESS) *) (* primitive REs *) val any = Any val none = None val epsilon = Epsilon (* canonical constructors *) fun mkSymSet c = if SIS.isEmpty c then None else if SIS.isUniverse c then Any else SymSet c fun mkSym sym = mkSymSet (SIS.singleton sym) fun mkConcat (re1, re2) = (case (re1, re2) of (Epsilon, re2) => re2 | (re1, Epsilon) => re1 | (None, _) => None | (_, None) => None | (Concat res1, Concat res2) => Concat(res1@res2) | (re1, Concat res2) => Concat(re1::res2) | (Concat res1, re2) => Concat(res1@[re2]) | _ => Concat[re1, re2] (* end case *)) fun mkConcatList [] = Epsilon | mkConcatList (re::res) = mkConcat(re, mkConcatList res) fun mkClosure (Epsilon) = Epsilon | mkClosure (None) = Epsilon | mkClosure (re as Closure _) = re | mkClosure re = Closure re fun mergeSIS (inRes, mop) = let fun isSIS (SymSet _) = true | isSIS _ = false val (siss, res) = List.partition isSIS inRes in case siss of [] => inRes | [re] => inRes | sis::siss' => let fun wrapmop (SymSet s1, SymSet s2) = SymSet (mop (s1, s2)) | wrapmop _ = raise Fail "BUG: wrapmop: SymSet expected" val merged = List.foldl wrapmop sis siss' fun reinsert (re1, []) = [re1] | reinsert (re1, re::res) = (case compare (re1, re) of LESS => re1::re::res | EQUAL => raise Fail "BUG: mergeSIS: only one SymSet expected" | GREATER => re::(reinsert (re1, res)) (* end case *)) in reinsert (merged, res) end end fun mkOr (re1, re2) = let fun merge ([], res2) = res2 | merge (res1, []) = res1 | merge (re1::r1, re2::r2) = (case compare(re1, re2) of LESS => re1::merge(r1, re2::r2) | EQUAL => merge (re1::r1, r2) | GREATER => re2 :: merge(re1::r1, r2) (* end case *)) fun mk (a, b) = (case mergeSIS(merge(a, b), SIS.union) of [] => None | [re] => re | res => Op(OR, res) (* end case *)) in case (re1, re2) of (None, _) => re2 | (_, None) => re1 | (SymSet s1, SymSet s2) => mkSymSet (SIS.union (s1, s2)) | (Op(OR, res1), Op(OR, res2)) => mk(res1, res2) | (Op(OR, res1), _) => mk(res1, [re2]) | (_, Op(OR, res2)) => mk([re1], res2) | (re1, re2) => (case compare(re1, re2) of LESS => Op(OR, [re1, re2]) | EQUAL => re1 | GREATER => Op(OR, [re2, re1]) (* end case *)) (* end case *) end fun mkAnd (re1, re2) = let fun merge ([], res2) = res2 | merge (res1, []) = res1 | merge (re1::r1, re2::r2) = (case compare(re1, re2) of LESS => re1::merge(r1, re2::r2) | EQUAL => merge (re1::r1, r2) | GREATER => re2 :: merge(re1::r1, r2) (* end case *)) fun mk (a, b) = (case mergeSIS(merge(a, b), SIS.intersect) of [] => None | [re] => re | res => Op(AND, res) (* end case *)) in case (re1, re2) of (None, _) => None | (_, None) => None | (SymSet s1, SymSet s2) => mkSymSet (SIS.intersect (s1, s2)) | (Op(AND, res1), Op(AND, res2)) => mk(res1, res2) | (Op(AND, res1), _) => mk(res1, [re2]) | (_, Op(AND, res2)) => mk([re1], res2) | (re1, re2) => (case compare(re1, re2) of LESS => Op(AND, [re1, re2]) | EQUAL => re1 | GREATER => Op(AND, [re2, re1]) (* end case *)) (* end case *) end fun mkXor (re1, re2) = let fun merge ([], res2) = res2 | merge (res1, []) = res1 | merge (re1::r1, re2::r2) = (case compare(re1, re2) of LESS => re1::merge(r1, re2::r2) | EQUAL => merge (r1, r2) | GREATER => re2 :: merge(re1::r1, r2) (* end case *)) fun mk (a, b) = (case merge(a, b) of [] => None | [re] => re | res => Op(XOR, res) (* end case *)) in case (re1, re2) of (None, _) => re2 | (_, None) => re1 | (SymSet s1, SymSet s2) => mkSymSet (SIS.intersect ( SIS.union (s1, s2), SIS.complement (SIS.intersect (s1, s2)) )) | (Op(XOR, res1), Op(XOR, res2)) => mk(res1, res2) | (Op(XOR, res1), _) => mk(res1, [re2]) | (_, Op(XOR, res2)) => mk([re1], res2) | (re1, re2) => (case compare(re1, re2) of LESS => Op(XOR, [re1, re2]) | EQUAL => None (* FIXME is this right? *) | GREATER => Op(XOR, [re2, re1]) (* end case *)) (* end case *) end fun mkOp (OR, re1, re2) = mkOr(re1, re2) | mkOp (AND, re1, re2) = mkAnd(re1, re2) | mkOp (XOR, re1, re2) = mkXor(re1, re2) fun mkNot (Not re) = re | mkNot (None) = mkClosure(Any) | mkNot re = Not re fun mkOpt re = mkOr(Epsilon, re) fun mkRep (re, low, high) = let fun lowReps 0 = Epsilon | lowReps 1 = re | lowReps n = mkConcat (re, lowReps (n-1)) fun highReps 0 = Epsilon | highReps 1 = mkOpt re | highReps n = mkConcat (mkOpt re, highReps (n-1)) in if high < low then raise Subscript else mkConcat (lowReps low, highReps (high - low)) end fun mkAtLeast (re, 0) = mkClosure re | mkAtLeast (re, n) = mkConcat (re, mkAtLeast (re, n-1)) fun isNone None = true | isNone _ = false fun symToString w = "#\"" ^ (Char.toString (Char.chr (Word32.toInt w))) ^ "\"" handle Overflow => raise Fail "(BUG) RegExp: symToString on a nonascii character" fun SISToString s = let fun c2s c = if (c < 0w128) then Char.toString (Char.chr (Word32.toInt c)) else String.concat ["\\u", Word32.toString c] fun f (a, b) = if a=b then c2s a else concat[c2s a, "-", c2s b] (* we want to describe the interval set as concisely as possible, * so we compare the number of intervals in the set to the number * of intervals in its complement, and use the smaller of the two. *) val intervals = SIS.intervals s val intervals' = SIS.intervals (SIS.complement s) val (neg, rngs) = if List.length intervals < List.length intervals' then ("", intervals) else ("^", intervals') val str = neg ^ (String.concat (List.map f (rngs))) in if String.size str <= 1 then str else "[" ^ str ^ "]" end fun toString re = let fun opToString OR = "|" | opToString AND = "&" | opToString XOR = "^" fun opPrec OR = 0 | opPrec AND = 2 | opPrec XOR = 1 fun prec Any = 6 | prec None = 6 | prec Epsilon = 6 | prec (SymSet _) = 6 | prec (Concat[]) = 6 | prec (Concat _) = 3 | prec (Closure _) = 5 | prec (Op(_, [])) = 6 | prec (Op(_, [re])) = prec re | prec (Op(rator, _)) = opPrec rator | prec (Not _) = 4 fun toS (Any, l) = "{any}" :: l | toS (None, l) = "{none}" :: l | toS (Epsilon, l) = "{epsilon}" :: l | toS (SymSet s, l) = SISToString s :: l | toS (Concat[], l) = "" :: l | toS (Concat[re], l) = toS(re, l) | toS (Concat res, l) = toS'(res, 3, "", l) | toS (Closure re, l) = paren(5, re, "*" :: l) | toS (Op(_, []), l) = "{}" :: l | toS (Op(rator, [re]), l) = toS(re, l) | toS (Op(rator, res), l) = toS'(res, opPrec rator, opToString rator, l) | toS (Not re, l) = "!" :: paren(4, re, l) and toS' ([], p, rator, l) = raise Fail "empty" | toS' (re::r, p, rator, l) = paren(p, re, List.foldr (fn (re, l) => rator :: paren(p, re, l)) l r) and paren (p, re, l) = if (p <= prec re) then toS (re, l) else "(" :: toS(re, ")" :: l) in String.concat(toS(re, [])) end (* true iff epsilon is in the language recognized by the RE *) fun nullable Any = false | nullable None = false | nullable Epsilon = true | nullable (SymSet _) = false | nullable (Closure _) = true | nullable (Concat res) = List.all nullable res | nullable (Op(OR, res)) = List.exists nullable res | nullable (Op(AND, res)) = List.all nullable res | nullable (Op(XOR, re::r)) = (nullable re andalso not(List.exists nullable r)) orelse nullable(Op(XOR, r)) | nullable (Op(XOR, [])) = raise Fail "(BUG) RegExp: RE operator has no operands" | nullable (Not re) = not(nullable re) fun delta re = if (nullable re) then Epsilon else None (* compute derivative w.r.t. a symbol *) fun derivative a = let fun da Any = Epsilon | da None = None | da Epsilon = None | da (SymSet s) = if SIS.member(s, a) then Epsilon else None | da (re as Closure re') = mkConcat(da re', re) | da (Concat[]) = None | da (Concat[re]) = da re | da (Concat(re::res)) = mkOr( mkConcatList((da re)::res), mkConcat(delta re, da(Concat res))) | da (Op(_, [])) = raise Fail "(BUG) RegExp: RE operator has no operands" | da (Op(rator, [re])) = da re | da (Op(rator, re::res)) = mkOp(rator, da re, da(Op(rator, res))) | da (Not re) = mkNot(da re) in da end structure Map = RedBlackMapFn ( struct type ord_key = re Vector.vector val compare = Vector.collate compare end) (* yields the smallest partitioning of the alphabet that * "respects" the given sets. if S is one of the sets * returned by compress, then it must be either disjoint * with or a subset of each of the sets in the sets * parameter. see the implementation notes for more detail. *) fun compress sets = let (* performs partition of a set againt a list of sets, * assuming the list of sets is pairwise disjoint. *) fun part1 (set, []) = if SIS.isEmpty set then [] else [set] | part1 (set1, set2 :: ss) = if SIS.isEmpty set1 then set2 :: ss else let val i = SIS.intersect (set1, set2) in if SIS.isEmpty i then (set2 :: (part1 (set1, ss))) else let val s1 = SIS.difference (set1, i) val s2 = SIS.difference (set2, i) val ss' = if SIS.isEmpty s1 then ss else part1 (s1, ss) in if SIS.isEmpty s2 then (i :: ss') else (i :: s2 :: ss') end end in List.foldl part1 [] (SIS.universe::sets) end fun derivatives (res : re Vector.vector) = let (* ds is the "factoring function" *) fun ds Any = [SIS.universe] | ds None = [] | ds Epsilon = [] | ds (SymSet s) = [s] | ds (Closure re) = ds re | ds (Concat []) = [] | ds (Concat [re]) = ds re | ds (Concat (re::res)) = if nullable re then (ds re) @ (ds (Concat res)) else ds re | ds (Op(rator, res)) = List.concat (map ds res) | ds (Not re) = ds re val sets = Vector.foldl (fn (re, sets) => (ds re) @ sets) [] res val sets' = compress sets fun classes ([], classMap) = Map.listItemsi classMap | classes (set::sets, classMap) = let (* use first element as representative of the equiv class *) val (rep, _) = List.hd (SIS.intervals set) val derivs = Vector.map (derivative rep) res in case Map.find (classMap, derivs) of NONE => classes (sets, Map.insert(classMap, derivs, set)) | SOME set' => let val map' = Map.insert(classMap, derivs, SIS.union (set, set')) in classes (sets, map') end end in classes (sets', Map.empty) end end