Home My Page Projects Code Snippets Project Openings SML/NJ
Summary Activity Forums Tracker Lists Tasks Docs Surveys News SCM Files

SCM Repository

[smlnj] Annotation of /smlnj-lib/trunk/RegExp/BackEnd/thompson-engine.sml
ViewVC logotype

Annotation of /smlnj-lib/trunk/RegExp/BackEnd/thompson-engine.sml

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3190 - (view) (download)

1 : jhr 2958 (* thompson-engine.sml
2 :     *
3 :     * COPYRIGHT (c) 2008 The Fellowship of SML/NJ (http://www.smlnj.org)
4 :     * All rights reserved.
5 :     *
6 :     * This is an implementation of Ken Thompson's RE matchine algorithm from
7 :     * CACM (1968). It is based on the description of the algorithm by Russ
8 :     * Cox at http://swtch.com/~rsc/regexp/regexp1.html.
9 :     *)
10 :    
11 :     structure ThompsonEngine : REGEXP_ENGINE =
12 :     struct
13 :    
14 :     structure RE = RegExpSyntax
15 :     structure CSet = RE.CharSet
16 :     structure M = MatchTree
17 :    
18 :     (* a match specifies the position (as a stream) and the length of the match *)
19 :     type 'a match = {pos : 'a, len : int} MatchTree.match_tree
20 :    
21 :     (* intermediate representation of states *)
22 : jhr 3014 datatype state_kind
23 :     = CHR' of (char * state' ref)
24 :     | CSET' of (CSet.set * state' ref)
25 :     | NCSET' of (CSet.set * state' ref)
26 :     | SPLIT' of (state' ref * state' ref)
27 :     | BOL' of state' ref (* assert beginning of line *)
28 :     | EOL' of state' ref (* assert end of line *)
29 : jhr 2958 | FINAL'
30 :    
31 : jhr 3014 withtype state' = {id : int, kind : state_kind}
32 :    
33 : jhr 2958 type frag = {start : state', out : state' ref list}
34 :    
35 :     (* return the ID of a state *)
36 : jhr 3014 fun idOf {id, kind} = id
37 : jhr 2958
38 : jhr 3014 val final = {id = 0, kind = FINAL'}
39 :    
40 : jhr 2958 (* interpreter representation of states *)
41 :     datatype state
42 :     = CHR of (char * int)
43 :     | CSET of (CSet.set * int)
44 :     | NCSET of (CSet.set * int)
45 :     | SPLIT of (int * int)
46 : jhr 3014 | BOL of int (* assert beginning of line *)
47 :     | EOL of int (* assert end of line *)
48 : jhr 2958 | FINAL
49 :    
50 : jhr 3014 fun cvtState {id, kind} = (case kind
51 :     of CHR'(c, out) => CHR(c, idOf(!out))
52 :     | CSET'(cset, out) => CSET(cset, idOf(!out))
53 :     | NCSET'(cset, out) => NCSET(cset, idOf(!out))
54 :     | SPLIT'(out1, out2) => SPLIT(idOf(!out1), idOf(!out2))
55 :     | BOL' out => BOL(idOf(!out))
56 :     | EOL' out => EOL(idOf(!out))
57 :     | FINAL' => FINAL
58 :     (* end case *))
59 : jhr 2958
60 :     datatype regexp = RE of {start : int, states : state vector}
61 :    
62 :     fun compile re = let
63 :     (* the list of states; state 0 is always the accepting state *)
64 :     val nStates = ref 1
65 : jhr 3014 val states = ref [final]
66 : jhr 2958 (* create new states *)
67 : jhr 3014 fun new kind = let
68 : jhr 2958 val id = !nStates
69 : jhr 3014 val s = {id = id, kind = kind}
70 : jhr 2958 in
71 :     states := s :: !states;
72 :     nStates := id+1;
73 :     s
74 :     end
75 : jhr 3014 fun newChr (c, out) = new (CHR'(c, out))
76 :     fun newCset (cset, out) = new (CSET'(cset, out))
77 :     fun newNcset (cset, out) = new (NCSET'(cset, out))
78 :     fun newSplit (out1, out2) = new (SPLIT'(out1, out2))
79 :     fun newBOL out = new (BOL' out)
80 :     fun newEOL out = new (EOL' out)
81 : jhr 2958 (* update the outputs of a fragment *)
82 :     fun setOuts (f : frag, s : state') = List.app (fn r => r := s) (#out f)
83 :     (* compile an RE *)
84 :     fun reComp re = (case re
85 :     of RE.Group re => reComp re
86 : jhr 3016 | RE.Alt[] => raise Fail "empty alternative"
87 : jhr 2958 | RE.Alt[re] => reComp re
88 :     | RE.Alt(re::rest) => let
89 :     val f1 = reComp re
90 :     val f2 = reComp (RE.Alt rest)
91 :     val s = newSplit(ref(#start f1), ref(#start f2))
92 :     in
93 :     {start = s, out = #out f1 @ #out f2}
94 :     end
95 : jhr 3016 | RE.Concat[] => raise Fail "empty concatenation"
96 : jhr 2958 | RE.Concat[re] => reComp re
97 : jhr 3190 | RE.Concat(re::rest) => cat (re, RE.Concat rest)
98 : jhr 2958 | RE.Interval(re, 0, SOME 1) => option re
99 :     | RE.Interval(re, 0, NONE) => closure re
100 :     | RE.Interval(re, 1, NONE) => posClosure re
101 : jhr 3190 | RE.Interval(re, i, SOME j) => raise Fail "unimplemented"
102 : jhr 2958 | RE.Option re => option re
103 :     | RE.Star re => closure re
104 :     | RE.Plus re => posClosure re
105 :     | RE.MatchSet cset => let
106 : jhr 3014 val out = ref final
107 : jhr 2958 in
108 :     {start = newCset(cset, out), out = [out]}
109 :     end
110 :     | RE.NonmatchSet cset => let
111 : jhr 3014 val out = ref final
112 : jhr 2958 in
113 :     {start = newNcset(cset, out), out = [out]}
114 :     end
115 :     | RE.Char c => let
116 : jhr 3014 val out = ref final
117 : jhr 2958 in
118 :     {start = newChr(c, out), out = [out]}
119 :     end
120 : jhr 3014 | RE.Begin => let
121 :     val out = ref final
122 :     in
123 :     {start = newBOL out, out = [out]}
124 :     end
125 : jhr 2958 | RE.End => raise Fail "End"
126 :     (* end case *))
127 : jhr 3190 (* compile re1 . re2 *)
128 :     and cat (re1, re2) = let
129 :     val f1 = reComp re1
130 :     val f2 = reComp re2
131 :     in
132 :     setOuts (f1, #start f2);
133 :     {start = #start f1, out = #out f2}
134 :     end
135 : jhr 2958 (* compile re? *)
136 :     and option re = let
137 :     val f = reComp re
138 : jhr 3014 val out = ref final
139 : jhr 2958 val s = newSplit(ref(#start f), out)
140 :     in
141 :     {start = s, out = out :: #out f}
142 :     end
143 :     (* compile re* *)
144 :     and closure re = let
145 :     val f = reComp re
146 : jhr 3014 val out = ref final
147 : jhr 2958 val s = newSplit(ref(#start f), out)
148 :     in
149 :     setOuts (f, s);
150 :     {start = s, out = [out]}
151 :     end
152 :     (* compile re+ *)
153 :     and posClosure re = let
154 :     val f = reComp re
155 : jhr 3014 val out = ref final
156 : jhr 2958 val s = newSplit(ref(#start f), out)
157 :     in
158 :     setOuts (f, s);
159 :     {start = #start f, out = [out]}
160 :     end
161 :     (* generate the intermediate state representation *)
162 :     val frag = reComp re
163 : jhr 3014 val _ = setOuts (frag, final)
164 : jhr 2958 (* convert the states to the final representation; note that we reverse the list
165 :     * so that the states are now in increasing order.
166 :     *)
167 :     val states = List.foldl (fn (s, l) => cvtState s :: l) [] (!states)
168 :     in
169 :     RE{ start = idOf(#start frag), states = Vector.fromList states }
170 :     end
171 :    
172 : jhr 3190 (* +DEBUG *)
173 :     fun stateToString (CHR(c, out)) =
174 :     concat["CHR (#\"", Char.toString c, "\", ", Int.toString out, ")"]
175 :     | stateToString (CSET(cs, out)) = concat["CSET (-, ", Int.toString out, ")"]
176 :     | stateToString (NCSET(cs, out)) = concat["NCSET (-, ", Int.toString out, ")"]
177 :     | stateToString (SPLIT(out1, out2)) =
178 :     concat["SPLIT (", Int.toString out1, ", ", Int.toString out2, ")"]
179 :     | stateToString (BOL out) = concat["BOL ", Int.toString out]
180 :     | stateToString (EOL out) = concat["EOL ", Int.toString out]
181 :     | stateToString FINAL = "FINAL"
182 :     fun dump (RE{start, states}) = let
183 :     fun prState st = print(stateToString st)
184 :     in
185 :     print(concat["start = ", Int.toString start, "\n"]);
186 :     Vector.appi (fn (i, st) => (print(Int.toString i ^ ": "); prState st; print "\n"))
187 :     states
188 :     end
189 :     (* -DEBUG *)
190 :    
191 :     (* scan the stream for the first occurrence of the regular expression *)
192 :     fun scan (RE{start, states}, getc : (char,'a) StringCvt.reader) = let
193 :     (*val _ = dump (RE{start=start, states=states})*)
194 : jhr 2958 (* to make elimination of duplicates in a state set cheap, we map state IDs
195 :     * to a stamp of the last set that they were added to.
196 :     *)
197 :     val stamp = ref 0w1
198 :     val lastStamp = Array.array(Vector.length states, 0w0)
199 :     fun addState (stamp', stateList, id) =
200 :     if (Array.sub(lastStamp, id) = stamp')
201 :     then stateList
202 :     else (
203 :     Array.update(lastStamp, id, stamp');
204 :     case Vector.sub(states, id)
205 :     of SPLIT(out1, out2) =>
206 :     addState (stamp', addState (stamp', stateList, out1), out2)
207 :     | state => state :: stateList
208 :     (* end case *))
209 : jhr 3190 fun startState () = let
210 :     val stamp' = !stamp
211 :     in
212 :     stamp := stamp' + 0w1;
213 :     addState (stamp', [], start)
214 :     end
215 : jhr 2958 fun isMatch stamp' = (Array.sub(lastStamp, 0) = stamp')
216 : jhr 3190 (* attempt to match the RE starting with the stream startPos *)
217 :     fun find' (isFirst, startPos) = let
218 :     fun scan (_, _, _, lastAccepting, []) = lastAccepting
219 :     | scan (isFirst, n, strm, lastAccepting, nfaState) = (case getc strm
220 : jhr 2958 of NONE => if isMatch (!stamp)
221 :     then SOME(n, startPos)
222 :     else lastAccepting
223 :     | SOME(c, strm') => let
224 :     val stamp' = !stamp
225 :     val _ = (stamp := stamp' + 0w1)
226 :     fun test ([], nextStates) = nextStates
227 : jhr 3190 | test (s::r, nextStates) = let
228 :     fun continue nextStates = test(r, nextStates)
229 :     fun add out = continue(addState (stamp', nextStates, out))
230 :     in
231 :     case s
232 :     of CHR(c', out) => if (c = c')
233 :     then add out
234 :     else continue nextStates
235 :     | CSET(cset, out) => if CSet.member(cset, c)
236 :     then add out
237 :     else continue nextStates
238 :     | NCSET(cset, out) => if CSet.member(cset, c)
239 :     then continue nextStates
240 :     else add out
241 :     | BOL out => if isFirst
242 :     then test(Vector.sub(states, out)::r, nextStates)
243 :     else continue nextStates
244 :     | EOL out => raise Fail "end-of-line not supported yet"
245 :     | _ => continue nextStates
246 :     (* end case *)
247 :     end
248 : jhr 2958 val nextNfaState = test (nfaState, [])
249 :     val lastAccepting = if isMatch stamp'
250 :     then SOME(n+1, startPos)
251 :     else lastAccepting
252 :     in
253 : jhr 3190 (*
254 :     print(concat[
255 :     "{", String.concatWith "," (List.map stateToString nfaState), "} -- ",
256 :     "#\"", Char.toString c, "\" --> {",
257 :     String.concatWith "," (List.map stateToString nextNfaState), "}\n"]);
258 :     *)
259 :     scan ((c = #"\n"), n+1, strm', lastAccepting, nextNfaState)
260 : jhr 2958 end
261 :     (* end case *))
262 :     in
263 : jhr 3190 case scan (isFirst, 0, startPos, NONE, startState())
264 :     of NONE => NONE
265 : jhr 2958 | SOME(n, strm) => SOME(M.Match({pos=startPos, len=n}, []), strm)
266 :     (* end case *)
267 :     end
268 :     in
269 : jhr 3190 find'
270 : jhr 2958 end
271 :    
272 :     fun find re getc stream = let
273 : jhr 3190 val scan = scan (re, getc)
274 :     fun loop (isFirst, s) = (case (scan (isFirst, s))
275 : jhr 2958 of NONE => (case (getc s)
276 : jhr 3190 of SOME(#"\n", s') => loop (true, s')
277 :     | SOME(_, s') => loop (false, s')
278 : jhr 2958 | NONE => NONE
279 :     (* end case *))
280 :     | SOME v => SOME v
281 :     (* end case *))
282 :     in
283 :     loop (true, stream)
284 :     end
285 :    
286 : jhr 3190 fun prefix re getc strm = scan (re, getc) (true, strm)
287 : jhr 2958
288 :     fun match [] = (fn getc => fn strm => NONE)
289 :     | match l = let
290 :     (* compile the REs *)
291 :     val l = List.map (fn (re, act) => (compile re, act)) l
292 :     fun match' getc strm = let
293 :     (* find the longest SOME *)
294 :     fun loop ([], max, _) = max
295 : jhr 3190 | loop ((re, act)::r, max, maxLen) = (case scan(re, getc) (true, strm)
296 : jhr 2958 of NONE => loop (r, max, maxLen)
297 :     | SOME(m as MatchTree.Match({len, ...}, _), cs) =>
298 :     if (len > maxLen)
299 :     then loop (r, SOME(m, act, cs), len)
300 :     else loop (r, max, maxLen)
301 :     (* end case *))
302 :     in
303 :     case loop (l, NONE, ~1)
304 :     of NONE => NONE
305 :     | SOME(m, act, cs) => SOME(act m, cs)
306 :     (* end case *)
307 :     end
308 :     in
309 :     match'
310 :     end
311 : jhr 3190
312 : jhr 2958 end

root@smlnj-gforge.cs.uchicago.edu
ViewVC Help
Powered by ViewVC 1.0.0