Home My Page Projects Code Snippets Project Openings SML/NJ
 Summary Activity Forums Tracker Lists Tasks Docs Surveys News SCM Files

# SCM Repository

[smlnj] View of /smlnj-lib/trunk/RegExp/BackEnd/thompson-engine.sml
 [smlnj] / smlnj-lib / trunk / RegExp / BackEnd / thompson-engine.sml

# View of /smlnj-lib/trunk/RegExp/BackEnd/thompson-engine.sml

Mon May 5 16:41:33 2008 UTC (12 years ago) by jhr
File size: 8640 byte(s)
```  Started to implement unit tests for RE matching.
```
```(* thompson-engine.sml
*
* COPYRIGHT (c) 2008 The Fellowship of SML/NJ (http://www.smlnj.org)
*
* This is an implementation of Ken Thompson's RE matchine algorithm from
* CACM (1968).  It is based on the description of the algorithm by Russ
* Cox at http://swtch.com/~rsc/regexp/regexp1.html.
*)

structure ThompsonEngine : REGEXP_ENGINE =
struct

structure RE = RegExpSyntax
structure CSet = RE.CharSet
structure M = MatchTree

(* a match specifies the position (as a stream) and the length of the match *)
type 'a match = {pos : 'a, len : int} MatchTree.match_tree

(* intermediate representation of states *)
datatype state_kind
= CHR' of (char * state' ref)
| CSET' of (CSet.set * state' ref)
| NCSET' of (CSet.set * state' ref)
| SPLIT' of (state' ref * state' ref)
| BOL' of state' ref			(* assert beginning of line *)
| EOL' of state' ref			(* assert end of line *)
| FINAL'

withtype state' = {id : int, kind : state_kind}

type frag = {start : state', out : state' ref list}

(* return the ID of a state *)
fun idOf {id, kind} = id

val final = {id = 0, kind = FINAL'}

(* interpreter representation of states *)
datatype state
= CHR of (char * int)
| CSET of (CSet.set * int)
| NCSET of (CSet.set * int)
| SPLIT of (int * int)
| BOL of int			(* assert beginning of line *)
| EOL of int			(* assert end of line *)
| FINAL

fun cvtState {id, kind} = (case kind
of CHR'(c, out) => CHR(c, idOf(!out))
| CSET'(cset, out) => CSET(cset, idOf(!out))
| NCSET'(cset, out) => NCSET(cset, idOf(!out))
| SPLIT'(out1, out2) => SPLIT(idOf(!out1), idOf(!out2))
| BOL' out => BOL(idOf(!out))
| EOL' out => EOL(idOf(!out))
| FINAL' => FINAL
(* end case *))

datatype regexp = RE of {start : int, states : state vector}

fun compile re = let
(* the list of states; state 0 is always the accepting state *)
val nStates = ref 1
val states = ref [final]
(* create new states *)
fun new kind = let
val id = !nStates
val s = {id = id, kind = kind}
in
states := s :: !states;
nStates := id+1;
s
end
fun newChr (c, out) = new (CHR'(c, out))
fun newCset (cset, out) = new (CSET'(cset, out))
fun newNcset (cset, out) = new (NCSET'(cset, out))
fun newSplit (out1, out2) = new (SPLIT'(out1, out2))
fun newBOL out = new (BOL' out)
fun newEOL out = new (EOL' out)
(* update the outputs of a fragment *)
fun setOuts (f : frag, s : state') = List.app (fn r => r := s) (#out f)
(* compile an RE *)
fun reComp re = (case re
of RE.Group re => reComp re
| RE.Alt[] => raise Fail "empty alternative"
| RE.Alt[re] => reComp re
| RE.Alt(re::rest) =>  let
val f1 = reComp re
val f2 = reComp (RE.Alt rest)
val s = newSplit(ref(#start f1), ref(#start f2))
in
{start = s, out = #out f1 @ #out f2}
end
| RE.Concat[] => raise Fail "empty concatenation"
| RE.Concat[re] => reComp re
| RE.Concat(re::rest) => let
val f1 = reComp re
val f2 = reComp(RE.Concat rest)
in
setOuts (f1, #start f2);
{start = #start f1, out = #out f2}
end
| RE.Interval(re, 0, SOME 1) => option re
| RE.Interval(re, 0, NONE) => closure re
| RE.Interval(re, 1, NONE) => posClosure re
| RE.Interval(re, i, optn) => raise Fail "Interval"
| RE.Option re => option re
| RE.Star re => closure re
| RE.Plus re => posClosure re
| RE.MatchSet cset => let
val out = ref final
in
{start = newCset(cset, out), out = [out]}
end
| RE.NonmatchSet cset => let
val out = ref final
in
{start = newNcset(cset, out), out = [out]}
end
| RE.Char c => let
val out = ref final
in
{start = newChr(c, out), out = [out]}
end
| RE.Begin => let
val out = ref final
in
{start = newBOL out, out = [out]}
end
| RE.End => raise Fail "End"
(* end case *))
(* compile re? *)
and option re = let
val f = reComp re
val out = ref final
val s = newSplit(ref(#start f), out)
in
{start = s, out = out :: #out f}
end
(* compile re* *)
and closure re = let
val f = reComp re
val out = ref final
val s = newSplit(ref(#start f), out)
in
setOuts (f, s);
{start = s, out = [out]}
end
(* compile re+ *)
and posClosure re = let
val f = reComp re
val out = ref final
val s = newSplit(ref(#start f), out)
in
setOuts (f, s);
{start = #start f, out = [out]}
end
(* generate the intermediate state representation *)
val frag = reComp re
val _ = setOuts (frag, final)
(* convert the states to the final representation; note that we reverse the list
* so that the states are now in increasing order.
*)
val states = List.foldl (fn (s, l) => cvtState s :: l) [] (!states)
in
RE{ start = idOf(#start frag), states = Vector.fromList states }
end

(* scan the stream for the first occurence of the regular expression *)
fun scan (RE{start, states}, getc : (char,'a) StringCvt.reader, isFirst, strm : 'a) = let
(* to make elimination of duplicates in a state set cheap, we map state IDs
* to a stamp of the last set that they were added to.
*)
val stamp = ref 0w1
val lastStamp = Array.array(Vector.length states, 0w0)
fun addState (stamp', stateList, id) =
if (Array.sub(lastStamp, id) = stamp')
then stateList
else (
Array.update(lastStamp, id, stamp');
case Vector.sub(states, id)
of SPLIT(out1, out2) =>
| state => state :: stateList
(* end case *))
fun startState () = addState(!stamp, [], start)
fun isMatch stamp' = (Array.sub(lastStamp, 0) = stamp')
fun find' startPos = let
fun scan (_, _, lastAccepting, []) = lastAccepting
| scan (n, strm, lastAccepting, nfaState) = (case getc strm
of NONE => if isMatch (!stamp)
then SOME(n, startPos)
else lastAccepting
| SOME(c, strm') => let
val stamp' = !stamp
val _ = (stamp := stamp' + 0w1)
fun test ([], nextStates) = nextStates
| test (s::r, nextStates) = (case s
of CHR(c', out) => if (c = c')
else nextStates
| CSET(cset, out) => if CSet.member(cset, c)
else nextStates
| NCSET(cset, out) => if CSet.member(cset, c)
then nextStates
| BOL out => if isFirst
else nextStates
| EOL out => raise Fail "end-of-line not supported yet"
| _ => nextStates
(* end case *))
val nextNfaState = test (nfaState, [])
val lastAccepting = if isMatch stamp'
then SOME(n+1, startPos)
else lastAccepting
in
scan (n+1, strm', lastAccepting, nextNfaState)
end
(* end case *))
in
case scan (0, startPos, NONE, startState())
of NONE => (case getc startPos
of SOME(_, strm) => find' strm
| NONE => NONE
(* end case *))
| SOME(n, strm) => SOME(M.Match({pos=startPos, len=n}, []), strm)
(* end case *)
end
in
find' strm
end

fun find re getc stream = let
(* FIXME: we need to reset isFirst on a newline; we also need to add an isLast
* flag to scan to support the EOL assertion.
*)
fun loop (isFirst, s) = (case (scan (re, getc, isFirst, s))
of NONE => (case (getc s)
of SOME(_, s') => loop (false, s')
| NONE => NONE
(* end case *))
| SOME v => SOME v
(* end case *))
in
loop (true, stream)
end

fun prefix re getc strm = scan (re, getc, true, strm)

fun match [] = (fn getc => fn strm => NONE)
| match l = let
(* compile the REs *)
val l = List.map (fn (re, act) => (compile re, act)) l
fun match' getc strm = let
(* find the longest SOME *)
fun loop ([], max, _) = max
| loop ((re, act)::r, max, maxLen) = (case scan(re, getc, true, strm)
of NONE => loop (r, max, maxLen)
| SOME(m as MatchTree.Match({len, ...}, _), cs) =>
if (len > maxLen)
then loop (r, SOME(m, act, cs), len)
else loop (r, max, maxLen)
(* end case *))
in
case loop (l, NONE, ~1)
of NONE => NONE
| SOME(m, act, cs) => SOME(act m, cs)
(* end case *)
end
in
match'
end
end
```