Home My Page Projects Code Snippets Project Openings SML/NJ
Summary Activity Forums Tracker Lists Tasks Docs Surveys News SCM Files

SCM Repository

[smlnj] View of /smlnj-lib/trunk/RegExp/FrontEnd/awk-syntax.sml
ViewVC logotype

View of /smlnj-lib/trunk/RegExp/FrontEnd/awk-syntax.sml

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3279 - (download) (annotate)
Sun Mar 15 14:45:21 2009 UTC (10 years, 6 months ago) by jhr
File size: 7929 byte(s)
  Added support for intervals to AWK RE syntax.
(* awk-syntax.sml
 *
 * COPYRIGHT (c) 2008 The Fellowship of SML/NJ (http://www.smlnj.org)
 * All rights reserved.
 *
 * This module implements the AWK syntax for regular expressions.  The
 * syntax is defined on pp. 28-30 of "The AWK Programming Language,"
 * by Aho, Kernighan and Weinberger.  We have extended it with interval
 * syntax, whcih was added as part of the POSIX standard.
 *
 * The meta characters are:
 *	"\" "^" "$" "." "[" "]" "|" "(" ")" "*" "+" "?"
 *    Atomic REs:
 *      c	matches the character c (for non-metacharacters c)
 *      "^"	matches the empty string at the beginning of a line
 *	"$"	matches the empty string at the end of a line
 *      "."	matches any single character (except \000 and \n)
 *
 *    Escape sequences:
 *	"\b"	matches backspace
 *	"\f"	matches formfeed
 *	"\n"	matches newline (linefeed)
 *	"\r"	matches carriage return
 *	"\t"	matches tab
 *	"\"ddd	matches the character with octal code ddd.
 *	"\"c	matches the character c (e.g., \\ for \, \" for ")
 *      "\x"dd  matches the character with hex code dd.
 *
 *    Character classes:
 *	[...]	matches any character in "..."
 *	[^...]	a complemented character list, which matches any character not
 *		in the list "..."
 *
 *    Compound regular expressions, where A and B are REs:
 *	A|B	matches A or B
 *	AB	matches A followed by B
 *	A?	matches zero or one As
 *	A*	matches zero or more As
 *	A+	matches one or more As
 *	A{n}	matches n copies of A
 *	A{n,}	matches n copies of A
 *	A{n,m}	matches n copies of A
 *	(A)	matches A
 *)

structure AwkSyntax : REGEXP_PARSER =
  struct

    structure R = RegExpSyntax

    structure SC = StringCvt
    structure W8 = Word8
    structure C = Char

    val isMeta = C.contains "\\^$.[]|()*+?"

    exception Error

    val dotMatch = R.NonmatchSet (R.CharSet.addList (R.CharSet.empty,explode "\000\n"))

    fun scan getc cs = let
	  fun getc' cs = (case (getc cs)
		 of NONE => raise Error
		  | (SOME arg) => arg)
		(* end case *)
	  fun isOctDigit c = (#"0" <= c) andalso (c <= #"7")
	  fun returnVal (v, cl, cs) = let
		val (n, _) = valOf (Int.scan v List.getItem cl)
		in
		  (C.chr n, cs) handle _ => raise Error
	      (* SC.scanString (Int.scan SC.OCT) (implode [c1,c2,c3]) *)
		end
	  fun getDecimal cs = let
		fun lp (cs, digits) = (case getc cs
		       of NONE => (cs, List.rev digits)
			| SOME(c, cs') =>
			    if (C.isDigit c) then lp(cs', c::digits)
			    else (cs, List.rev digits)
		      (* end case *))
		in
		  case lp (cs, [])
		   of (_, []) => raise Error (* no digits *)
		    | (cs, digits) => let
			val SOME(n, _) = (Int.scan SC.DEC List.getItem digits)
			in
			  (n, cs)
			end
		  (* end case *)
		end
	  fun getHexChar (c,cs) = (case (getc cs)
                 of NONE => returnVal (SC.HEX,[c],cs)
		  | SOME (c',cs') => 
		     if not (C.isHexDigit c') then returnVal (SC.HEX,[c],cs)
		     else returnVal (SC.HEX,[c,c'],cs')
		(* end case *))
	  fun getOctalChar (c,cs) = (case (getc cs)
		 of NONE => returnVal (SC.OCT,[c],cs)
	          | SOME(c',cs') => 
		      if not (isOctDigit c') then returnVal (SC.OCT,[c],cs)
		      else (case (getc cs')
			of NONE => returnVal (SC.OCT,[c,c'],cs')
		         | SOME (c'',cs'') => 
			     if not (isOctDigit c'') then returnVal (SC.OCT,[c,c'],cs')
			     else returnVal (SC.OCT,[c,c',c''],cs'')))
	  fun getEscapeChar cs = (case (getc' cs)
		 of (#"b", cs) => (#"\008", cs)
		  | (#"f", cs) => (#"\012", cs)
		  | (#"n", cs) => (#"\n", cs)
		  | (#"r", cs) => (#"\013", cs)
		  | (#"t", cs) => (#"\t", cs)
		  | (#"x", cs) => let val (c1,cs) = getc' cs
				  in
				      if (C.isHexDigit c1) then getHexChar (c1,cs) else raise Error
				  end
		  | (c1, cs) =>
       		      if (isOctDigit c1) then getOctalChar (c1,cs) else (c1, cs))
	  fun scanAlt (stk, cs) = let
		val (re, cs') = scanSeq([], cs)
		in
		  case (stk, getc cs')
		   of ([], NONE) => (re, cs')
		    | (_, SOME(#"|", cs'')) => scanAlt(re::stk, cs'')
		    | _ => (R.Alt(rev(re::stk)), cs')
		  (* end case *)
		end
	  and scanSeq (stk, cs) = let
		fun continue (re, cs') = scanSeq(re::stk, cs')
		fun done () = (R.Concat(rev stk), cs)
		in
		  case (stk, getc cs)
		   of ([], NONE) => raise Error
		    | ([re], NONE) => (re, cs)
		    | (_, NONE) => done()
		    | (re::r, SOME(#"{", cs')) => let
			val (n, m, cs'') = scanInterval cs'
			in
			  scanSeq (R.Interval(re, n, m)::r, cs'')
			end
		    | (re::r, SOME(#"?", cs')) => scanSeq (R.Option re :: r, cs')
		    | (re::r, SOME(#"*", cs')) => scanSeq (R.Star re :: r, cs')
		    | (re::r, SOME(#"+", cs')) => scanSeq (R.Plus re :: r, cs')
		    | (_, SOME(#"|", _)) => done()
		    | (_, SOME(#")", _)) => done()
		    | (_, SOME(#"(", cs')) => continue(scanGrp cs')
		    | (_, SOME(#".", cs')) => continue(dotMatch, cs')
		    | (_, SOME(#"^", cs')) => continue(R.Begin, cs')
		    | (_, SOME(#"$",cs')) => continue(R.End, cs')
		    | (_, SOME(#"[", cs')) => continue(scanClass cs')
		    | (_, SOME(#"\\", cs')) => continue(scanEscape cs')
		    | (_, SOME(c, cs')) => if (isMeta c)
			then raise Error
			else scanSeq((R.Char c)::stk, cs')
		  (* end case *)
		end
	(* scan the tail of "{n}", "{n,}", or "{n,m"}", where the leading "{" has already
	 * been scanned.
	 *)
	  and scanInterval cs = let
		val (n, cs) = getDecimal cs
		in
		  case getc cs
		   of SOME(#",", cs) => (case getc cs
			 of SOME(#"}", cs) => (n, NONE, cs)
			  | _ => let
			      val (m, cs) = getDecimal cs
			      in
				case getc cs
				 of SOME(#"}", cs) => (n, SOME m, cs)
				  | _ => raise Error
				(* end case *)
			      end
			(* end case *))
		    | SOME(#"}", cs) => (n, SOME n, cs)
		    | _ => raise Error
		  (* end case *)
		end
	  and scanGrp cs = let
		val (re, cs') = scanAlt ([], cs)
		in
		  case (getc' cs')
		   of (#")", cs'') => (R.Group re, cs'')
		    | _ => raise Error
		  (* end case *)
		end
	  and scanClass cs = let
		fun scanClass' cs = let
		      fun scanRange1 (set, cs) = (case (getc' cs)
			     of (#"]", cs) => (set,cs)
			      | (#"\\", cs) => let
				  val (c, cs) = getEscapeChar cs
				  in
				    scanRange2 (set, c, cs)
				  end
			      | (c, cs) => scanRange2 (set, c, cs)
			    (* end case *))
		      and scanRange2 (set, c, cs) = (case (getc' cs)
			     of (#"]", cs) => (R.CharSet.add(set, c), cs)
			      | (#"\\", cs) => let
				  val (c', cs) = getEscapeChar cs
				  in
				    scanRange2 (R.CharSet.add(set, c), c', cs)
				  end
			      | (#"-", cs) => scanRange3 (set, c, cs)
			      | (c', cs) =>
				  scanRange2 (R.CharSet.add(set, c), c', cs)
			    (* end case *))
		      and scanRange3 (set, minC, cs) = (case (getc' cs)
			     of (#"]", cs) =>
				  (R.CharSet.add(R.CharSet.add(set, minC), #"-"), cs)
			      | (#"\\", cs) => let
				  val (c, cs) = getEscapeChar cs
				  in
				    chkRange(set, minC, c, cs)
				  end
			      | (c, cs) => chkRange(set, minC, c, cs)
			    (* end case *))
		      and chkRange (set, minC, maxC, cs) =
			    if (minC > maxC)
			      then scanRange1 (set,cs )  (*raise Error *) (* as per bwk test suite *)
			      else scanRange1 (R.addRange (set,minC,maxC),cs) (*R.CharSet.addList (set,List.tabulate (ord(maxC)-ord(minC)+1,fn v => chr (v+ord(minC)))), cs) *)
		      in
			case (getc' cs)
			 of (#"-", cs) =>
			      scanRange1(R.CharSet.add(R.CharSet.empty, #"-"), cs)
		          | (#"]", cs) => 
			      scanRange2(R.CharSet.empty, #"]", cs)  (* as per bwk test suite *)
			  | _ => scanRange1(R.CharSet.empty, cs)
			(* end case *)
		      end
		in
		  case (getc' cs)
		   of (#"^", cs) => let
			val (set, cs) = scanClass' cs
			in
			  (R.NonmatchSet set, cs)
			end
		    | _ => let
			val (set, cs) = scanClass' cs
			in
			  (R.MatchSet set, cs)
			end
		  (* end case *)
		end
	  and scanEscape cs = let val (c, cs) = getEscapeChar cs
		in
		  (R.Char c, cs)
		end
	  in
	    SOME(scanAlt([], cs)) handle Error => NONE
	  end


  end (* AWK_RE_Syntax *)

root@smlnj-gforge.cs.uchicago.edu
ViewVC Help
Powered by ViewVC 1.0.0