Home My Page Projects Code Snippets Project Openings SML/NJ
Summary Activity Forums Tracker Lists Tasks Docs Surveys News SCM Files

SCM Repository

[smlnj] Annotation of /smlnj-lib/trunk/RegExp/FrontEnd/awk-syntax.sml
ViewVC logotype

Annotation of /smlnj-lib/trunk/RegExp/FrontEnd/awk-syntax.sml

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3279 - (view) (download)

1 : monnier 104 (* awk-syntax.sml
2 :     *
3 : jhr 3079 * COPYRIGHT (c) 2008 The Fellowship of SML/NJ (http://www.smlnj.org)
4 :     * All rights reserved.
5 : monnier 104 *
6 :     * This module implements the AWK syntax for regular expressions. The
7 :     * syntax is defined on pp. 28-30 of "The AWK Programming Language,"
8 : jhr 3279 * by Aho, Kernighan and Weinberger. We have extended it with interval
9 :     * syntax, whcih was added as part of the POSIX standard.
10 : monnier 104 *
11 :     * The meta characters are:
12 :     * "\" "^" "$" "." "[" "]" "|" "(" ")" "*" "+" "?"
13 :     * Atomic REs:
14 :     * c matches the character c (for non-metacharacters c)
15 :     * "^" matches the empty string at the beginning of a line
16 :     * "$" matches the empty string at the end of a line
17 :     * "." matches any single character (except \000 and \n)
18 :     *
19 :     * Escape sequences:
20 :     * "\b" matches backspace
21 :     * "\f" matches formfeed
22 :     * "\n" matches newline (linefeed)
23 :     * "\r" matches carriage return
24 :     * "\t" matches tab
25 :     * "\"ddd matches the character with octal code ddd.
26 :     * "\"c matches the character c (e.g., \\ for \, \" for ")
27 :     * "\x"dd matches the character with hex code dd.
28 :     *
29 :     * Character classes:
30 : jhr 3279 * [...] matches any character in "..."
31 :     * [^...] a complemented character list, which matches any character not
32 :     * in the list "..."
33 : monnier 104 *
34 : jhr 3279 * Compound regular expressions, where A and B are REs:
35 :     * A|B matches A or B
36 : monnier 104 * AB matches A followed by B
37 : jhr 3279 * A? matches zero or one As
38 :     * A* matches zero or more As
39 :     * A+ matches one or more As
40 :     * A{n} matches n copies of A
41 :     * A{n,} matches n copies of A
42 :     * A{n,m} matches n copies of A
43 :     * (A) matches A
44 : monnier 104 *)
45 :    
46 :     structure AwkSyntax : REGEXP_PARSER =
47 :     struct
48 :    
49 :     structure R = RegExpSyntax
50 :    
51 :     structure SC = StringCvt
52 :     structure W8 = Word8
53 :     structure C = Char
54 :    
55 :     val isMeta = C.contains "\\^$.[]|()*+?"
56 :    
57 :     exception Error
58 :    
59 :     val dotMatch = R.NonmatchSet (R.CharSet.addList (R.CharSet.empty,explode "\000\n"))
60 :    
61 :     fun scan getc cs = let
62 :     fun getc' cs = (case (getc cs)
63 : jhr 3279 of NONE => raise Error
64 :     | (SOME arg) => arg)
65 :     (* end case *)
66 : monnier 104 fun isOctDigit c = (#"0" <= c) andalso (c <= #"7")
67 : jhr 3279 fun returnVal (v, cl, cs) = let
68 :     val (n, _) = valOf (Int.scan v List.getItem cl)
69 :     in
70 : monnier 104 (C.chr n, cs) handle _ => raise Error
71 :     (* SC.scanString (Int.scan SC.OCT) (implode [c1,c2,c3]) *)
72 : jhr 3279 end
73 :     fun getDecimal cs = let
74 :     fun lp (cs, digits) = (case getc cs
75 :     of NONE => (cs, List.rev digits)
76 :     | SOME(c, cs') =>
77 :     if (C.isDigit c) then lp(cs', c::digits)
78 :     else (cs, List.rev digits)
79 :     (* end case *))
80 :     in
81 :     case lp (cs, [])
82 :     of (_, []) => raise Error (* no digits *)
83 :     | (cs, digits) => let
84 :     val SOME(n, _) = (Int.scan SC.DEC List.getItem digits)
85 :     in
86 :     (n, cs)
87 :     end
88 :     (* end case *)
89 :     end
90 : monnier 104 fun getHexChar (c,cs) = (case (getc cs)
91 :     of NONE => returnVal (SC.HEX,[c],cs)
92 : jhr 3279 | SOME (c',cs') =>
93 :     if not (C.isHexDigit c') then returnVal (SC.HEX,[c],cs)
94 :     else returnVal (SC.HEX,[c,c'],cs')
95 :     (* end case *))
96 : monnier 104 fun getOctalChar (c,cs) = (case (getc cs)
97 :     of NONE => returnVal (SC.OCT,[c],cs)
98 : jhr 3279 | SOME(c',cs') =>
99 : monnier 104 if not (isOctDigit c') then returnVal (SC.OCT,[c],cs)
100 :     else (case (getc cs')
101 :     of NONE => returnVal (SC.OCT,[c,c'],cs')
102 :     | SOME (c'',cs'') =>
103 :     if not (isOctDigit c'') then returnVal (SC.OCT,[c,c'],cs')
104 :     else returnVal (SC.OCT,[c,c',c''],cs'')))
105 :     fun getEscapeChar cs = (case (getc' cs)
106 :     of (#"b", cs) => (#"\008", cs)
107 :     | (#"f", cs) => (#"\012", cs)
108 :     | (#"n", cs) => (#"\n", cs)
109 :     | (#"r", cs) => (#"\013", cs)
110 :     | (#"t", cs) => (#"\t", cs)
111 :     | (#"x", cs) => let val (c1,cs) = getc' cs
112 :     in
113 :     if (C.isHexDigit c1) then getHexChar (c1,cs) else raise Error
114 :     end
115 :     | (c1, cs) =>
116 :     if (isOctDigit c1) then getOctalChar (c1,cs) else (c1, cs))
117 :     fun scanAlt (stk, cs) = let
118 :     val (re, cs') = scanSeq([], cs)
119 :     in
120 :     case (stk, getc cs')
121 :     of ([], NONE) => (re, cs')
122 :     | (_, SOME(#"|", cs'')) => scanAlt(re::stk, cs'')
123 :     | _ => (R.Alt(rev(re::stk)), cs')
124 :     (* end case *)
125 :     end
126 :     and scanSeq (stk, cs) = let
127 :     fun continue (re, cs') = scanSeq(re::stk, cs')
128 :     fun done () = (R.Concat(rev stk), cs)
129 :     in
130 :     case (stk, getc cs)
131 :     of ([], NONE) => raise Error
132 :     | ([re], NONE) => (re, cs)
133 :     | (_, NONE) => done()
134 : jhr 3279 | (re::r, SOME(#"{", cs')) => let
135 :     val (n, m, cs'') = scanInterval cs'
136 :     in
137 :     scanSeq (R.Interval(re, n, m)::r, cs'')
138 :     end
139 : monnier 104 | (re::r, SOME(#"?", cs')) => scanSeq (R.Option re :: r, cs')
140 :     | (re::r, SOME(#"*", cs')) => scanSeq (R.Star re :: r, cs')
141 :     | (re::r, SOME(#"+", cs')) => scanSeq (R.Plus re :: r, cs')
142 :     | (_, SOME(#"|", _)) => done()
143 :     | (_, SOME(#")", _)) => done()
144 :     | (_, SOME(#"(", cs')) => continue(scanGrp cs')
145 :     | (_, SOME(#".", cs')) => continue(dotMatch, cs')
146 :     | (_, SOME(#"^", cs')) => continue(R.Begin, cs')
147 : jhr 3079 | (_, SOME(#"$",cs')) => continue(R.End, cs')
148 : monnier 104 | (_, SOME(#"[", cs')) => continue(scanClass cs')
149 :     | (_, SOME(#"\\", cs')) => continue(scanEscape cs')
150 :     | (_, SOME(c, cs')) => if (isMeta c)
151 :     then raise Error
152 :     else scanSeq((R.Char c)::stk, cs')
153 :     (* end case *)
154 :     end
155 : jhr 3279 (* scan the tail of "{n}", "{n,}", or "{n,m"}", where the leading "{" has already
156 :     * been scanned.
157 :     *)
158 :     and scanInterval cs = let
159 :     val (n, cs) = getDecimal cs
160 :     in
161 :     case getc cs
162 :     of SOME(#",", cs) => (case getc cs
163 :     of SOME(#"}", cs) => (n, NONE, cs)
164 :     | _ => let
165 :     val (m, cs) = getDecimal cs
166 :     in
167 :     case getc cs
168 :     of SOME(#"}", cs) => (n, SOME m, cs)
169 :     | _ => raise Error
170 :     (* end case *)
171 :     end
172 :     (* end case *))
173 :     | SOME(#"}", cs) => (n, SOME n, cs)
174 :     | _ => raise Error
175 :     (* end case *)
176 :     end
177 : monnier 104 and scanGrp cs = let
178 :     val (re, cs') = scanAlt ([], cs)
179 :     in
180 :     case (getc' cs')
181 :     of (#")", cs'') => (R.Group re, cs'')
182 :     | _ => raise Error
183 :     (* end case *)
184 :     end
185 :     and scanClass cs = let
186 :     fun scanClass' cs = let
187 :     fun scanRange1 (set, cs) = (case (getc' cs)
188 :     of (#"]", cs) => (set,cs)
189 :     | (#"\\", cs) => let
190 :     val (c, cs) = getEscapeChar cs
191 :     in
192 :     scanRange2 (set, c, cs)
193 :     end
194 :     | (c, cs) => scanRange2 (set, c, cs)
195 :     (* end case *))
196 :     and scanRange2 (set, c, cs) = (case (getc' cs)
197 :     of (#"]", cs) => (R.CharSet.add(set, c), cs)
198 :     | (#"\\", cs) => let
199 :     val (c', cs) = getEscapeChar cs
200 :     in
201 :     scanRange2 (R.CharSet.add(set, c), c', cs)
202 :     end
203 :     | (#"-", cs) => scanRange3 (set, c, cs)
204 :     | (c', cs) =>
205 :     scanRange2 (R.CharSet.add(set, c), c', cs)
206 :     (* end case *))
207 :     and scanRange3 (set, minC, cs) = (case (getc' cs)
208 :     of (#"]", cs) =>
209 :     (R.CharSet.add(R.CharSet.add(set, minC), #"-"), cs)
210 :     | (#"\\", cs) => let
211 :     val (c, cs) = getEscapeChar cs
212 :     in
213 :     chkRange(set, minC, c, cs)
214 :     end
215 :     | (c, cs) => chkRange(set, minC, c, cs)
216 :     (* end case *))
217 :     and chkRange (set, minC, maxC, cs) =
218 :     if (minC > maxC)
219 :     then scanRange1 (set,cs ) (*raise Error *) (* as per bwk test suite *)
220 :     else scanRange1 (R.addRange (set,minC,maxC),cs) (*R.CharSet.addList (set,List.tabulate (ord(maxC)-ord(minC)+1,fn v => chr (v+ord(minC)))), cs) *)
221 :     in
222 :     case (getc' cs)
223 :     of (#"-", cs) =>
224 :     scanRange1(R.CharSet.add(R.CharSet.empty, #"-"), cs)
225 :     | (#"]", cs) =>
226 :     scanRange2(R.CharSet.empty, #"]", cs) (* as per bwk test suite *)
227 :     | _ => scanRange1(R.CharSet.empty, cs)
228 :     (* end case *)
229 :     end
230 :     in
231 :     case (getc' cs)
232 :     of (#"^", cs) => let
233 :     val (set, cs) = scanClass' cs
234 :     in
235 :     (R.NonmatchSet set, cs)
236 :     end
237 :     | _ => let
238 :     val (set, cs) = scanClass' cs
239 :     in
240 :     (R.MatchSet set, cs)
241 :     end
242 :     (* end case *)
243 :     end
244 :     and scanEscape cs = let val (c, cs) = getEscapeChar cs
245 :     in
246 :     (R.Char c, cs)
247 :     end
248 :     in
249 :     SOME(scanAlt([], cs)) handle Error => NONE
250 :     end
251 :    
252 :    
253 :     end (* AWK_RE_Syntax *)

root@smlnj-gforge.cs.uchicago.edu
ViewVC Help
Powered by ViewVC 1.0.0