Home My Page Projects Code Snippets Project Openings SML/NJ
Summary Activity Forums Tracker Lists Tasks Docs Surveys News SCM Files

SCM Repository

[smlnj] Annotation of /sml/trunk/src/compiler/MiscUtil/library/pickle-util.sml
ViewVC logotype

Annotation of /sml/trunk/src/compiler/MiscUtil/library/pickle-util.sml

Parent Directory Parent Directory | Revision Log Revision Log


Revision 496 - (view) (download)

1 : monnier 496 (*
2 :     * This is the new "generic" pickle utility which replaces Andrew Appel's
3 :     * original "sharewrite" module. Aside from formal differences, this
4 :     * new module ended up not being any different from Andrew's. However,
5 :     * it ties in with its "unpickle" counterpart which is a *lot* better than
6 :     * its predecessor.
7 :     *
8 :     * Generated pickles tend to be a little bit smaller, which can
9 :     * probably be explained by the slightly more compact (in the common case,
10 :     * i.e. for small absolute values) integer representation.
11 :     *
12 :     * July 1999, Matthias Blume
13 :     *)
14 :    
15 :     (*
16 :     * By the way, there is no point in trying to internally use
17 :     * Word8Vector.vector instead of string for now.
18 :     * These strings participate in order comparisons (which makes
19 :     * Word8Vector.vector unsuitable). Moreover, conversion between
20 :     * string and Word8Vector.vector is currently just a cast, so it
21 :     * does not cost anything in the end.
22 :     *)
23 :     signature PICKLE_UTIL = sig
24 :    
25 :     type id
26 :    
27 :     (* Type info. Use a different number for each type constructor. *)
28 :     type tinfo = int (* negative numbers are reserved! *)
29 :    
30 :     type 'ahm pickle
31 :     type ('ahm, 'v) pickler = 'v -> 'ahm pickle
32 :    
33 :     (* Combining pickles into one. The resulting compound pickle will not
34 :     * automatically be subject to hash-consing. Wrap it with $ to get
35 :     * that effect. *)
36 :     val & : 'ahm pickle * 'ahm pickle -> 'ahm pickle
37 :    
38 :     (* $ produces the pickle for one case (constructor) of a datatype.
39 :     * The string must be one character long and the argument pickle
40 :     * should be the pickle for the constructor's arguments. If there
41 :     * are no arguments, then use % instead of $.
42 :     * Use the same tinfo for all constructors of the same datatype
43 :     * and different tinfos for constructors of different types.
44 :     *
45 :     * The latter is really only important if there are constructors
46 :     * of different type who have identical argument types and use the
47 :     * same $ identificaton string. In this case the pickler might
48 :     * identify two values of different types, and as a result the
49 :     * unpickler will be very unhappy.
50 :     *
51 :     * On the other hand, if you use different tinfos for the same type,
52 :     * then nothing terrible will happen. You might lose some sharing,
53 :     * though.
54 :     *
55 :     * The string argument could theoretically be more than one character
56 :     * long. In this case the corresponding unpickling function must
57 :     * be sure to get all those characters out of the input stream.
58 :     * We actually do exploit this "feature" internally. *)
59 :     val $ : tinfo -> string * 'ahm pickle -> 'ahm pickle
60 :     val % : tinfo -> string -> 'ahm pickle
61 :    
62 :     (* "ah_share" is used to specify potential for "ad-hoc" sharing
63 :     * using the user-supplied map. It is important that ah_share is
64 :     * applied to pickles constructed by $ or % but never to those
65 :     * constructed by &. Ad-hoc sharing is used to break structural
66 :     * cycles, to identify parts of the value that the hash-conser cannot
67 :     * automatically identify but which should be identified nevertheless,
68 :     * or to identify those parts that would be too expensive to be left
69 :     * to the hash-conser. *)
70 :     val ah_share : { find : 'ahm * 'v -> id option,
71 :     insert : 'ahm * 'v * id -> 'ahm } ->
72 :     ('ahm, 'v) pickler -> ('ahm, 'v) pickler
73 :    
74 :     (* generating pickles for values of some basic types *)
75 :     val w_bool : ('ahm, bool) pickler
76 :     val w_int : ('ahm, int) pickler
77 :     val w_word : ('ahm, word) pickler
78 :     val w_int32 : ('ahm, Int32.int) pickler
79 :     val w_word32 : ('ahm, Word32.word) pickler
80 :     val w_string : ('ahm, string) pickler
81 :    
82 :     (* generating pickles for some parameterized types (given a pickler
83 :     * for the parameter) *)
84 :     val w_list : ('ahm, 'a) pickler -> ('ahm, 'a list) pickler
85 :     val w_option : ('ahm, 'a) pickler -> ('ahm, 'a option) pickler
86 :    
87 :     (* this doesn't automatically identify (i.e., hash-cons) pairs *)
88 :     val w_pair :
89 :     ('ahm, 'a) pickler * ('ahm, 'b) pickler -> ('ahm, 'a * 'b) pickler
90 :    
91 :     (* Pickling a "lazy" value (i.e., a thunk); the thunk will be forced
92 :     * by the pickler. Unpickling is lazy again; but, of course, that
93 :     * laziness is unrelated to the laziness of the original value. *)
94 :     val w_lazy : ('ahm, 'a) pickler -> ('ahm, unit -> 'a) pickler
95 :    
96 :     (* run the pickle, i.e., turn it into a string *)
97 :     val pickle : 'ahm -> 'ahm pickle -> string
98 :    
99 :     (* The xxx_lifter stuff is here to allow picklers to be "patched
100 :     * together". If you already have a pickler that uses a sharing map
101 :     * of type B and you want to use it as part of a bigger pickler that
102 :     * uses a sharing map of type A, then you must write a (B, A) map_lifter
103 :     * which then lets you lift the existing pickler to one that uses
104 :     * type A maps instead of its own type B maps.
105 :     *
106 :     * The idea is that B maps are really part of A maps. They can be
107 :     * extracted for the duration of using the existing pickler. Then,
108 :     * when that pickler is done, we can patch the resulting new B map
109 :     * back into the original A map to obtain a new A map. *)
110 :     type ('b_ahm, 'a_ahm) map_lifter =
111 :     { extract: 'a_ahm -> 'b_ahm, patchback: 'a_ahm * 'b_ahm -> 'a_ahm }
112 :    
113 :     val lift_pickler: ('b_ahm, 'a_ahm) map_lifter ->
114 :     ('b_ahm, 'v) pickler -> ('a_ahm, 'v) pickler
115 :     end
116 :    
117 :     structure PickleUtil :> PICKLE_UTIL = struct
118 :    
119 :     type pos = int
120 :     type id = pos
121 :     type tinfo = int
122 :     type codes = id list
123 :    
124 :     structure HCM = RedBlackMapFn
125 :     (struct
126 :     type ord_key = string * tinfo * codes
127 :     fun compare ((c, t, l), (c', t', l')) = let
128 :     fun codesCmp ([], []) = EQUAL
129 :     | codesCmp (_ :: _, []) = GREATER
130 :     | codesCmp ([], _ :: _) = LESS
131 :     | codesCmp (h :: t, h' :: t') =
132 :     if h < h' then LESS else if h > h' then GREATER
133 :     else codesCmp (t, t')
134 :     in
135 :     if t < t' then LESS else if t > t' then GREATER
136 :     else case String.compare (c, c') of
137 :     EQUAL => codesCmp (l, l')
138 :     | unequal => unequal
139 :     end
140 :     end)
141 :    
142 :     datatype pre_result =
143 :     STRING of string
144 :     | CONCAT of pre_result * pre_result
145 :    
146 :     fun pre_size (STRING s) = size s
147 :     | pre_size (CONCAT (p, p')) = pre_size p + pre_size p'
148 :    
149 :     val backref = STRING "\255"
150 :     val size_backref = 1
151 :     val nullbytes = STRING ""
152 :    
153 :     type hcm = id HCM.map
154 :     type 'ahm state = hcm * 'ahm * pos
155 :    
156 :     type 'ahm pickle = 'ahm state -> codes * pre_result * 'ahm state
157 :     type ('ahm, 'v) pickler = 'v -> 'ahm pickle
158 :    
159 :     infix 3 $
160 :     infixr 4 &
161 :    
162 :     fun (f & g) state = let
163 :     val (fc, fpr, state') = f state
164 :     val (gc, gpr, state'') = g state'
165 :     in
166 :     (fc @ gc, CONCAT (fpr, gpr), state'')
167 :     end
168 :    
169 :     fun anyint_encode (n, negative) = let
170 :     (* this is essentially the same mechanism that's also used in
171 :     * TopLevel/batch/binfile.sml (maybe we should share it) *)
172 :     val // = LargeWord.div
173 :     val %% = LargeWord.mod
174 :     val !! = LargeWord.orb
175 :     infix // %% !!
176 :     val toW8 = Word8.fromLargeWord
177 :     fun r (0w0, l) = Word8Vector.fromList l
178 :     | r (n, l) =
179 :     r (n // 0w128, toW8 ((n %% 0w128) !! 0w128) :: l)
180 :     val lastDigit = n %% 0w64
181 :     val lastByte = if negative then lastDigit !! 0w64 else lastDigit
182 :     in
183 :     Byte.bytesToString (r (n // 0w64, [toW8 lastByte]))
184 :     end
185 :    
186 :     fun largeword_encode n = anyint_encode (n, false)
187 :     fun largeint_encode i =
188 :     if i >= 0 then anyint_encode (LargeWord.fromLargeInt i, false)
189 :     (* careful to do the negation in word domain... *)
190 :     else anyint_encode (0w0 - LargeWord.fromLargeInt i, true)
191 :    
192 :     val word32_encode = largeword_encode o Word32.toLargeWord
193 :     val word_encode = largeword_encode o Word.toLargeWord
194 :    
195 :     val int32_encode = largeint_encode o Int32.toLarge
196 :     val int_encode = largeint_encode o Int.toLarge
197 :    
198 :     fun % ti c (hcm, ahm, next) = let
199 :     val key = (c, ti, [])
200 :     in
201 :     case HCM.find (hcm, key) of
202 :     SOME i => ([i], STRING c, (hcm, ahm, next + size c))
203 :     | NONE => ([next], STRING c,
204 :     (HCM.insert (hcm, key, next), ahm, next + size c))
205 :     end
206 :    
207 :     fun dollar ti (c, p) (hcm, ahm, next) = let
208 :     val (codes, pr, (hcm', ahm', next')) = p (hcm, ahm, next + size c)
209 :     val key = (c, ti, codes)
210 :     in
211 :     case HCM.find (hcm, key) of
212 :     SOME i => let
213 :     val brnum = int_encode i
214 :     in
215 :     ([i], CONCAT (backref, STRING brnum),
216 :     (hcm, ahm, next + size_backref + size brnum))
217 :     end
218 :     | NONE =>
219 :     ([next], CONCAT (STRING c, pr),
220 :     (HCM.insert (hcm', key, next), ahm', next'))
221 :     end
222 :    
223 :     fun ah_share { find, insert } w v (hcm, ahm, next) =
224 :     case find (ahm, v) of
225 :     SOME i => let
226 :     val brnum = int_encode i
227 :     in
228 :     ([i], CONCAT (backref, STRING brnum),
229 :     (hcm, ahm, next + size_backref + size brnum))
230 :     end
231 :     | NONE => w v (hcm, insert (ahm, v, next), next)
232 :    
233 :     fun w_lazy w thunk (hcm, ahm, next) = let
234 :     val v = thunk ()
235 :     (* The larger the value of trialStart, the smaller the chance that
236 :     * the loop (see below) will run more than once. However, some
237 :     * space may be wasted. 2 sounds like a good compromise to me. *)
238 :     val trialStart = 2
239 :     (* This loop is ugly, but we don't expect it to run very often.
240 :     * It is needed because we must first write the length of the
241 :     * encoding of the thunk's value, but that encoding depends
242 :     * on the length (or rather: on the length of the length). *)
243 :     fun loop (nxt, ilen) = let
244 :     val (codes, pr, state) = w v (hcm, ahm, nxt)
245 :     val sz = pre_size pr
246 :     val ie = int_encode sz
247 :     val iesz = size ie
248 :     (* Padding in front is better because the unpickler can
249 :     * simply discard all leading 0s and does not need to know
250 :     * about the pickler's setting of "trialStart". *)
251 :     val null = STRING "\000"
252 :     fun pad (pr, n) =
253 :     if n = 0 then pr
254 :     else pad (CONCAT (null, pr), n - 1)
255 :     in
256 :     if ilen < iesz then loop (nxt + 1, ilen + 1)
257 :     else (codes, CONCAT (pad (STRING ie, ilen - iesz), pr), state)
258 :     end
259 :     in
260 :     loop (next + trialStart, trialStart)
261 :     end
262 :    
263 :     local
264 :     val I = ~1
265 :     val W = ~2
266 :     val I32 = ~3
267 :     val W32 = ~4
268 :     in
269 :     (* Even though the encoding could start with the
270 :     * backref character, we know that it isn't actually a backref
271 :     * because % suppresses back-references.
272 :     * Of course, this must be taken care of by unpickle-util! *)
273 :     fun w_int i = % I (int_encode i)
274 :     fun w_word w = % W (word_encode w)
275 :     fun w_int32 i32 = % I32 (int32_encode i32)
276 :     fun w_word32 w32 = % W32 (word32_encode w32)
277 :     end
278 :    
279 :     local
280 :     val L = ~5
281 :     fun chop5 l = let
282 :     fun ch (a :: b :: c :: d :: e :: r, cl) =
283 :     ch (r, (e, d, c, b, a) :: cl)
284 :     | ch (r, cl) = (rev r, cl)
285 :     in
286 :     ch (rev l, [])
287 :     end
288 :     in
289 :     fun w_list w l = let
290 :     val op $ = dollar L
291 :     fun wc [] = % L "N"
292 :     | wc ((a, b, c, d, e) :: r) =
293 :     "C" $ w a & w b & w c & w d & w e & wc r
294 :     in
295 :     case chop5 l of
296 :     ([], []) => % L "0"
297 :     | ([a], []) => "1" $ w a
298 :     | ([a, b], []) => "2" $ w a & w b
299 :     | ([a, b, c], []) => "3" $ w a & w b & w c
300 :     | ([a, b, c, d], []) => "4" $ w a & w b & w c & w d
301 :     | ([], r) => "5" $ wc r
302 :     | ([a], r) => "6" $ w a & wc r
303 :     | ([a, b], r) => "7" $ w a & w b & wc r
304 :     | ([a, b, c], r) => "8" $ w a & w b & w c & wc r
305 :     | ([a, b, c, d], r) => "9" $ w a & w b & w c & w d & wc r
306 :     | _ => raise Fail "PickleUtil.w_list: impossible chop"
307 :     end
308 :     end
309 :    
310 :     local
311 :     val O = ~6
312 :     in
313 :     fun w_option arg = let
314 :     val op $ = dollar O
315 :     fun wo w NONE = % O "n"
316 :     | wo w (SOME i) = "s" $ w i
317 :     in
318 :     wo arg
319 :     end
320 :     end
321 :    
322 :     fun w_pair (wa, wb) (a, b) = wa a & wb b
323 :    
324 :     local
325 :     val S = ~7
326 :     in
327 :     fun w_string s = let
328 :     val op $ = dollar S
329 :     (* The dummy_pickle is a hack to get strings to be identified
330 :     * automatically. They don't have "natural" children, so normally
331 :     * % would suppress the backref. The dummy pickle produces no
332 :     * codes and no output, but it is there to make $ believe that
333 :     * there are children. *)
334 :     fun dummy_pickle state = ([], nullbytes, state)
335 :     fun esc #"\\" = "\\\\"
336 :     | esc #"\"" = "\\\""
337 :     | esc #"\255" = "\\\255" (* need to escape backref char *)
338 :     | esc c = String.str c
339 :     in
340 :     (String.translate esc s ^ "\"") $ dummy_pickle
341 :     end
342 :     end
343 :    
344 :     local
345 :     val B = ~8
346 :     in
347 :     fun w_bool true = % B "t"
348 :     | w_bool false = % B "f"
349 :     end
350 :    
351 :     local
352 :     fun pr2s pr = let
353 :     fun flat (STRING s, l) = s :: l
354 :     | flat (CONCAT (x, STRING s), l) = flat (x, s :: l)
355 :     | flat (CONCAT (x, CONCAT (y, z)), l) =
356 :     flat (CONCAT (CONCAT (x, y), z), l)
357 :     in
358 :     concat (flat (pr, []))
359 :     end
360 :     in
361 :     fun pickle emptyMap p = let
362 :     val (_, pr, _) = p (HCM.empty, emptyMap, 0)
363 :     in
364 :     pr2s pr
365 :     end
366 :     end
367 :    
368 :     type ('b_ahm, 'a_ahm) map_lifter =
369 :     { extract: 'a_ahm -> 'b_ahm, patchback: 'a_ahm * 'b_ahm -> 'a_ahm }
370 :    
371 :     fun lift_pickler { extract, patchback } wb b (hcm, a_ahm, next) = let
372 :     val b_ahm = extract a_ahm
373 :     val (codes, pr, (hcm', b_ahm', next')) = wb b (hcm, b_ahm, next)
374 :     val a_ahm' = patchback (a_ahm, b_ahm')
375 :     in
376 :     (codes, pr, (hcm', a_ahm', next'))
377 :     end
378 :    
379 :     (* for export *)
380 :     nonfix $
381 :     val $ = dollar
382 :     end

root@smlnj-gforge.cs.uchicago.edu
ViewVC Help
Powered by ViewVC 1.0.0