Home My Page Projects Code Snippets Project Openings SML/NJ
Summary Activity Forums Tracker Lists Tasks Docs Surveys News SCM Files

SCM Repository

[smlnj] Annotation of /sml/trunk/src/compiler/MiscUtil/library/pickle-util.sml
ViewVC logotype

Annotation of /sml/trunk/src/compiler/MiscUtil/library/pickle-util.sml

Parent Directory Parent Directory | Revision Log Revision Log


Revision 710 - (view) (download)

1 : monnier 496 (*
2 :     * This is the new "generic" pickle utility which replaces Andrew Appel's
3 :     * original "sharewrite" module. Aside from formal differences, this
4 :     * new module ended up not being any different from Andrew's. However,
5 :     * it ties in with its "unpickle" counterpart which is a *lot* better than
6 :     * its predecessor.
7 :     *
8 :     * Generated pickles tend to be a little bit smaller, which can
9 :     * probably be explained by the slightly more compact (in the common case,
10 :     * i.e. for small absolute values) integer representation.
11 :     *
12 :     * July 1999, Matthias Blume
13 : blume 710 *
14 :     * Addendum: This module now also marks as "actually being shared" those
15 :     * nodes where actual sharing has been detected. Marking is done by
16 :     * setting the high bit in the char code of the node. This means that
17 :     * char codes must be in the range [0,126] to avoid conflicts. (127
18 :     * cannot be used because setting the high bit there results in 255 --
19 :     * which is the backref code.)
20 :     * This improves unpickling time by about 25% and also reduces memory
21 :     * usage because much fewer sharing map entries have to be made during
22 :     * unpickling.
23 :     *
24 :     * October 2000, Matthias Blume
25 : monnier 496 *)
26 :    
27 :     (*
28 :     * By the way, there is no point in trying to internally use
29 :     * Word8Vector.vector instead of string for now.
30 :     * These strings participate in order comparisons (which makes
31 :     * Word8Vector.vector unsuitable). Moreover, conversion between
32 :     * string and Word8Vector.vector is currently just a cast, so it
33 :     * does not cost anything in the end.
34 :     *)
35 :     signature PICKLE_UTIL = sig
36 :    
37 :     type id
38 :    
39 :     (* Type info. Use a different number for each type constructor. *)
40 :     type tinfo = int (* negative numbers are reserved! *)
41 :    
42 :     type 'ahm pickle
43 :     type ('ahm, 'v) pickler = 'v -> 'ahm pickle
44 :    
45 :     (* $ produces the pickle for one case (constructor) of a datatype.
46 :     * The string must be one character long and the argument pickle
47 : blume 515 * should be the pickle for the constructor's arguments.
48 : monnier 496 * Use the same tinfo for all constructors of the same datatype
49 :     * and different tinfos for constructors of different types.
50 :     *
51 :     * The latter is really only important if there are constructors
52 :     * of different type who have identical argument types and use the
53 :     * same $ identificaton string. In this case the pickler might
54 :     * identify two values of different types, and as a result the
55 :     * unpickler will be very unhappy.
56 :     *
57 :     * On the other hand, if you use different tinfos for the same type,
58 :     * then nothing terrible will happen. You might lose some sharing,
59 :     * though.
60 :     *
61 :     * The string argument could theoretically be more than one character
62 :     * long. In this case the corresponding unpickling function must
63 :     * be sure to get all those characters out of the input stream.
64 :     * We actually do exploit this "feature" internally. *)
65 : blume 515 val $ : tinfo -> string * 'ahm pickle list -> 'ahm pickle
66 : monnier 496
67 :     (* "ah_share" is used to specify potential for "ad-hoc" sharing
68 : blume 515 * using the user-supplied map.
69 : blume 710 * Ad-hoc sharing is used to identify parts of the value that the
70 :     * hash-conser cannot automatically identify but which should be
71 :     * identified nevertheless, or to identify those parts that would be
72 :     * too expensive to be left to the hash-conser. *)
73 : monnier 496 val ah_share : { find : 'ahm * 'v -> id option,
74 :     insert : 'ahm * 'v * id -> 'ahm } ->
75 :     ('ahm, 'v) pickler -> ('ahm, 'v) pickler
76 :    
77 :     (* generating pickles for values of some basic types *)
78 :     val w_bool : ('ahm, bool) pickler
79 :     val w_int : ('ahm, int) pickler
80 :     val w_word : ('ahm, word) pickler
81 :     val w_int32 : ('ahm, Int32.int) pickler
82 :     val w_word32 : ('ahm, Word32.word) pickler
83 :     val w_string : ('ahm, string) pickler
84 :    
85 :     (* generating pickles for some parameterized types (given a pickler
86 :     * for the parameter) *)
87 :     val w_list : ('ahm, 'a) pickler -> ('ahm, 'a list) pickler
88 :     val w_option : ('ahm, 'a) pickler -> ('ahm, 'a option) pickler
89 :     val w_pair :
90 :     ('ahm, 'a) pickler * ('ahm, 'b) pickler -> ('ahm, 'a * 'b) pickler
91 :    
92 :     (* Pickling a "lazy" value (i.e., a thunk); the thunk will be forced
93 :     * by the pickler. Unpickling is lazy again; but, of course, that
94 :     * laziness is unrelated to the laziness of the original value. *)
95 :     val w_lazy : ('ahm, 'a) pickler -> ('ahm, unit -> 'a) pickler
96 :    
97 :     (* run the pickle, i.e., turn it into a string *)
98 :     val pickle : 'ahm -> 'ahm pickle -> string
99 :    
100 :     (* The xxx_lifter stuff is here to allow picklers to be "patched
101 :     * together". If you already have a pickler that uses a sharing map
102 :     * of type B and you want to use it as part of a bigger pickler that
103 : blume 710 * uses a sharing map of type A, you must write a (B, A) map_lifter
104 : monnier 496 * which then lets you lift the existing pickler to one that uses
105 :     * type A maps instead of its own type B maps.
106 :     *
107 :     * The idea is that B maps are really part of A maps. They can be
108 :     * extracted for the duration of using the existing pickler. Then,
109 :     * when that pickler is done, we can patch the resulting new B map
110 :     * back into the original A map to obtain a new A map. *)
111 :     type ('b_ahm, 'a_ahm) map_lifter =
112 :     { extract: 'a_ahm -> 'b_ahm, patchback: 'a_ahm * 'b_ahm -> 'a_ahm }
113 :    
114 :     val lift_pickler: ('b_ahm, 'a_ahm) map_lifter ->
115 :     ('b_ahm, 'v) pickler -> ('a_ahm, 'v) pickler
116 :     end
117 :    
118 :     structure PickleUtil :> PICKLE_UTIL = struct
119 :    
120 :     type pos = int
121 :     type id = pos
122 : blume 710 type codes = id list
123 : monnier 496 type tinfo = int
124 :    
125 : blume 710 type shareinfo = IntRedBlackSet.set
126 :     val si_empty = IntRedBlackSet.empty
127 :     val si_add = IntRedBlackSet.add
128 :     val si_list = IntRedBlackSet.listItems
129 :    
130 : monnier 496 structure HCM = RedBlackMapFn
131 :     (struct
132 :     type ord_key = string * tinfo * codes
133 :     fun compare ((c, t, l), (c', t', l')) = let
134 :     fun codesCmp ([], []) = EQUAL
135 :     | codesCmp (_ :: _, []) = GREATER
136 :     | codesCmp ([], _ :: _) = LESS
137 :     | codesCmp (h :: t, h' :: t') =
138 :     if h < h' then LESS else if h > h' then GREATER
139 :     else codesCmp (t, t')
140 :     in
141 :     if t < t' then LESS else if t > t' then GREATER
142 :     else case String.compare (c, c') of
143 :     EQUAL => codesCmp (l, l')
144 :     | unequal => unequal
145 :     end
146 :     end)
147 :    
148 : blume 569 structure PM = IntRedBlackMap
149 :    
150 : monnier 496 datatype pre_result =
151 :     STRING of string
152 :     | CONCAT of pre_result * pre_result
153 :    
154 :     fun pre_size (STRING s) = size s
155 :     | pre_size (CONCAT (p, p')) = pre_size p + pre_size p'
156 :    
157 :     val backref = STRING "\255"
158 :     val size_backref = 1
159 :     val nullbytes = STRING ""
160 :    
161 :     type hcm = id HCM.map
162 : blume 569 type fwdm = id PM.map (* forwarding map *)
163 : blume 710 type 'ahm state = hcm * fwdm * 'ahm * pos * shareinfo
164 : monnier 496
165 :     type 'ahm pickle = 'ahm state -> codes * pre_result * 'ahm state
166 :     type ('ahm, 'v) pickler = 'v -> 'ahm pickle
167 :    
168 :     infix 3 $
169 :     infixr 4 &
170 :    
171 :     fun (f & g) state = let
172 :     val (fc, fpr, state') = f state
173 :     val (gc, gpr, state'') = g state'
174 :     in
175 :     (fc @ gc, CONCAT (fpr, gpr), state'')
176 :     end
177 :    
178 : blume 515 (* collapse a non-empty list of pickles into one *)
179 :     fun collapse (h, []) = h
180 :     | collapse (h, ht :: tt) = h & collapse (ht, tt)
181 :    
182 : monnier 496 fun anyint_encode (n, negative) = let
183 :     (* this is essentially the same mechanism that's also used in
184 :     * TopLevel/batch/binfile.sml (maybe we should share it) *)
185 :     val // = LargeWord.div
186 :     val %% = LargeWord.mod
187 :     val !! = LargeWord.orb
188 :     infix // %% !!
189 :     val toW8 = Word8.fromLargeWord
190 :     fun r (0w0, l) = Word8Vector.fromList l
191 :     | r (n, l) =
192 :     r (n // 0w128, toW8 ((n %% 0w128) !! 0w128) :: l)
193 :     val lastDigit = n %% 0w64
194 :     val lastByte = if negative then lastDigit !! 0w64 else lastDigit
195 :     in
196 :     Byte.bytesToString (r (n // 0w64, [toW8 lastByte]))
197 :     end
198 :    
199 :     fun largeword_encode n = anyint_encode (n, false)
200 :     fun largeint_encode i =
201 :     if i >= 0 then anyint_encode (LargeWord.fromLargeInt i, false)
202 :     (* careful to do the negation in word domain... *)
203 :     else anyint_encode (0w0 - LargeWord.fromLargeInt i, true)
204 :    
205 :     val word32_encode = largeword_encode o Word32.toLargeWord
206 :     val word_encode = largeword_encode o Word.toLargeWord
207 :    
208 :     val int32_encode = largeint_encode o Int32.toLarge
209 :     val int_encode = largeint_encode o Int.toLarge
210 :    
211 : blume 710 fun % ti c (hcm, fwdm, ahm, next, si) = let
212 : monnier 496 val key = (c, ti, [])
213 :     in
214 :     case HCM.find (hcm, key) of
215 : blume 710 SOME i =>
216 :     ([i], STRING c, (hcm, PM.insert (fwdm, next, i),
217 :     ahm, next + size c, si))
218 :     | NONE =>
219 :     ([next], STRING c, (HCM.insert (hcm, key, next), fwdm,
220 :     ahm, next + size c, si))
221 : monnier 496 end
222 :    
223 : blume 515 fun dollar ti (c, []) state = % ti c state
224 : blume 710 | dollar ti (c, plh :: plt) (hcm, fwdm, ahm, next, si) = let
225 : blume 515 val p = collapse (plh, plt)
226 : blume 710 val (codes, pr, (hcm', fwdm', ahm', next', si')) =
227 :     p (hcm, fwdm, ahm, next + size c, si)
228 : blume 515 val key = (c, ti, codes)
229 :     in
230 :     case HCM.find (hcm, key) of
231 :     SOME i => let
232 :     val brnum = int_encode i
233 :     in
234 :     ([i], CONCAT (backref, STRING brnum),
235 : blume 569 (hcm, PM.insert (fwdm, next, i),
236 : blume 710 ahm, next + size_backref + size brnum,
237 :     si_add (si', i)))
238 : blume 515 end
239 :     | NONE =>
240 : monnier 496 ([next], CONCAT (STRING c, pr),
241 : blume 710 (HCM.insert (hcm', key, next), fwdm', ahm', next', si'))
242 : blume 515 end
243 : monnier 496
244 : blume 710 fun ah_share { find, insert } w v (hcm, fwdm, ahm, next, si) =
245 : monnier 496 case find (ahm, v) of
246 : blume 569 SOME i0 => let
247 :     val i = getOpt (PM.find (fwdm, i0), i0)
248 : monnier 496 val brnum = int_encode i
249 :     in
250 :     ([i], CONCAT (backref, STRING brnum),
251 : blume 710 (hcm, fwdm, ahm, next + size_backref + size brnum,
252 :     si_add (si, i)))
253 : monnier 496 end
254 : blume 710 | NONE => w v (hcm, fwdm, insert (ahm, v, next), next, si)
255 : monnier 496
256 : blume 710 fun w_lazy w thunk (hcm, fwdm, ahm, next, si) = let
257 : monnier 496 val v = thunk ()
258 :     (* The larger the value of trialStart, the smaller the chance that
259 :     * the loop (see below) will run more than once. However, some
260 : blume 652 * space may be wasted. 3 should avoid this most of the time.
261 :     * (Experience shows: 2 doesn't.) *)
262 :     val trialStart = 3
263 : monnier 496 (* This loop is ugly, but we don't expect it to run very often.
264 :     * It is needed because we must first write the length of the
265 :     * encoding of the thunk's value, but that encoding depends
266 :     * on the length (or rather: on the length of the length). *)
267 :     fun loop (nxt, ilen) = let
268 : blume 710 val (codes, pr, state) = w v (hcm, fwdm, ahm, nxt, si)
269 : monnier 496 val sz = pre_size pr
270 :     val ie = int_encode sz
271 :     val iesz = size ie
272 : blume 653
273 : monnier 496 (* Padding in front is better because the unpickler can
274 :     * simply discard all leading 0s and does not need to know
275 :     * about the pickler's setting of "trialStart". *)
276 :     val null = STRING "\000"
277 :     fun pad (pr, n) =
278 :     if n = 0 then pr
279 :     else pad (CONCAT (null, pr), n - 1)
280 :     in
281 : blume 653 if ilen < iesz then loop (nxt + 1, ilen + 1)
282 : monnier 496 else (codes, CONCAT (pad (STRING ie, ilen - iesz), pr), state)
283 :     end
284 :     in
285 :     loop (next + trialStart, trialStart)
286 :     end
287 :    
288 :     local
289 :     val I = ~1
290 :     val W = ~2
291 :     val I32 = ~3
292 :     val W32 = ~4
293 :     in
294 :     (* Even though the encoding could start with the
295 :     * backref character, we know that it isn't actually a backref
296 :     * because % suppresses back-references.
297 :     * Of course, this must be taken care of by unpickle-util! *)
298 :     fun w_int i = % I (int_encode i)
299 :     fun w_word w = % W (word_encode w)
300 :     fun w_int32 i32 = % I32 (int32_encode i32)
301 :     fun w_word32 w32 = % W32 (word32_encode w32)
302 :     end
303 :    
304 :     local
305 :     val L = ~5
306 :     fun chop5 l = let
307 :     fun ch (a :: b :: c :: d :: e :: r, cl) =
308 :     ch (r, (e, d, c, b, a) :: cl)
309 :     | ch (r, cl) = (rev r, cl)
310 :     in
311 :     ch (rev l, [])
312 :     end
313 :     in
314 :     fun w_list w l = let
315 :     val op $ = dollar L
316 :     fun wc [] = % L "N"
317 :     | wc ((a, b, c, d, e) :: r) =
318 : blume 515 "C" $ [w a, w b, w c, w d, w e, wc r]
319 : monnier 496 in
320 :     case chop5 l of
321 :     ([], []) => % L "0"
322 : blume 515 | ([a], []) => "1" $ [w a]
323 :     | ([a, b], []) => "2" $ [w a, w b]
324 :     | ([a, b, c], []) => "3" $ [w a, w b, w c]
325 :     | ([a, b, c, d], []) => "4" $ [w a, w b, w c, w d]
326 :     | ([], r) => "5" $ [wc r]
327 :     | ([a], r) => "6" $ [w a, wc r]
328 :     | ([a, b], r) => "7" $ [w a, w b, wc r]
329 :     | ([a, b, c], r) => "8" $ [w a, w b, w c, wc r]
330 :     | ([a, b, c, d], r) => "9" $ [w a, w b, w c, w d, wc r]
331 : monnier 496 | _ => raise Fail "PickleUtil.w_list: impossible chop"
332 :     end
333 :     end
334 :    
335 :     local
336 :     val O = ~6
337 :     in
338 :     fun w_option arg = let
339 :     val op $ = dollar O
340 :     fun wo w NONE = % O "n"
341 : blume 515 | wo w (SOME i) = "s" $ [w i]
342 : monnier 496 in
343 :     wo arg
344 :     end
345 :     end
346 :    
347 : blume 515 local
348 :     val P = ~7
349 :     in
350 :     fun w_pair (wa, wb) (a, b) = let
351 :     val op $ = dollar P
352 :     in
353 :     "p" $ [wa a, wb b]
354 :     end
355 :     end
356 : monnier 496
357 :     local
358 : blume 515 val S = ~8
359 : monnier 496 in
360 :     fun w_string s = let
361 :     val op $ = dollar S
362 :     (* The dummy_pickle is a hack to get strings to be identified
363 :     * automatically. They don't have "natural" children, so normally
364 :     * % would suppress the backref. The dummy pickle produces no
365 :     * codes and no output, but it is there to make $ believe that
366 :     * there are children. *)
367 :     fun dummy_pickle state = ([], nullbytes, state)
368 :     fun esc #"\\" = "\\\\"
369 :     | esc #"\"" = "\\\""
370 :     | esc #"\255" = "\\\255" (* need to escape backref char *)
371 :     | esc c = String.str c
372 :     in
373 : blume 710 (concat ["\"", String.translate esc s, "\""]) $ [dummy_pickle]
374 : monnier 496 end
375 :     end
376 :    
377 :     local
378 : blume 515 val B = ~9
379 : monnier 496 in
380 :     fun w_bool true = % B "t"
381 :     | w_bool false = % B "f"
382 :     end
383 :    
384 :     local
385 : blume 710 fun pr2s (pr, next, si) = let
386 :    
387 :     (* This puts a code string in front of the list of
388 :     * code strings that follow. It also takes care of
389 :     * setting the high bit where necessary (see below),
390 :     * updates the current position and the list of remaining
391 :     * shared positions. *)
392 :     fun add ("", p, h, t, l) = (p, h :: t, l)
393 :     | add (s, p, h, t, l) = let
394 :     val len = size s
395 :     val p' = p - len
396 :     in
397 :     if p' = h then let
398 :     val fst =
399 :     String.str
400 :     (Char.chr
401 :     (Char.ord (String.sub (s, 0)) + 128))
402 :     fun ret x = (p', t, x)
403 :     in
404 :     if len > 1 then
405 :     ret (fst :: String.extract (s, 1, NONE) :: l)
406 :     else ret (fst :: l)
407 :     end
408 :     else (p', h :: t, s :: l)
409 :     end
410 :    
411 :     (* Fast flattening -- when we are out of shared codes. *)
412 :     fun fflat (STRING s, l) = s :: l
413 :     | fflat (CONCAT (x, STRING s), l) = fflat (x, s :: l)
414 :     | fflat (CONCAT (x, CONCAT (y, z)), l) =
415 :     fflat (CONCAT (CONCAT (x, y), z), l)
416 :    
417 :     (* Flattening runs in linear time.
418 :     * We simultaneously use this loop to set the high bits in
419 :     * codes that correspond to shared nodes. The positions of
420 :     * these codes are given by high-to-low sorted list of
421 :     * integers. *)
422 :     fun flat (x, (_, [], l)) = fflat (x, l)
423 :     | flat (STRING s, (p, h :: t, l)) =
424 :     #3 (add (s, p, h, t, l))
425 :     | flat (CONCAT (x, STRING s), (p, h :: t, l)) =
426 :     flat (x, add (s, p, h, t, l))
427 :     | flat (CONCAT (x, CONCAT (y, z)), phtl) =
428 :     flat (CONCAT (CONCAT (x, y), z), phtl)
429 : monnier 496 in
430 : blume 710 concat (flat (pr, (next, rev (si_list si), [])))
431 : monnier 496 end
432 :     in
433 :     fun pickle emptyMap p = let
434 : blume 710 val (_, pr, (_, _, _, next, si)) =
435 :     p (HCM.empty, PM.empty, emptyMap, 0, si_empty)
436 : monnier 496 in
437 : blume 710 pr2s (pr, next, si)
438 : monnier 496 end
439 :     end
440 :    
441 :     type ('b_ahm, 'a_ahm) map_lifter =
442 :     { extract: 'a_ahm -> 'b_ahm, patchback: 'a_ahm * 'b_ahm -> 'a_ahm }
443 :    
444 : blume 710 fun lift_pickler { extract, patchback } wb b (hcm, fwdm, a_ahm, next, si) =
445 :     let val b_ahm = extract a_ahm
446 :     val (codes, pr, (hcm', fwdm', b_ahm', next', si')) =
447 :     wb b (hcm, fwdm, b_ahm, next, si)
448 :     val a_ahm' = patchback (a_ahm, b_ahm')
449 :     in
450 :     (codes, pr, (hcm', fwdm', a_ahm', next', si'))
451 :     end
452 : monnier 496
453 :     (* for export *)
454 :     nonfix $
455 :     val $ = dollar
456 :     end

root@smlnj-gforge.cs.uchicago.edu
ViewVC Help
Powered by ViewVC 1.0.0