SCM Repository
Annotation of /sml/branches/SMLNJ/src/comp-lib/pickle-util.sml
Parent Directory
|
Revision Log
Revision 427 -
(view)
(download)
Original Path: sml/trunk/src/comp-lib/pickle-util.sml
1 : | monnier | 427 | (* |
2 : | * This is the new "generic" pickle utility which replaces Andrew Appel's | ||
3 : | * original "sharewrite" module. Aside from formal differences, this | ||
4 : | * new module ended up not being any different from Andrew's. However, | ||
5 : | * it ties in with its "unpickle" counterpart which is a *lot* better than | ||
6 : | * its predecessor. | ||
7 : | * | ||
8 : | * Generated pickles tend to be a little bit smaller, which can | ||
9 : | * probably be explained by the slightly more compact (in the common case, | ||
10 : | * i.e. for small absolute values) integer representation. | ||
11 : | * | ||
12 : | * July 1999, Matthias Blume | ||
13 : | *) | ||
14 : | |||
15 : | (* | ||
16 : | * By the way, there is no point in trying to internally use | ||
17 : | * Word8Vector.vector instead of string for now. | ||
18 : | * These strings participate in order comparisons (which makes | ||
19 : | * Word8Vector.vector unsuitable). Moreover, conversion between | ||
20 : | * string and Word8Vector.vector is currently just a cast, so it | ||
21 : | * does not cost anything in the end. | ||
22 : | *) | ||
23 : | signature PICKLE_UTIL = sig | ||
24 : | |||
25 : | type id | ||
26 : | |||
27 : | (* Type info. Use a different number for each type constructor. *) | ||
28 : | type tinfo = int (* negative numbers are reserved! *) | ||
29 : | |||
30 : | type 'ahm pickle | ||
31 : | type ('ahm, 'v) pickler = 'v -> 'ahm pickle | ||
32 : | |||
33 : | (* Combining pickles into one. The resulting compound pickle will not | ||
34 : | * automatically be subject to hash-consing. Wrap it with $ to get | ||
35 : | * that effect. *) | ||
36 : | val & : 'ahm pickle * 'ahm pickle -> 'ahm pickle | ||
37 : | |||
38 : | (* $ produces the pickle for one case (constructor) of a datatype. | ||
39 : | * The string must be one character long and the argument pickle | ||
40 : | * should be the pickle for the constructor's arguments. If there | ||
41 : | * are no arguments, then use % instead of $. | ||
42 : | * Use the same tinfo for all constructors of the same datatype | ||
43 : | * and different tinfos for constructors of different types. | ||
44 : | * | ||
45 : | * The latter is really only important if there are constructors | ||
46 : | * of different type who have identical argument types and use the | ||
47 : | * same $ identificaton string. In this case the pickler might | ||
48 : | * identify two values of different types, and as a result the | ||
49 : | * unpickler will be very unhappy. | ||
50 : | * | ||
51 : | * On the other hand, if you use different tinfos for the same type, | ||
52 : | * then nothing terrible will happen. You might lose some sharing, | ||
53 : | * though. | ||
54 : | * | ||
55 : | * The string argument could theoretically be more than one character | ||
56 : | * long. In this case the corresponding unpickling function must | ||
57 : | * be sure to get all those characters out of the input stream. | ||
58 : | * We actually do exploit this "feature" internally. *) | ||
59 : | val $ : tinfo -> string * 'ahm pickle -> 'ahm pickle | ||
60 : | val % : tinfo -> string -> 'ahm pickle | ||
61 : | |||
62 : | (* "ah_share" is used to specify potential for "ad-hoc" sharing | ||
63 : | * using the user-supplied map. It is important that ah_share is | ||
64 : | * applied to pickles constructed by $ or % but never to those | ||
65 : | * constructed by &. Ad-hoc sharing is used to break structural | ||
66 : | * cycles, to identify parts of the value that the hash-conser cannot | ||
67 : | * automatically identify but which should be identified nevertheless, | ||
68 : | * or to identify those parts that would be too expensive to be left | ||
69 : | * to the hash-conser. *) | ||
70 : | val ah_share : { find : 'ahm * 'v -> id option, | ||
71 : | insert : 'ahm * 'v * id -> 'ahm } -> | ||
72 : | ('ahm, 'v) pickler -> ('ahm, 'v) pickler | ||
73 : | |||
74 : | (* generating pickles for values of some basic types *) | ||
75 : | val w_bool : ('ahm, bool) pickler | ||
76 : | val w_int : ('ahm, int) pickler | ||
77 : | val w_word : ('ahm, word) pickler | ||
78 : | val w_int32 : ('ahm, Int32.int) pickler | ||
79 : | val w_word32 : ('ahm, Word32.word) pickler | ||
80 : | val w_string : ('ahm, string) pickler | ||
81 : | |||
82 : | (* generating pickles for some parameterized types (given a pickler | ||
83 : | * for the parameter) *) | ||
84 : | val w_list : ('ahm, 'a) pickler -> ('ahm, 'a list) pickler | ||
85 : | val w_option : ('ahm, 'a) pickler -> ('ahm, 'a option) pickler | ||
86 : | |||
87 : | (* this doesn't automatically identify (i.e., hash-cons) pairs *) | ||
88 : | val w_pair : | ||
89 : | ('ahm, 'a) pickler * ('ahm, 'b) pickler -> ('ahm, 'a * 'b) pickler | ||
90 : | |||
91 : | (* Pickling a "lazy" value (i.e., a thunk); the thunk will be forced | ||
92 : | * by the pickler. Unpickling is lazy again; but, of course, that | ||
93 : | * laziness is unrelated to the laziness of the original value. *) | ||
94 : | val w_lazy : ('ahm, 'a) pickler -> ('ahm, unit -> 'a) pickler | ||
95 : | |||
96 : | (* run the pickle, i.e., turn it into a string *) | ||
97 : | val pickle : 'ahm -> 'ahm pickle -> string | ||
98 : | |||
99 : | (* The xxx_lifter stuff is here to allow picklers to be "patched | ||
100 : | * together". If you already have a pickler that uses a sharing map | ||
101 : | * of type B and you want to use it as part of a bigger pickler that | ||
102 : | * uses a sharing map of type A, then you must write a (B, A) map_lifter | ||
103 : | * which then lets you lift the existing pickler to one that uses | ||
104 : | * type A maps instead of its own type B maps. | ||
105 : | * | ||
106 : | * The idea is that B maps are really part of A maps. They can be | ||
107 : | * extracted for the duration of using the existing pickler. Then, | ||
108 : | * when that pickler is done, we can patch the resulting new B map | ||
109 : | * back into the original A map to obtain a new A map. *) | ||
110 : | type ('b_ahm, 'a_ahm) map_lifter = | ||
111 : | { extract: 'a_ahm -> 'b_ahm, patchback: 'a_ahm * 'b_ahm -> 'a_ahm } | ||
112 : | |||
113 : | val lift_pickler: ('b_ahm, 'a_ahm) map_lifter -> | ||
114 : | ('b_ahm, 'v) pickler -> ('a_ahm, 'v) pickler | ||
115 : | end | ||
116 : | |||
117 : | structure PickleUtil :> PICKLE_UTIL = struct | ||
118 : | |||
119 : | type pos = int | ||
120 : | type id = pos | ||
121 : | type tinfo = int | ||
122 : | type codes = id list | ||
123 : | |||
124 : | structure HCM = BinaryMapFn | ||
125 : | (struct | ||
126 : | type ord_key = string * tinfo * codes | ||
127 : | fun compare ((c, t, l), (c', t', l')) = let | ||
128 : | val tinfoCmp = Int.compare | ||
129 : | fun codesCmp ([], []) = EQUAL | ||
130 : | | codesCmp (_ :: _, []) = GREATER | ||
131 : | | codesCmp ([], _ :: _) = LESS | ||
132 : | | codesCmp (h :: t, h' :: t') = | ||
133 : | case Int.compare (h, h') of | ||
134 : | EQUAL => codesCmp (t, t') | ||
135 : | | unequal => unequal | ||
136 : | in | ||
137 : | case String.compare (c, c') of | ||
138 : | EQUAL => (case tinfoCmp (t, t') of | ||
139 : | EQUAL => codesCmp (l, l') | ||
140 : | | unequal => unequal) | ||
141 : | | unequal => unequal | ||
142 : | end | ||
143 : | end) | ||
144 : | |||
145 : | datatype pre_result = | ||
146 : | STRING of string | ||
147 : | | CONCAT of pre_result * pre_result | ||
148 : | |||
149 : | fun pre_size (STRING s) = size s | ||
150 : | | pre_size (CONCAT (p, p')) = pre_size p + pre_size p' | ||
151 : | |||
152 : | val backref = STRING "\255" | ||
153 : | val size_backref = 1 | ||
154 : | val nullbytes = STRING "" | ||
155 : | |||
156 : | type hcm = id HCM.map | ||
157 : | type 'ahm state = hcm * 'ahm * pos | ||
158 : | |||
159 : | type 'ahm pickle = 'ahm state -> codes * pre_result * 'ahm state | ||
160 : | type ('ahm, 'v) pickler = 'v -> 'ahm pickle | ||
161 : | |||
162 : | infix 3 $ | ||
163 : | infixr 4 & | ||
164 : | |||
165 : | fun (f & g) state = let | ||
166 : | val (fc, fpr, state') = f state | ||
167 : | val (gc, gpr, state'') = g state' | ||
168 : | in | ||
169 : | (fc @ gc, CONCAT (fpr, gpr), state'') | ||
170 : | end | ||
171 : | |||
172 : | fun anyint_encode (n, negative) = let | ||
173 : | (* this is essentially the same mechanism that's also used in | ||
174 : | * TopLevel/batch/binfile.sml (maybe we should share it) *) | ||
175 : | val // = LargeWord.div | ||
176 : | val %% = LargeWord.mod | ||
177 : | val !! = LargeWord.orb | ||
178 : | infix // %% !! | ||
179 : | val toW8 = Word8.fromLargeWord | ||
180 : | fun r (0w0, l) = Word8Vector.fromList l | ||
181 : | | r (n, l) = | ||
182 : | r (n // 0w128, toW8 ((n %% 0w128) !! 0w128) :: l) | ||
183 : | val lastDigit = n %% 0w64 | ||
184 : | val lastByte = if negative then lastDigit !! 0w64 else lastDigit | ||
185 : | in | ||
186 : | Byte.bytesToString (r (n // 0w64, [toW8 lastByte])) | ||
187 : | end | ||
188 : | |||
189 : | fun largeword_encode n = anyint_encode (n, false) | ||
190 : | fun largeint_encode i = | ||
191 : | if i >= 0 then anyint_encode (LargeWord.fromLargeInt i, false) | ||
192 : | (* careful to do the negation in word domain... *) | ||
193 : | else anyint_encode (0w0 - LargeWord.fromLargeInt i, true) | ||
194 : | |||
195 : | val word32_encode = largeword_encode o Word32.toLargeWord | ||
196 : | val word_encode = largeword_encode o Word.toLargeWord | ||
197 : | |||
198 : | val int32_encode = largeint_encode o Int32.toLarge | ||
199 : | val int_encode = largeint_encode o Int.toLarge | ||
200 : | |||
201 : | fun % ti c (hcm, ahm, next) = let | ||
202 : | val key = (c, ti, []) | ||
203 : | in | ||
204 : | case HCM.find (hcm, key) of | ||
205 : | SOME i => ([i], STRING c, (hcm, ahm, next + size c)) | ||
206 : | | NONE => ([next], STRING c, | ||
207 : | (HCM.insert (hcm, key, next), ahm, next + size c)) | ||
208 : | end | ||
209 : | |||
210 : | fun dollar ti (c, p) (hcm, ahm, next) = let | ||
211 : | val (codes, pr, (hcm', ahm', next')) = p (hcm, ahm, next + size c) | ||
212 : | val key = (c, ti, codes) | ||
213 : | in | ||
214 : | case HCM.find (hcm, key) of | ||
215 : | SOME i => let | ||
216 : | val brnum = int_encode i | ||
217 : | in | ||
218 : | ([i], CONCAT (backref, STRING brnum), | ||
219 : | (hcm, ahm, next + size_backref + size brnum)) | ||
220 : | end | ||
221 : | | NONE => | ||
222 : | ([next], CONCAT (STRING c, pr), | ||
223 : | (HCM.insert (hcm', key, next), ahm', next')) | ||
224 : | end | ||
225 : | |||
226 : | fun ah_share { find, insert } w v (hcm, ahm, next) = | ||
227 : | case find (ahm, v) of | ||
228 : | SOME i => let | ||
229 : | val brnum = int_encode i | ||
230 : | in | ||
231 : | ([i], CONCAT (backref, STRING brnum), | ||
232 : | (hcm, ahm, next + size_backref + size brnum)) | ||
233 : | end | ||
234 : | | NONE => w v (hcm, insert (ahm, v, next), next) | ||
235 : | |||
236 : | fun w_lazy w thunk (hcm, ahm, next) = let | ||
237 : | val v = thunk () | ||
238 : | (* The larger the value of trialStart, the smaller the chance that | ||
239 : | * the loop (see below) will run more than once. However, some | ||
240 : | * space may be wasted. 2 sounds like a good compromise to me. *) | ||
241 : | val trialStart = 2 | ||
242 : | (* This loop is ugly, but we don't expect it to run very often. | ||
243 : | * It is needed because we must first write the length of the | ||
244 : | * encoding of the thunk's value, but that encoding depends | ||
245 : | * on the length (or rather: on the length of the length). *) | ||
246 : | fun loop (nxt, ilen) = let | ||
247 : | val (codes, pr, state) = w v (hcm, ahm, nxt) | ||
248 : | val sz = pre_size pr | ||
249 : | val ie = int_encode sz | ||
250 : | val iesz = size ie | ||
251 : | (* Padding in front is better because the unpickler can | ||
252 : | * simply discard all leading 0s and does not need to know | ||
253 : | * about the pickler's setting of "trialStart". *) | ||
254 : | val null = STRING "\000" | ||
255 : | fun pad (pr, n) = | ||
256 : | if n = 0 then pr | ||
257 : | else pad (CONCAT (null, pr), n - 1) | ||
258 : | in | ||
259 : | if ilen < iesz then loop (nxt + 1, ilen + 1) | ||
260 : | else (codes, CONCAT (pad (STRING ie, ilen - iesz), pr), state) | ||
261 : | end | ||
262 : | in | ||
263 : | loop (next + trialStart, trialStart) | ||
264 : | end | ||
265 : | |||
266 : | local | ||
267 : | val I = ~1 | ||
268 : | val W = ~2 | ||
269 : | val I32 = ~3 | ||
270 : | val W32 = ~4 | ||
271 : | in | ||
272 : | (* Even though the encoding could start with the | ||
273 : | * backref character, we know that it isn't actually a backref | ||
274 : | * because % suppresses back-references. | ||
275 : | * Of course, this must be taken care of by unpickle-util! *) | ||
276 : | fun w_int i = % I (int_encode i) | ||
277 : | fun w_word w = % W (word_encode w) | ||
278 : | fun w_int32 i32 = % I32 (int32_encode i32) | ||
279 : | fun w_word32 w32 = % W32 (word32_encode w32) | ||
280 : | end | ||
281 : | |||
282 : | local | ||
283 : | val L = ~5 | ||
284 : | fun chop5 l = let | ||
285 : | fun ch (a :: b :: c :: d :: e :: r, cl) = | ||
286 : | ch (r, (e, d, c, b, a) :: cl) | ||
287 : | | ch (r, cl) = (rev r, cl) | ||
288 : | in | ||
289 : | ch (rev l, []) | ||
290 : | end | ||
291 : | in | ||
292 : | fun w_list w l = let | ||
293 : | val op $ = dollar L | ||
294 : | fun wc [] = % L "N" | ||
295 : | | wc ((a, b, c, d, e) :: r) = | ||
296 : | "C" $ w a & w b & w c & w d & w e & wc r | ||
297 : | in | ||
298 : | case chop5 l of | ||
299 : | ([], []) => % L "0" | ||
300 : | | ([a], []) => "1" $ w a | ||
301 : | | ([a, b], []) => "2" $ w a & w b | ||
302 : | | ([a, b, c], []) => "3" $ w a & w b & w c | ||
303 : | | ([a, b, c, d], []) => "4" $ w a & w b & w c & w d | ||
304 : | | ([], r) => "5" $ wc r | ||
305 : | | ([a], r) => "6" $ w a & wc r | ||
306 : | | ([a, b], r) => "7" $ w a & w b & wc r | ||
307 : | | ([a, b, c], r) => "8" $ w a & w b & w c & wc r | ||
308 : | | ([a, b, c, d], r) => "9" $ w a & w b & w c & w d & wc r | ||
309 : | | _ => raise Fail "PickleUtil.w_list: impossible chop" | ||
310 : | end | ||
311 : | end | ||
312 : | |||
313 : | local | ||
314 : | val O = ~6 | ||
315 : | in | ||
316 : | fun w_option arg = let | ||
317 : | val op $ = dollar O | ||
318 : | fun wo w NONE = % O "n" | ||
319 : | | wo w (SOME i) = "s" $ w i | ||
320 : | in | ||
321 : | wo arg | ||
322 : | end | ||
323 : | end | ||
324 : | |||
325 : | fun w_pair (wa, wb) (a, b) = wa a & wb b | ||
326 : | |||
327 : | local | ||
328 : | val S = ~7 | ||
329 : | in | ||
330 : | fun w_string s = let | ||
331 : | val op $ = dollar S | ||
332 : | (* The dummy_pickle is a hack to get strings to be identified | ||
333 : | * automatically. They don't have "natural" children, so normally | ||
334 : | * % would suppress the backref. The dummy pickle produces no | ||
335 : | * codes and no output, but it is there to make $ believe that | ||
336 : | * there are children. *) | ||
337 : | fun dummy_pickle state = ([], nullbytes, state) | ||
338 : | fun esc #"\\" = "\\\\" | ||
339 : | | esc #"\"" = "\\\"" | ||
340 : | | esc #"\255" = "\\\255" (* need to escape backref char *) | ||
341 : | | esc c = String.str c | ||
342 : | in | ||
343 : | (String.translate esc s ^ "\"") $ dummy_pickle | ||
344 : | end | ||
345 : | end | ||
346 : | |||
347 : | local | ||
348 : | val B = ~8 | ||
349 : | in | ||
350 : | fun w_bool true = % B "t" | ||
351 : | | w_bool false = % B "f" | ||
352 : | end | ||
353 : | |||
354 : | local | ||
355 : | fun pr2s pr = let | ||
356 : | fun flat (STRING s, l) = s :: l | ||
357 : | | flat (CONCAT (x, STRING s), l) = flat (x, s :: l) | ||
358 : | | flat (CONCAT (x, CONCAT (y, z)), l) = | ||
359 : | flat (CONCAT (CONCAT (x, y), z), l) | ||
360 : | in | ||
361 : | concat (flat (pr, [])) | ||
362 : | end | ||
363 : | in | ||
364 : | fun pickle emptyMap p = let | ||
365 : | val (_, pr, _) = p (HCM.empty, emptyMap, 0) | ||
366 : | in | ||
367 : | pr2s pr | ||
368 : | end | ||
369 : | end | ||
370 : | |||
371 : | type ('b_ahm, 'a_ahm) map_lifter = | ||
372 : | { extract: 'a_ahm -> 'b_ahm, patchback: 'a_ahm * 'b_ahm -> 'a_ahm } | ||
373 : | |||
374 : | fun lift_pickler { extract, patchback } wb b (hcm, a_ahm, next) = let | ||
375 : | val b_ahm = extract a_ahm | ||
376 : | val (codes, pr, (hcm', b_ahm', next')) = wb b (hcm, b_ahm, next) | ||
377 : | val a_ahm' = patchback (a_ahm, b_ahm') | ||
378 : | in | ||
379 : | (codes, pr, (hcm', a_ahm', next')) | ||
380 : | end | ||
381 : | |||
382 : | (* for export *) | ||
383 : | nonfix $ | ||
384 : | val $ = dollar | ||
385 : | end |
root@smlnj-gforge.cs.uchicago.edu | ViewVC Help |
Powered by ViewVC 1.0.0 |