SCM Repository
Annotation of /sml/branches/SMLNJ/src/comp-lib/unpickle-util.sml
Parent Directory
|
Revision Log
Revision 428 - (view) (download)
1 : | monnier | 427 | (* |
2 : | * This is the new "generic" unpickle utility. It replaces Andrew Appel's | ||
3 : | * original "shareread" module. The main difference is that instead of | ||
4 : | * using a "universal" type together with numerous injections and projections | ||
5 : | * it uses separate maps. This approach proves to be a lot more light-weight. | ||
6 : | * | ||
7 : | * The benefits are: | ||
8 : | * - no projections, hence no "match nonexhaustive" warnings and... | ||
9 : | * - ...no additional run-time overhead (all checking is done during | ||
10 : | * the map membership test which is common to both implementations) | ||
11 : | * - no necessity for those many tiny "fn"-functions that litter the | ||
12 : | * original code, resulting in... | ||
13 : | * - ...a more "natural" style for writing the actual unpickling code | ||
14 : | * that makes for shorter source code | ||
15 : | * - ...a lot less generated machine code (less than 50% of the original | ||
16 : | * version) | ||
17 : | * - ...slightly faster operation (around 15% speedup) | ||
18 : | * (My guess is that it is a combination of fewer projections and | ||
19 : | * fewer generated thunks that make the code run faster.) | ||
20 : | * | ||
21 : | * July 1999, Matthias Blume | ||
22 : | *) | ||
23 : | signature UNPICKLE_UTIL = sig | ||
24 : | |||
25 : | exception Format | ||
26 : | |||
27 : | type 'v map (* one for each type *) | ||
28 : | type session (* encapsulates unpickling state *) | ||
29 : | |||
30 : | (* Make a type-specific sharing map using "mkMap". | ||
31 : | * | ||
32 : | * Be sure to create such maps only locally, otherwise you have a | ||
33 : | * space leak. | ||
34 : | * | ||
35 : | * The ML type system will prevent you from accidentially using the | ||
36 : | * same map for different types, so don't worry. But using TOO MANY | ||
37 : | * maps (i.e., more than one map for the same type) will likely | ||
38 : | * cause problems because the unpickler might try to look for a | ||
39 : | * back reference that is in a different map than the one where the | ||
40 : | * value is actually registered. | ||
41 : | * | ||
42 : | * By the way, this warning is not unique to the many-maps approach. | ||
43 : | * The same thing would happen with the original "universal domain" | ||
44 : | * unpickler if you declare two different constructors for the | ||
45 : | * same type. Given that there are about 100 types (and thus | ||
46 : | * 100 constructors or maps) in the SML/NJ environment pickler, | ||
47 : | * the possibility for such a mistake is not to be dismissed. *) | ||
48 : | val mkMap : unit -> 'v map | ||
49 : | |||
50 : | type 'v reader = unit -> 'v | ||
51 : | |||
52 : | (* A "charGetter" is the mechanism that gets actual characters from | ||
53 : | * the pickle. For ordinary pickles, the unpickler will never call | ||
54 : | * "seek". Moreover, the same is true if you read the pickles created | ||
55 : | * by pickleN sequentially from the first to the last (i.e., not | ||
56 : | * "out-of-order"). "cur" determines the current position and must be | ||
57 : | * implemented. *) | ||
58 : | type charGetter = | ||
59 : | { read: char reader, seek: int -> unit, cur: unit -> int } | ||
60 : | |||
61 : | (* the string is the pickle string *) | ||
62 : | val stringGetter : string -> charGetter | ||
63 : | |||
64 : | (* open the unpickling session - everything is parameterized by this; | ||
65 : | * the charGetter provides the bytes of the pickle *) | ||
66 : | val mkSession : charGetter -> session | ||
67 : | |||
68 : | (* The typical style is to write a "reader" function for each type | ||
69 : | * The reader function uses a local helper function which takes the | ||
70 : | * first character of a pickle (this is usually the discriminator that | ||
71 : | * was given to $ or % in the pickler) and returns the unpickled | ||
72 : | * value. The function recursively calls other "reader" functions. | ||
73 : | * To actually get the value from the pickle, pass the helper | ||
74 : | * to "share" -- together with the current session and the | ||
75 : | * type-specific map. "share" will take care of back-references | ||
76 : | * (using the map) and pass the first character to your helper | ||
77 : | * when necessary. The standard pattern for writing a "t reader" | ||
78 : | * therefore is: | ||
79 : | * | ||
80 : | * val session = UnpickleUtil.mkSession pickle | ||
81 : | * fun share m f = UnpickleUtil.share session m f | ||
82 : | * ... | ||
83 : | * val t_map = Unpickleutil.mkMap () | ||
84 : | * ... | ||
85 : | * fun r_t () = let | ||
86 : | * fun t #"a" = ... (* case a *) | ||
87 : | * | t #"b" = ... (* case b *) | ||
88 : | * ... | ||
89 : | * | _ = raise UnpickleUtil.Format | ||
90 : | * in | ||
91 : | * share t_map t | ||
92 : | * end | ||
93 : | *) | ||
94 : | val share : session -> 'v map -> (char -> 'v) -> 'v | ||
95 : | |||
96 : | (* If you know that you don't need a map because there can be no | ||
97 : | * sharing (typically if you didn't use any $ but only % for pickling | ||
98 : | * your type), then you can use "nonshare" instead of "share". *) | ||
99 : | val nonshare : session -> (char -> 'v) -> 'v | ||
100 : | |||
101 : | (* making readers for some common types *) | ||
102 : | val r_int : session -> int reader | ||
103 : | val r_int32 : session -> Int32.int reader | ||
104 : | val r_word : session -> word reader | ||
105 : | val r_word32 : session -> Word32.word reader | ||
106 : | val r_bool : session -> bool reader | ||
107 : | val r_string : session -> string reader | ||
108 : | |||
109 : | (* readers for parametric types need their own map *) | ||
110 : | val r_list : session -> 'v list map -> 'v reader -> 'v list reader | ||
111 : | val r_option : session -> 'v option map -> 'v reader -> 'v option reader | ||
112 : | |||
113 : | (* pairs are not shared, so we don't need a map here *) | ||
114 : | val r_pair : 'a reader * 'b reader -> ('a * 'b) reader | ||
115 : | |||
116 : | (* The laziness generated here is in the unpickling. In other words | ||
117 : | * unpickling state is not discarded until the last lazy value has been | ||
118 : | * forced. *) | ||
119 : | val r_lazy : session -> 'a reader -> (unit -> 'a) reader | ||
120 : | end | ||
121 : | |||
122 : | structure UnpickleUtil :> UNPICKLE_UTIL = struct | ||
123 : | |||
124 : | exception Format | ||
125 : | |||
126 : | type 'v map = ('v * int) IntBinaryMap.map ref | ||
127 : | type state = string map | ||
128 : | |||
129 : | type 'v reader = unit -> 'v | ||
130 : | |||
131 : | type charGetter = | ||
132 : | { read: char reader, seek: int -> unit, cur: unit -> int } | ||
133 : | |||
134 : | type session = { state: state, getter: charGetter } | ||
135 : | |||
136 : | fun mkMap () = ref IntBinaryMap.empty | ||
137 : | |||
138 : | fun stringGetter pstring = let | ||
139 : | val pos = ref 0 | ||
140 : | fun rd () = let | ||
141 : | val p = !pos | ||
142 : | in | ||
143 : | pos := p + 1; | ||
144 : | String.sub (pstring, p) handle Subscript => raise Format | ||
145 : | end | ||
146 : | fun sk p = pos := p | ||
147 : | fun cur () = !pos | ||
148 : | in | ||
149 : | { read = rd, seek = sk, cur = cur } | ||
150 : | end | ||
151 : | |||
152 : | local | ||
153 : | fun f_anyint rd () = let | ||
154 : | val & = Word8.andb | ||
155 : | infix & | ||
156 : | val large = Word8.toLargeWord | ||
157 : | fun loop n = let | ||
158 : | val w8 = Byte.charToByte (rd ()) | ||
159 : | in | ||
160 : | if (w8 & 0w128) = 0w0 then | ||
161 : | (n * 0w64 + large (w8 & 0w63), (w8 & 0w64) <> 0w0) | ||
162 : | else loop (n * 0w128 + large (w8 & 0w127)) | ||
163 : | end | ||
164 : | in | ||
165 : | loop 0w0 | ||
166 : | end | ||
167 : | |||
168 : | fun f_largeword cvt rd () = | ||
169 : | case f_anyint rd () of | ||
170 : | (w, false) => (cvt w handle _ => raise Format) | ||
171 : | | _ => raise Format | ||
172 : | |||
173 : | fun f_largeint cvt rd () = let | ||
174 : | val (w, negative) = f_anyint rd () | ||
175 : | val i = LargeWord.toLargeInt w handle _ => raise Format | ||
176 : | in | ||
177 : | (if negative then cvt (~i) else cvt i) | ||
178 : | handle _ => raise Format | ||
179 : | end | ||
180 : | in | ||
181 : | val f_int = f_largeint Int.fromLarge | ||
182 : | val f_int32 = f_largeint Int32.fromLarge | ||
183 : | val f_word = f_largeword Word.fromLargeWord | ||
184 : | val f_word32 = f_largeword Word32.fromLargeWord | ||
185 : | end | ||
186 : | |||
187 : | fun mkSession charGetter = | ||
188 : | ({ state = mkMap (), getter = charGetter }: session) | ||
189 : | |||
190 : | fun share { state, getter = { read, seek, cur } } m r = let | ||
191 : | fun firsttime (pos, c) = let | ||
192 : | val v = r c | ||
193 : | val pos' = cur () | ||
194 : | in | ||
195 : | m := IntBinaryMap.insert (!m, pos, (v, pos')); | ||
196 : | v | ||
197 : | end | ||
198 : | in | ||
199 : | case read () of | ||
200 : | #"\255" => let | ||
201 : | val pos = f_int read () | ||
202 : | in | ||
203 : | case IntBinaryMap.find (!m, pos) of | ||
204 : | SOME (v, _) => v | ||
205 : | | NONE => let | ||
206 : | val here = cur () | ||
207 : | in | ||
208 : | seek pos; | ||
209 : | (* It is ok to use "read" here because | ||
210 : | * there won't be back-references that jump | ||
211 : | * to other back-references. *) | ||
212 : | firsttime (pos, read()) | ||
213 : | before seek here | ||
214 : | end | ||
215 : | end | ||
216 : | | c => let | ||
217 : | (* Must subtract one to get back in front of c. *) | ||
218 : | val pos = cur () - 1 | ||
219 : | in | ||
220 : | case IntBinaryMap.find (!m, pos) of | ||
221 : | SOME (v, pos') => (seek pos'; v) | ||
222 : | | NONE => firsttime (pos, c) | ||
223 : | end | ||
224 : | end | ||
225 : | |||
226 : | (* "nonshare" gets around backref detection. Certain integer | ||
227 : | * encodings may otherwise be mis-identified as back references. | ||
228 : | * Moreover, unlike in the case of "share" we don't need a map | ||
229 : | * for "nonshare". This could be used as an optimization for | ||
230 : | * types that are known to never be shared anyway (bool, etc.). *) | ||
231 : | fun nonshare (s: session) f = f (#read (#getter s) ()) | ||
232 : | |||
233 : | local | ||
234 : | fun f2r f_x (s: session) = f_x (#read (#getter s)) | ||
235 : | in | ||
236 : | val r_int = f2r f_int | ||
237 : | val r_int32 = f2r f_int32 | ||
238 : | val r_word = f2r f_word | ||
239 : | val r_word32 = f2r f_word32 | ||
240 : | end | ||
241 : | |||
242 : | fun r_lazy session r () = let | ||
243 : | val memo = ref (fn () => raise Fail "UnpickleUtil.r_lazy") | ||
244 : | val { getter = { cur, seek, ... }, ... } = session | ||
245 : | (* the size may have leading 0s because of padding *) | ||
246 : | fun getSize () = let | ||
247 : | val sz = r_int session () | ||
248 : | in | ||
249 : | if sz = 0 then getSize () else sz | ||
250 : | end | ||
251 : | val sz = getSize () (* size of v *) | ||
252 : | val start = cur () (* start of v *) | ||
253 : | fun thunk () = let | ||
254 : | val wherever = cur () (* remember where we are now *) | ||
255 : | val _ = seek start (* go to start of v *) | ||
256 : | val v = r () (* read v *) | ||
257 : | in | ||
258 : | seek wherever; (* go back to where we were *) | ||
259 : | memo := (fn () => v); (* memoize *) | ||
260 : | v | ||
261 : | end | ||
262 : | in | ||
263 : | memo := thunk; | ||
264 : | seek (start + sz); (* as if we had read the value *) | ||
265 : | (fn () => !memo ()) | ||
266 : | end | ||
267 : | |||
268 : | fun r_list session m r () = let | ||
269 : | fun r_chops () = let | ||
270 : | fun rcl #"N" = [] | ||
271 : | | rcl #"C" = r () :: r () :: r () :: r () :: r () :: r_chops () | ||
272 : | | rcl _ = raise Format | ||
273 : | in | ||
274 : | share session m rcl | ||
275 : | end | ||
276 : | fun rl #"0" = [] | ||
277 : | | rl #"1" = [r ()] | ||
278 : | | rl #"2" = [r (), r ()] | ||
279 : | | rl #"3" = [r (), r (), r ()] | ||
280 : | | rl #"4" = [r (), r (), r (), r ()] | ||
281 : | | rl #"5" = r_chops () | ||
282 : | | rl #"6" = r () :: r_chops () | ||
283 : | | rl #"7" = r () :: r () :: r_chops () | ||
284 : | | rl #"8" = r () :: r () :: r () :: r_chops () | ||
285 : | | rl #"9" = r () :: r () :: r () :: r () :: r_chops () | ||
286 : | | rl _ = raise Format | ||
287 : | in | ||
288 : | share session m rl | ||
289 : | end | ||
290 : | |||
291 : | fun r_option session m r () = let | ||
292 : | fun ro #"n" = NONE | ||
293 : | | ro #"s" = SOME (r ()) | ||
294 : | | ro _ = raise Format | ||
295 : | in | ||
296 : | share session m ro | ||
297 : | end | ||
298 : | |||
299 : | fun r_pair (r_a, r_b) () = (r_a (), r_b ()) | ||
300 : | |||
301 : | fun r_bool session () = let | ||
302 : | fun rb #"t" = true | ||
303 : | | rb #"f" = false | ||
304 : | | rb _ = raise Format | ||
305 : | in | ||
306 : | nonshare session rb | ||
307 : | end | ||
308 : | |||
309 : | fun r_string session () = let | ||
310 : | val { state = m, getter } = session | ||
311 : | val { read, ... } = getter | ||
312 : | fun rs c = let | ||
313 : | fun loop (l, #"\"") = String.implode (rev l) | ||
314 : | | loop (l, #"\\") = loop (read () :: l, read ()) | ||
315 : | | loop (l, c) = loop (c :: l, read ()) | ||
316 : | in | ||
317 : | loop ([], c) | ||
318 : | end | ||
319 : | in | ||
320 : | share session m rs | ||
321 : | end | ||
322 : | end |
root@smlnj-gforge.cs.uchicago.edu | ViewVC Help |
Powered by ViewVC 1.0.0 |