Home My Page Projects Code Snippets Project Openings SML/NJ
Summary Activity Forums Tracker Lists Tasks Docs Surveys News SCM Files

SCM Repository

[smlnj] View of /sml/trunk/src/compiler/MiscUtil/library/pickle-util.sml
ViewVC logotype

View of /sml/trunk/src/compiler/MiscUtil/library/pickle-util.sml

Parent Directory Parent Directory | Revision Log Revision Log


Revision 569 - (download) (annotate)
Tue Mar 7 04:01:07 2000 UTC (19 years, 4 months ago) by blume
File size: 13101 byte(s)
- size info in BOOTLIST
     * no fixed upper limits for number of bootfiles or length of
       bootfile names in runtime
     * falling back to old behavior if no BOOTLIST size info found
- allocation size heuristics in .run-sml
     * tries to read cache size from /proc/cpuinfo (this is important for
        small-cache Celeron systems!)
- install.sh robustified
- CM manual updates
- paranoid mode
     * no more CMB.deliver() (i.e., all done by CMB.make())
     * can re-use existing sml.boot.* files
     * init.cmi now treated as library
     * library stamps for consistency checks
- sml.boot.<arch>-<os>/PIDMAP file
     * This file is read by the CM startup code.  This is used to minimize
       the amount of dynamic state that needs to be stowed away for the
       purpose of sharing between interactive system and user code.
- CM.Anchor.anchor instead of CM.Anchor.{set,cancel}
     * Upon request by Elsa.  Anchors now controlled by get-set-pair
       like most other CM state variables.
- Compiler.CMSA eliminated
     * No longer supported by CM anyway.
- fixed bugs in pickler that kept biting Stefan
     * past refs to past refs (was caused by the possibility that
       ad-hoc sharing is more discriminating than hash-cons sharing)
     * integer overflow on LargeInt.minInt
- ml-{lex,yacc} build scripts now use new mechanism
  for building standalone programs
- fixed several gcc -Wall warnings that were caused by missing header
  files, missing initializations, etc., in runtime (not all warnings
  eliminated, though)
(*
 * This is the new "generic" pickle utility which replaces Andrew Appel's
 * original "sharewrite" module.  Aside from formal differences, this
 * new module ended up not being any different from Andrew's.  However,
 * it ties in with its "unpickle" counterpart which is a *lot* better than
 * its predecessor.
 *
 * Generated pickles tend to be a little bit smaller, which can
 * probably be explained by the slightly more compact (in the common case,
 * i.e. for small absolute values) integer representation.
 *
 * July 1999, Matthias Blume
 *)

(*
 * By the way, there is no point in trying to internally use
 * Word8Vector.vector instead of string for now.
 * These strings participate in order comparisons (which makes
 * Word8Vector.vector unsuitable).  Moreover, conversion between
 * string and Word8Vector.vector is currently just a cast, so it
 * does not cost anything in the end.
 *)
signature PICKLE_UTIL = sig

    type id

    (* Type info.  Use a different number for each type constructor. *)
    type tinfo = int			(* negative numbers are reserved! *)

    type 'ahm pickle
    type ('ahm, 'v) pickler = 'v -> 'ahm pickle

    (* $ produces the pickle for one case (constructor) of a datatype.
     * The string must be one character long and the argument pickle
     * should be the pickle for the constructor's arguments.
     * Use the same tinfo for all constructors of the same datatype
     * and different tinfos for constructors of different types.
     *
     * The latter is really only important if there are constructors
     * of different type who have identical argument types and use the
     * same $ identificaton string.  In this case the pickler might
     * identify two values of different types, and as a result the
     * unpickler will be very unhappy.
     *
     * On the other hand, if you use different tinfos for the same type,
     * then nothing terrible will happen.  You might lose some sharing,
     * though.
     *
     * The string argument could theoretically be more than one character
     * long.  In this case the corresponding unpickling function must
     * be sure to get all those characters out of the input stream.
     * We actually do exploit this "feature" internally. *)
    val $ : tinfo -> string * 'ahm pickle list -> 'ahm pickle

    (* "ah_share" is used to specify potential for "ad-hoc" sharing
     * using the user-supplied map.
     * Ad-hoc sharing is used to break structural
     * cycles, to identify parts of the value that the hash-conser cannot
     * automatically identify but which should be identified nevertheless,
     * or to identify those parts that would be too expensive to be left
     * to the hash-conser. *)
    val ah_share : { find : 'ahm * 'v -> id option,
		     insert : 'ahm * 'v * id -> 'ahm } ->
        ('ahm, 'v) pickler -> ('ahm, 'v) pickler

    (* generating pickles for values of some basic types *)
    val w_bool : ('ahm, bool) pickler
    val w_int : ('ahm, int) pickler
    val w_word : ('ahm, word) pickler
    val w_int32 : ('ahm, Int32.int) pickler
    val w_word32 : ('ahm, Word32.word) pickler
    val w_string : ('ahm, string) pickler

    (* generating pickles for some parameterized types (given a pickler
     * for the parameter) *)
    val w_list : ('ahm, 'a) pickler -> ('ahm, 'a list) pickler
    val w_option : ('ahm, 'a) pickler -> ('ahm, 'a option) pickler

    (* this doesn't automatically identify (i.e., hash-cons) pairs *)
    val w_pair :
	('ahm, 'a) pickler * ('ahm, 'b) pickler -> ('ahm, 'a * 'b) pickler

    (* Pickling a "lazy" value (i.e., a thunk);  the thunk will be forced
     * by the pickler. Unpickling is lazy again; but, of course, that
     * laziness is unrelated to the laziness of the original value. *)
    val w_lazy : ('ahm, 'a) pickler -> ('ahm, unit -> 'a) pickler

    (* run the pickle, i.e., turn it into a string *)
    val pickle : 'ahm -> 'ahm pickle -> string

    (* The xxx_lifter stuff is here to allow picklers to be "patched
     * together".  If you already have a pickler that uses a sharing map
     * of type B and you want to use it as part of a bigger pickler that
     * uses a sharing map of type A, then you must write a (B, A) map_lifter
     * which then lets you lift the existing pickler to one that uses
     * type A maps instead of its own type B maps.
     *
     * The idea is that B maps are really part of A maps. They can be
     * extracted for the duration of using the existing pickler.  Then,
     * when that pickler is done, we can patch the resulting new B map
     * back into the original A map to obtain a new A map. *)
    type ('b_ahm, 'a_ahm) map_lifter =
	{ extract: 'a_ahm -> 'b_ahm, patchback: 'a_ahm * 'b_ahm -> 'a_ahm }

    val lift_pickler: ('b_ahm, 'a_ahm) map_lifter ->
	('b_ahm, 'v) pickler -> ('a_ahm, 'v) pickler
end

structure PickleUtil :> PICKLE_UTIL = struct

    type pos = int
    type id = pos
    type tinfo = int
    type codes = id list

    structure HCM = RedBlackMapFn
  	(struct
  	    type ord_key = string * tinfo * codes
  	    fun compare ((c, t, l), (c', t', l')) = let
  		fun codesCmp ([], []) = EQUAL
  		  | codesCmp (_ :: _, []) = GREATER
  		  | codesCmp ([], _ :: _) = LESS
  		  | codesCmp (h :: t, h' :: t') =
		    if h < h' then LESS else if h > h' then GREATER
		    else codesCmp (t, t')
  	    in
		if t < t' then LESS else if t > t' then GREATER
		else case String.compare (c, c') of
		    EQUAL => codesCmp (l, l')
  		  | unequal => unequal
  	    end
  	end)

    structure PM = IntRedBlackMap

    datatype pre_result =
	STRING of string
      | CONCAT of pre_result * pre_result

    fun pre_size (STRING s) = size s
      | pre_size (CONCAT (p, p')) = pre_size p + pre_size p'

    val backref = STRING "\255"
    val size_backref = 1
    val nullbytes = STRING ""

    type hcm = id HCM.map
    type fwdm = id PM.map		(* forwarding map *)
    type 'ahm state = hcm * fwdm * 'ahm * pos

    type 'ahm pickle = 'ahm state -> codes * pre_result * 'ahm state
    type ('ahm, 'v) pickler = 'v -> 'ahm pickle

    infix 3 $
    infixr 4 &

    fun (f & g) state = let
	val (fc, fpr, state') = f state
	val (gc, gpr, state'') = g state'
    in
	(fc @ gc, CONCAT (fpr, gpr), state'')
    end

    (* collapse a non-empty list of pickles into one *)
    fun collapse (h, []) = h
      | collapse (h, ht :: tt) = h & collapse (ht, tt)

    fun anyint_encode (n, negative) = let
	(* this is essentially the same mechanism that's also used in
	 * TopLevel/batch/binfile.sml (maybe we should share it) *)
	val // = LargeWord.div
	val %% = LargeWord.mod
	val !! = LargeWord.orb
	infix // %% !!
	val toW8 = Word8.fromLargeWord
	fun r (0w0, l) = Word8Vector.fromList l
	  | r (n, l) =
	    r (n // 0w128, toW8 ((n %% 0w128) !! 0w128) :: l)
	val lastDigit = n %% 0w64
	val lastByte = if negative then lastDigit !! 0w64 else lastDigit
    in
	Byte.bytesToString (r (n // 0w64, [toW8 lastByte]))
    end

    fun largeword_encode n = anyint_encode (n, false)
    fun largeint_encode i =
	if i >= 0 then anyint_encode (LargeWord.fromLargeInt i, false)
	(* careful to do the negation in word domain... *)
	else anyint_encode (0w0 - LargeWord.fromLargeInt i, true)

    val word32_encode = largeword_encode o Word32.toLargeWord
    val word_encode = largeword_encode o Word.toLargeWord

    val int32_encode = largeint_encode o Int32.toLarge
    val int_encode = largeint_encode o Int.toLarge

    fun % ti c (hcm, fwdm, ahm, next) = let
	val key = (c, ti, [])
    in
	case HCM.find (hcm, key) of
	    SOME i => ([i], STRING c, (hcm, PM.insert (fwdm, next, i),
				       ahm, next + size c))
	  | NONE => ([next], STRING c,
		     (HCM.insert (hcm, key, next), fwdm, ahm, next + size c))
    end

    fun dollar ti (c, []) state = % ti c state
      | dollar ti (c, plh :: plt) (hcm, fwdm, ahm, next) = let
	    val p = collapse (plh, plt)
	    val (codes, pr, (hcm', fwdm', ahm', next')) =
		p (hcm, fwdm, ahm, next + size c)
	    val key = (c, ti, codes)
	in
	    case HCM.find (hcm, key) of
		SOME i => let
		    val brnum = int_encode i
		in
		    ([i], CONCAT (backref, STRING brnum),
		     (hcm, PM.insert (fwdm, next, i),
		      ahm, next + size_backref + size brnum))
		end
	      | NONE =>
		([next], CONCAT (STRING c, pr),
		 (HCM.insert (hcm', key, next), fwdm', ahm', next'))
	end

    fun ah_share { find, insert } w v (hcm, fwdm, ahm, next) =
	case find (ahm, v) of
	    SOME i0 => let
		val i = getOpt (PM.find (fwdm, i0), i0)
		val brnum = int_encode i
	    in
		([i], CONCAT (backref, STRING brnum),
		 (hcm, fwdm, ahm, next + size_backref + size brnum))
	    end
	  | NONE => w v (hcm, fwdm, insert (ahm, v, next), next)

    fun w_lazy w thunk (hcm, fwdm, ahm, next) = let
	val v = thunk ()
	(* The larger the value of trialStart, the smaller the chance that
	 * the loop (see below) will run more than once.  However, some
	 * space may be wasted.  2 sounds like a good compromise to me. *)
	val trialStart = 2
	(* This loop is ugly, but we don't expect it to run very often.
	 * It is needed because we must first write the length of the
	 * encoding of the thunk's value, but that encoding depends
	 * on the length (or rather: on the length of the length). *)
	fun loop (nxt, ilen) = let
	    val (codes, pr, state) = w v (hcm, fwdm, ahm, nxt)
	    val sz = pre_size pr
	    val ie = int_encode sz
	    val iesz = size ie
	    (* Padding in front is better because the unpickler can
	     * simply discard all leading 0s and does not need to know
	     * about the pickler's setting of "trialStart". *)
	    val null = STRING "\000"
	    fun pad (pr, n) =
		if n = 0 then pr
		else pad (CONCAT (null, pr), n - 1)
	in
	    if ilen < iesz then loop (nxt + 1, ilen + 1)
	    else (codes, CONCAT (pad (STRING ie, ilen - iesz), pr), state)
	end
    in
	loop (next + trialStart, trialStart)
    end

    local
	val I = ~1
	val W = ~2
	val I32 = ~3
	val W32 = ~4
    in
	(* Even though the encoding could start with the
	 * backref character, we know that it isn't actually a backref
	 * because % suppresses back-references.
	 * Of course, this must be taken care of by unpickle-util! *)
	fun w_int i = % I (int_encode i)
	fun w_word w = % W (word_encode w)
	fun w_int32 i32 = % I32 (int32_encode i32)
	fun w_word32 w32 = % W32 (word32_encode w32)
    end

    local
	val L = ~5
	fun chop5 l = let
	    fun ch (a :: b :: c :: d :: e :: r, cl) =
		ch (r, (e, d, c, b, a) :: cl)
	      | ch (r, cl) = (rev r, cl)
	in
	    ch (rev l, [])
	end
    in
	fun w_list w l = let
	    val op $ = dollar L
	    fun wc [] = % L "N"
	      | wc ((a, b, c, d, e) :: r) =
		"C" $ [w a, w b, w c, w d, w e, wc r]
	in
	    case chop5 l of
		([], []) => % L "0"
	      | ([a], []) => "1" $ [w a]
	      | ([a, b], []) => "2" $ [w a, w b]
	      | ([a, b, c], []) => "3" $ [w a, w b, w c]
	      | ([a, b, c, d], []) => "4" $ [w a, w b, w c, w d]
	      | ([], r) => "5" $ [wc r]
	      | ([a], r) => "6" $ [w a, wc r]
	      | ([a, b], r) => "7" $ [w a, w b, wc r]
	      | ([a, b, c], r) => "8" $ [w a, w b, w c, wc r]
	      | ([a, b, c, d], r) => "9" $ [w a, w b, w c, w d, wc r]
	      | _ => raise Fail "PickleUtil.w_list: impossible chop"
	end
    end

    local
	val O = ~6
    in
	fun w_option arg = let
	    val op $ = dollar O
	    fun wo w NONE = % O "n"
	      | wo w (SOME i) = "s" $ [w i]
	in
	    wo arg
	end
    end

    local
	val P = ~7
    in
	fun w_pair (wa, wb) (a, b) = let
	    val op $ = dollar P
	in
	    "p" $ [wa a, wb b]
	end
    end

    local
	val S = ~8
    in
	fun w_string s = let
	    val op $ = dollar S
	    (* The dummy_pickle is a hack to get strings to be identified
	     * automatically. They don't have "natural" children, so normally
	     * % would suppress the backref.  The dummy pickle produces no
	     * codes and no output, but it is there to make $ believe that
	     * there are children. *)
	    fun dummy_pickle state = ([], nullbytes, state)
	    fun esc #"\\" = "\\\\"
	      | esc #"\"" = "\\\""
	      | esc #"\255" = "\\\255"	(* need to escape backref char *)
	      | esc c = String.str c
	in
	    (String.translate esc s ^ "\"") $ [dummy_pickle]
	end
    end

    local
	val B = ~9
    in
	fun w_bool true = % B "t"
	  | w_bool false = % B "f"
    end

    local
	fun pr2s pr = let
	    fun flat (STRING s, l) = s :: l
	      | flat (CONCAT (x, STRING s), l) = flat (x, s :: l)
	      | flat (CONCAT (x, CONCAT (y, z)), l) =
		flat (CONCAT (CONCAT (x, y), z), l)
	in
	     concat (flat (pr, []))
	end
    in
	fun pickle emptyMap p = let
	    val (_, pr, _) = p (HCM.empty, PM.empty, emptyMap, 0)
	in
	     pr2s pr
	end
    end

    type ('b_ahm, 'a_ahm) map_lifter =
        { extract: 'a_ahm -> 'b_ahm, patchback: 'a_ahm * 'b_ahm -> 'a_ahm }

    fun lift_pickler { extract, patchback } wb b (hcm, fwdm, a_ahm, next) = let
	val b_ahm = extract a_ahm
	val (codes, pr, (hcm', fwdm', b_ahm', next')) =
	    wb b (hcm, fwdm, b_ahm, next)
	val a_ahm' = patchback (a_ahm, b_ahm')
    in
	(codes, pr, (hcm', fwdm', a_ahm', next'))
    end

    (* for export *)
    nonfix $
    val $ = dollar
end

root@smlnj-gforge.cs.uchicago.edu
ViewVC Help
Powered by ViewVC 1.0.0