Home My Page Projects Code Snippets Project Openings diderot
Summary Activity Tracker Tasks SCM

SCM Repository

[diderot] View of /branches/vis12-cl/src/compiler/cl-target/gen-output.sml
ViewVC logotype

View of /branches/vis12-cl/src/compiler/cl-target/gen-output.sml

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3095 - (download) (annotate)
Wed Mar 18 13:12:57 2015 UTC (4 years, 4 months ago) by jhr
File size: 7437 byte(s)
  working on reorg of names
(* gen-output.sml
 *
 * COPYRIGHT (c) 2014 The Diderot Project (http://diderot-language.cs.uchicago.edu)
 * All rights reserved.
 *
 * Generate strand output functions for the OpenCL target.  The output formats always have
 * a single axis for the data elements followed by one, or more, axes for the output structure.
 * There are two cases that we handle:
 *
 *	grid, fixed-size elements:
 *		nrrd has object axis followed by grid axes
 *
 *	collection, fixed-size elements
 *		nrrd has object axis followed by a single axis
 *
 * NOTE: the C target also supports dynamic-sized elements (i.e., dynamic sequences), but the
 * OpenCL target does not support these yet.
 *
 * The object axis kind depends on the output type, but it will either be one of the tensor types
 * that Teem knows about or else nrrdKindList.  In any case, the data elements are written as a
 * flat vector following the in-memory layout.  The other axes in the file will have nrrdKindSpace
 * as their kind.
 *
 * TODO: some of this code is common with c-target/gen-output.sml (e.g., writing outputs to
 * files), so we should refactor it.
 *
 * TODO: for sequences of tensors (e.g., tensor[3]{2}), we should use a separate axis for the
 * sequence dimension with kind nrrdKindList.
 *)

structure GenOutput : sig

  (* gen (props, nAxes) outputs
   *	returns code for getting the output/snapshot nrrds from the program state.
   *    The arguments are:
   *	    props	- the target information
   *	    nAxes	- the number of axes in the grid of strands (NONE for a collection)
   *	    outputs	- the list of output state variables paired with their TreeIL types
   *    The return value is a record {kernels, getFns}, where
   *	    kernels	- list of OpenCL kernels used to get output variables
   *	    getFns	- list of function declarations that implement the public
   *			  output and snapshot queries.
   *)
    val gen : Properties.props * int option -> (TreeIL.Ty.ty * string) list -> CLang.decl list

    val genKernels : Properties.props * CLang.ty * int option
	  -> (TreeIL.Ty.ty * string) list
	    -> (string * CLang.decl) list

  end = struct

    structure IL = TreeIL
    structure V = IL.Var
    structure Ty = IL.Ty
    structure CL = CLang
    structure Nrrd = NrrdEnums
    structure U = CLUtil
    structure CN = CNames

    fun mapi f l = let
	  fun mapf (i, [], l) = List.rev l
	    | mapf (i, x::xs, l) = mapf (i+1, xs, f(i, x)::l)
	  in
	    mapf (0, l, [])
	  end

    val nrrdPtrTy = CL.T_Ptr(CL.T_Named "Nrrd")
    val sizeTy = CL.T_Named "size_t"
    fun mkInt i = CL.mkInt(IntInf.fromInt i)
	      
  (* variables in the generated code *)
    val wrldV = CL.mkVar "wrld"
    val sizesV = CL.mkVar "sizes"
    val nDataV = CL.mkVar "nData"

  (* utility functions for initializing the sizes array *)
    fun sizes i = CL.mkSubscript(sizesV, mkInt i)
    fun setSizes (i, v) = CL.mkAssign(sizes i, v)

  (* create a kernel for copying the given output state variable to the output
   * buffer.
   *)
(* NOTES: if the output is a grid, then we want to use the grid indices as a guide for processing
 * the output.  Otherwise, the order does not matter, but we do need to worry about synchronizing
 * writes to the output buffer.
 *)
    fun mkCopyKernel tgt strandTy (ty : TreeIL.Ty.ty, name) = let
	  val (ty', nElems) = CLTyTranslate.toOutputType ty
	  val body = CL.mkBlock[
		  CL.mkDeclInit(CL.uint32, "idx",
		    CL.mkBinOp(
		      CL.mkBinOp(CL.mkApply("get_group_id", [CL.mkInt 0]), CL.#*, CL.mkVar "BLK_SZ"),
		      CL.#+,
		      CL.mkApply("get_local_id", [CL.mkInt 0]))),
		  CL.mkDeclInit(CL.uint32, "offset",
		    CL.mkBinOp(CL.mkApply("get_num_groups", [CL.mkInt 0]), CL.#*, CL.mkVar "BLK_SZ")),
		  CL.S_Decl(["__global"], strandTy, "state", SOME(CL.I_Exp(CL.mkIndirect(CL.mkVar "sched", "state")))),
		  CL.mkWhile (CL.mkBinOp(CL.mkVar "idx", CL.#<, CL.mkVar "nStrands"),
		    CL.mkBlock(
		      CL.S_Decl(["__global"], CL.T_Ptr ty', "dst",
			SOME(CL.I_Exp(CL.mkBinOp(CL.mkVar "outBuf", CL.#+,
			  CL.mkBinOp(CL.mkInt(IntInf.fromInt nElems), CL.#*, CL.mkVar "idx"))))) ::
		      CLTyTranslate.copyToOutput{
			  ty = ty,
			  dst = CL.mkVar "dst",
			  src = CL.mkSelect(CL.mkSubscript(CL.mkVar "state", CL.mkVar "idx"), name)
			} @ 
			[CL.mkAssign' (CL.mkVar "idx", CL.+=, CL.mkVar "offset")]))
		]
	  val kName = name ^ "Kern"
	  val kern = U.mkKernel(
		kName,
		[U.globalParam(CN.schedPtrTy tgt, "sched"), U.globalParam(CL.T_Ptr ty', "outBuf"), U.clParam(CL.uint32, "nStrands")],
		body)
	  in
	    (kName, kern)
	  end

  (* create the body of an output function for fixed-size outputs.  The structure of the
   * function body is:
   *
   *	declare and compute sizes array
   *	allocate GPU data object
   *	invoke kernel to copy data from strand state into data buffer
   *	allocate nrrd nData
   *	copy data from GPU to nrrd
   *)
    fun genFixedOutput (tgt, snapshot, nAxes, ty, name) = let
	  val (elemCTy, nrrdType, axisKind, nElems) = OutputUtil.infoOf (tgt, ty)
	  val (nAxes, domAxisKind) = (case nAxes
		 of NONE => (1, Nrrd.KindList)
		  | SOME n => (n, Nrrd.KindSpace)
		(* end case *))
          val nDataAxes = if (axisKind = Nrrd.KindScalar) then 0 else 1
	  val sizesDim = nAxes + nDataAxes
	(* generate the sizes initialization code *)
	  val initSizes = let
		val dimSizes = let
                      val dcl = CL.mkDecl(CL.T_Array(sizeTy, SOME sizesDim), "sizes", NONE)
                      in
                        if (axisKind = Nrrd.KindScalar)
                          then [dcl]
                          else [dcl, setSizes(0, mkInt nElems)]
                      end
		in
		  if #isArray tgt
		    then dimSizes @
		      List.tabulate (nAxes, fn i =>
			setSizes(i+nDataAxes, CL.mkSubscript(CL.mkIndirect(wrldV, "size"), mkInt(nAxes-i-1))))
		    else raise Fail "output for collection is unimplemented"
		end
	(* code to copy the data from the GPU *)
	  val copyCode = [] (* FIXME *)
	(* the function body *)
	  val stms =
		CL.mkComment["Compute sizes of nrrd file"] ::
		initSizes @
		[CL.mkReturn(SOME(CL.mkApply("OutputGridFixed", [
		    CL.mkVar "wrld", CL.mkInt(IntInf.fromInt sizesDim), CL.mkVar "sizes",
		    CL.mkVar(NrrdEnums.tyToEnum nrrdType),
		    CL.mkIndirect(CL.mkVar "wrld", name ^ "Kern"), CL.mkVar "nData"
		  ])))]
	  in
	    ([CL.PARAM([], nrrdPtrTy, "nData")], CL.mkBlock stms)
	  end

    fun gen (tgt : Properties.props, nAxes) = let
	  fun getFn snapshot (ty, name) = let
		val funcName = if snapshot
		      then CN.snapshotGet(tgt, name)
		      else CN.outputGet(tgt, name)
		fun mkFunc (params, body) =
		      CL.D_Func([], CL.boolTy, funcName, CL.PARAM([], CN.worldPtrTy tgt, "wrld")::params, body)
		in
		  case ty
		   of Ty.DynSeqTy ty' => raise Fail "dynamic sequences not supported for OpenCL"
		    | _ => mkFunc (genFixedOutput(tgt, snapshot, nAxes, ty, name))
		  (* end case *)
		end
	  fun gen' outputs = let
		val getFns = List.map (getFn false) outputs
		val allFns = if (#exec tgt)
			then getFns @ OutputUtil.genOutput(tgt, outputs)
		      else if (#snapshot tgt)
			then List.map (getFn true) outputs @ getFns
			else getFns
		in
		  allFns
		end
	  in
	    gen'
	  end

(* TODO: we should provide a command-line option to batch output; i.e., to deal with all output
 * variables in one kernel call.
 *)
    fun genKernels (tgt : Properties.props, strandTy, nAxes) = List.map (mkCopyKernel tgt strandTy)
		      
  end

root@smlnj-gforge.cs.uchicago.edu
ViewVC Help
Powered by ViewVC 1.0.0