(* cl-target.sml * * COPYRIGHT (c) 2011 The Diderot Project (http://diderot-language.cs.uchicago.edu) * All rights reserved. *) structure CLTarget : TARGET = struct structure IL = TreeIL structure V = IL.Var structure Ty = IL.Ty structure CL = CLang structure RN = RuntimeNames structure ToCL = TreeToCL structure N = CNames (* revmap f l == List.rev(List.map f l) *) fun revmap f = let fun rmap ([], l) = l | rmap (x::r, l) = rmap (r, f x :: l) in fn l => rmap (l, []) end (* common arithmetic *) fun #+# (a, b) = CL.mkBinOp(a, CL.#+, b) fun #*# (a, b) = CL.mkBinOp(a, CL.#*, b) infix 5 #+# infix 6 #*# (* translate TreeIL types to shadow types *) fun shadowTy ty = (case ty of Ty.BoolTy => CL.T_Named "cl_bool" | Ty.StringTy => raise Fail "unexpected string type" | Ty.IVecTy 1 => CL.T_Named(RN.shadowIntTy ()) | Ty.IVecTy n => raise Fail "unexpected int vector type" | Ty.TensorTy[] => CL.T_Named(RN.shadowRealTy ()) | Ty.TensorTy[n] => CL.T_Named(RN.shadowVecTy n) | Ty.TensorTy[n, m] => CL.T_Named(RN.shadowMatTy(n,m)) | Ty.ImageTy(ImageInfo.ImgInfo{dim, ...}) => CL.T_Named(RN.shadowImageTy dim) | _ => raise Fail(concat["TreeToC.trType(", Ty.toString ty, ")"]) (* end case *)) (* FIXME: add comments that more clearly explain the difference between convertToShadow and * convertStrandToShadow *) (* translate TreeIL types to shadow types *) fun convertToShadow (ty, name) = (case ty of Ty.IVecTy 1 => CL.mkAssign( CL.mkSelect(CL.mkVar RN.shadowGlaobalsName, name), CL.mkIndirect(CL.mkVar RN.globalsVarName, name)) | Ty.TensorTy[n]=> CL.mkCall(RN.convertToShadowVec n, [ CL.mkUnOp(CL.%&, CL.mkSelect(CL.mkVar RN.shadowGlaobalsName, name)), CL.mkIndirect(CL.mkVar RN.globalsVarName, name) ]) | Ty.ImageTy(ImageInfo.ImgInfo{dim, ...}) => CL.mkCall(RN.shadowImageFunc dim, [ CL.mkVar "context", CL.mkUnOp(CL.%&, CL.mkSelect(CL.mkVar RN.shadowGlaobalsName, name)), CL.mkIndirect(CL.mkVar RN.globalsVarName, name) ]) | Ty.TensorTy[n, m] => CL.mkCall(RN.convertToShadowMat(m,n), [ CL.mkSelect(CL.mkVar RN.shadowGlaobalsName, name), CL.mkIndirect(CL.mkVar RN.globalsVarName, name) ]) | _ => CL.mkAssign( CL.mkSelect(CL.mkVar RN.shadowGlaobalsName,name), CL.mkIndirect(CL.mkVar RN.globalsVarName, name)) (* end case *)) (* generate code to convert strand TreeIL types to shadow types *) fun convertStrandToShadow (ty, name, selfIn, selfOut) = (case ty of Ty.IVecTy 1 => CL.mkAssign( CL.mkIndirect(CL.mkVar selfIn, name), CL.mkIndirect(CL.mkVar selfOut, name)) | Ty.TensorTy[n]=> CL.mkCall(RN.convertToShadowVec n, [ CL.mkUnOp(CL.%&, CL.mkIndirect(CL.mkVar selfIn, name)), CL.mkIndirect(CL.mkVar selfOut, name) ]) | Ty.TensorTy[n, m] => CL.mkCall(RN.convertToShadowMat(m,n), [ CL.mkUnOp(CL.%&, CL.mkIndirect(CL.mkVar selfIn, name)), CL.mkIndirect(CL.mkVar selfOut, name) ]) | _ => CL.mkAssign( CL.mkIndirect(CL.mkVar selfIn, name), CL.mkIndirect(CL.mkVar selfOut, name)) (* end case *)) (* helper functions for specifying parameters in various address spaces *) fun clParam (spc, ty, x) = CL.PARAM([spc], ty, x) fun globalParam (ty, x) = CL.PARAM(["__global"], ty, x) fun constantParam (ty, x) = CL.PARAM(["__constant"], ty, x) fun localParam (ty, x) = CL.PARAM(["__local"], ty, x) fun privateParam (ty, x) = CL.PARAM(["__private"], ty, x) (* OpenCL global pointer type *) fun globalPtr ty = CL.T_Qual("__global", CL.T_Ptr ty) (* C variable translation *) structure TrCVar = struct type env = CL.typed_var TreeIL.Var.Map.map fun lookup (env, x) = (case V.Map.find (env, x) of SOME(CL.V(_, x')) => x' | NONE => raise Fail(concat["TrCVar.lookup(_, ", V.name x, ")"]) (* end case *)) (* translate a variable that occurs in an l-value context (i.e., as the target of an assignment) *) fun lvalueVar (env, x) = (case V.kind x of IL.VK_Global => CL.mkIndirect(CL.mkVar RN.globalsVarName, lookup(env, x)) | IL.VK_State strand => CL.mkIndirect(CL.mkVar "selfOut", lookup(env, x)) | IL.VK_Local => CL.mkVar(lookup(env, x)) (* end case *)) (* translate a variable that occurs in an r-value context *) fun rvalueVar (env, x) = (case V.kind x of IL.VK_Global => CL.mkIndirect(CL.mkVar RN.globalsVarName, lookup(env, x)) | IL.VK_State strand => CL.mkIndirect(CL.mkVar "selfIn", lookup(env, x)) | IL.VK_Local => CL.mkVar(lookup(env, x)) (* end case *)) end structure ToC = TreeToCFn (TrCVar) type var = CL.typed_var type exp = CL.exp type stm = CL.stm (* OpenCL specific types *) val clIntTy = CL.T_Named "cl_int" val clProgramTy = CL.T_Named "cl_program" val clKernelTy = CL.T_Named "cl_kernel" val clCmdQueueTy = CL.T_Named "cl_command_queue" val clContextTy = CL.T_Named "cl_context" val clDeviceIdTy = CL.T_Named "cl_device_id" val clPlatformIdTy = CL.T_Named "cl_platform_id" val clMemoryTy = CL.T_Named "cl_mem" val globPtrTy = CL.T_Ptr(CL.T_Named RN.globalsTy) val strandShadowEnv = 1 val globalShadowEnv = 2 (* variable or field that is mirrored between host and GPU *) type mirror_var = { (* FIXME: perhaps it would be cleaner to just track the TreeIL type of the variable? *) hostTy : CL.ty, (* variable type on Host (i.e., C type) *) shadowTy : CL.ty, (* host-side shadow type of GPU type *) gpuTy : CL.ty, (* variable's type on GPU (i.e., OpenCL type) *) hToS: stm, (* the statement that converts the variable to its *) (* shadow representation *) var : CL.var (* variable name *) } datatype strand = Strand of { name : string, tyName : string, state : mirror_var list ref, output : (Ty.ty * CL.var) option ref, (* the strand's output variable (only one for now) *) code : CL.decl list ref, init_code: CL.decl ref } datatype program = Prog of { name : string, (* stem of source file *) double : bool, (* true for double-precision support *) parallel : bool, (* true for multithreaded (or multi-GPU) target *) debug : bool, (* true for debug support in executable *) globals : mirror_var list ref, topDecls : CL.decl list ref, strands : strand AtomTable.hash_table, initially : CL.decl ref, numDims: int ref, (* number of dimensions in initially iteration *) imgGlobals: (string * int) list ref, prFn: CL.decl ref } datatype env = ENV of { info : env_info, vMap : var V.Map.map, scope : scope } and env_info = INFO of { prog : program } and scope = NoScope | GlobalScope | InitiallyScope | StrandScope of TreeIL.var list (* strand initialization *) | MethodScope of MethodName.name * TreeIL.var list (* method body; vars are state variables *) (* the supprted widths of vectors of reals on the target. *) (* FIXME: for OpenCL 1.1, 3 is also valid *) fun vectorWidths () = [2, 4, 8, 16] (* tests for whether various expression forms can appear inline *) fun inlineCons n = (n < 2) (* vectors are inline, but not matrices *) val inlineMatrixExp = false (* can matrix-valued expressions appear inline? *) (* TreeIL to target translations *) structure Tr = struct fun fragment (ENV{info, vMap, scope}, blk) = let val (vMap, stms) = (case scope of GlobalScope => ToC.trFragment (vMap, blk) (* NOTE: if we move strand initialization to the GPU, then we'll have to change the following code! *) | InitiallyScope => ToC.trFragment (vMap, blk) | _ => ToCL.trFragment (vMap, blk) (* end case *)) in (ENV{info=info, vMap=vMap, scope=scope}, stms) end fun block (ENV{vMap, scope, ...}, blk) = let fun saveState cxt stateVars trAssign (env, args, stm) = ( ListPair.foldrEq (fn (x, e, stms) => trAssign(env, x, e)@stms) [stm] (stateVars, args) ) handle ListPair.UnequalLengths => ( print(concat["saveState ", cxt, ": length mismatch; ", Int.toString(List.length args), " args\n"]); raise Fail(concat["saveState ", cxt, ": length mismatch"])) in case scope (* NOTE: if we move strand initialization to the GPU, then we'll have to change the following code! *) of StrandScope stateVars => ToC.trBlock (vMap, saveState "StrandScope" stateVars ToC.trAssign, blk) | MethodScope(name, stateVars) => ToCL.trBlock (vMap, saveState "MethodScope" stateVars ToCL.trAssign, blk) | InitiallyScope => ToCL.trBlock (vMap, fn (_, _, stm) => [stm], blk) | _ => ToC.trBlock (vMap, fn (_, _, stm) => [stm], blk) (* end case *) end fun exp (ENV{vMap, ...}, e) = ToCL.trExp(vMap, e) end (* variables *) structure Var = struct fun mirror (ty, name, shadowEnv ) = { hostTy = ToC.trType ty, shadowTy = shadowTy ty, gpuTy = ToCL.trType ty, hToS = if globalShadowEnv = shadowEnv then convertToShadow (ty, name) else convertStrandToShadow(ty, name, "selfIn", "selfOut"), var = name } fun name (ToCL.V(_, name)) = name fun global (Prog{globals, imgGlobals, ...}, name, ty) = let val x = mirror (ty, name, globalShadowEnv) fun isImgGlobal (Ty.ImageTy(ImageInfo.ImgInfo{dim, ...}), name) = imgGlobals := (name,dim) :: !imgGlobals | isImgGlobal _ = () in globals := x :: !globals; isImgGlobal (ty, name); ToCL.V(#gpuTy x, name) end fun param x = ToCL.V(ToCL.trType(V.ty x), V.name x) fun state (Strand{state, ...}, x) = let val ty = V.ty x val x' = mirror (ty, V.name x, strandShadowEnv) in state := x' :: !state; ToCL.V(#gpuTy x', #var x') end end (* environments *) structure Env = struct (* create a new environment *) fun new prog = ENV{ info=INFO{prog = prog}, vMap = V.Map.empty, scope = NoScope } (* define the current translation context *) fun setScope scope (ENV{info, vMap, ...}) = ENV{info=info, vMap=vMap, scope=scope} val scopeGlobal = setScope GlobalScope val scopeInitially = setScope InitiallyScope fun scopeStrand (env, svars) = setScope (StrandScope svars) env fun scopeMethod (env, name, svars) = setScope (MethodScope(name, svars)) env (* bind a TreeIL varaiable to a target variable *) fun bind (ENV{info, vMap, scope}, x, x') = ENV{ info = info, vMap = V.Map.insert(vMap, x, x'), scope = scope } end (* programs *) structure Program = struct fun new {name, double, parallel, debug} = ( RN.initTargetSpec double; CNames.initTargetSpec double; Prog{ name = name, double = double, parallel = parallel, debug = debug, globals = ref [], topDecls = ref [], strands = AtomTable.mkTable (16, Fail "strand table"), initially = ref(CL.D_Comment["missing initially"]), numDims = ref 0, imgGlobals = ref[], prFn = ref(CL.D_Comment(["No Print Function"])) }) (* register the code that is used to register command-line options for input variables *) fun inputs (Prog{topDecls, ...}, stm) = let val inputsFn = CL.D_Func( [], CL.voidTy, RN.registerOpts, [CL.PARAM([], CL.T_Ptr(CL.T_Named RN.optionsTy), "opts")], stm) in topDecls := inputsFn :: !topDecls end (* register the global initialization part of a program *) fun init (Prog{topDecls, ...}, init) = let val globalsDecl = CL.mkAssign(CL.mkVar RN.globalsVarName, CL.mkApply("malloc", [CL.mkSizeof(CL.T_Named RN.globalsTy)])) val initFn = CL.D_Func( [], CL.voidTy, RN.initGlobals, [], CL.mkBlock[ globalsDecl, CL.mkCall(RN.initGlobalsHelper, [CL.mkVar RN.globalsVarName]) ]) val initHelperFn = CL.D_Func( [], CL.voidTy, RN.initGlobalsHelper, [CL.PARAM([], globPtrTy, RN.globalsVarName)], init) val shutdownFn = CL.D_Func( [], CL.voidTy, RN.shutdown, [CL.PARAM([], CL.T_Ptr(CL.T_Named RN.worldTy), "wrld")], CL.S_Block[]) in topDecls := shutdownFn :: initFn :: initHelperFn :: !topDecls end (* create and register the initially function for a program *) fun initially { prog = Prog{name=progName, strands, initially,numDims, ...}, isArray : bool, iterPrefix : stm list, iters : (var * exp * exp) list, createPrefix : stm list, strand : Atom.atom, args : exp list } = let val name = Atom.toString strand val nDims = List.length iters val worldTy = CL.T_Ptr(CL.T_Named N.worldTy) fun mapi f xs = let fun mapf (_, []) = [] | mapf (i, x::xs) = f(i, x) :: mapf(i+1, xs) in mapf (0, xs) end val baseInit = mapi (fn (i, (_, e, _)) => (i, CL.I_Exp e)) iters val sizeInit = mapi (fn (i, (CL.V(ty, _), lo, hi)) => (i, CL.I_Exp(CL.mkBinOp(hi, CL.#-, lo) #+# CL.mkIntTy(1, ty))) ) iters (* code to allocate the world and initial strands *) val wrld = "wrld" val allocCode = [ CL.mkComment["allocate initial block of strands"], CL.mkDecl(CL.T_Array(CL.int32, SOME nDims), "base", SOME(CL.I_Array baseInit)), CL.mkDecl(CL.T_Array(CL.uint32, SOME nDims), "size", SOME(CL.I_Array sizeInit)), CL.mkDecl(worldTy, wrld, SOME(CL.I_Exp(CL.mkApply(N.allocInitially, [ CL.mkVar "ProgramName", CL.mkUnOp(CL.%&, CL.mkVar(N.strandDesc name)), CL.mkBool isArray, CL.mkIntTy(IntInf.fromInt nDims, CL.int32), CL.mkVar "base", CL.mkVar "size" ])))) ] (* create the loop nest for the initially iterations *) val indexVar = "ix" val strandTy = CL.T_Ptr(CL.T_Named(N.strandTy name)) fun mkLoopNest [] = CL.mkBlock(createPrefix @ [ CL.mkDecl(strandTy, "sp", SOME(CL.I_Exp( CL.mkCast(strandTy, CL.mkApply(N.inState, [ CL.mkVar "wrld", CL.mkVar indexVar #*# CL.mkSizeof(CL.T_Named (N.strandTy name)) ]))))), CL.mkCall(N.strandInit name, CL.mkVar "sp" :: args), CL.mkAssign(CL.mkVar indexVar, CL.mkVar indexVar #+# CL.mkIntTy(1, CL.uint32)) ]) | mkLoopNest ((CL.V(ty, param), lo, hi)::iters) = let val body = mkLoopNest iters in CL.mkFor( [(ty, param, lo)], CL.mkBinOp(CL.mkVar param, CL.#<=, hi), [CL.mkPostOp(CL.mkVar param, CL.^++)], body) end val iterCode = [ CL.mkComment["initially"], CL.mkDecl(CL.uint32, indexVar, SOME(CL.I_Exp(CL.mkIntTy(0, CL.uint32)))), mkLoopNest iters ] val body = CL.mkBlock( iterPrefix @ allocCode @ iterCode @ [CL.mkReturn(SOME(CL.mkVar "wrld"))]) val initFn = CL.D_Func([], worldTy, N.initially, [], body) in numDims := nDims; initially := initFn end (***** OUTPUT *****) (* FIXME: I think that the iteration and test for stable strands can be moved into the runtime, which * will make the print function compatible with the C target version. *) fun genStrandPrint (Strand{name, tyName, state, output, code, ...}) = let (* the print function *) val prFnName = concat[name, "Print"] val prFn = let val params = [ CL.PARAM([], CL.T_Ptr(CL.T_Named "FILE"), "outS"), CL.PARAM([], CL.T_Ptr(CL.T_Named(RN.strandShadowTy tyName)), "self") ] val SOME(ty, x) = !output val outState = CL.mkIndirect(CL.mkVar "self", x) val prArgs = (case ty of Ty.IVecTy 1 => [CL.mkStr(!N.gIntFormat ^ "\n"), outState] | Ty.IVecTy d => let fun sel i = CL.mkApply( "VSUB", [outState, CL.mkInt(IntInf.fromInt i)]) val fmt = CL.mkStr( String.concatWith " " (List.tabulate(d, fn _ => !N.gIntFormat)) ^ "\n") val args = List.tabulate (d, sel) in fmt :: args end | Ty.TensorTy[] => [CL.mkStr "%f\n", outState] | Ty.TensorTy[d] => let fun sel i = CL.mkApply( "VSUB", [outState, CL.mkInt(IntInf.fromInt i)]) val fmt = CL.mkStr( String.concatWith " " (List.tabulate(d, fn _ => "%f")) ^ "\n") val args = List.tabulate (d, sel) in fmt :: args end | _ => raise Fail("genStrand: unsupported output type " ^ Ty.toString ty) (* end case *)) in CL.D_Func(["static"], CL.voidTy, prFnName, params, CL.mkCall("fprintf", CL.mkVar "outS" :: prArgs)) end in prFn end fun genStrandTyDef (targetTy, Strand{state,...},tyName) = (* the type declaration for the strand's state struct *) CL.D_StructDef( revmap (fn x => (targetTy x, #var x)) (!state), tyName) (* generates the globals buffers and arguments function *) fun genConvertShadowTypes (Strand{name, tyName, state,...}) = let (* Delcare opencl setup objects *) val errVar = "err" val params = [ CL.PARAM([], CL.T_Ptr(CL.T_Named tyName), "selfOut"), CL.PARAM([], CL.T_Ptr(CL.T_Named(RN.strandShadowTy tyName)), "selfIn") ] val body = List.map (fn (x:mirror_var) => #hToS x) (!state) in CL.D_Func([], CL.voidTy, RN.strandConvertName name, params, CL.mkBlock body) end (* generates the opencl buffers for the image data *) fun getGlobalDataBuffers (globals, imgGlobals, contextVar, errVar) = let val globalBuffErr = "error creating OpenCL global buffer\n" fun errorFn msg = CL.mkIfThen(CL.mkBinOp(CL.mkVar errVar, CL.#!=, CL.mkVar "CL_SUCCESS"), CL.mkBlock([CL.mkCall("fprintf",[CL.mkVar "stderr", CL.mkStr msg]), CL.mkCall("exit",[CL.mkInt 1])])) val shadowTypeDecl = CL.mkDecl(CL.T_Named(RN.shadowGlobalsTy), RN.shadowGlaobalsName, NONE) val globalToShadowStms = List.map (fn (x:mirror_var) => #hToS x ) globals val globalBufferDecl = CL.mkDecl(clMemoryTy,concat[RN.globalsVarName,"_cl"],NONE) val globalBuffer = CL.mkAssign(CL.mkVar(concat[RN.globalsVarName,"_cl"]), CL.mkApply("clCreateBuffer", [ CL.mkVar contextVar, CL.mkBinOp(CL.mkVar "CL_MEM_READ_ONLY", CL.#|, CL.mkVar "CL_MEM_COPY_HOST_PTR"), CL.mkSizeof(CL.T_Named RN.shadowGlobalsTy), CL.mkUnOp(CL.%&,CL.mkVar RN.shadowGlaobalsName), CL.mkUnOp(CL.%&,CL.mkVar errVar) ])) fun genDataBuffers ([],_,_,_) = [] | genDataBuffers ((var,nDims)::globals, contextVar, errVar,errFn) = let val hostVar = CL.mkIndirect(CL.mkVar RN.globalsVarName, var) val size = CL.mkIndirect(hostVar, "dataSzb") in CL.mkDecl(clMemoryTy, RN.addBufferSuffixData var ,NONE) :: CL.mkAssign(CL.mkVar(RN.addBufferSuffixData var), CL.mkApply("clCreateBuffer", [ CL.mkVar contextVar, CL.mkVar "CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR", size, CL.mkIndirect(hostVar, "data"), CL.mkUnOp(CL.%&,CL.mkVar errVar) ])) :: errFn(concat["error in creating ",RN.addBufferSuffixData var, " global buffer\n"]) :: genDataBuffers(globals,contextVar,errVar,errFn) end in [shadowTypeDecl] @ globalToShadowStms @ [globalBufferDecl, globalBuffer,errorFn(globalBuffErr)] @ genDataBuffers(imgGlobals,contextVar,errVar,errorFn) end (* generates the kernel arguments for the image data *) fun genGlobalArguments (globals, count, kernelVar, errVar) = let val globalArgErr = "error creating OpenCL global argument\n" fun errorFn msg = CL.mkIfThen(CL.mkBinOp(CL.mkVar errVar, CL.#!=, CL.mkVar "CL_SUCCESS"), CL.mkBlock([CL.mkCall("fprintf",[CL.mkVar "stderr", CL.mkStr msg]), CL.mkCall("exit",[CL.mkInt 1])])) val globalArgument = CL.mkExpStm(CL.mkAssignOp(CL.mkVar errVar,CL.&=, CL.mkApply("clSetKernelArg", [CL.mkVar kernelVar, CL.mkPostOp(CL.mkVar count, CL.^++), CL.mkApply("sizeof",[CL.mkVar "cl_mem"]), CL.mkUnOp(CL.%&,CL.mkVar(concat[RN.globalsVarName,"_cl"]))]))) fun genDataArguments ([],_,_,_,_) = [] | genDataArguments ((var,nDims)::globals,count,kernelVar,errVar,errFn) = CL.mkExpStm(CL.mkAssignOp(CL.mkVar errVar,CL.$=, CL.mkApply("clSetKernelArg", [CL.mkVar kernelVar, CL.mkPostOp(CL.mkVar count, CL.^++), CL.mkApply("sizeof",[CL.mkVar "cl_mem"]), CL.mkUnOp(CL.%&,CL.mkVar(RN.addBufferSuffixData var))]))) :: errFn(concat["error in creating ",RN.addBufferSuffixData var, " argument\n"]) :: genDataArguments (globals,count,kernelVar,errVar,errFn) in globalArgument :: errorFn globalArgErr :: genDataArguments(globals, count, kernelVar, errVar,errorFn) end (* generates the globals buffers and arguments function *) fun genGlobalBuffersArgs (globals,imgGlobals) = let (* Delcare opencl setup objects *) val errVar = "err" val params = [ CL.PARAM([],CL.T_Named("cl_context"), "context"), CL.PARAM([],CL.T_Named("cl_kernel"), "kernel"), CL.PARAM([],CL.T_Named("cl_command_queue"), "cmdQ"), CL.PARAM([],CL.T_Named("int"), "argStart") ] val clGlobalBuffers = getGlobalDataBuffers(globals,!imgGlobals, "context", errVar) val clGlobalArguments = genGlobalArguments(!imgGlobals, "argStart", "kernel", errVar) (* Body put all the statments together *) val body = CL.mkDecl(clIntTy, errVar, SOME(CL.I_Exp(CL.mkInt 0))) :: clGlobalBuffers @ clGlobalArguments in CL.D_Func([],CL.voidTy,RN.globalsSetupName,params,CL.mkBlock(body)) end (* generate the data and global parameters *) fun genKeneralGlobalParams ((name,tyname)::rest) = globalParam (CL.T_Ptr(CL.voidTy), RN.addBufferSuffixData name) :: genKeneralGlobalParams rest | genKeneralGlobalParams [] = [] (* generate the main kernel function for the .cl file *) fun genKernelFun (strand, nDims, globals, imgGlobals) = let val Strand{name, tyName, state, output, code,...} = strand val fName = RN.kernelFuncName; val inState = "selfIn" val outState = "selfOut" val tempVar = "tmp" (* Each work-item is assigned a certain amount strands per iteration (determined by the variable limit). Each workgroup gets assigned a contiguous strand section, the starting strand index is placed in the workgroup's index within the WorkQueue data structure. By using the work-item's local id, it first determines its workgroup's starting strand (within the WorkQueue data structure) and then uses its local-id index to determine which strand it should work on for its group. *) val workerOffset = CL.mkApply(RN.getGroupId, [CL.mkInt 0]) val localOffset = CL.mkApply(RN.getLocalThreadId, [CL.mkInt 0]) val params = [ globalParam(CL.T_Ptr(CL.T_Named tyName), "strands"), globalParam(CL.T_Ptr(CL.intTy), "strandStatus"), globalParam(CL.T_Ptr(CL.intTy), "workQueue"), globalParam(CL.T_Ptr(CL.intTy),"numAvail"), clParam("",CL.intTy,"numStrands"), clParam("",CL.intTy,"limit") ] @ [globalParam(globPtrTy, RN.globalsVarName)] @ genKeneralGlobalParams(!imgGlobals) val index_ids = [ CL.mkDecl(CL.intTy, "workerIndex", SOME(CL.I_Exp(workerOffset))), CL.mkDecl(CL.intTy, "strandIndex", SOME(CL.I_Exp( CL.mkSubscript(CL.mkVar "workQueue",CL.mkVar "workerIndex") #+# localOffset #*# CL.mkVar "limit"))) ] val strandInit = [ CL.mkAssign(CL.mkVar "selfIn", CL.mkVar "strands" #+# CL.mkVar "strandIndex"), CL.mkAssign(CL.mkVar "selfOut",CL.mkVar "selfIn") ] val strandDecl = [ CL.mkAttrDecl(["__global"],CL.T_Ptr(CL.T_Named tyName), "selfIn", NONE), CL.mkAttrDecl(["__global"],CL.T_Ptr(CL.T_Named tyName), "selfOut", NONE) ] val imageDataDecl = CL.mkDecl(CL.T_Named(RN.imageDataType),RN.globalImageDataName,NONE) val imageDataStms = List.map (fn (x,_) => CL.mkAssign(CL.mkSelect(CL.mkVar(RN.globalImageDataName),RN.imageDataName x), CL.mkVar(RN.addBufferSuffixData x))) (!imgGlobals) val status = [CL.mkDecl(CL.intTy, "status", NONE)] val updateStm = CL.mkAssign(CL.mkVar "status", CL.mkApply(RN.strandUpdate name, [ CL.mkVar inState, CL.mkVar outState, CL.mkVar RN.globalsVarName, CL.mkVar RN.globalImageDataName ])) val statusIf = CL.mkIfThenElse( CL.mkBinOp(CL.mkBinOp(CL.mkVar "status", CL.#==, CL.mkVar RN.kStabilize), CL.#&&, CL.mkBinOp(CL.mkSubscript(CL.mkVar "strandStatus", CL.mkVar "strandIndex"), CL.#!=, CL.mkVar RN.kStable)), (* then *) CL.mkBlock([ CL.mkAssign(CL.mkSubscript(CL.mkVar "strandStatus",CL.mkVar "strandIndex"),CL.mkVar RN.kStable), CL.mkCall(RN.atom_dec,[CL.mkUnOp(CL.%&,CL.mkSubscript(CL.mkVar "numAvail",CL.mkInt 0))])]), (* else *) CL.mkBlock[ CL.mkIfThen(CL.mkBinOp(CL.mkBinOp(CL.mkVar "status", CL.#==, CL.mkVar RN.kDie), CL.#&&,CL.mkBinOp(CL.mkSubscript(CL.mkVar "strandStatus",CL.mkVar "strandIndex"), CL.#!=, CL.mkVar RN.kDie)), CL.mkBlock([ CL.mkAssign(CL.mkSubscript(CL.mkVar "strandStatus",CL.mkVar "strandIndex"),CL.mkVar RN.kDie), CL.mkCall(RN.atom_dec,[CL.mkUnOp(CL.%&,CL.mkSubscript(CL.mkVar "numAvail",CL.mkInt 0))])]))]) val incStrand = CL.mkExpStm(CL.mkPostOp(CL.mkVar "strandIndex",CL.^++)) val forStablize = CL.mkFor( [(CL.intTy,"idx",CL.mkInt 0)], CL.mkBinOp( CL.mkBinOp(CL.mkVar "idx", CL.#<, CL.mkVar "limit"), CL.#&&, CL.mkBinOp(CL.mkVar "strandIndex", CL.#<, CL.mkVar "numStrands")), [CL.mkPostOp(CL.mkVar "idx", CL.^++)], CL.mkBlock(strandInit @ [updateStm, statusIf] @ [incStrand])) val local_vars = index_ids @ [imageDataDecl] @ imageDataStms @ strandDecl @ status val body = CL.mkBlock(local_vars @ [forStablize]) in CL.D_Func(["__kernel"], CL.voidTy, fName, params, body) end (* generate a global structure type definition from the list of globals *) fun genGlobalStruct (targetTy, globals, tyName) = let val globs = List.map (fn (x : mirror_var) => (targetTy x, #var x)) globals in CL.D_StructDef(globs, tyName) end (* generate a global structure type definition from the image data of the image globals *) fun genImageDataStruct (imgGlobals, tyName) = let val globs = List.map (fn (x, _) => (globalPtr CL.voidTy, RN.imageDataName x)) imgGlobals in CL.D_StructDef(globs, tyName) end fun genGlobals (declFn, targetTy, globals) = let fun doVar (x : mirror_var) = declFn (CL.D_Var([], targetTy x, #var x, NONE)) in List.app doVar globals end fun genStrandDesc (Strand{name, output, ...}) = let (* the strand's descriptor object *) val descI = let fun fnPtr (ty, f) = CL.I_Exp(CL.mkCast(CL.T_Named ty, CL.mkVar f)) val SOME(outTy, _) = !output in CL.I_Struct[ ("name", CL.I_Exp(CL.mkStr name)), ("stateSzb", CL.I_Exp(CL.mkSizeof(CL.T_Named(RN.strandTy name)))), ("shadowStrandSzb", CL.I_Exp(CL.mkSizeof(CL.T_Named(RN.strandShadowTy (RN.strandTy name))))), (* FIXME: we may need to add a shadowOutputSzb field too for OpenCL *) ("outputSzb", CL.I_Exp(CL.mkSizeof(shadowTy outTy))), ("update", fnPtr("update_method_t", "0")), ("strandCopy", fnPtr("convert_method_t", RN.strandConvertName name)), ("print", fnPtr("print_method_t", RN.strandPrintName name)), ("output", fnPtr("output_method_t", "0")) (* FIXME *) ] end val desc = CL.D_Var([], CL.T_Named N.strandDescTy, N.strandDesc name, SOME descI) in desc end (* generate the table of strand descriptors *) fun genStrandTable (declFn, strands) = let val nStrands = length strands fun genInit (Strand{name, ...}) = CL.I_Exp(CL.mkUnOp(CL.%&, CL.mkVar(N.strandDesc name))) fun genInits (_, []) = [] | genInits (i, s::ss) = (i, genInit s) :: genInits(i+1, ss) in declFn (CL.D_Var([], CL.int32, N.numStrands, SOME(CL.I_Exp(CL.mkIntTy(IntInf.fromInt nStrands, CL.int32))))); declFn (CL.D_Var([], CL.T_Array(CL.T_Ptr(CL.T_Named N.strandDescTy), SOME nStrands), N.strands, SOME(CL.I_Array(genInits (0, strands))))) end fun genSrc (baseName, prog) = let val Prog{name,double, globals, topDecls, strands, initially, imgGlobals, numDims, ...} = prog val clFileName = OS.Path.joinBaseExt{base=baseName, ext=SOME "cl"} val cFileName = OS.Path.joinBaseExt{base=baseName, ext=SOME "c"} val clOutS = TextIO.openOut clFileName val cOutS = TextIO.openOut cFileName val clppStrm = PrintAsCL.new clOutS val cppStrm = PrintAsC.new cOutS val progName = name fun cppDecl dcl = PrintAsC.output(cppStrm, dcl) fun clppDecl dcl = PrintAsCL.output(clppStrm, dcl) val strands = AtomTable.listItems strands val [strand as Strand{name, tyName, code, init_code, ...}] = strands in (* Generate the OpenCl file *) clppDecl (CL.D_Verbatim([ (* if the target supports OpenCL 1.1, then 32-bit atomics are part of the core. *) "#if (DIDEROT_CL_VERSION == 100)", "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics: enable", "#endif", if double then "#define DIDEROT_DOUBLE_PRECISION" else "#define DIDEROT_SINGLE_PRECISION", "#define DIDEROT_TARGET_CL", "#include \"Diderot/cl-diderot.h\"" ])); clppDecl (genGlobalStruct (#gpuTy, !globals, RN.globalsTy)); clppDecl (genImageDataStruct(!imgGlobals,RN.imageDataType)); clppDecl (genStrandTyDef(#gpuTy, strand,tyName)); List.app clppDecl (!code); clppDecl (genKernelFun (strand, !numDims, globals, imgGlobals)); (* Generate the Host C file *) cppDecl (CL.D_Verbatim([ if double then "#define DIDEROT_DOUBLE_PRECISION" else "#define DIDEROT_SINGLE_PRECISION", "#define DIDEROT_TARGET_CL", "#include \"Diderot/diderot.h\"" ])); cppDecl (CL.D_Var(["static"], CL.charPtr, "ProgramName", SOME(CL.I_Exp(CL.mkStr progName)))); cppDecl (genGlobalStruct (#hostTy, !globals, RN.globalsTy)); cppDecl (genGlobalStruct (#shadowTy, !globals, RN.shadowGlobalsTy)); (* FIXME: does this really need to be a global? *) cppDecl (CL.D_Var(["static"], globPtrTy, RN.globalsVarName, NONE)); cppDecl (genStrandTyDef (#hostTy, strand, tyName)); cppDecl (genStrandTyDef (#shadowTy, strand, RN.strandShadowTy tyName)); cppDecl (genConvertShadowTypes strand); cppDecl (!init_code); cppDecl (genStrandPrint strand); List.app cppDecl (List.rev (!topDecls)); cppDecl (genGlobalBuffersArgs (!globals,imgGlobals)); List.app (fn strand => cppDecl (genStrandDesc strand)) strands; genStrandTable (cppDecl, strands); cppDecl (!initially); PrintAsC.close cppStrm; PrintAsCL.close clppStrm; TextIO.closeOut cOutS; TextIO.closeOut clOutS end (* output the code to the filesystem. The string is the basename of the source file *) fun generate (basename, prog as Prog{double, parallel, debug, ...}) = let fun condCons (true, x, xs) = x::xs | condCons (false, _, xs) = xs (* generate the C compiler flags *) val cflags = ["-I" ^ Paths.diderotInclude, "-I" ^ Paths.teemInclude] val cflags = condCons (parallel, #pthread Paths.cflags, cflags) val cflags = if debug then #debug Paths.cflags :: cflags else #ndebug Paths.cflags :: cflags val cflags = #base Paths.cflags :: cflags (* generate the loader flags *) val extraLibs = condCons (parallel, #pthread Paths.extraLibs, []) val extraLibs = Paths.teemLinkFlags @ #base Paths.extraLibs :: extraLibs val extraLibs = #cl Paths.extraLibs :: extraLibs val rtLib = TargetUtil.runtimeName { target = TargetUtil.TARGET_CL, parallel = parallel, double = double, debug = debug } val ldOpts = rtLib :: extraLibs in genSrc (basename, prog); RunCC.compile (basename, cflags); RunCC.link (basename, ldOpts) end end (* Program *) (* strands *) structure Strand = struct fun define (Prog{strands, ...}, strandId) = let val name = Atom.toString strandId val strand = Strand{ name = name, tyName = RN.strandTy name, state = ref [], output = ref NONE, code = ref [], init_code = ref (CL.D_Comment(["no init code"])) } in AtomTable.insert strands (strandId, strand); strand end (* return the strand with the given name *) fun lookup (Prog{strands, ...}, strandId) = AtomTable.lookup strands strandId (* register the strand-state initialization code. The variables are the strand * parameters. *) fun init (Strand{name, tyName, code, init_code, ...}, params, init) = let val fName = RN.strandInit name val params = clParam ("",CL.T_Ptr(CL.T_Named tyName), "selfOut") :: List.map (fn (ToCL.V(ty, x)) => CL.PARAM([], ty, x)) params val initFn = CL.D_Func([], CL.voidTy, fName, params, init) in init_code := initFn end (* register a strand method *) fun method (Strand{name, tyName, code,...}, methName, body) = let val fName = name ^ MethodName.toString methName val params = [ globalParam (CL.T_Ptr(CL.T_Named tyName), "selfIn"), globalParam (CL.T_Ptr(CL.T_Named tyName), "selfOut"), globalParam (CL.T_Ptr(CL.T_Named (RN.globalsTy)), RN.globalsVarName), CL.PARAM([],CL.T_Named(RN.imageDataType),RN.globalImageDataName) ] val resTy = (case methName of MethodName.Update => CL.T_Named "StrandStatus_t" | MethodName.Stabilize => CL.voidTy (* end case *)) val methFn = CL.D_Func([], resTy, fName, params, body) in code := methFn :: !code end fun output (Strand{output, ...}, ty, ToCL.V(_, x)) = output := SOME(ty, x) end end structure CLBackEnd = CodeGenFn(CLTarget)
Click to toggle
does not end with </html> tag
does not end with </body> tag
The output has ended thus: , ToCL.V(_, x)) = output := SOME(ty, x) end end structure CLBackEnd = CodeGenFn(CLTarget)