Home My Page Projects Code Snippets Project Openings diderot
Summary Activity Tracker Tasks SCM

SCM Repository

[diderot] Annotation of /branches/pure-cfg/src/lib/cl-target/main.c
ViewVC logotype

Annotation of /branches/pure-cfg/src/lib/cl-target/main.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1517 - (view) (download) (as text)

1 : jhr 1267 /*! \file main.c
2 :     *
3 :     * \author John Reppy
4 :     */
5 :    
6 :     /*
7 :     * COPYRIGHT (c) 2011 The Diderot Project (http://diderot-language.cs.uchicago.edu)
8 :     * All rights reserved.
9 :     */
10 :    
11 : jhr 1275 #include <Diderot/diderot.h>
12 : jhr 1411 #include "clinfo.h"
13 : jhr 1267 #include <string.h>
14 :     #include <stdio.h>
15 :     #include <assert.h>
16 : lamonts 1271 #include <sys/sysctl.h>
17 :     #include <sys/stat.h>
18 : jhr 1267
19 : jhr 1492 // FIXME: the kernels should really be in the lib subdirectory
20 :     // path to kernels
21 :     #define KERNELS_FILE_PATH DIDEROT_INCLUDE_PATH "/cl_kernels/kernels.cl"
22 : lamonts 1488
23 : lamonts 1513
24 : lamonts 1488 typedef struct {
25 : jhr 1492 cl_int blkIdx; //!< the id of this block
26 :     cl_int nActive; //!< number of active (status != DIE or STABLE) strands
27 :     cl_int nDead; //!< number of strands in the DIE state
28 :     cl_int nStabilizing; //!< number of new strands in the STABILIZE state
29 :     cl_int nDying; //!< number of new strands in the DIE state
30 : lamonts 1488 } StrandBlock_t;
31 :    
32 :     typedef struct {
33 : jhr 1492 cl_int numStrands; //!< number of strands
34 :     cl_int sId; //!< the index into the todo list or queue
35 : lamonts 1501 cl_int nextStrand; //!< index of the next strand to retrieve from the pool
36 :     cl_int clearQueueSz; //!< an indicator on whether the queue size should be cleared
37 : jhr 1492 cl_int queueSize; //!< number of blocks on the scheduler's queue
38 :     cl_int todoSize; //!< number of blocks on the scheduler's todo list
39 : lamonts 1500 cl_int numAvailable; //!< number of active strands left to process
40 : lamonts 1488 } SchedState_t;
41 :    
42 :    
43 : jhr 1267 // NOTE: we probably should put this in a file that supports runtime printing
44 :     static bool VerboseFlg = false;
45 : jhr 1287 static bool TimingFlg = false;
46 : lamonts 1513 static bool NrrdOutputFlg = false;
47 : lamonts 1470 #define WARP_SIZE 32
48 : lamonts 1459
49 : jhr 1473 typedef struct {
50 :     cl_kernel kern; //!< OpenCL kernel that implements the program
51 :     size_t workGrpSize; //!< size of workgroup for this kernel
52 :     cl_ulong localSzb; //!< size of local memory used by kernel
53 :     } GPUKernel_t;
54 :    
55 : jhr 1267 struct struct_world {
56 : jhr 1471 const char *name; //!< the program name
57 :     bool isArray; //!< is the initialization an array or collection?
58 :     uint32_t nDims; //!< depth of iteration nesting
59 :     int32_t *base; //!< nDims array of base indices
60 :     uint32_t *size; //!< nDims array of iteration sizes
61 :     uint32_t numStrands; //!< number of strands in the world
62 :     uint32_t strandSize; //!< the sizeof of the strand buffers
63 : lamonts 1341 void *inState;
64 :     void *outState;
65 : jhr 1471 int *status; //!< array of strand status flags
66 :     cl_device_id device; //!< OpenCL device
67 :     cl_context context; //!< OpenCL execution context
68 :     cl_command_queue cmdQ; //!< OpenCL command queue
69 : lamonts 1500 GPUKernel_t kernel[3]; //!< OpenCL kernel that implements the program
70 : jhr 1267 };
71 : jhr 1471
72 : lamonts 1459 typedef struct {
73 : lamonts 1488 cl_mem schedMem;
74 :     cl_mem outMem;
75 :     cl_mem inMem;
76 :     cl_mem blocksMem;
77 :     cl_mem strandBlocksIdxsMem;
78 :     cl_mem queueMem;
79 :     cl_mem todoMem;
80 :     cl_mem statusMem;
81 :     } Kernel_Args_t;
82 : jhr 1267
83 : jhr 1291 static bool InitCL (CLInfo_t *clInfo, Diderot_World_t *wrld);
84 : jhr 1473 static void CheckErrorCode (cl_int sts, const char * msg);
85 : lamonts 1488 static void SetPhase1Args (cl_kernel kernel, int *argCount, Kernel_Args_t *args);
86 :     static void SetPhase2Args (cl_kernel kernel, int *argCount, Kernel_Args_t *args, int blk_sz);
87 : lamonts 1500 static void SetScheduleKernelArgs(cl_kernel kernel, int *argCount, Kernel_Args_t *args);
88 :    
89 : jhr 1473 static bool CreateKernel (cl_device_id dev, cl_program prog, const char *name, GPUKernel_t *kern);
90 :    
91 : jhr 1358 extern void Diderot_LoadGlobals (cl_context cxt, cl_kernel kernel, cl_command_queue cmdQ, int argStart);
92 :    
93 : jhr 1267 int main (int argc, const char **argv)
94 :     {
95 :     // get information about OpenCL support
96 :     CLInfo_t *clInfo = GetCLInfo();
97 :     if (clInfo == 0) {
98 : jhr 1327 fprintf (stderr, "no OpenCL support\n");
99 :     exit (1);
100 : jhr 1267 }
101 :    
102 :     Diderot_Options_t *opts = Diderot_OptNew ();
103 :    
104 :     Diderot_OptAddFlag (opts, "verbose", "enable runtime-system messages", &VerboseFlg);
105 : jhr 1411 Diderot_OptAddFlag (opts, "timing", "enable execution timing", &TimingFlg);
106 : lamonts 1513 Diderot_OptAddFlag (opts, "nrrd", "enable nrrd output", &NrrdOutputFlg);
107 : jhr 1267 Diderot_RegisterGlobalOpts (opts);
108 :     Diderot_OptProcess (opts, argc, argv);
109 :     Diderot_OptFree (opts);
110 :    
111 : jhr 1379 if (VerboseFlg)
112 : jhr 1492 PrintCLInfo (stdout, clInfo);
113 : jhr 1379
114 : jhr 1267 // run the generated global initialization code
115 : jhr 1379 if (VerboseFlg)
116 : jhr 1411 fprintf (stderr, "initializing globals ...\n");
117 :     Diderot_InitGlobals ();
118 : lamonts 1316
119 : jhr 1291 Diderot_World_t *wrld = Diderot_Initially (); // this may not be right for OpenCL
120 : lamonts 1459
121 : jhr 1291 if (! InitCL(clInfo, wrld))
122 :     exit (1);
123 : jhr 1282
124 : lamonts 1488 // Conversion of strands from their host types to their shadow types
125 : jhr 1472 void *shadowInState = CheckedAlloc(Diderot_Strands[0]->shadowStrandSzb * wrld->numStrands);
126 : lamonts 1488 void *shadowOutState = CheckedAlloc(Diderot_Strands[0]->shadowStrandSzb * wrld->numStrands);
127 : lamonts 1462 uint8_t *strandPtr = (uint8_t *)wrld->inState;
128 :     uint8_t *strandShadowPtr = (uint8_t *)shadowInState;
129 : lamonts 1488 uint8_t *strandShadowOutPtr = (uint8_t *)shadowInState;
130 : lamonts 1462 size_t shadowSize = Diderot_Strands[0]->shadowStrandSzb * wrld->numStrands;
131 :    
132 : jhr 1469 for (int i = 0; i < wrld->numStrands; i++) {
133 :     Diderot_Strands[0]->strandCopy (strandPtr, strandShadowPtr);
134 : lamonts 1488 Diderot_Strands[0]->strandCopy (strandPtr, strandShadowOutPtr);
135 : jhr 1469 strandPtr += Diderot_Strands[0]->stateSzb;
136 :     strandShadowPtr += Diderot_Strands[0]->shadowStrandSzb;
137 : lamonts 1488 strandShadowOutPtr += Diderot_Strands[0]->shadowStrandSzb;
138 : lamonts 1462 }
139 : jhr 1471 free (wrld->inState);
140 : lamonts 1488 free(wrld->outState);
141 : lamonts 1462 wrld->inState = shadowInState;
142 : lamonts 1488 wrld->outState = shadowOutState;
143 : lamonts 1462
144 : jhr 1469 // hack to make the invariant part of the state the same in both copies
145 : lamonts 1462 memcpy (wrld->outState, wrld->inState, shadowSize);
146 :    
147 : jhr 1327 int argCount = 0;
148 :     cl_int sts = CL_SUCCESS;
149 : lamonts 1316
150 : lamonts 1488 Kernel_Args_t kernelArgs;
151 :     SchedState_t scheduler;
152 : lamonts 1346
153 : lamonts 1488 scheduler.numStrands = wrld->numStrands;
154 : lamonts 1500 scheduler.nextStrand = 0;
155 : lamonts 1488 scheduler.todoSize = 0;
156 :     scheduler.sId = 0;
157 : lamonts 1501 scheduler.clearQueueSz= 1;
158 : lamonts 1500 scheduler.numAvailable = wrld->numStrands;
159 : lamonts 1488
160 : lamonts 1465 size_t globalWorkSize[1] = {0};
161 : lamonts 1488 size_t localWorkSize[1] = {0};
162 : jhr 1492 int dIdx = clInfo->mainDeviceIdx;
163 : lamonts 1488 int pIdx = clInfo->mainPlatformIdx;
164 :     globalWorkSize[0] = clInfo->platforms[pIdx].devices[dIdx].numCUs * clInfo->platforms[pIdx].devices[dIdx].cuWidth;
165 :     localWorkSize[0] = clInfo->platforms[pIdx].devices[dIdx].cuWidth;
166 : lamonts 1502 int numberOfBlocks = ceil((double)wrld->numStrands/localWorkSize[0]);
167 : lamonts 1488
168 :     size_t strandBlkMemSize = sizeof(int) * numberOfBlocks * clInfo->platforms[pIdx].devices[dIdx].cuWidth;;
169 :     size_t schedListMemSize = sizeof(int) * numberOfBlocks;
170 : jhr 1492 int *schedulerQueue = (int *)CheckedAlloc(schedListMemSize);
171 :     int *schedulerTodoList = (int *)CheckedAlloc(schedListMemSize);
172 :     int *strandBlocksIdxs = (int *)CheckedAlloc(strandBlkMemSize);
173 : lamonts 1488 scheduler.queueSize = numberOfBlocks;
174 : lamonts 1513
175 : lamonts 1488 size_t strandblkMemSize = sizeof(StrandBlock_t) * numberOfBlocks;
176 :     StrandBlock_t * strandBlks = (StrandBlock_t *)CheckedAlloc(strandblkMemSize);
177 : lamonts 1470
178 : jhr 1492 for (int i = 0; i < numberOfBlocks; i++) {
179 : lamonts 1488 schedulerQueue[i] = i;
180 :     strandBlks[i].blkIdx = i;
181 :     strandBlks[i].nActive = 0;
182 :     strandBlks[i].nDead = 0;
183 :     strandBlks[i].nStabilizing = 0;
184 : jhr 1492 strandBlks[i].nDying = 0;
185 : jhr 1432 }
186 : lamonts 1459
187 : lamonts 1488 kernelArgs.schedMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
188 :     sizeof(SchedState_t), &scheduler, &sts);
189 :     CheckErrorCode (sts, "error creating OpenCL scheduler buffer\n");
190 : lamonts 1459
191 : lamonts 1488 kernelArgs.blocksMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
192 :     strandblkMemSize, strandBlks , &sts);
193 :     CheckErrorCode (sts, "error creating OpenCL strand blocks buffer\n");
194 : lamonts 1465
195 : lamonts 1488 kernelArgs.strandBlocksIdxsMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
196 :     strandBlkMemSize, strandBlocksIdxs , &sts);
197 :     CheckErrorCode (sts, "error creating OpenCL strand blocks indices buffer\n");
198 : lamonts 1425
199 : lamonts 1488 kernelArgs.queueMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
200 :     schedListMemSize, schedulerQueue , &sts);
201 :     CheckErrorCode (sts, "error creating OpenCL scheduler queue buffer\n");
202 :    
203 :     kernelArgs.todoMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
204 :     schedListMemSize, schedulerTodoList , &sts);
205 :     CheckErrorCode (sts, "error creating OpenCL scheduler todo buffer\n");
206 :    
207 :     kernelArgs.inMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
208 :     shadowSize, wrld->inState, &sts);
209 :     CheckErrorCode (sts, "error creating OpenCL strand in-state buffer\n");
210 :    
211 :     kernelArgs.outMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
212 :     shadowSize, wrld->outState, &sts);
213 :     CheckErrorCode (sts, "error creating OpenCL strand out-state buffer\n");
214 :    
215 :     kernelArgs.statusMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
216 :     sizeof(int) * wrld->numStrands, wrld->status, &sts);
217 :     CheckErrorCode (sts, "error creating OpenCL world status buffer\n");
218 :    
219 : lamonts 1513 //Setup the Update Kernel's arguments
220 : lamonts 1488 SetPhase1Args(wrld->kernel[0].kern, &argCount, &kernelArgs);
221 :     Diderot_LoadGlobals(wrld->context, wrld->kernel[0].kern, wrld->cmdQ, argCount);
222 : lamonts 1459
223 : lamonts 1513 //Setup the Compaction Kernel's arguments
224 : lamonts 1488 argCount=0;
225 :     SetPhase2Args(wrld->kernel[1].kern, &argCount, &kernelArgs,clInfo->platforms[pIdx].devices[dIdx].cuWidth);
226 :    
227 : lamonts 1513 //Setup the Scheduler Kernel's arguments
228 : lamonts 1500 argCount = 0;
229 :     SetScheduleKernelArgs(wrld->kernel[2].kern, &argCount, &kernelArgs);
230 :    
231 : lamonts 1459 clFinish(wrld->cmdQ);
232 :    
233 :     double t0 = GetTime();
234 :    
235 : jhr 1469 clFinish(wrld->cmdQ);
236 : lamonts 1461
237 : lamonts 1500 while (scheduler.numAvailable > 0) {
238 :    
239 : jhr 1492 //Runs the update kernel on all strands
240 : lamonts 1488 sts = clEnqueueNDRangeKernel(wrld->cmdQ, wrld->kernel[0].kern, 1, NULL,
241 : lamonts 1513 globalWorkSize, localWorkSize, 0, NULL, NULL);
242 : lamonts 1500 CheckErrorCode (sts, "error in executing update kernel\n");
243 : lamonts 1488
244 : lamonts 1500 sts = clEnqueueTask(wrld->cmdQ,wrld->kernel[2].kern,0,NULL,NULL);
245 :     CheckErrorCode (sts, "error in executing scheduler update kernel before compaction\n");
246 : lamonts 1513
247 : lamonts 1488
248 : lamonts 1500 // Run the compaction kernel on all strands
249 :     sts = clEnqueueNDRangeKernel(wrld->cmdQ, wrld->kernel[1].kern, 1, NULL,
250 : jhr 1492 globalWorkSize, localWorkSize, 0, NULL, NULL);
251 : lamonts 1513 CheckErrorCode (sts, "error ccccn executing compaction kernel\n");
252 : lamonts 1501
253 : lamonts 1513
254 : lamonts 1500 sts = clEnqueueTask(wrld->cmdQ,wrld->kernel[2].kern,0,NULL,NULL);
255 :     CheckErrorCode (sts, "error in executing scheduler update kernel after compaction\n");
256 : lamonts 1459
257 : lamonts 1513
258 :     sts = clEnqueueReadBuffer(wrld->cmdQ, kernelArgs.schedMem, CL_TRUE, 0, sizeof(SchedState_t),
259 : lamonts 1500 &scheduler, 0, NULL, NULL);
260 : lamonts 1462
261 : lamonts 1513 CheckErrorCode (sts, "error reading back scheduler information\n");
262 :    
263 :    
264 : lamonts 1459 }
265 : lamonts 1470
266 : lamonts 1488 sts = clEnqueueReadBuffer(wrld->cmdQ, kernelArgs.outMem, CL_TRUE, 0,shadowSize,
267 : jhr 1469 wrld->outState, 0, NULL, NULL);
268 : jhr 1471 CheckErrorCode (sts, "error in reading back output\n");
269 : lamonts 1459
270 : jhr 1469 sts = clEnqueueReadBuffer(
271 : lamonts 1488 wrld->cmdQ, kernelArgs.statusMem, CL_TRUE, 0, sizeof(int) * wrld->numStrands,
272 : jhr 1469 wrld->status, 0, NULL, NULL);
273 : jhr 1471 CheckErrorCode (sts, "error in reading back status\n");
274 : jhr 1342
275 : lamonts 1513
276 : jhr 1469 clFinish (wrld->cmdQ);
277 : jhr 1473 clReleaseKernel(wrld->kernel[0].kern);
278 :     clReleaseKernel(wrld->kernel[1].kern);
279 : lamonts 1316 clReleaseCommandQueue(wrld->cmdQ);
280 :     clReleaseContext(wrld->context);
281 :    
282 : jhr 1287 double totalTime = GetTime() - t0;
283 :    
284 :     if (VerboseFlg)
285 :     fprintf (stderr, "done in %f seconds\n", totalTime);
286 :     else if (TimingFlg)
287 :     printf ("usr=%f\n", totalTime);
288 :    
289 : nseltzer 1453 // here we have the final state of all of the strands in the "in" buffer
290 : lamonts 1513 // output the final strand states
291 :     Output_Args_t outArgs;
292 :     outArgs.name = wrld->name;
293 :     outArgs.isArray = wrld->isArray;
294 :     outArgs.numStrands = wrld->numStrands;
295 : jhr 1517 outArgs.status = (uint8_t *)(wrld->status);
296 : lamonts 1513 outArgs.inState = wrld->outState;
297 :     outArgs.nDims = wrld->nDims;
298 :     outArgs.size = wrld->size;
299 :     outArgs.isStatusInt = true;
300 :     outArgs.outputSzb = Diderot_Strands[0]->shadowStrandSzb;
301 :     if (NrrdOutputFlg)
302 :     Diderot_Output (&outArgs);
303 :     else
304 :     Diderot_Print (&outArgs);
305 : jhr 1287
306 : lamonts 1488 Diderot_Shutdown (wrld);
307 : jhr 1287
308 :     return 0;
309 :    
310 : lamonts 1271 }
311 : jhr 1469
312 : lamonts 1488 static void SetPhase1Args (cl_kernel kernel, int *argCount, Kernel_Args_t *args)
313 : lamonts 1459 {
314 : jhr 1492 int count = *argCount;
315 : lamonts 1459 cl_int sts = CL_SUCCESS;
316 : jhr 1282
317 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->inMem);
318 : lamonts 1500 CheckErrorCode (sts, "Update Kernel: error Setting OpenCL strand in-state argument\n");
319 : lamonts 1488
320 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->outMem);
321 : lamonts 1500 CheckErrorCode (sts, "Update Kernel: error Setting OpenCL strand out-state argument\n");
322 : lamonts 1459
323 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->statusMem);
324 : lamonts 1500 CheckErrorCode (sts, "Update Kernel: error Setting OpenCL world status argument\n");
325 : lamonts 1459
326 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->schedMem);
327 : lamonts 1500 CheckErrorCode (sts, "Update Kernel: error Setting OpenCL scheduler argument\n");
328 : lamonts 1459
329 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->blocksMem);
330 : lamonts 1500 CheckErrorCode (sts, "Update Kernel: error Setting OpenCL strand blocks argument\n");
331 : lamonts 1459
332 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->strandBlocksIdxsMem);
333 : lamonts 1500 CheckErrorCode (sts, "Update Kernel: error Setting OpenCL strand blocks' indices argument\n");
334 : lamonts 1459
335 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->queueMem);
336 : lamonts 1500 CheckErrorCode (sts, "Update Kernel: error Setting OpenCL scheduler queue argument\n");
337 : lamonts 1488
338 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->todoMem);
339 : lamonts 1500 CheckErrorCode (sts, "Update Kernel: error Setting OpenCL scheduler todo argument\n");
340 : lamonts 1488
341 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(StrandBlock_t), NULL);
342 : lamonts 1513 CheckErrorCode (sts, "Update Kernel: error Setting OpenCL local strand blockargument\n");
343 : lamonts 1488
344 : jhr 1492 *argCount = count;
345 :    
346 : jhr 1469 }
347 :    
348 : lamonts 1488 static void SetPhase2Args (cl_kernel kernel, int *argCount, Kernel_Args_t *args, int blk_sz)
349 : lamonts 1459 {
350 : jhr 1492 int count = *argCount;
351 :     cl_int sts = CL_SUCCESS;
352 : lamonts 1459
353 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->statusMem);
354 : lamonts 1500 CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL world status argument\n");
355 : lamonts 1459
356 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->schedMem);
357 : lamonts 1500 CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL scheduler argument\n");
358 : lamonts 1488
359 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->blocksMem);
360 : lamonts 1500 CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL strand blocks argument\n");
361 : lamonts 1488
362 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->strandBlocksIdxsMem);
363 : lamonts 1500 CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL strand blocks' indices argument\n");
364 : lamonts 1488
365 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->queueMem);
366 : lamonts 1500 CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL scheduler queue argument\n");
367 : lamonts 1488
368 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->todoMem);
369 : lamonts 1500 CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL scheduler todo argument\n");
370 : lamonts 1488
371 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(StrandBlock_t), NULL);
372 : lamonts 1500 CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL local strand blockargument\n");
373 : lamonts 1488
374 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(int) * blk_sz, NULL);
375 : lamonts 1500 CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL local preStable argument\n");
376 : lamonts 1488
377 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(int) * blk_sz, NULL);
378 : lamonts 1500 CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL local preDead argument\n");
379 : lamonts 1488
380 : lamonts 1500 sts = clSetKernelArg (kernel, count++, sizeof(int) * blk_sz * 2, NULL);
381 :     CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL local temporary array for the prefix scan of preStable preDead argument\n");
382 : lamonts 1488
383 : lamonts 1513
384 : jhr 1492 *argCount = count;
385 : jhr 1469 }
386 : lamonts 1500 static void SetScheduleKernelArgs(cl_kernel kernel, int *argCount, Kernel_Args_t *args)
387 :     {
388 :     int count = *argCount;
389 :     cl_int sts = CL_SUCCESS;
390 :    
391 :     sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->schedMem);
392 :     CheckErrorCode (sts, "Scheduler Kernel: error setting OpenCL scheduler argument\n");
393 : jhr 1469
394 : lamonts 1500 *argCount = count;
395 :     }
396 :    
397 : jhr 1471 static void CheckErrorCode (cl_int sts, const char *msg)
398 : lamonts 1459 {
399 :     if (sts != CL_SUCCESS) {
400 : jhr 1464 fprintf (stderr, "%s", msg);
401 : lamonts 1459 exit(1);
402 :     }
403 : jhr 1469 }
404 :    
405 : jhr 1287 /*! \brief load OpenCL code from a file
406 :     */
407 : jhr 1291 static char *LoadSource (const char *filename)
408 : lamonts 1271 {
409 :     struct stat statbuf;
410 : jhr 1287 if (stat(filename, &statbuf) < 0) {
411 : jhr 1327 fprintf (stderr, "unable to stat OpenCL source file %s\n", filename);
412 :     exit (1);
413 : jhr 1287 }
414 :    
415 : jhr 1472 char *source = (char *)CheckedAlloc(statbuf.st_size + 1);
416 : jhr 1287 if (source == 0) {
417 : jhr 1327 fprintf (stderr, "unable to allocate memory for OpenCL source\n");
418 :     exit (1);
419 : jhr 1287 }
420 :    
421 :     FILE *fh = fopen(filename, "r");
422 :     if ((fh == 0)
423 :     || (fread(source, statbuf.st_size, 1, fh) != 1)) {
424 : jhr 1327 fprintf (stderr, "unable to read OpenCL source from %s\n", filename);
425 :     exit (1);
426 : jhr 1287 }
427 :     source[statbuf.st_size] = '\0';
428 :     fclose (fh);
429 : lamonts 1271
430 :     return source;
431 : jhr 1342 }
432 :    
433 : jhr 1473 //! create a kernel object from a program
434 :     static bool CreateKernel (cl_device_id dev, cl_program prog, const char *name, GPUKernel_t *kern)
435 :     {
436 :     cl_int sts;
437 :    
438 :     kern->kern = clCreateKernel(prog, name, &sts);
439 :     if (sts != CL_SUCCESS) {
440 :     fprintf (stderr, "error getting %s from program\n", name);
441 :     return false;
442 :     }
443 :     sts = clGetKernelWorkGroupInfo (
444 :     kern->kern, dev, CL_KERNEL_WORK_GROUP_SIZE,
445 :     sizeof(size_t), &(kern->workGrpSize), 0);
446 :     if (sts != CL_SUCCESS) {
447 :     fprintf (stderr, "error getting workgroup size for %s\n", name);
448 :     return false;
449 :     }
450 :     sts = clGetKernelWorkGroupInfo (
451 :     kern->kern, dev, CL_KERNEL_LOCAL_MEM_SIZE,
452 :     sizeof(cl_ulong), &(kern->localSzb), 0);
453 :     if (sts != CL_SUCCESS) {
454 :     fprintf (stderr, "error getting local memory size for %s\n", name);
455 :     return false;
456 :     }
457 :    
458 :     if (VerboseFlg) {
459 :     fprintf(stderr, "kernel %s: workgroup size = %d, local memory = %d bytes\n",
460 :     name, (int)kern->workGrpSize, (int)kern->localSzb);
461 :     }
462 :    
463 :     return true;
464 :     }
465 :    
466 : jhr 1352 static void LogMessagesToStderr (const char *errstr, const void *private_info, size_t cb, void *user_data)
467 : lamonts 1328 {
468 : jhr 1480 fprintf(stderr, "***** error log *****\n");
469 : jhr 1342 fprintf(stderr, "%s\n", errstr);
470 : jhr 1480 fprintf(stderr, "***** end error log *****\n");
471 : jhr 1342 }
472 : jhr 1267
473 : jhr 1417 /*! \brief initialize the OpenCL execution context, including loading and compiling the OpenCL
474 :     * program.
475 :     * \param clInfo points to the summary information about the available OpenCL devices.
476 :     * \param wrld the Diderot execution information.
477 :     * \return true if initialization is successful, otherwise false.
478 : jhr 1291 */
479 :     static bool InitCL (CLInfo_t *clInfo, Diderot_World_t *wrld)
480 :     {
481 : jhr 1327 cl_int sts;
482 : jhr 1291
483 :     // find a GPU on platform[0]
484 : jhr 1420 DeviceInfo_t *dev = 0;
485 : lamonts 1488 clInfo->mainPlatformIdx = 0;
486 : jhr 1291 int i;
487 :     for (i = 0; i < clInfo->platforms[0].numDevices; i++) {
488 : jhr 1417 if (isGPUDevice (&(clInfo->platforms[0].devices[i]))
489 : jhr 1472 && clInfo->platforms[0].devices[i].isAvail) {
490 : jhr 1420 dev = &(clInfo->platforms[0].devices[i]);
491 : lamonts 1488 clInfo->mainDeviceIdx = i;
492 : jhr 1291 break;
493 :     }
494 :     }
495 : lamonts 1488
496 : jhr 1420 if (dev == 0) {
497 : jhr 1291 fprintf (stderr, "unable to find GPU device\n");
498 :     return false;
499 :     }
500 :    
501 : jhr 1469 if (VerboseFlg) {
502 :     fprintf (stderr, "using platform 0, device %d: %s\n",
503 :     i, clInfo->platforms[0].devices[i].name);
504 :     }
505 :    
506 : jhr 1291 // create the context
507 : jhr 1420 cl_context cxt = clCreateContext(0, 1, &(dev->id), LogMessagesToStderr, 0, &sts);
508 : jhr 1291 if (sts != CL_SUCCESS) {
509 : jhr 1327 fprintf (stderr, "error creating OpenCL context\n");
510 : jhr 1291 return false;
511 :     }
512 :    
513 :     // create the program from the source
514 :     int fnameLen = strlen(wrld->name) + 4; // name + ".cl\0"
515 : jhr 1472 char *fname = (char *)CheckedAlloc(fnameLen);
516 : jhr 1291 snprintf(fname, fnameLen, "%s.cl", wrld->name);
517 : lamonts 1459 char *updateSource = LoadSource (fname);
518 : jhr 1464 free (fname);
519 : lamonts 1459
520 : jhr 1492 // load scheduler kernels
521 :     char *kernelsSource = LoadSource(KERNELS_FILE_PATH);
522 :    
523 : lamonts 1513 const char *src[2] = {kernelsSource,updateSource};
524 : jhr 1464 cl_program prog = clCreateProgramWithSource(cxt, 2, src, NULL, &sts);
525 : jhr 1492 free (updateSource);
526 :     free (kernelsSource);
527 : jhr 1291 if (sts != CL_SUCCESS) {
528 : jhr 1327 fprintf (stderr, "error creating program\n");
529 : jhr 1291 return false;
530 :     }
531 :    
532 :     // build the program
533 : jhr 1420 char options[1024];
534 :     snprintf (options, sizeof(options),
535 : jhr 1472 "-D DIDEROT_CL_VERSION=%d -D DIDEROT_CU_WIDTH=%d -I %s -w",
536 : jhr 1420 100*dev->majorVersion + dev->minorVersion,
537 : jhr 1472 dev->cuWidth,
538 : jhr 1420 DIDEROT_INCLUDE_PATH);
539 : jhr 1472 if (VerboseFlg) {
540 :     fprintf (stderr, "clBuildProgram options: %s\n", options);
541 :     }
542 : jhr 1420 sts = clBuildProgram (prog, 1, &(dev->id), options, 0, 0);
543 : jhr 1291 if (sts != CL_SUCCESS) {
544 : jhr 1352 size_t logSzb;
545 : jhr 1420 clGetProgramBuildInfo (prog, dev->id, CL_PROGRAM_BUILD_LOG, 0, 0, &logSzb);
546 : jhr 1472 char *log = CheckedAlloc(logSzb+1);
547 : jhr 1420 clGetProgramBuildInfo (prog, dev->id, CL_PROGRAM_BUILD_LOG, logSzb, log, &logSzb);
548 : jhr 1352 log[logSzb] = '\0';
549 :     fprintf (stderr, "error compiling program:\n%s\n", log);
550 :     free (log);
551 : jhr 1291 return false;
552 :     }
553 :    
554 : jhr 1473 // extract the kernels from the program
555 : lamonts 1488 if ((! CreateKernel (dev->id, prog, "Diderot_UpdateKernel", &(wrld->kernel[0])))
556 : lamonts 1500 || (! CreateKernel (dev->id, prog, "Diderot_CompactionKernel", &(wrld->kernel[1])))
557 :     || (! CreateKernel (dev->id, prog, "Diderot_SchedUpdateKernel", &(wrld->kernel[2]))))
558 : jhr 1291 return false;
559 :    
560 : jhr 1417 // create the command queue
561 : jhr 1420 cl_command_queue q = clCreateCommandQueue(cxt, dev->id, 0, &sts);
562 : jhr 1417 if (sts != CL_SUCCESS) {
563 :     fprintf (stderr, "error creating OpenCL command queue\n");
564 :     return false;
565 :     }
566 :    
567 : jhr 1291 // initialize world info
568 : jhr 1420 wrld->device = dev->id;
569 : jhr 1291 wrld->context = cxt;
570 :     wrld->cmdQ = q;
571 :    
572 :     return true;
573 :     }
574 : jhr 1327
575 : jhr 1267 // this should be the part of the scheduler
576 :     void *Diderot_AllocStrand (Strand_t *strand)
577 :     {
578 : jhr 1472 return CheckedAlloc(strand->stateSzb);
579 : jhr 1267 }
580 :    
581 :     // block allocation of an initial collection of strands
582 : lamonts 1346 Diderot_World_t *Diderot_AllocInitially(
583 : jhr 1287 const char *name, // the name of the program
584 : jhr 1267 Strand_t *strand, // the type of strands being allocated
585 :     bool isArray, // is the initialization an array or collection?
586 :     uint32_t nDims, // depth of iteration nesting
587 :     int32_t *base, // nDims array of base indices
588 :     uint32_t *size) // nDims array of iteration sizes
589 :     {
590 : jhr 1472 Diderot_World_t *wrld = NEW(Diderot_World_t);
591 : jhr 1267 if (wrld == 0) {
592 :     fprintf (stderr, "unable to allocate world\n");
593 :     exit (1);
594 :     }
595 :    
596 : jhr 1287 wrld->name = name; /* NOTE: we are assuming that name is statically allocated! */
597 : jhr 1267 wrld->isArray = isArray;
598 :     wrld->nDims = nDims;
599 : jhr 1472 wrld->base = NEWVEC(int32_t, nDims);
600 :     wrld->size = NEWVEC(uint32_t, nDims);
601 : jhr 1267 size_t numStrands = 1;
602 :     for (int i = 0; i < wrld->nDims; i++) {
603 :     numStrands *= size[i];
604 :     wrld->base[i] = base[i];
605 :     wrld->size[i] = size[i];
606 :     }
607 :    
608 : jhr 1378 if (VerboseFlg) {
609 :     printf("AllocInitially: %d", size[0]);
610 :     for (int i = 1; i < nDims; i++) printf(" x %d", size[i]);
611 :     printf("\n");
612 :     }
613 : jhr 1267
614 :     // allocate the strand state pointers
615 :     wrld->numStrands = numStrands;
616 : lamonts 1346 wrld->strandSize = strand->stateSzb * numStrands;
617 : jhr 1472 /*
618 :     wrld->inState = NEWVEC(void *, numStrands);
619 :     wrld->outState = NEWVEC(void *, numStrands);
620 :     */
621 :     wrld->inState = CheckedAlloc (wrld->strandSize);
622 :     wrld->outState = CheckedAlloc (strand->shadowStrandSzb * numStrands);
623 :     wrld->status = NEWVEC(int, numStrands);
624 : jhr 1267 if ((wrld->inState == 0) || (wrld->outState == 0) || (wrld->status == 0)) {
625 :     fprintf (stderr, "unable to allocate strand states\n");
626 :     exit (1);
627 :     }
628 :    
629 :     // initialize strand state pointers etc.
630 :     for (size_t i = 0; i < numStrands; i++) {
631 : lamonts 1341 // wrld->inState[i] = Diderot_AllocStrand (strand);
632 :     // wrld->outState[i] = Diderot_AllocStrand (strand);
633 : lamonts 1351 wrld->status[i] = DIDEROT_ACTIVE;
634 : jhr 1267 }
635 :    
636 :     return wrld;
637 :    
638 :     }
639 :    
640 :     // get strand state pointers
641 :     void *Diderot_InState (Diderot_World_t *wrld, uint32_t i)
642 :     {
643 :     assert (i < wrld->numStrands);
644 : lamonts 1461 return wrld->inState + i;
645 : jhr 1267 }
646 :    
647 :     void *Diderot_OutState (Diderot_World_t *wrld, uint32_t i)
648 :     {
649 :     assert (i < wrld->numStrands);
650 : lamonts 1341 return &wrld->outState[i];
651 : jhr 1267 }
652 :    
653 :     bool Diderot_IsActive (Diderot_World_t *wrld, uint32_t i)
654 :     {
655 :     assert (i < wrld->numStrands);
656 :     return !wrld->status[i];
657 : jhr 1287 }
658 : jhr 1435
659 : jhr 1354 /***** Support for shadow image values *****/
660 : jhr 1435
661 : jhr 1354 void ShadowImage1D (cl_context cxt, Shadow_image1D_t *dst, Diderot_image1D_t *src)
662 :     {
663 :     dst->size[0] = src->size[0];
664 :     dst->s = src->s;
665 :     dst->t = dst->t;
666 :     }
667 : jhr 1435
668 : jhr 1354 void ShadowImage2D (cl_context cxt, Shadow_image2D_t *dst, Diderot_image2D_t *src)
669 :     {
670 : lamonts 1461 dst->size[0] = src->size[0];
671 : jhr 1354 dst->size[1] = src->size[1];
672 : jhr 1355 ShadowMat2x2 (dst->w2i, src->w2i);
673 : lamonts 1461 ShadowVec2 (&dst->tVec, src->tVec);
674 : jhr 1355 ShadowMat2x2 (dst->w2iT, src->w2iT);
675 : jhr 1354 }
676 :    
677 :     void ShadowImage3D (cl_context cxt, Shadow_image3D_t *dst, Diderot_image3D_t *src)
678 :     {
679 :     dst->size[0] = src->size[0];
680 :     dst->size[1] = src->size[1];
681 :     dst->size[2] = src->size[2];
682 : jhr 1355 ShadowMat3x3 (dst->w2i, src->w2i);
683 : lamonts 1461 ShadowVec3 (&dst->tVec, src->tVec);
684 : jhr 1435 ShadowMat3x3 (dst->w2iT, src->w2iT);
685 : jhr 1354 }

root@smlnj-gforge.cs.uchicago.edu
ViewVC Help
Powered by ViewVC 1.0.0