Home My Page Projects Code Snippets Project Openings diderot
Summary Activity Tracker Tasks SCM

SCM Repository

[diderot] Annotation of /branches/pure-cfg/src/lib/cl-target/main.c
ViewVC logotype

Annotation of /branches/pure-cfg/src/lib/cl-target/main.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1519 - (view) (download) (as text)

1 : jhr 1267 /*! \file main.c
2 :     *
3 :     * \author John Reppy
4 :     */
5 :    
6 :     /*
7 :     * COPYRIGHT (c) 2011 The Diderot Project (http://diderot-language.cs.uchicago.edu)
8 :     * All rights reserved.
9 :     */
10 :    
11 : jhr 1275 #include <Diderot/diderot.h>
12 : jhr 1411 #include "clinfo.h"
13 : lamonts 1271 #include <sys/sysctl.h>
14 :     #include <sys/stat.h>
15 : jhr 1267
16 : jhr 1492 // FIXME: the kernels should really be in the lib subdirectory
17 :     // path to kernels
18 :     #define KERNELS_FILE_PATH DIDEROT_INCLUDE_PATH "/cl_kernels/kernels.cl"
19 : lamonts 1488
20 : lamonts 1513
21 : lamonts 1488 typedef struct {
22 : jhr 1492 cl_int blkIdx; //!< the id of this block
23 :     cl_int nActive; //!< number of active (status != DIE or STABLE) strands
24 :     cl_int nDead; //!< number of strands in the DIE state
25 :     cl_int nStabilizing; //!< number of new strands in the STABILIZE state
26 :     cl_int nDying; //!< number of new strands in the DIE state
27 : lamonts 1488 } StrandBlock_t;
28 :    
29 :     typedef struct {
30 : jhr 1492 cl_int numStrands; //!< number of strands
31 :     cl_int sId; //!< the index into the todo list or queue
32 : lamonts 1501 cl_int nextStrand; //!< index of the next strand to retrieve from the pool
33 :     cl_int clearQueueSz; //!< an indicator on whether the queue size should be cleared
34 : jhr 1492 cl_int queueSize; //!< number of blocks on the scheduler's queue
35 :     cl_int todoSize; //!< number of blocks on the scheduler's todo list
36 : jhr 1519 cl_int numAvailable; //!< number of active strands left to process
37 : lamonts 1488 } SchedState_t;
38 :    
39 :    
40 : jhr 1267 // NOTE: we probably should put this in a file that supports runtime printing
41 :     static bool VerboseFlg = false;
42 : jhr 1287 static bool TimingFlg = false;
43 : lamonts 1513 static bool NrrdOutputFlg = false;
44 : lamonts 1470 #define WARP_SIZE 32
45 : lamonts 1459
46 : jhr 1473 typedef struct {
47 :     cl_kernel kern; //!< OpenCL kernel that implements the program
48 :     size_t workGrpSize; //!< size of workgroup for this kernel
49 :     cl_ulong localSzb; //!< size of local memory used by kernel
50 :     } GPUKernel_t;
51 :    
52 : jhr 1267 struct struct_world {
53 : jhr 1519 STRUCT_WORLD_PREFIX
54 :     // FIXME: document these fields!
55 : lamonts 1341 void *inState;
56 :     void *outState;
57 : jhr 1471 cl_device_id device; //!< OpenCL device
58 :     cl_context context; //!< OpenCL execution context
59 :     cl_command_queue cmdQ; //!< OpenCL command queue
60 : lamonts 1500 GPUKernel_t kernel[3]; //!< OpenCL kernel that implements the program
61 : jhr 1267 };
62 : jhr 1471
63 : jhr 1519 // FIXME: need documentation for this stuct!
64 : lamonts 1459 typedef struct {
65 : jhr 1519 cl_mem schedMem;
66 :     cl_mem outMem;
67 :     cl_mem inMem;
68 :     cl_mem blocksMem;
69 :     cl_mem strandBlocksIdxsMem;
70 :     cl_mem queueMem;
71 :     cl_mem todoMem;
72 :     cl_mem statusMem;
73 :     } KernelArgs_t;
74 : jhr 1267
75 : jhr 1291 static bool InitCL (CLInfo_t *clInfo, Diderot_World_t *wrld);
76 : jhr 1473 static void CheckErrorCode (cl_int sts, const char * msg);
77 : jhr 1519 static void SetPhase1Args (cl_kernel kernel, int *argCount, KernelArgs_t *args);
78 :     static void SetPhase2Args (cl_kernel kernel, int *argCount, KernelArgs_t *args, int blk_sz);
79 :     static void SetScheduleKernelArgs (cl_kernel kernel, int *argCount, KernelArgs_t *args);
80 : lamonts 1500
81 : jhr 1473 static bool CreateKernel (cl_device_id dev, cl_program prog, const char *name, GPUKernel_t *kern);
82 :    
83 : jhr 1358 extern void Diderot_LoadGlobals (cl_context cxt, cl_kernel kernel, cl_command_queue cmdQ, int argStart);
84 :    
85 : jhr 1267 int main (int argc, const char **argv)
86 :     {
87 :     // get information about OpenCL support
88 :     CLInfo_t *clInfo = GetCLInfo();
89 :     if (clInfo == 0) {
90 : jhr 1327 fprintf (stderr, "no OpenCL support\n");
91 :     exit (1);
92 : jhr 1267 }
93 :    
94 :     Diderot_Options_t *opts = Diderot_OptNew ();
95 :    
96 :     Diderot_OptAddFlag (opts, "verbose", "enable runtime-system messages", &VerboseFlg);
97 : jhr 1411 Diderot_OptAddFlag (opts, "timing", "enable execution timing", &TimingFlg);
98 : lamonts 1513 Diderot_OptAddFlag (opts, "nrrd", "enable nrrd output", &NrrdOutputFlg);
99 : jhr 1267 Diderot_RegisterGlobalOpts (opts);
100 :     Diderot_OptProcess (opts, argc, argv);
101 :     Diderot_OptFree (opts);
102 :    
103 : jhr 1379 if (VerboseFlg)
104 : jhr 1492 PrintCLInfo (stdout, clInfo);
105 : jhr 1379
106 : jhr 1267 // run the generated global initialization code
107 : jhr 1379 if (VerboseFlg)
108 : jhr 1411 fprintf (stderr, "initializing globals ...\n");
109 :     Diderot_InitGlobals ();
110 : lamonts 1316
111 : jhr 1291 Diderot_World_t *wrld = Diderot_Initially (); // this may not be right for OpenCL
112 : lamonts 1459
113 : jhr 1291 if (! InitCL(clInfo, wrld))
114 :     exit (1);
115 : jhr 1282
116 : lamonts 1488 // Conversion of strands from their host types to their shadow types
117 : jhr 1472 void *shadowInState = CheckedAlloc(Diderot_Strands[0]->shadowStrandSzb * wrld->numStrands);
118 : lamonts 1488 void *shadowOutState = CheckedAlloc(Diderot_Strands[0]->shadowStrandSzb * wrld->numStrands);
119 : lamonts 1462 uint8_t *strandPtr = (uint8_t *)wrld->inState;
120 :     uint8_t *strandShadowPtr = (uint8_t *)shadowInState;
121 : lamonts 1488 uint8_t *strandShadowOutPtr = (uint8_t *)shadowInState;
122 : lamonts 1462 size_t shadowSize = Diderot_Strands[0]->shadowStrandSzb * wrld->numStrands;
123 :    
124 : jhr 1469 for (int i = 0; i < wrld->numStrands; i++) {
125 :     Diderot_Strands[0]->strandCopy (strandPtr, strandShadowPtr);
126 : lamonts 1488 Diderot_Strands[0]->strandCopy (strandPtr, strandShadowOutPtr);
127 : jhr 1469 strandPtr += Diderot_Strands[0]->stateSzb;
128 :     strandShadowPtr += Diderot_Strands[0]->shadowStrandSzb;
129 : lamonts 1488 strandShadowOutPtr += Diderot_Strands[0]->shadowStrandSzb;
130 : lamonts 1462 }
131 : jhr 1519 // FIXME: it is confusing to use the inState/outState pointers for two different purposes.
132 : jhr 1471 free (wrld->inState);
133 : jhr 1519 free (wrld->outState);
134 : lamonts 1462 wrld->inState = shadowInState;
135 : lamonts 1488 wrld->outState = shadowOutState;
136 : lamonts 1462
137 : jhr 1469 // hack to make the invariant part of the state the same in both copies
138 : lamonts 1462 memcpy (wrld->outState, wrld->inState, shadowSize);
139 :    
140 : jhr 1327 int argCount = 0;
141 :     cl_int sts = CL_SUCCESS;
142 : lamonts 1316
143 : jhr 1519 KernelArgs_t kernelArgs;
144 : lamonts 1488 SchedState_t scheduler;
145 : lamonts 1346
146 : lamonts 1488 scheduler.numStrands = wrld->numStrands;
147 : lamonts 1500 scheduler.nextStrand = 0;
148 : lamonts 1488 scheduler.todoSize = 0;
149 :     scheduler.sId = 0;
150 : lamonts 1501 scheduler.clearQueueSz= 1;
151 : lamonts 1500 scheduler.numAvailable = wrld->numStrands;
152 : lamonts 1488
153 : lamonts 1465 size_t globalWorkSize[1] = {0};
154 : lamonts 1488 size_t localWorkSize[1] = {0};
155 : jhr 1492 int dIdx = clInfo->mainDeviceIdx;
156 : lamonts 1488 int pIdx = clInfo->mainPlatformIdx;
157 :     globalWorkSize[0] = clInfo->platforms[pIdx].devices[dIdx].numCUs * clInfo->platforms[pIdx].devices[dIdx].cuWidth;
158 :     localWorkSize[0] = clInfo->platforms[pIdx].devices[dIdx].cuWidth;
159 : lamonts 1502 int numberOfBlocks = ceil((double)wrld->numStrands/localWorkSize[0]);
160 : lamonts 1488
161 :     size_t strandBlkMemSize = sizeof(int) * numberOfBlocks * clInfo->platforms[pIdx].devices[dIdx].cuWidth;;
162 :     size_t schedListMemSize = sizeof(int) * numberOfBlocks;
163 : jhr 1492 int *schedulerQueue = (int *)CheckedAlloc(schedListMemSize);
164 :     int *schedulerTodoList = (int *)CheckedAlloc(schedListMemSize);
165 :     int *strandBlocksIdxs = (int *)CheckedAlloc(strandBlkMemSize);
166 : lamonts 1488 scheduler.queueSize = numberOfBlocks;
167 : lamonts 1513
168 : lamonts 1488 size_t strandblkMemSize = sizeof(StrandBlock_t) * numberOfBlocks;
169 :     StrandBlock_t * strandBlks = (StrandBlock_t *)CheckedAlloc(strandblkMemSize);
170 : lamonts 1470
171 : jhr 1492 for (int i = 0; i < numberOfBlocks; i++) {
172 : lamonts 1488 schedulerQueue[i] = i;
173 :     strandBlks[i].blkIdx = i;
174 :     strandBlks[i].nActive = 0;
175 :     strandBlks[i].nDead = 0;
176 :     strandBlks[i].nStabilizing = 0;
177 : jhr 1492 strandBlks[i].nDying = 0;
178 : jhr 1432 }
179 : lamonts 1459
180 : lamonts 1488 kernelArgs.schedMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
181 :     sizeof(SchedState_t), &scheduler, &sts);
182 :     CheckErrorCode (sts, "error creating OpenCL scheduler buffer\n");
183 : lamonts 1459
184 : lamonts 1488 kernelArgs.blocksMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
185 :     strandblkMemSize, strandBlks , &sts);
186 :     CheckErrorCode (sts, "error creating OpenCL strand blocks buffer\n");
187 : lamonts 1465
188 : lamonts 1488 kernelArgs.strandBlocksIdxsMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
189 :     strandBlkMemSize, strandBlocksIdxs , &sts);
190 :     CheckErrorCode (sts, "error creating OpenCL strand blocks indices buffer\n");
191 : lamonts 1425
192 : lamonts 1488 kernelArgs.queueMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
193 :     schedListMemSize, schedulerQueue , &sts);
194 :     CheckErrorCode (sts, "error creating OpenCL scheduler queue buffer\n");
195 :    
196 :     kernelArgs.todoMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
197 :     schedListMemSize, schedulerTodoList , &sts);
198 :     CheckErrorCode (sts, "error creating OpenCL scheduler todo buffer\n");
199 :    
200 :     kernelArgs.inMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
201 :     shadowSize, wrld->inState, &sts);
202 :     CheckErrorCode (sts, "error creating OpenCL strand in-state buffer\n");
203 :    
204 :     kernelArgs.outMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
205 :     shadowSize, wrld->outState, &sts);
206 :     CheckErrorCode (sts, "error creating OpenCL strand out-state buffer\n");
207 :    
208 :     kernelArgs.statusMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
209 :     sizeof(int) * wrld->numStrands, wrld->status, &sts);
210 :     CheckErrorCode (sts, "error creating OpenCL world status buffer\n");
211 :    
212 : lamonts 1513 //Setup the Update Kernel's arguments
213 : lamonts 1488 SetPhase1Args(wrld->kernel[0].kern, &argCount, &kernelArgs);
214 :     Diderot_LoadGlobals(wrld->context, wrld->kernel[0].kern, wrld->cmdQ, argCount);
215 : lamonts 1459
216 : lamonts 1513 //Setup the Compaction Kernel's arguments
217 : lamonts 1488 argCount=0;
218 :     SetPhase2Args(wrld->kernel[1].kern, &argCount, &kernelArgs,clInfo->platforms[pIdx].devices[dIdx].cuWidth);
219 :    
220 : lamonts 1513 //Setup the Scheduler Kernel's arguments
221 : lamonts 1500 argCount = 0;
222 :     SetScheduleKernelArgs(wrld->kernel[2].kern, &argCount, &kernelArgs);
223 :    
224 : lamonts 1459 clFinish(wrld->cmdQ);
225 :    
226 :     double t0 = GetTime();
227 :    
228 : jhr 1469 clFinish(wrld->cmdQ);
229 : lamonts 1461
230 : lamonts 1500 while (scheduler.numAvailable > 0) {
231 : jhr 1519 // Runs the update kernel on all strands
232 : lamonts 1488 sts = clEnqueueNDRangeKernel(wrld->cmdQ, wrld->kernel[0].kern, 1, NULL,
233 : jhr 1519 globalWorkSize, localWorkSize, 0, NULL, NULL);
234 : lamonts 1500 CheckErrorCode (sts, "error in executing update kernel\n");
235 : lamonts 1488
236 : jhr 1519 sts = clEnqueueTask(wrld->cmdQ,wrld->kernel[2].kern, 0, NULL, NULL);
237 : lamonts 1500 CheckErrorCode (sts, "error in executing scheduler update kernel before compaction\n");
238 : lamonts 1513
239 : lamonts 1500 // Run the compaction kernel on all strands
240 :     sts = clEnqueueNDRangeKernel(wrld->cmdQ, wrld->kernel[1].kern, 1, NULL,
241 : jhr 1492 globalWorkSize, localWorkSize, 0, NULL, NULL);
242 : lamonts 1513 CheckErrorCode (sts, "error ccccn executing compaction kernel\n");
243 : lamonts 1501
244 : lamonts 1500 sts = clEnqueueTask(wrld->cmdQ,wrld->kernel[2].kern,0,NULL,NULL);
245 :     CheckErrorCode (sts, "error in executing scheduler update kernel after compaction\n");
246 : lamonts 1459
247 : lamonts 1513 sts = clEnqueueReadBuffer(wrld->cmdQ, kernelArgs.schedMem, CL_TRUE, 0, sizeof(SchedState_t),
248 : lamonts 1500 &scheduler, 0, NULL, NULL);
249 : lamonts 1513 CheckErrorCode (sts, "error reading back scheduler information\n");
250 : lamonts 1459 }
251 : lamonts 1470
252 : lamonts 1488 sts = clEnqueueReadBuffer(wrld->cmdQ, kernelArgs.outMem, CL_TRUE, 0,shadowSize,
253 : jhr 1469 wrld->outState, 0, NULL, NULL);
254 : jhr 1471 CheckErrorCode (sts, "error in reading back output\n");
255 : lamonts 1459
256 : jhr 1469 sts = clEnqueueReadBuffer(
257 : lamonts 1488 wrld->cmdQ, kernelArgs.statusMem, CL_TRUE, 0, sizeof(int) * wrld->numStrands,
258 : jhr 1469 wrld->status, 0, NULL, NULL);
259 : jhr 1471 CheckErrorCode (sts, "error in reading back status\n");
260 : jhr 1342
261 : lamonts 1513
262 : jhr 1469 clFinish (wrld->cmdQ);
263 : jhr 1473 clReleaseKernel(wrld->kernel[0].kern);
264 :     clReleaseKernel(wrld->kernel[1].kern);
265 : lamonts 1316 clReleaseCommandQueue(wrld->cmdQ);
266 :     clReleaseContext(wrld->context);
267 :    
268 : jhr 1287 double totalTime = GetTime() - t0;
269 :    
270 :     if (VerboseFlg)
271 :     fprintf (stderr, "done in %f seconds\n", totalTime);
272 :     else if (TimingFlg)
273 :     printf ("usr=%f\n", totalTime);
274 :    
275 : nseltzer 1453 // here we have the final state of all of the strands in the "in" buffer
276 : lamonts 1513 // output the final strand states
277 :     if (NrrdOutputFlg)
278 : jhr 1519 Diderot_Output (wrld, Diderot_Strands[0]->shadowStrandSzb);
279 : lamonts 1513 else
280 : jhr 1519 Diderot_Print (wrld);
281 : jhr 1287
282 : lamonts 1488 Diderot_Shutdown (wrld);
283 : jhr 1287
284 :     return 0;
285 :    
286 : lamonts 1271 }
287 : jhr 1469
288 : jhr 1519 static void SetPhase1Args (cl_kernel kernel, int *argCount, KernelArgs_t *args)
289 : lamonts 1459 {
290 : jhr 1492 int count = *argCount;
291 : lamonts 1459 cl_int sts = CL_SUCCESS;
292 : jhr 1282
293 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->inMem);
294 : lamonts 1500 CheckErrorCode (sts, "Update Kernel: error Setting OpenCL strand in-state argument\n");
295 : lamonts 1488
296 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->outMem);
297 : lamonts 1500 CheckErrorCode (sts, "Update Kernel: error Setting OpenCL strand out-state argument\n");
298 : lamonts 1459
299 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->statusMem);
300 : lamonts 1500 CheckErrorCode (sts, "Update Kernel: error Setting OpenCL world status argument\n");
301 : lamonts 1459
302 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->schedMem);
303 : lamonts 1500 CheckErrorCode (sts, "Update Kernel: error Setting OpenCL scheduler argument\n");
304 : lamonts 1459
305 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->blocksMem);
306 : lamonts 1500 CheckErrorCode (sts, "Update Kernel: error Setting OpenCL strand blocks argument\n");
307 : lamonts 1459
308 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->strandBlocksIdxsMem);
309 : lamonts 1500 CheckErrorCode (sts, "Update Kernel: error Setting OpenCL strand blocks' indices argument\n");
310 : lamonts 1459
311 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->queueMem);
312 : lamonts 1500 CheckErrorCode (sts, "Update Kernel: error Setting OpenCL scheduler queue argument\n");
313 : lamonts 1488
314 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->todoMem);
315 : lamonts 1500 CheckErrorCode (sts, "Update Kernel: error Setting OpenCL scheduler todo argument\n");
316 : lamonts 1488
317 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(StrandBlock_t), NULL);
318 : lamonts 1513 CheckErrorCode (sts, "Update Kernel: error Setting OpenCL local strand blockargument\n");
319 : lamonts 1488
320 : jhr 1492 *argCount = count;
321 :    
322 : jhr 1469 }
323 :    
324 : jhr 1519 static void SetPhase2Args (cl_kernel kernel, int *argCount, KernelArgs_t *args, int blk_sz)
325 : lamonts 1459 {
326 : jhr 1492 int count = *argCount;
327 :     cl_int sts = CL_SUCCESS;
328 : lamonts 1459
329 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->statusMem);
330 : lamonts 1500 CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL world status argument\n");
331 : lamonts 1459
332 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->schedMem);
333 : lamonts 1500 CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL scheduler argument\n");
334 : lamonts 1488
335 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->blocksMem);
336 : lamonts 1500 CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL strand blocks argument\n");
337 : lamonts 1488
338 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->strandBlocksIdxsMem);
339 : lamonts 1500 CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL strand blocks' indices argument\n");
340 : lamonts 1488
341 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->queueMem);
342 : lamonts 1500 CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL scheduler queue argument\n");
343 : lamonts 1488
344 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->todoMem);
345 : lamonts 1500 CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL scheduler todo argument\n");
346 : lamonts 1488
347 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(StrandBlock_t), NULL);
348 : lamonts 1500 CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL local strand blockargument\n");
349 : lamonts 1488
350 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(int) * blk_sz, NULL);
351 : lamonts 1500 CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL local preStable argument\n");
352 : lamonts 1488
353 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(int) * blk_sz, NULL);
354 : lamonts 1500 CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL local preDead argument\n");
355 : lamonts 1488
356 : lamonts 1500 sts = clSetKernelArg (kernel, count++, sizeof(int) * blk_sz * 2, NULL);
357 :     CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL local temporary array for the prefix scan of preStable preDead argument\n");
358 : lamonts 1488
359 : lamonts 1513
360 : jhr 1492 *argCount = count;
361 : jhr 1469 }
362 : jhr 1519 static void SetScheduleKernelArgs(cl_kernel kernel, int *argCount, KernelArgs_t *args)
363 : lamonts 1500 {
364 :     int count = *argCount;
365 :     cl_int sts = CL_SUCCESS;
366 :    
367 :     sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->schedMem);
368 :     CheckErrorCode (sts, "Scheduler Kernel: error setting OpenCL scheduler argument\n");
369 : jhr 1469
370 : lamonts 1500 *argCount = count;
371 :     }
372 :    
373 : jhr 1471 static void CheckErrorCode (cl_int sts, const char *msg)
374 : lamonts 1459 {
375 :     if (sts != CL_SUCCESS) {
376 : jhr 1464 fprintf (stderr, "%s", msg);
377 : lamonts 1459 exit(1);
378 :     }
379 : jhr 1469 }
380 :    
381 : jhr 1287 /*! \brief load OpenCL code from a file
382 :     */
383 : jhr 1291 static char *LoadSource (const char *filename)
384 : lamonts 1271 {
385 :     struct stat statbuf;
386 : jhr 1287 if (stat(filename, &statbuf) < 0) {
387 : jhr 1327 fprintf (stderr, "unable to stat OpenCL source file %s\n", filename);
388 :     exit (1);
389 : jhr 1287 }
390 :    
391 : jhr 1472 char *source = (char *)CheckedAlloc(statbuf.st_size + 1);
392 : jhr 1287 if (source == 0) {
393 : jhr 1327 fprintf (stderr, "unable to allocate memory for OpenCL source\n");
394 :     exit (1);
395 : jhr 1287 }
396 :    
397 :     FILE *fh = fopen(filename, "r");
398 :     if ((fh == 0)
399 :     || (fread(source, statbuf.st_size, 1, fh) != 1)) {
400 : jhr 1327 fprintf (stderr, "unable to read OpenCL source from %s\n", filename);
401 :     exit (1);
402 : jhr 1287 }
403 :     source[statbuf.st_size] = '\0';
404 :     fclose (fh);
405 : lamonts 1271
406 :     return source;
407 : jhr 1342 }
408 :    
409 : jhr 1473 //! create a kernel object from a program
410 :     static bool CreateKernel (cl_device_id dev, cl_program prog, const char *name, GPUKernel_t *kern)
411 :     {
412 :     cl_int sts;
413 :    
414 :     kern->kern = clCreateKernel(prog, name, &sts);
415 :     if (sts != CL_SUCCESS) {
416 :     fprintf (stderr, "error getting %s from program\n", name);
417 :     return false;
418 :     }
419 :     sts = clGetKernelWorkGroupInfo (
420 :     kern->kern, dev, CL_KERNEL_WORK_GROUP_SIZE,
421 :     sizeof(size_t), &(kern->workGrpSize), 0);
422 :     if (sts != CL_SUCCESS) {
423 :     fprintf (stderr, "error getting workgroup size for %s\n", name);
424 :     return false;
425 :     }
426 :     sts = clGetKernelWorkGroupInfo (
427 :     kern->kern, dev, CL_KERNEL_LOCAL_MEM_SIZE,
428 :     sizeof(cl_ulong), &(kern->localSzb), 0);
429 :     if (sts != CL_SUCCESS) {
430 :     fprintf (stderr, "error getting local memory size for %s\n", name);
431 :     return false;
432 :     }
433 :    
434 :     if (VerboseFlg) {
435 :     fprintf(stderr, "kernel %s: workgroup size = %d, local memory = %d bytes\n",
436 :     name, (int)kern->workGrpSize, (int)kern->localSzb);
437 :     }
438 :    
439 :     return true;
440 :     }
441 :    
442 : jhr 1352 static void LogMessagesToStderr (const char *errstr, const void *private_info, size_t cb, void *user_data)
443 : lamonts 1328 {
444 : jhr 1480 fprintf(stderr, "***** error log *****\n");
445 : jhr 1342 fprintf(stderr, "%s\n", errstr);
446 : jhr 1480 fprintf(stderr, "***** end error log *****\n");
447 : jhr 1342 }
448 : jhr 1267
449 : jhr 1417 /*! \brief initialize the OpenCL execution context, including loading and compiling the OpenCL
450 :     * program.
451 :     * \param clInfo points to the summary information about the available OpenCL devices.
452 :     * \param wrld the Diderot execution information.
453 :     * \return true if initialization is successful, otherwise false.
454 : jhr 1291 */
455 :     static bool InitCL (CLInfo_t *clInfo, Diderot_World_t *wrld)
456 :     {
457 : jhr 1327 cl_int sts;
458 : jhr 1291
459 :     // find a GPU on platform[0]
460 : jhr 1420 DeviceInfo_t *dev = 0;
461 : lamonts 1488 clInfo->mainPlatformIdx = 0;
462 : jhr 1291 int i;
463 :     for (i = 0; i < clInfo->platforms[0].numDevices; i++) {
464 : jhr 1417 if (isGPUDevice (&(clInfo->platforms[0].devices[i]))
465 : jhr 1472 && clInfo->platforms[0].devices[i].isAvail) {
466 : jhr 1420 dev = &(clInfo->platforms[0].devices[i]);
467 : lamonts 1488 clInfo->mainDeviceIdx = i;
468 : jhr 1291 break;
469 :     }
470 :     }
471 : lamonts 1488
472 : jhr 1420 if (dev == 0) {
473 : jhr 1291 fprintf (stderr, "unable to find GPU device\n");
474 :     return false;
475 :     }
476 :    
477 : jhr 1469 if (VerboseFlg) {
478 :     fprintf (stderr, "using platform 0, device %d: %s\n",
479 :     i, clInfo->platforms[0].devices[i].name);
480 :     }
481 :    
482 : jhr 1291 // create the context
483 : jhr 1420 cl_context cxt = clCreateContext(0, 1, &(dev->id), LogMessagesToStderr, 0, &sts);
484 : jhr 1291 if (sts != CL_SUCCESS) {
485 : jhr 1327 fprintf (stderr, "error creating OpenCL context\n");
486 : jhr 1291 return false;
487 :     }
488 :    
489 :     // create the program from the source
490 :     int fnameLen = strlen(wrld->name) + 4; // name + ".cl\0"
491 : jhr 1472 char *fname = (char *)CheckedAlloc(fnameLen);
492 : jhr 1291 snprintf(fname, fnameLen, "%s.cl", wrld->name);
493 : lamonts 1459 char *updateSource = LoadSource (fname);
494 : jhr 1464 free (fname);
495 : lamonts 1459
496 : jhr 1492 // load scheduler kernels
497 :     char *kernelsSource = LoadSource(KERNELS_FILE_PATH);
498 :    
499 : lamonts 1513 const char *src[2] = {kernelsSource,updateSource};
500 : jhr 1464 cl_program prog = clCreateProgramWithSource(cxt, 2, src, NULL, &sts);
501 : jhr 1492 free (updateSource);
502 :     free (kernelsSource);
503 : jhr 1291 if (sts != CL_SUCCESS) {
504 : jhr 1327 fprintf (stderr, "error creating program\n");
505 : jhr 1291 return false;
506 :     }
507 :    
508 :     // build the program
509 : jhr 1420 char options[1024];
510 :     snprintf (options, sizeof(options),
511 : jhr 1472 "-D DIDEROT_CL_VERSION=%d -D DIDEROT_CU_WIDTH=%d -I %s -w",
512 : jhr 1420 100*dev->majorVersion + dev->minorVersion,
513 : jhr 1472 dev->cuWidth,
514 : jhr 1420 DIDEROT_INCLUDE_PATH);
515 : jhr 1472 if (VerboseFlg) {
516 :     fprintf (stderr, "clBuildProgram options: %s\n", options);
517 :     }
518 : jhr 1420 sts = clBuildProgram (prog, 1, &(dev->id), options, 0, 0);
519 : jhr 1291 if (sts != CL_SUCCESS) {
520 : jhr 1352 size_t logSzb;
521 : jhr 1420 clGetProgramBuildInfo (prog, dev->id, CL_PROGRAM_BUILD_LOG, 0, 0, &logSzb);
522 : jhr 1472 char *log = CheckedAlloc(logSzb+1);
523 : jhr 1420 clGetProgramBuildInfo (prog, dev->id, CL_PROGRAM_BUILD_LOG, logSzb, log, &logSzb);
524 : jhr 1352 log[logSzb] = '\0';
525 :     fprintf (stderr, "error compiling program:\n%s\n", log);
526 :     free (log);
527 : jhr 1291 return false;
528 :     }
529 :    
530 : jhr 1473 // extract the kernels from the program
531 : lamonts 1488 if ((! CreateKernel (dev->id, prog, "Diderot_UpdateKernel", &(wrld->kernel[0])))
532 : lamonts 1500 || (! CreateKernel (dev->id, prog, "Diderot_CompactionKernel", &(wrld->kernel[1])))
533 :     || (! CreateKernel (dev->id, prog, "Diderot_SchedUpdateKernel", &(wrld->kernel[2]))))
534 : jhr 1291 return false;
535 :    
536 : jhr 1417 // create the command queue
537 : jhr 1420 cl_command_queue q = clCreateCommandQueue(cxt, dev->id, 0, &sts);
538 : jhr 1417 if (sts != CL_SUCCESS) {
539 :     fprintf (stderr, "error creating OpenCL command queue\n");
540 :     return false;
541 :     }
542 :    
543 : jhr 1291 // initialize world info
544 : jhr 1420 wrld->device = dev->id;
545 : jhr 1291 wrld->context = cxt;
546 :     wrld->cmdQ = q;
547 :    
548 :     return true;
549 :     }
550 : jhr 1327
551 : jhr 1267 // this should be the part of the scheduler
552 :     void *Diderot_AllocStrand (Strand_t *strand)
553 :     {
554 : jhr 1472 return CheckedAlloc(strand->stateSzb);
555 : jhr 1267 }
556 :    
557 :     // block allocation of an initial collection of strands
558 : jhr 1519 Diderot_World_t *Diderot_AllocInitially (
559 : jhr 1287 const char *name, // the name of the program
560 : jhr 1267 Strand_t *strand, // the type of strands being allocated
561 :     bool isArray, // is the initialization an array or collection?
562 :     uint32_t nDims, // depth of iteration nesting
563 :     int32_t *base, // nDims array of base indices
564 :     uint32_t *size) // nDims array of iteration sizes
565 :     {
566 : jhr 1472 Diderot_World_t *wrld = NEW(Diderot_World_t);
567 : jhr 1267 if (wrld == 0) {
568 :     fprintf (stderr, "unable to allocate world\n");
569 :     exit (1);
570 :     }
571 :    
572 : jhr 1287 wrld->name = name; /* NOTE: we are assuming that name is statically allocated! */
573 : jhr 1267 wrld->isArray = isArray;
574 :     wrld->nDims = nDims;
575 : jhr 1472 wrld->base = NEWVEC(int32_t, nDims);
576 :     wrld->size = NEWVEC(uint32_t, nDims);
577 : jhr 1267 size_t numStrands = 1;
578 :     for (int i = 0; i < wrld->nDims; i++) {
579 :     numStrands *= size[i];
580 :     wrld->base[i] = base[i];
581 :     wrld->size[i] = size[i];
582 :     }
583 :    
584 : jhr 1378 if (VerboseFlg) {
585 :     printf("AllocInitially: %d", size[0]);
586 :     for (int i = 1; i < nDims; i++) printf(" x %d", size[i]);
587 :     printf("\n");
588 :     }
589 : jhr 1267
590 :     // allocate the strand state pointers
591 :     wrld->numStrands = numStrands;
592 : jhr 1519 wrld->inState = CheckedAlloc (strand->stateSzb * numStrands);
593 : jhr 1472 wrld->outState = CheckedAlloc (strand->shadowStrandSzb * numStrands);
594 : jhr 1519 wrld->status = NEWVEC(StatusInt_t, numStrands);
595 : jhr 1267
596 :     // initialize strand state pointers etc.
597 :     for (size_t i = 0; i < numStrands; i++) {
598 : lamonts 1351 wrld->status[i] = DIDEROT_ACTIVE;
599 : jhr 1267 }
600 :    
601 :     return wrld;
602 :    
603 :     }
604 :    
605 :     // get strand state pointers
606 :     void *Diderot_InState (Diderot_World_t *wrld, uint32_t i)
607 :     {
608 :     assert (i < wrld->numStrands);
609 : jhr 1519 return &(wrld->inState[i]);
610 : jhr 1267 }
611 :    
612 :     void *Diderot_OutState (Diderot_World_t *wrld, uint32_t i)
613 :     {
614 :     assert (i < wrld->numStrands);
615 : jhr 1519 return &(wrld->outState[i]);
616 : jhr 1267 }
617 :    
618 : jhr 1354 /***** Support for shadow image values *****/
619 : jhr 1435
620 : jhr 1354 void ShadowImage1D (cl_context cxt, Shadow_image1D_t *dst, Diderot_image1D_t *src)
621 :     {
622 :     dst->size[0] = src->size[0];
623 :     dst->s = src->s;
624 :     dst->t = dst->t;
625 :     }
626 : jhr 1435
627 : jhr 1354 void ShadowImage2D (cl_context cxt, Shadow_image2D_t *dst, Diderot_image2D_t *src)
628 :     {
629 : lamonts 1461 dst->size[0] = src->size[0];
630 : jhr 1354 dst->size[1] = src->size[1];
631 : jhr 1355 ShadowMat2x2 (dst->w2i, src->w2i);
632 : lamonts 1461 ShadowVec2 (&dst->tVec, src->tVec);
633 : jhr 1355 ShadowMat2x2 (dst->w2iT, src->w2iT);
634 : jhr 1354 }
635 :    
636 :     void ShadowImage3D (cl_context cxt, Shadow_image3D_t *dst, Diderot_image3D_t *src)
637 :     {
638 :     dst->size[0] = src->size[0];
639 :     dst->size[1] = src->size[1];
640 :     dst->size[2] = src->size[2];
641 : jhr 1355 ShadowMat3x3 (dst->w2i, src->w2i);
642 : lamonts 1461 ShadowVec3 (&dst->tVec, src->tVec);
643 : jhr 1435 ShadowMat3x3 (dst->w2iT, src->w2iT);
644 : jhr 1354 }

root@smlnj-gforge.cs.uchicago.edu
ViewVC Help
Powered by ViewVC 1.0.0