Home My Page Projects Code Snippets Project Openings diderot
Summary Activity Tracker Tasks SCM

SCM Repository

[diderot] Annotation of /branches/pure-cfg/src/lib/cl-target/main.c
ViewVC logotype

Annotation of /branches/pure-cfg/src/lib/cl-target/main.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1526 - (view) (download) (as text)

1 : jhr 1267 /*! \file main.c
2 :     *
3 :     * \author John Reppy
4 :     */
5 :    
6 :     /*
7 :     * COPYRIGHT (c) 2011 The Diderot Project (http://diderot-language.cs.uchicago.edu)
8 :     * All rights reserved.
9 :     */
10 :    
11 : jhr 1275 #include <Diderot/diderot.h>
12 : jhr 1411 #include "clinfo.h"
13 : lamonts 1271 #include <sys/sysctl.h>
14 :     #include <sys/stat.h>
15 : jhr 1267
16 : jhr 1492 // FIXME: the kernels should really be in the lib subdirectory
17 :     // path to kernels
18 :     #define KERNELS_FILE_PATH DIDEROT_INCLUDE_PATH "/cl_kernels/kernels.cl"
19 : lamonts 1488
20 : lamonts 1513
21 : lamonts 1488 typedef struct {
22 : jhr 1492 cl_int blkIdx; //!< the id of this block
23 :     cl_int nActive; //!< number of active (status != DIE or STABLE) strands
24 :     cl_int nDead; //!< number of strands in the DIE state
25 :     cl_int nStabilizing; //!< number of new strands in the STABILIZE state
26 :     cl_int nDying; //!< number of new strands in the DIE state
27 : lamonts 1488 } StrandBlock_t;
28 :    
29 :     typedef struct {
30 : jhr 1492 cl_int numStrands; //!< number of strands
31 :     cl_int sId; //!< the index into the todo list or queue
32 : lamonts 1501 cl_int nextStrand; //!< index of the next strand to retrieve from the pool
33 :     cl_int clearQueueSz; //!< an indicator on whether the queue size should be cleared
34 : jhr 1492 cl_int queueSize; //!< number of blocks on the scheduler's queue
35 :     cl_int todoSize; //!< number of blocks on the scheduler's todo list
36 : jhr 1519 cl_int numAvailable; //!< number of active strands left to process
37 : lamonts 1488 } SchedState_t;
38 :    
39 :    
40 : jhr 1267 // NOTE: we probably should put this in a file that supports runtime printing
41 :     static bool VerboseFlg = false;
42 : jhr 1287 static bool TimingFlg = false;
43 : lamonts 1513 static bool NrrdOutputFlg = false;
44 : lamonts 1470 #define WARP_SIZE 32
45 : lamonts 1459
46 : jhr 1473 typedef struct {
47 :     cl_kernel kern; //!< OpenCL kernel that implements the program
48 :     size_t workGrpSize; //!< size of workgroup for this kernel
49 :     cl_ulong localSzb; //!< size of local memory used by kernel
50 :     } GPUKernel_t;
51 :    
52 : jhr 1267 struct struct_world {
53 : jhr 1519 STRUCT_WORLD_PREFIX
54 :     // FIXME: document these fields!
55 : lamonts 1341 void *inState;
56 :     void *outState;
57 : jhr 1471 cl_device_id device; //!< OpenCL device
58 :     cl_context context; //!< OpenCL execution context
59 :     cl_command_queue cmdQ; //!< OpenCL command queue
60 : lamonts 1500 GPUKernel_t kernel[3]; //!< OpenCL kernel that implements the program
61 : jhr 1267 };
62 : jhr 1471
63 : jhr 1519 // FIXME: need documentation for this stuct!
64 : lamonts 1459 typedef struct {
65 : jhr 1519 cl_mem schedMem;
66 :     cl_mem outMem;
67 :     cl_mem inMem;
68 :     cl_mem blocksMem;
69 :     cl_mem strandBlocksIdxsMem;
70 :     cl_mem queueMem;
71 :     cl_mem todoMem;
72 :     cl_mem statusMem;
73 :     } KernelArgs_t;
74 : jhr 1267
75 : jhr 1291 static bool InitCL (CLInfo_t *clInfo, Diderot_World_t *wrld);
76 : jhr 1473 static void CheckErrorCode (cl_int sts, const char * msg);
77 : jhr 1519 static void SetPhase1Args (cl_kernel kernel, int *argCount, KernelArgs_t *args);
78 :     static void SetPhase2Args (cl_kernel kernel, int *argCount, KernelArgs_t *args, int blk_sz);
79 :     static void SetScheduleKernelArgs (cl_kernel kernel, int *argCount, KernelArgs_t *args);
80 : lamonts 1500
81 : jhr 1473 static bool CreateKernel (cl_device_id dev, cl_program prog, const char *name, GPUKernel_t *kern);
82 :    
83 : jhr 1358 extern void Diderot_LoadGlobals (cl_context cxt, cl_kernel kernel, cl_command_queue cmdQ, int argStart);
84 :    
85 : jhr 1267 int main (int argc, const char **argv)
86 :     {
87 :     // get information about OpenCL support
88 :     CLInfo_t *clInfo = GetCLInfo();
89 :     if (clInfo == 0) {
90 : jhr 1327 fprintf (stderr, "no OpenCL support\n");
91 :     exit (1);
92 : jhr 1267 }
93 :    
94 :     Diderot_Options_t *opts = Diderot_OptNew ();
95 :    
96 :     Diderot_OptAddFlag (opts, "verbose", "enable runtime-system messages", &VerboseFlg);
97 : jhr 1411 Diderot_OptAddFlag (opts, "timing", "enable execution timing", &TimingFlg);
98 : lamonts 1513 Diderot_OptAddFlag (opts, "nrrd", "enable nrrd output", &NrrdOutputFlg);
99 : jhr 1267 Diderot_RegisterGlobalOpts (opts);
100 :     Diderot_OptProcess (opts, argc, argv);
101 :     Diderot_OptFree (opts);
102 :    
103 : jhr 1379 if (VerboseFlg)
104 : jhr 1492 PrintCLInfo (stdout, clInfo);
105 : jhr 1379
106 : jhr 1267 // run the generated global initialization code
107 : jhr 1379 if (VerboseFlg)
108 : jhr 1411 fprintf (stderr, "initializing globals ...\n");
109 :     Diderot_InitGlobals ();
110 : lamonts 1316
111 : jhr 1291 Diderot_World_t *wrld = Diderot_Initially (); // this may not be right for OpenCL
112 : lamonts 1459
113 : jhr 1291 if (! InitCL(clInfo, wrld))
114 :     exit (1);
115 : jhr 1282
116 : lamonts 1488 // Conversion of strands from their host types to their shadow types
117 : jhr 1472 void *shadowInState = CheckedAlloc(Diderot_Strands[0]->shadowStrandSzb * wrld->numStrands);
118 : lamonts 1488 void *shadowOutState = CheckedAlloc(Diderot_Strands[0]->shadowStrandSzb * wrld->numStrands);
119 : lamonts 1462 uint8_t *strandPtr = (uint8_t *)wrld->inState;
120 :     uint8_t *strandShadowPtr = (uint8_t *)shadowInState;
121 : lamonts 1488 uint8_t *strandShadowOutPtr = (uint8_t *)shadowInState;
122 : lamonts 1462 size_t shadowSize = Diderot_Strands[0]->shadowStrandSzb * wrld->numStrands;
123 :    
124 : jhr 1469 for (int i = 0; i < wrld->numStrands; i++) {
125 :     Diderot_Strands[0]->strandCopy (strandPtr, strandShadowPtr);
126 : lamonts 1488 Diderot_Strands[0]->strandCopy (strandPtr, strandShadowOutPtr);
127 : jhr 1469 strandPtr += Diderot_Strands[0]->stateSzb;
128 :     strandShadowPtr += Diderot_Strands[0]->shadowStrandSzb;
129 : lamonts 1488 strandShadowOutPtr += Diderot_Strands[0]->shadowStrandSzb;
130 : lamonts 1462 }
131 : jhr 1519 // FIXME: it is confusing to use the inState/outState pointers for two different purposes.
132 : jhr 1471 free (wrld->inState);
133 : jhr 1519 free (wrld->outState);
134 : lamonts 1462 wrld->inState = shadowInState;
135 : lamonts 1488 wrld->outState = shadowOutState;
136 : lamonts 1462
137 : jhr 1469 // hack to make the invariant part of the state the same in both copies
138 : lamonts 1462 memcpy (wrld->outState, wrld->inState, shadowSize);
139 :    
140 : jhr 1327 int argCount = 0;
141 :     cl_int sts = CL_SUCCESS;
142 : lamonts 1316
143 : jhr 1519 KernelArgs_t kernelArgs;
144 : lamonts 1488 SchedState_t scheduler;
145 : lamonts 1346
146 : lamonts 1488 scheduler.numStrands = wrld->numStrands;
147 : lamonts 1500 scheduler.nextStrand = 0;
148 : lamonts 1488 scheduler.todoSize = 0;
149 :     scheduler.sId = 0;
150 : lamonts 1501 scheduler.clearQueueSz= 1;
151 : lamonts 1500 scheduler.numAvailable = wrld->numStrands;
152 : lamonts 1488
153 : lamonts 1465 size_t globalWorkSize[1] = {0};
154 : lamonts 1488 size_t localWorkSize[1] = {0};
155 : jhr 1492 int dIdx = clInfo->mainDeviceIdx;
156 : lamonts 1488 int pIdx = clInfo->mainPlatformIdx;
157 :     globalWorkSize[0] = clInfo->platforms[pIdx].devices[dIdx].numCUs * clInfo->platforms[pIdx].devices[dIdx].cuWidth;
158 :     localWorkSize[0] = clInfo->platforms[pIdx].devices[dIdx].cuWidth;
159 : lamonts 1502 int numberOfBlocks = ceil((double)wrld->numStrands/localWorkSize[0]);
160 : lamonts 1488
161 :     size_t strandBlkMemSize = sizeof(int) * numberOfBlocks * clInfo->platforms[pIdx].devices[dIdx].cuWidth;;
162 :     size_t schedListMemSize = sizeof(int) * numberOfBlocks;
163 : jhr 1492 int *schedulerQueue = (int *)CheckedAlloc(schedListMemSize);
164 :     int *schedulerTodoList = (int *)CheckedAlloc(schedListMemSize);
165 :     int *strandBlocksIdxs = (int *)CheckedAlloc(strandBlkMemSize);
166 : lamonts 1488 scheduler.queueSize = numberOfBlocks;
167 : lamonts 1513
168 : lamonts 1488 size_t strandblkMemSize = sizeof(StrandBlock_t) * numberOfBlocks;
169 :     StrandBlock_t * strandBlks = (StrandBlock_t *)CheckedAlloc(strandblkMemSize);
170 : lamonts 1470
171 : jhr 1492 for (int i = 0; i < numberOfBlocks; i++) {
172 : lamonts 1488 schedulerQueue[i] = i;
173 :     strandBlks[i].blkIdx = i;
174 :     strandBlks[i].nActive = 0;
175 :     strandBlks[i].nDead = 0;
176 :     strandBlks[i].nStabilizing = 0;
177 : jhr 1492 strandBlks[i].nDying = 0;
178 : jhr 1432 }
179 : lamonts 1459
180 : lamonts 1488 kernelArgs.schedMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
181 :     sizeof(SchedState_t), &scheduler, &sts);
182 :     CheckErrorCode (sts, "error creating OpenCL scheduler buffer\n");
183 : lamonts 1459
184 : lamonts 1488 kernelArgs.blocksMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
185 :     strandblkMemSize, strandBlks , &sts);
186 :     CheckErrorCode (sts, "error creating OpenCL strand blocks buffer\n");
187 : lamonts 1465
188 : lamonts 1488 kernelArgs.strandBlocksIdxsMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
189 :     strandBlkMemSize, strandBlocksIdxs , &sts);
190 :     CheckErrorCode (sts, "error creating OpenCL strand blocks indices buffer\n");
191 : lamonts 1425
192 : lamonts 1488 kernelArgs.queueMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
193 :     schedListMemSize, schedulerQueue , &sts);
194 :     CheckErrorCode (sts, "error creating OpenCL scheduler queue buffer\n");
195 :    
196 :     kernelArgs.todoMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
197 :     schedListMemSize, schedulerTodoList , &sts);
198 :     CheckErrorCode (sts, "error creating OpenCL scheduler todo buffer\n");
199 :    
200 :     kernelArgs.inMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
201 :     shadowSize, wrld->inState, &sts);
202 :     CheckErrorCode (sts, "error creating OpenCL strand in-state buffer\n");
203 :    
204 :     kernelArgs.outMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
205 :     shadowSize, wrld->outState, &sts);
206 :     CheckErrorCode (sts, "error creating OpenCL strand out-state buffer\n");
207 :    
208 :     kernelArgs.statusMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
209 :     sizeof(int) * wrld->numStrands, wrld->status, &sts);
210 :     CheckErrorCode (sts, "error creating OpenCL world status buffer\n");
211 :    
212 : lamonts 1513 //Setup the Update Kernel's arguments
213 : lamonts 1488 SetPhase1Args(wrld->kernel[0].kern, &argCount, &kernelArgs);
214 :     Diderot_LoadGlobals(wrld->context, wrld->kernel[0].kern, wrld->cmdQ, argCount);
215 : lamonts 1459
216 : lamonts 1513 //Setup the Compaction Kernel's arguments
217 : lamonts 1488 argCount=0;
218 :     SetPhase2Args(wrld->kernel[1].kern, &argCount, &kernelArgs,clInfo->platforms[pIdx].devices[dIdx].cuWidth);
219 :    
220 : lamonts 1513 //Setup the Scheduler Kernel's arguments
221 : lamonts 1500 argCount = 0;
222 :     SetScheduleKernelArgs(wrld->kernel[2].kern, &argCount, &kernelArgs);
223 :    
224 : lamonts 1459 clFinish(wrld->cmdQ);
225 :    
226 :     double t0 = GetTime();
227 :    
228 : jhr 1469 clFinish(wrld->cmdQ);
229 : lamonts 1461
230 : lamonts 1500 while (scheduler.numAvailable > 0) {
231 : jhr 1519 // Runs the update kernel on all strands
232 : lamonts 1488 sts = clEnqueueNDRangeKernel(wrld->cmdQ, wrld->kernel[0].kern, 1, NULL,
233 : jhr 1519 globalWorkSize, localWorkSize, 0, NULL, NULL);
234 : lamonts 1500 CheckErrorCode (sts, "error in executing update kernel\n");
235 : lamonts 1488
236 : jhr 1519 sts = clEnqueueTask(wrld->cmdQ,wrld->kernel[2].kern, 0, NULL, NULL);
237 : lamonts 1500 CheckErrorCode (sts, "error in executing scheduler update kernel before compaction\n");
238 : lamonts 1513
239 : lamonts 1500 // Run the compaction kernel on all strands
240 :     sts = clEnqueueNDRangeKernel(wrld->cmdQ, wrld->kernel[1].kern, 1, NULL,
241 : jhr 1492 globalWorkSize, localWorkSize, 0, NULL, NULL);
242 : lamonts 1513 CheckErrorCode (sts, "error ccccn executing compaction kernel\n");
243 : lamonts 1501
244 : lamonts 1500 sts = clEnqueueTask(wrld->cmdQ,wrld->kernel[2].kern,0,NULL,NULL);
245 :     CheckErrorCode (sts, "error in executing scheduler update kernel after compaction\n");
246 : lamonts 1459
247 : lamonts 1513 sts = clEnqueueReadBuffer(wrld->cmdQ, kernelArgs.schedMem, CL_TRUE, 0, sizeof(SchedState_t),
248 : lamonts 1500 &scheduler, 0, NULL, NULL);
249 : lamonts 1513 CheckErrorCode (sts, "error reading back scheduler information\n");
250 : lamonts 1459 }
251 : lamonts 1470
252 : lamonts 1488 sts = clEnqueueReadBuffer(wrld->cmdQ, kernelArgs.outMem, CL_TRUE, 0,shadowSize,
253 : jhr 1469 wrld->outState, 0, NULL, NULL);
254 : jhr 1471 CheckErrorCode (sts, "error in reading back output\n");
255 : lamonts 1459
256 : jhr 1469 sts = clEnqueueReadBuffer(
257 : lamonts 1488 wrld->cmdQ, kernelArgs.statusMem, CL_TRUE, 0, sizeof(int) * wrld->numStrands,
258 : jhr 1469 wrld->status, 0, NULL, NULL);
259 : jhr 1471 CheckErrorCode (sts, "error in reading back status\n");
260 : jhr 1342
261 : lamonts 1526 wrld->outputSzb = Diderot_Strands[0]->shadowStrandSzb;
262 : jhr 1469 clFinish (wrld->cmdQ);
263 : jhr 1473 clReleaseKernel(wrld->kernel[0].kern);
264 :     clReleaseKernel(wrld->kernel[1].kern);
265 : lamonts 1316 clReleaseCommandQueue(wrld->cmdQ);
266 :     clReleaseContext(wrld->context);
267 :    
268 : jhr 1287 double totalTime = GetTime() - t0;
269 :    
270 :     if (VerboseFlg)
271 :     fprintf (stderr, "done in %f seconds\n", totalTime);
272 :     else if (TimingFlg)
273 :     printf ("usr=%f\n", totalTime);
274 :    
275 : nseltzer 1453 // here we have the final state of all of the strands in the "in" buffer
276 : lamonts 1513 // output the final strand states
277 :     if (NrrdOutputFlg)
278 : jhr 1519 Diderot_Output (wrld, Diderot_Strands[0]->shadowStrandSzb);
279 : lamonts 1513 else
280 : jhr 1519 Diderot_Print (wrld);
281 : jhr 1287
282 : lamonts 1526
283 : lamonts 1488 Diderot_Shutdown (wrld);
284 : jhr 1287
285 :     return 0;
286 :    
287 : lamonts 1271 }
288 : jhr 1469
289 : jhr 1519 static void SetPhase1Args (cl_kernel kernel, int *argCount, KernelArgs_t *args)
290 : lamonts 1459 {
291 : jhr 1492 int count = *argCount;
292 : lamonts 1459 cl_int sts = CL_SUCCESS;
293 : jhr 1282
294 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->inMem);
295 : lamonts 1500 CheckErrorCode (sts, "Update Kernel: error Setting OpenCL strand in-state argument\n");
296 : lamonts 1488
297 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->outMem);
298 : lamonts 1500 CheckErrorCode (sts, "Update Kernel: error Setting OpenCL strand out-state argument\n");
299 : lamonts 1459
300 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->statusMem);
301 : lamonts 1500 CheckErrorCode (sts, "Update Kernel: error Setting OpenCL world status argument\n");
302 : lamonts 1459
303 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->schedMem);
304 : lamonts 1500 CheckErrorCode (sts, "Update Kernel: error Setting OpenCL scheduler argument\n");
305 : lamonts 1459
306 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->blocksMem);
307 : lamonts 1500 CheckErrorCode (sts, "Update Kernel: error Setting OpenCL strand blocks argument\n");
308 : lamonts 1459
309 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->strandBlocksIdxsMem);
310 : lamonts 1500 CheckErrorCode (sts, "Update Kernel: error Setting OpenCL strand blocks' indices argument\n");
311 : lamonts 1459
312 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->queueMem);
313 : lamonts 1500 CheckErrorCode (sts, "Update Kernel: error Setting OpenCL scheduler queue argument\n");
314 : lamonts 1488
315 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->todoMem);
316 : lamonts 1500 CheckErrorCode (sts, "Update Kernel: error Setting OpenCL scheduler todo argument\n");
317 : lamonts 1488
318 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(StrandBlock_t), NULL);
319 : lamonts 1513 CheckErrorCode (sts, "Update Kernel: error Setting OpenCL local strand blockargument\n");
320 : lamonts 1488
321 : jhr 1492 *argCount = count;
322 :    
323 : jhr 1469 }
324 :    
325 : jhr 1519 static void SetPhase2Args (cl_kernel kernel, int *argCount, KernelArgs_t *args, int blk_sz)
326 : lamonts 1459 {
327 : jhr 1492 int count = *argCount;
328 :     cl_int sts = CL_SUCCESS;
329 : lamonts 1459
330 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->statusMem);
331 : lamonts 1500 CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL world status argument\n");
332 : lamonts 1459
333 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->schedMem);
334 : lamonts 1500 CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL scheduler argument\n");
335 : lamonts 1488
336 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->blocksMem);
337 : lamonts 1500 CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL strand blocks argument\n");
338 : lamonts 1488
339 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->strandBlocksIdxsMem);
340 : lamonts 1500 CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL strand blocks' indices argument\n");
341 : lamonts 1488
342 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->queueMem);
343 : lamonts 1500 CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL scheduler queue argument\n");
344 : lamonts 1488
345 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->todoMem);
346 : lamonts 1500 CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL scheduler todo argument\n");
347 : lamonts 1488
348 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(StrandBlock_t), NULL);
349 : lamonts 1500 CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL local strand blockargument\n");
350 : lamonts 1488
351 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(int) * blk_sz, NULL);
352 : lamonts 1500 CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL local preStable argument\n");
353 : lamonts 1488
354 : jhr 1492 sts = clSetKernelArg (kernel, count++, sizeof(int) * blk_sz, NULL);
355 : lamonts 1500 CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL local preDead argument\n");
356 : lamonts 1488
357 : lamonts 1500 sts = clSetKernelArg (kernel, count++, sizeof(int) * blk_sz * 2, NULL);
358 :     CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL local temporary array for the prefix scan of preStable preDead argument\n");
359 : lamonts 1488
360 : lamonts 1513
361 : jhr 1492 *argCount = count;
362 : jhr 1469 }
363 : jhr 1519 static void SetScheduleKernelArgs(cl_kernel kernel, int *argCount, KernelArgs_t *args)
364 : lamonts 1500 {
365 :     int count = *argCount;
366 :     cl_int sts = CL_SUCCESS;
367 :    
368 :     sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->schedMem);
369 :     CheckErrorCode (sts, "Scheduler Kernel: error setting OpenCL scheduler argument\n");
370 : jhr 1469
371 : lamonts 1500 *argCount = count;
372 :     }
373 :    
374 : jhr 1471 static void CheckErrorCode (cl_int sts, const char *msg)
375 : lamonts 1459 {
376 :     if (sts != CL_SUCCESS) {
377 : jhr 1464 fprintf (stderr, "%s", msg);
378 : lamonts 1459 exit(1);
379 :     }
380 : jhr 1469 }
381 :    
382 : jhr 1287 /*! \brief load OpenCL code from a file
383 :     */
384 : jhr 1291 static char *LoadSource (const char *filename)
385 : lamonts 1271 {
386 :     struct stat statbuf;
387 : jhr 1287 if (stat(filename, &statbuf) < 0) {
388 : jhr 1327 fprintf (stderr, "unable to stat OpenCL source file %s\n", filename);
389 :     exit (1);
390 : jhr 1287 }
391 :    
392 : jhr 1472 char *source = (char *)CheckedAlloc(statbuf.st_size + 1);
393 : jhr 1287 if (source == 0) {
394 : jhr 1327 fprintf (stderr, "unable to allocate memory for OpenCL source\n");
395 :     exit (1);
396 : jhr 1287 }
397 :    
398 :     FILE *fh = fopen(filename, "r");
399 :     if ((fh == 0)
400 :     || (fread(source, statbuf.st_size, 1, fh) != 1)) {
401 : jhr 1327 fprintf (stderr, "unable to read OpenCL source from %s\n", filename);
402 :     exit (1);
403 : jhr 1287 }
404 :     source[statbuf.st_size] = '\0';
405 :     fclose (fh);
406 : lamonts 1271
407 :     return source;
408 : jhr 1342 }
409 :    
410 : jhr 1473 //! create a kernel object from a program
411 :     static bool CreateKernel (cl_device_id dev, cl_program prog, const char *name, GPUKernel_t *kern)
412 :     {
413 :     cl_int sts;
414 :    
415 :     kern->kern = clCreateKernel(prog, name, &sts);
416 :     if (sts != CL_SUCCESS) {
417 :     fprintf (stderr, "error getting %s from program\n", name);
418 :     return false;
419 :     }
420 :     sts = clGetKernelWorkGroupInfo (
421 :     kern->kern, dev, CL_KERNEL_WORK_GROUP_SIZE,
422 :     sizeof(size_t), &(kern->workGrpSize), 0);
423 :     if (sts != CL_SUCCESS) {
424 :     fprintf (stderr, "error getting workgroup size for %s\n", name);
425 :     return false;
426 :     }
427 :     sts = clGetKernelWorkGroupInfo (
428 :     kern->kern, dev, CL_KERNEL_LOCAL_MEM_SIZE,
429 :     sizeof(cl_ulong), &(kern->localSzb), 0);
430 :     if (sts != CL_SUCCESS) {
431 :     fprintf (stderr, "error getting local memory size for %s\n", name);
432 :     return false;
433 :     }
434 :    
435 :     if (VerboseFlg) {
436 :     fprintf(stderr, "kernel %s: workgroup size = %d, local memory = %d bytes\n",
437 :     name, (int)kern->workGrpSize, (int)kern->localSzb);
438 :     }
439 :    
440 :     return true;
441 :     }
442 :    
443 : jhr 1352 static void LogMessagesToStderr (const char *errstr, const void *private_info, size_t cb, void *user_data)
444 : lamonts 1328 {
445 : jhr 1480 fprintf(stderr, "***** error log *****\n");
446 : jhr 1342 fprintf(stderr, "%s\n", errstr);
447 : jhr 1480 fprintf(stderr, "***** end error log *****\n");
448 : jhr 1342 }
449 : jhr 1267
450 : jhr 1417 /*! \brief initialize the OpenCL execution context, including loading and compiling the OpenCL
451 :     * program.
452 :     * \param clInfo points to the summary information about the available OpenCL devices.
453 :     * \param wrld the Diderot execution information.
454 :     * \return true if initialization is successful, otherwise false.
455 : jhr 1291 */
456 :     static bool InitCL (CLInfo_t *clInfo, Diderot_World_t *wrld)
457 :     {
458 : jhr 1327 cl_int sts;
459 : jhr 1291
460 :     // find a GPU on platform[0]
461 : jhr 1420 DeviceInfo_t *dev = 0;
462 : lamonts 1488 clInfo->mainPlatformIdx = 0;
463 : jhr 1291 int i;
464 :     for (i = 0; i < clInfo->platforms[0].numDevices; i++) {
465 : jhr 1417 if (isGPUDevice (&(clInfo->platforms[0].devices[i]))
466 : jhr 1472 && clInfo->platforms[0].devices[i].isAvail) {
467 : jhr 1420 dev = &(clInfo->platforms[0].devices[i]);
468 : lamonts 1488 clInfo->mainDeviceIdx = i;
469 : jhr 1291 break;
470 :     }
471 :     }
472 : lamonts 1488
473 : jhr 1420 if (dev == 0) {
474 : jhr 1291 fprintf (stderr, "unable to find GPU device\n");
475 :     return false;
476 :     }
477 :    
478 : jhr 1469 if (VerboseFlg) {
479 :     fprintf (stderr, "using platform 0, device %d: %s\n",
480 :     i, clInfo->platforms[0].devices[i].name);
481 :     }
482 :    
483 : jhr 1291 // create the context
484 : jhr 1420 cl_context cxt = clCreateContext(0, 1, &(dev->id), LogMessagesToStderr, 0, &sts);
485 : jhr 1291 if (sts != CL_SUCCESS) {
486 : jhr 1327 fprintf (stderr, "error creating OpenCL context\n");
487 : jhr 1291 return false;
488 :     }
489 :    
490 :     // create the program from the source
491 :     int fnameLen = strlen(wrld->name) + 4; // name + ".cl\0"
492 : jhr 1472 char *fname = (char *)CheckedAlloc(fnameLen);
493 : jhr 1291 snprintf(fname, fnameLen, "%s.cl", wrld->name);
494 : lamonts 1459 char *updateSource = LoadSource (fname);
495 : jhr 1464 free (fname);
496 : lamonts 1459
497 : jhr 1492 // load scheduler kernels
498 :     char *kernelsSource = LoadSource(KERNELS_FILE_PATH);
499 :    
500 : lamonts 1513 const char *src[2] = {kernelsSource,updateSource};
501 : jhr 1464 cl_program prog = clCreateProgramWithSource(cxt, 2, src, NULL, &sts);
502 : jhr 1492 free (updateSource);
503 :     free (kernelsSource);
504 : jhr 1291 if (sts != CL_SUCCESS) {
505 : jhr 1327 fprintf (stderr, "error creating program\n");
506 : jhr 1291 return false;
507 :     }
508 :    
509 :     // build the program
510 : jhr 1420 char options[1024];
511 :     snprintf (options, sizeof(options),
512 : jhr 1472 "-D DIDEROT_CL_VERSION=%d -D DIDEROT_CU_WIDTH=%d -I %s -w",
513 : jhr 1420 100*dev->majorVersion + dev->minorVersion,
514 : jhr 1472 dev->cuWidth,
515 : jhr 1420 DIDEROT_INCLUDE_PATH);
516 : jhr 1472 if (VerboseFlg) {
517 :     fprintf (stderr, "clBuildProgram options: %s\n", options);
518 :     }
519 : jhr 1420 sts = clBuildProgram (prog, 1, &(dev->id), options, 0, 0);
520 : jhr 1291 if (sts != CL_SUCCESS) {
521 : jhr 1352 size_t logSzb;
522 : jhr 1420 clGetProgramBuildInfo (prog, dev->id, CL_PROGRAM_BUILD_LOG, 0, 0, &logSzb);
523 : jhr 1472 char *log = CheckedAlloc(logSzb+1);
524 : jhr 1420 clGetProgramBuildInfo (prog, dev->id, CL_PROGRAM_BUILD_LOG, logSzb, log, &logSzb);
525 : jhr 1352 log[logSzb] = '\0';
526 :     fprintf (stderr, "error compiling program:\n%s\n", log);
527 :     free (log);
528 : jhr 1291 return false;
529 :     }
530 :    
531 : jhr 1473 // extract the kernels from the program
532 : lamonts 1488 if ((! CreateKernel (dev->id, prog, "Diderot_UpdateKernel", &(wrld->kernel[0])))
533 : lamonts 1500 || (! CreateKernel (dev->id, prog, "Diderot_CompactionKernel", &(wrld->kernel[1])))
534 :     || (! CreateKernel (dev->id, prog, "Diderot_SchedUpdateKernel", &(wrld->kernel[2]))))
535 : jhr 1291 return false;
536 :    
537 : jhr 1417 // create the command queue
538 : jhr 1420 cl_command_queue q = clCreateCommandQueue(cxt, dev->id, 0, &sts);
539 : jhr 1417 if (sts != CL_SUCCESS) {
540 :     fprintf (stderr, "error creating OpenCL command queue\n");
541 :     return false;
542 :     }
543 :    
544 : jhr 1291 // initialize world info
545 : jhr 1420 wrld->device = dev->id;
546 : jhr 1291 wrld->context = cxt;
547 :     wrld->cmdQ = q;
548 :    
549 :     return true;
550 :     }
551 : jhr 1327
552 : jhr 1267 // this should be the part of the scheduler
553 :     void *Diderot_AllocStrand (Strand_t *strand)
554 :     {
555 : jhr 1472 return CheckedAlloc(strand->stateSzb);
556 : jhr 1267 }
557 :    
558 :     // block allocation of an initial collection of strands
559 : jhr 1519 Diderot_World_t *Diderot_AllocInitially (
560 : jhr 1287 const char *name, // the name of the program
561 : jhr 1267 Strand_t *strand, // the type of strands being allocated
562 :     bool isArray, // is the initialization an array or collection?
563 :     uint32_t nDims, // depth of iteration nesting
564 :     int32_t *base, // nDims array of base indices
565 :     uint32_t *size) // nDims array of iteration sizes
566 :     {
567 : jhr 1472 Diderot_World_t *wrld = NEW(Diderot_World_t);
568 : jhr 1267 if (wrld == 0) {
569 :     fprintf (stderr, "unable to allocate world\n");
570 :     exit (1);
571 :     }
572 :    
573 : jhr 1287 wrld->name = name; /* NOTE: we are assuming that name is statically allocated! */
574 : jhr 1267 wrld->isArray = isArray;
575 :     wrld->nDims = nDims;
576 : jhr 1472 wrld->base = NEWVEC(int32_t, nDims);
577 :     wrld->size = NEWVEC(uint32_t, nDims);
578 : jhr 1267 size_t numStrands = 1;
579 :     for (int i = 0; i < wrld->nDims; i++) {
580 :     numStrands *= size[i];
581 :     wrld->base[i] = base[i];
582 :     wrld->size[i] = size[i];
583 :     }
584 :    
585 : jhr 1378 if (VerboseFlg) {
586 :     printf("AllocInitially: %d", size[0]);
587 :     for (int i = 1; i < nDims; i++) printf(" x %d", size[i]);
588 :     printf("\n");
589 :     }
590 : jhr 1267
591 :     // allocate the strand state pointers
592 :     wrld->numStrands = numStrands;
593 : jhr 1519 wrld->inState = CheckedAlloc (strand->stateSzb * numStrands);
594 : jhr 1472 wrld->outState = CheckedAlloc (strand->shadowStrandSzb * numStrands);
595 : jhr 1519 wrld->status = NEWVEC(StatusInt_t, numStrands);
596 : jhr 1267
597 :     // initialize strand state pointers etc.
598 :     for (size_t i = 0; i < numStrands; i++) {
599 : lamonts 1351 wrld->status[i] = DIDEROT_ACTIVE;
600 : jhr 1267 }
601 :    
602 :     return wrld;
603 :    
604 :     }
605 :    
606 :     // get strand state pointers
607 :     void *Diderot_InState (Diderot_World_t *wrld, uint32_t i)
608 :     {
609 :     assert (i < wrld->numStrands);
610 : jhr 1519 return &(wrld->inState[i]);
611 : jhr 1267 }
612 :    
613 :     void *Diderot_OutState (Diderot_World_t *wrld, uint32_t i)
614 :     {
615 :     assert (i < wrld->numStrands);
616 : lamonts 1526 return wrld->outState + (i * wrld->outputSzb);
617 : jhr 1267 }
618 :    
619 : jhr 1354 /***** Support for shadow image values *****/
620 : jhr 1435
621 : jhr 1354 void ShadowImage1D (cl_context cxt, Shadow_image1D_t *dst, Diderot_image1D_t *src)
622 :     {
623 :     dst->size[0] = src->size[0];
624 :     dst->s = src->s;
625 :     dst->t = dst->t;
626 :     }
627 : jhr 1435
628 : jhr 1354 void ShadowImage2D (cl_context cxt, Shadow_image2D_t *dst, Diderot_image2D_t *src)
629 :     {
630 : lamonts 1461 dst->size[0] = src->size[0];
631 : jhr 1354 dst->size[1] = src->size[1];
632 : jhr 1355 ShadowMat2x2 (dst->w2i, src->w2i);
633 : lamonts 1461 ShadowVec2 (&dst->tVec, src->tVec);
634 : jhr 1355 ShadowMat2x2 (dst->w2iT, src->w2iT);
635 : jhr 1354 }
636 :    
637 :     void ShadowImage3D (cl_context cxt, Shadow_image3D_t *dst, Diderot_image3D_t *src)
638 :     {
639 :     dst->size[0] = src->size[0];
640 :     dst->size[1] = src->size[1];
641 :     dst->size[2] = src->size[2];
642 : jhr 1355 ShadowMat3x3 (dst->w2i, src->w2i);
643 : lamonts 1461 ShadowVec3 (&dst->tVec, src->tVec);
644 : jhr 1435 ShadowMat3x3 (dst->w2iT, src->w2iT);
645 : jhr 1354 }

root@smlnj-gforge.cs.uchicago.edu
ViewVC Help
Powered by ViewVC 1.0.0