SCM Repository
Annotation of /branches/vis12/src/lib/cl-target/main.c
Parent Directory
|
Revision Log
Revision 1685 - (view) (download) (as text)
1 : | jhr | 1671 | /*! \file main.c |
2 : | * | ||
3 : | * \author John Reppy | ||
4 : | */ | ||
5 : | |||
6 : | /* | ||
7 : | * COPYRIGHT (c) 2011 The Diderot Project (http://diderot-language.cs.uchicago.edu) | ||
8 : | * All rights reserved. | ||
9 : | */ | ||
10 : | |||
11 : | #include <Diderot/diderot.h> | ||
12 : | #include "clinfo.h" | ||
13 : | #include <sys/sysctl.h> | ||
14 : | #include <sys/stat.h> | ||
15 : | |||
16 : | typedef struct { | ||
17 : | cl_int blkIdx; //!< the id of this block | ||
18 : | cl_int nActive; //!< number of active (status != DIE or STABLE) strands | ||
19 : | cl_int nDead; //!< number of strands in the DIE state | ||
20 : | cl_int nStabilizing; //!< number of new strands in the STABILIZE state | ||
21 : | cl_int nDying; //!< number of new strands in the DIE state | ||
22 : | } StrandBlock_t; | ||
23 : | |||
24 : | typedef struct { | ||
25 : | cl_int numStrands; //!< number of strands | ||
26 : | cl_int sId; //!< the index into the todo list or queue | ||
27 : | cl_int nextStrand; //!< index of the next strand to retrieve from the pool | ||
28 : | cl_int clearQueueSz; //!< an indicator on whether the queue size should be cleared | ||
29 : | cl_int queueSize; //!< number of blocks on the scheduler's queue | ||
30 : | cl_int todoSize; //!< number of blocks on the scheduler's todo list | ||
31 : | cl_int numAvailable; //!< number of active strands left to process | ||
32 : | } SchedState_t; | ||
33 : | |||
34 : | typedef struct { | ||
35 : | cl_kernel kern; //!< OpenCL kernel that implements the program | ||
36 : | size_t workGrpSize; //!< size of workgroup for this kernel | ||
37 : | cl_ulong localSzb; //!< size of local memory used by kernel | ||
38 : | } GPUKernel_t; | ||
39 : | |||
40 : | struct struct_world { | ||
41 : | STRUCT_WORLD_PREFIX | ||
42 : | // FIXME: document these fields! | ||
43 : | void *inState; | ||
44 : | void *outState; | ||
45 : | DeviceInfo_t *device; //!< info about OpenCL device that we are using. | ||
46 : | uint32_t nWorkers; //!< number of work groups to create | ||
47 : | cl_context context; //!< OpenCL execution context | ||
48 : | cl_command_queue cmdQ; //!< OpenCL command queue | ||
49 : | GPUKernel_t kernel[3]; //!< OpenCL kernel that implements the program | ||
50 : | }; | ||
51 : | |||
52 : | // FIXME: need documentation for this stuct! | ||
53 : | typedef struct { | ||
54 : | cl_mem schedMem; | ||
55 : | cl_mem outMem; | ||
56 : | cl_mem inMem; | ||
57 : | cl_mem blocksMem; | ||
58 : | cl_mem strandBlocksIdxsMem; | ||
59 : | cl_mem queueMem; | ||
60 : | cl_mem todoMem; | ||
61 : | cl_mem statusMem; | ||
62 : | } KernelArgs_t; | ||
63 : | |||
64 : | static bool InitCL (CLInfo_t *clInfo, Diderot_World_t *wrld); | ||
65 : | static void CheckErrorCode (cl_int sts, const char * msg); | ||
66 : | static void SetPhase1Args (cl_kernel kernel, int *argCount, KernelArgs_t *args); | ||
67 : | static void SetPhase2Args (cl_kernel kernel, int *argCount, KernelArgs_t *args, int blk_sz); | ||
68 : | static void SetScheduleKernelArgs (cl_kernel kernel, int *argCount, KernelArgs_t *args); | ||
69 : | |||
70 : | static bool CreateKernel (cl_device_id dev, cl_program prog, const char *name, GPUKernel_t *kern); | ||
71 : | |||
72 : | extern void Diderot_LoadGlobals (cl_context cxt, cl_kernel kernel, cl_command_queue cmdQ, int argStart); | ||
73 : | |||
74 : | int main (int argc, const char **argv) | ||
75 : | { | ||
76 : | // get information about OpenCL support | ||
77 : | CLInfo_t *clInfo = GetCLInfo(); | ||
78 : | if (clInfo == 0) { | ||
79 : | fprintf (stderr, "no OpenCL support\n"); | ||
80 : | exit (1); | ||
81 : | } | ||
82 : | |||
83 : | // run the generated global initialization code | ||
84 : | if (VerboseFlg) | ||
85 : | fprintf (stderr, "initializing globals ...\n"); | ||
86 : | Diderot_InitGlobals (); | ||
87 : | |||
88 : | Diderot_int_t nWorkersPerCU = 4; | ||
89 : | Diderot_Options_t *opts = Diderot_OptNew (); | ||
90 : | Diderot_OptAddInt (opts, "np", "specify number of workers/CU", &nWorkersPerCU, true); | ||
91 : | Diderot_RegisterGlobalOpts (opts); | ||
92 : | Diderot_OptProcess (opts, argc, argv); | ||
93 : | Diderot_OptFree (opts); | ||
94 : | |||
95 : | if (VerboseFlg) | ||
96 : | PrintCLInfo (stdout, clInfo); | ||
97 : | |||
98 : | Diderot_World_t *wrld = Diderot_Initially (); // this may not be right for OpenCL | ||
99 : | |||
100 : | if (! InitCL(clInfo, wrld)) | ||
101 : | exit (1); | ||
102 : | |||
103 : | // set the number of work groups | ||
104 : | if ((0 < nWorkersPerCU) && (nWorkersPerCU < wrld->device->maxWISize[0] / wrld->device->cuWidth)) | ||
105 : | wrld->nWorkers = nWorkersPerCU * wrld->device->numCUs; | ||
106 : | else | ||
107 : | wrld->nWorkers = wrld->device->numCUs; // fallback to one worker per CU | ||
108 : | if (VerboseFlg) | ||
109 : | fprintf (stderr, "using %d x %d threads\n", | ||
110 : | wrld->nWorkers, wrld->device->cuWidth); | ||
111 : | |||
112 : | // Conversion of strands from their host types to their shadow types | ||
113 : | void *shadowInState = CheckedAlloc(Diderot_Strands[0]->shadowStrandSzb * wrld->numStrands); | ||
114 : | void *shadowOutState = CheckedAlloc(Diderot_Strands[0]->shadowStrandSzb * wrld->numStrands); | ||
115 : | uint8_t *strandPtr = (uint8_t *)wrld->inState; | ||
116 : | uint8_t *strandShadowPtr = (uint8_t *)shadowInState; | ||
117 : | uint8_t *strandShadowOutPtr = (uint8_t *)shadowOutState; | ||
118 : | size_t shadowSize = Diderot_Strands[0]->shadowStrandSzb * wrld->numStrands; | ||
119 : | |||
120 : | for (int i = 0; i < wrld->numStrands; i++) { | ||
121 : | Diderot_Strands[0]->strandCopy (strandPtr, strandShadowPtr); | ||
122 : | Diderot_Strands[0]->strandCopy (strandPtr, strandShadowOutPtr); | ||
123 : | strandPtr += Diderot_Strands[0]->stateSzb; | ||
124 : | strandShadowPtr += Diderot_Strands[0]->shadowStrandSzb; | ||
125 : | strandShadowOutPtr += Diderot_Strands[0]->shadowStrandSzb; | ||
126 : | } | ||
127 : | // FIXME: it is confusing to use the inState/outState pointers for two different purposes. | ||
128 : | free (wrld->inState); | ||
129 : | free (wrld->outState); | ||
130 : | wrld->inState = shadowInState; | ||
131 : | wrld->outState = shadowOutState; | ||
132 : | |||
133 : | // hack to make the invariant part of the state the same in both copies | ||
134 : | memcpy (wrld->outState, wrld->inState, shadowSize); | ||
135 : | |||
136 : | int argCount = 0; | ||
137 : | cl_int sts = CL_SUCCESS; | ||
138 : | |||
139 : | KernelArgs_t kernelArgs; | ||
140 : | SchedState_t scheduler; | ||
141 : | |||
142 : | scheduler.numStrands = wrld->numStrands; | ||
143 : | scheduler.nextStrand = 0; | ||
144 : | scheduler.todoSize = 0; | ||
145 : | scheduler.sId = 0; | ||
146 : | scheduler.clearQueueSz= 1; | ||
147 : | scheduler.numAvailable = wrld->numStrands; | ||
148 : | |||
149 : | size_t globalWorkSize[1] = {0}; | ||
150 : | size_t localWorkSize[1] = {0}; | ||
151 : | globalWorkSize[0] = wrld->nWorkers * wrld->device->cuWidth; | ||
152 : | localWorkSize[0] = wrld->device->cuWidth; | ||
153 : | |||
154 : | // compute the number of scheduler blocks | ||
155 : | int numberOfBlocks = (wrld->numStrands + wrld->device->cuWidth - 1) / wrld->device->cuWidth; | ||
156 : | assert (numberOfBlocks * wrld->device->cuWidth >= wrld->numStrands); | ||
157 : | |||
158 : | size_t strandBlkMemSize = sizeof(int) * numberOfBlocks * wrld->device->cuWidth; | ||
159 : | size_t schedListMemSize = sizeof(int) * numberOfBlocks; | ||
160 : | int *schedulerQueue = (int *)CheckedAlloc(schedListMemSize); | ||
161 : | int *schedulerTodoList = (int *)CheckedAlloc(schedListMemSize); | ||
162 : | int *strandBlocksIdxs = (int *)CheckedAlloc(strandBlkMemSize); | ||
163 : | scheduler.queueSize = numberOfBlocks; | ||
164 : | |||
165 : | size_t strandblkMemSize = sizeof(StrandBlock_t) * numberOfBlocks; | ||
166 : | StrandBlock_t *strandBlks = (StrandBlock_t *)CheckedAlloc(strandblkMemSize); | ||
167 : | |||
168 : | for (int i = 0; i < numberOfBlocks; i++) { | ||
169 : | schedulerQueue[i] = i; | ||
170 : | strandBlks[i].blkIdx = i; | ||
171 : | strandBlks[i].nActive = 0; | ||
172 : | strandBlks[i].nDead = 0; | ||
173 : | strandBlks[i].nStabilizing = 0; | ||
174 : | strandBlks[i].nDying = 0; | ||
175 : | } | ||
176 : | |||
177 : | kernelArgs.schedMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, | ||
178 : | sizeof(SchedState_t), &scheduler, &sts); | ||
179 : | CheckErrorCode (sts, "error creating OpenCL scheduler buffer\n"); | ||
180 : | |||
181 : | kernelArgs.blocksMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, | ||
182 : | strandblkMemSize, strandBlks , &sts); | ||
183 : | CheckErrorCode (sts, "error creating OpenCL strand blocks buffer\n"); | ||
184 : | |||
185 : | kernelArgs.strandBlocksIdxsMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, | ||
186 : | strandBlkMemSize, strandBlocksIdxs , &sts); | ||
187 : | CheckErrorCode (sts, "error creating OpenCL strand blocks indices buffer\n"); | ||
188 : | |||
189 : | kernelArgs.queueMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, | ||
190 : | schedListMemSize, schedulerQueue , &sts); | ||
191 : | CheckErrorCode (sts, "error creating OpenCL scheduler queue buffer\n"); | ||
192 : | |||
193 : | kernelArgs.todoMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, | ||
194 : | schedListMemSize, schedulerTodoList , &sts); | ||
195 : | CheckErrorCode (sts, "error creating OpenCL scheduler todo buffer\n"); | ||
196 : | |||
197 : | kernelArgs.inMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, | ||
198 : | shadowSize, wrld->inState, &sts); | ||
199 : | CheckErrorCode (sts, "error creating OpenCL strand in-state buffer\n"); | ||
200 : | |||
201 : | kernelArgs.outMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, | ||
202 : | shadowSize, wrld->outState, &sts); | ||
203 : | CheckErrorCode (sts, "error creating OpenCL strand out-state buffer\n"); | ||
204 : | |||
205 : | kernelArgs.statusMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, | ||
206 : | sizeof(int) * wrld->numStrands, wrld->status, &sts); | ||
207 : | CheckErrorCode (sts, "error creating OpenCL world status buffer\n"); | ||
208 : | |||
209 : | //Setup the Update Kernel's arguments | ||
210 : | SetPhase1Args (wrld->kernel[0].kern, &argCount, &kernelArgs); | ||
211 : | Diderot_LoadGlobals (wrld->context, wrld->kernel[0].kern, wrld->cmdQ, argCount); | ||
212 : | |||
213 : | //Setup the Compaction Kernel's arguments | ||
214 : | argCount=0; | ||
215 : | SetPhase2Args (wrld->kernel[1].kern, &argCount, &kernelArgs, wrld->device->cuWidth); | ||
216 : | |||
217 : | //Setup the Scheduler Kernel's arguments | ||
218 : | argCount = 0; | ||
219 : | SetScheduleKernelArgs (wrld->kernel[2].kern, &argCount, &kernelArgs); | ||
220 : | |||
221 : | double t0 = airTime(); | ||
222 : | |||
223 : | while (scheduler.numAvailable > 0) { | ||
224 : | // Runs the update kernel on all strands | ||
225 : | sts = clEnqueueNDRangeKernel(wrld->cmdQ, wrld->kernel[0].kern, 1, NULL, | ||
226 : | globalWorkSize, localWorkSize, 0, NULL, NULL); | ||
227 : | CheckErrorCode (sts, "error in executing update kernel\n"); | ||
228 : | |||
229 : | sts = clEnqueueTask(wrld->cmdQ,wrld->kernel[2].kern, 0, NULL, NULL); | ||
230 : | CheckErrorCode (sts, "error in executing scheduler update kernel before compaction\n"); | ||
231 : | |||
232 : | // Run the compaction kernel on all strands | ||
233 : | sts = clEnqueueNDRangeKernel(wrld->cmdQ, wrld->kernel[1].kern, 1, NULL, | ||
234 : | globalWorkSize, localWorkSize, 0, NULL, NULL); | ||
235 : | CheckErrorCode (sts, "error ccccn executing compaction kernel\n"); | ||
236 : | |||
237 : | sts = clEnqueueTask(wrld->cmdQ,wrld->kernel[2].kern,0,NULL,NULL); | ||
238 : | CheckErrorCode (sts, "error in executing scheduler update kernel after compaction\n"); | ||
239 : | |||
240 : | sts = clEnqueueReadBuffer(wrld->cmdQ, kernelArgs.schedMem, CL_TRUE, 0, sizeof(SchedState_t), | ||
241 : | &scheduler, 0, NULL, NULL); | ||
242 : | CheckErrorCode (sts, "error reading back scheduler information\n"); | ||
243 : | } | ||
244 : | |||
245 : | sts = clEnqueueReadBuffer(wrld->cmdQ, kernelArgs.outMem, CL_TRUE, 0,shadowSize, | ||
246 : | wrld->outState, 0, NULL, NULL); | ||
247 : | CheckErrorCode (sts, "error in reading back output\n"); | ||
248 : | |||
249 : | sts = clEnqueueReadBuffer( | ||
250 : | wrld->cmdQ, kernelArgs.statusMem, CL_TRUE, 0, sizeof(int) * wrld->numStrands, | ||
251 : | wrld->status, 0, NULL, NULL); | ||
252 : | CheckErrorCode (sts, "error in reading back status\n"); | ||
253 : | |||
254 : | wrld->outputSzb = Diderot_Strands[0]->shadowStrandSzb; | ||
255 : | clFinish (wrld->cmdQ); | ||
256 : | clReleaseKernel(wrld->kernel[0].kern); | ||
257 : | clReleaseKernel(wrld->kernel[1].kern); | ||
258 : | clReleaseCommandQueue(wrld->cmdQ); | ||
259 : | clReleaseContext(wrld->context); | ||
260 : | |||
261 : | double totalTime = airTime() - t0; | ||
262 : | |||
263 : | if (VerboseFlg) | ||
264 : | fprintf (stderr, "done in %f seconds\n", totalTime); | ||
265 : | else if (TimingFlg) | ||
266 : | printf ("usr=%f\n", totalTime); | ||
267 : | |||
268 : | // here we have the final state of all of the strands in the "in" buffer | ||
269 : | // output the final strand states | ||
270 : | if (NrrdOutputFlg) | ||
271 : | Diderot_Output (wrld, Diderot_Strands[0]->shadowStrandSzb); | ||
272 : | else | ||
273 : | Diderot_Print (wrld); | ||
274 : | |||
275 : | Diderot_Shutdown (wrld); | ||
276 : | |||
277 : | return 0; | ||
278 : | |||
279 : | } | ||
280 : | |||
281 : | static void SetPhase1Args (cl_kernel kernel, int *argCount, KernelArgs_t *args) | ||
282 : | { | ||
283 : | int count = *argCount; | ||
284 : | cl_int sts = CL_SUCCESS; | ||
285 : | |||
286 : | sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->inMem); | ||
287 : | CheckErrorCode (sts, "Update Kernel: error Setting OpenCL strand in-state argument\n"); | ||
288 : | |||
289 : | sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->outMem); | ||
290 : | CheckErrorCode (sts, "Update Kernel: error Setting OpenCL strand out-state argument\n"); | ||
291 : | |||
292 : | sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->statusMem); | ||
293 : | CheckErrorCode (sts, "Update Kernel: error Setting OpenCL world status argument\n"); | ||
294 : | |||
295 : | sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->schedMem); | ||
296 : | CheckErrorCode (sts, "Update Kernel: error Setting OpenCL scheduler argument\n"); | ||
297 : | |||
298 : | sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->blocksMem); | ||
299 : | CheckErrorCode (sts, "Update Kernel: error Setting OpenCL strand blocks argument\n"); | ||
300 : | |||
301 : | sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->strandBlocksIdxsMem); | ||
302 : | CheckErrorCode (sts, "Update Kernel: error Setting OpenCL strand blocks' indices argument\n"); | ||
303 : | |||
304 : | sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->queueMem); | ||
305 : | CheckErrorCode (sts, "Update Kernel: error Setting OpenCL scheduler queue argument\n"); | ||
306 : | |||
307 : | sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->todoMem); | ||
308 : | CheckErrorCode (sts, "Update Kernel: error Setting OpenCL scheduler todo argument\n"); | ||
309 : | |||
310 : | sts = clSetKernelArg (kernel, count++, sizeof(StrandBlock_t), NULL); | ||
311 : | CheckErrorCode (sts, "Update Kernel: error Setting OpenCL local strand blockargument\n"); | ||
312 : | |||
313 : | *argCount = count; | ||
314 : | |||
315 : | } | ||
316 : | |||
317 : | static void SetPhase2Args (cl_kernel kernel, int *argCount, KernelArgs_t *args, int blk_sz) | ||
318 : | { | ||
319 : | int count = *argCount; | ||
320 : | cl_int sts = CL_SUCCESS; | ||
321 : | |||
322 : | sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->statusMem); | ||
323 : | CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL world status argument\n"); | ||
324 : | |||
325 : | sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->schedMem); | ||
326 : | CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL scheduler argument\n"); | ||
327 : | |||
328 : | sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->blocksMem); | ||
329 : | CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL strand blocks argument\n"); | ||
330 : | |||
331 : | sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->strandBlocksIdxsMem); | ||
332 : | CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL strand blocks' indices argument\n"); | ||
333 : | |||
334 : | sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->queueMem); | ||
335 : | CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL scheduler queue argument\n"); | ||
336 : | |||
337 : | sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->todoMem); | ||
338 : | CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL scheduler todo argument\n"); | ||
339 : | |||
340 : | sts = clSetKernelArg (kernel, count++, sizeof(StrandBlock_t), NULL); | ||
341 : | CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL local strand blockargument\n"); | ||
342 : | |||
343 : | sts = clSetKernelArg (kernel, count++, sizeof(int) * blk_sz, NULL); | ||
344 : | CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL local preStable argument\n"); | ||
345 : | |||
346 : | sts = clSetKernelArg (kernel, count++, sizeof(int) * blk_sz, NULL); | ||
347 : | CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL local preDead argument\n"); | ||
348 : | |||
349 : | sts = clSetKernelArg (kernel, count++, sizeof(int) * blk_sz * 2, NULL); | ||
350 : | CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL local temporary array for the prefix scan of preStable preDead argument\n"); | ||
351 : | |||
352 : | |||
353 : | *argCount = count; | ||
354 : | } | ||
355 : | |||
356 : | static void SetScheduleKernelArgs(cl_kernel kernel, int *argCount, KernelArgs_t *args) | ||
357 : | { | ||
358 : | int count = *argCount; | ||
359 : | cl_int sts = CL_SUCCESS; | ||
360 : | |||
361 : | sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->schedMem); | ||
362 : | CheckErrorCode (sts, "Scheduler Kernel: error setting OpenCL scheduler argument\n"); | ||
363 : | |||
364 : | *argCount = count; | ||
365 : | } | ||
366 : | |||
367 : | static void CheckErrorCode (cl_int sts, const char *msg) | ||
368 : | { | ||
369 : | if (sts != CL_SUCCESS) { | ||
370 : | fprintf (stderr, "%s", msg); | ||
371 : | exit(1); | ||
372 : | } | ||
373 : | } | ||
374 : | |||
375 : | /*! \brief load OpenCL code from a file | ||
376 : | */ | ||
377 : | static char *LoadSource (const char *filename) | ||
378 : | { | ||
379 : | struct stat statbuf; | ||
380 : | if (stat(filename, &statbuf) < 0) { | ||
381 : | fprintf (stderr, "unable to stat OpenCL source file %s\n", filename); | ||
382 : | exit (1); | ||
383 : | } | ||
384 : | |||
385 : | char *source = (char *)CheckedAlloc(statbuf.st_size + 1); | ||
386 : | if (source == 0) { | ||
387 : | fprintf (stderr, "unable to allocate memory for OpenCL source\n"); | ||
388 : | exit (1); | ||
389 : | } | ||
390 : | |||
391 : | FILE *fh = fopen(filename, "r"); | ||
392 : | if ((fh == 0) | ||
393 : | || (fread(source, statbuf.st_size, 1, fh) != 1)) { | ||
394 : | fprintf (stderr, "unable to read OpenCL source from %s\n", filename); | ||
395 : | exit (1); | ||
396 : | } | ||
397 : | source[statbuf.st_size] = '\0'; | ||
398 : | fclose (fh); | ||
399 : | |||
400 : | return source; | ||
401 : | } | ||
402 : | |||
403 : | //! create a kernel object from a program | ||
404 : | static bool CreateKernel (cl_device_id dev, cl_program prog, const char *name, GPUKernel_t *kern) | ||
405 : | { | ||
406 : | cl_int sts; | ||
407 : | |||
408 : | kern->kern = clCreateKernel(prog, name, &sts); | ||
409 : | if (sts != CL_SUCCESS) { | ||
410 : | fprintf (stderr, "error getting %s from program\n", name); | ||
411 : | return false; | ||
412 : | } | ||
413 : | sts = clGetKernelWorkGroupInfo ( | ||
414 : | kern->kern, dev, CL_KERNEL_WORK_GROUP_SIZE, | ||
415 : | sizeof(size_t), &(kern->workGrpSize), 0); | ||
416 : | if (sts != CL_SUCCESS) { | ||
417 : | fprintf (stderr, "error getting workgroup size for %s\n", name); | ||
418 : | return false; | ||
419 : | } | ||
420 : | sts = clGetKernelWorkGroupInfo ( | ||
421 : | kern->kern, dev, CL_KERNEL_LOCAL_MEM_SIZE, | ||
422 : | sizeof(cl_ulong), &(kern->localSzb), 0); | ||
423 : | if (sts != CL_SUCCESS) { | ||
424 : | fprintf (stderr, "error getting local memory size for %s\n", name); | ||
425 : | return false; | ||
426 : | } | ||
427 : | |||
428 : | if (VerboseFlg) { | ||
429 : | fprintf(stderr, "kernel %s: workgroup size = %d, local memory = %d bytes\n", | ||
430 : | name, (int)kern->workGrpSize, (int)kern->localSzb); | ||
431 : | } | ||
432 : | |||
433 : | return true; | ||
434 : | } | ||
435 : | |||
436 : | static void LogMessagesToStderr (const char *errstr, const void *private_info, size_t cb, void *user_data) | ||
437 : | { | ||
438 : | fprintf(stderr, "***** error log *****\n"); | ||
439 : | fprintf(stderr, "%s\n", errstr); | ||
440 : | fprintf(stderr, "***** end error log *****\n"); | ||
441 : | } | ||
442 : | |||
443 : | /*! \brief initialize the OpenCL execution context, including loading and compiling the OpenCL | ||
444 : | * program. | ||
445 : | * \param clInfo points to the summary information about the available OpenCL devices. | ||
446 : | * \param wrld the Diderot execution information. | ||
447 : | * \return true if initialization is successful, otherwise false. | ||
448 : | */ | ||
449 : | static bool InitCL (CLInfo_t *clInfo, Diderot_World_t *wrld) | ||
450 : | { | ||
451 : | cl_int sts; | ||
452 : | int pltIx = 0; // main patform index | ||
453 : | |||
454 : | // find a GPU on platform[0] | ||
455 : | DeviceInfo_t *dev = 0; | ||
456 : | int i; | ||
457 : | for (i = 0; i < clInfo->platforms[pltIx].numDevices; i++) { | ||
458 : | if (isGPUDevice (&(clInfo->platforms[pltIx].devices[i])) | ||
459 : | && clInfo->platforms[pltIx].devices[i].isAvail) { | ||
460 : | dev = &(clInfo->platforms[pltIx].devices[i]); | ||
461 : | break; | ||
462 : | } | ||
463 : | } | ||
464 : | |||
465 : | if (dev == 0) { | ||
466 : | fprintf (stderr, "unable to find GPU device\n"); | ||
467 : | return false; | ||
468 : | } | ||
469 : | |||
470 : | if (VerboseFlg) { | ||
471 : | fprintf (stderr, "using platform %d, device %d: %s\n", | ||
472 : | pltIx, i, clInfo->platforms[0].devices[i].name); | ||
473 : | } | ||
474 : | |||
475 : | // create the context | ||
476 : | cl_context cxt = clCreateContext(0, 1, &(dev->id), LogMessagesToStderr, 0, &sts); | ||
477 : | if (sts != CL_SUCCESS) { | ||
478 : | fprintf (stderr, "error creating OpenCL context\n"); | ||
479 : | return false; | ||
480 : | } | ||
481 : | |||
482 : | // create the program from the source | ||
483 : | int fnameLen = strlen(wrld->name) + 4; // name + ".cl\0" | ||
484 : | char *fname = (char *)CheckedAlloc(fnameLen); | ||
485 : | snprintf(fname, fnameLen, "%s.cl", wrld->name); | ||
486 : | char *updateSource = LoadSource (fname); | ||
487 : | free (fname); | ||
488 : | |||
489 : | const char *src[1] = {updateSource}; | ||
490 : | cl_program prog = clCreateProgramWithSource(cxt, 1, src, NULL, &sts); | ||
491 : | free (updateSource); | ||
492 : | if (sts != CL_SUCCESS) { | ||
493 : | fprintf (stderr, "error creating program\n"); | ||
494 : | return false; | ||
495 : | } | ||
496 : | |||
497 : | // build the program | ||
498 : | char options[1024]; | ||
499 : | snprintf (options, sizeof(options), | ||
500 : | "-D DIDEROT_CL_VERSION=%d -D DIDEROT_CU_WIDTH=%d -I %s -w", | ||
501 : | 100*dev->majorVersion + dev->minorVersion, | ||
502 : | dev->cuWidth, | ||
503 : | DIDEROT_INCLUDE_PATH); | ||
504 : | if (VerboseFlg) { | ||
505 : | fprintf (stderr, "clBuildProgram options: %s\n", options); | ||
506 : | } | ||
507 : | sts = clBuildProgram (prog, 1, &(dev->id), options, 0, 0); | ||
508 : | if (sts != CL_SUCCESS) { | ||
509 : | size_t logSzb; | ||
510 : | clGetProgramBuildInfo (prog, dev->id, CL_PROGRAM_BUILD_LOG, 0, 0, &logSzb); | ||
511 : | char *log = CheckedAlloc(logSzb+1); | ||
512 : | clGetProgramBuildInfo (prog, dev->id, CL_PROGRAM_BUILD_LOG, logSzb, log, &logSzb); | ||
513 : | log[logSzb] = '\0'; | ||
514 : | fprintf (stderr, "error compiling program:\n%s\n", log); | ||
515 : | free (log); | ||
516 : | return false; | ||
517 : | } | ||
518 : | |||
519 : | // extract the kernels from the program | ||
520 : | if ((! CreateKernel (dev->id, prog, "Diderot_UpdateKernel", &(wrld->kernel[0]))) | ||
521 : | || (! CreateKernel (dev->id, prog, "Diderot_CompactionKernel", &(wrld->kernel[1]))) | ||
522 : | || (! CreateKernel (dev->id, prog, "Diderot_SchedUpdateKernel", &(wrld->kernel[2])))) | ||
523 : | return false; | ||
524 : | |||
525 : | // create the command queue | ||
526 : | cl_command_queue q = clCreateCommandQueue(cxt, dev->id, 0, &sts); | ||
527 : | if (sts != CL_SUCCESS) { | ||
528 : | fprintf (stderr, "error creating OpenCL command queue\n"); | ||
529 : | return false; | ||
530 : | } | ||
531 : | |||
532 : | // initialize world info | ||
533 : | wrld->device = dev; | ||
534 : | wrld->context = cxt; | ||
535 : | wrld->cmdQ = q; | ||
536 : | |||
537 : | return true; | ||
538 : | } | ||
539 : | |||
540 : | // this should be the part of the scheduler | ||
541 : | void *Diderot_AllocStrand (Strand_t *strand) | ||
542 : | { | ||
543 : | return CheckedAlloc(strand->stateSzb); | ||
544 : | } | ||
545 : | |||
546 : | // block allocation of an initial collection of strands | ||
547 : | Diderot_World_t *Diderot_AllocInitially ( | ||
548 : | const char *name, // the name of the program | ||
549 : | Strand_t *strand, // the type of strands being allocated | ||
550 : | bool isArray, // is the initialization an array or collection? | ||
551 : | uint32_t nDims, // depth of iteration nesting | ||
552 : | int32_t *base, // nDims array of base indices | ||
553 : | uint32_t *size) // nDims array of iteration sizes | ||
554 : | { | ||
555 : | Diderot_World_t *wrld = NEW(Diderot_World_t); | ||
556 : | if (wrld == 0) { | ||
557 : | fprintf (stderr, "unable to allocate world\n"); | ||
558 : | exit (1); | ||
559 : | } | ||
560 : | |||
561 : | wrld->name = name; /* NOTE: we are assuming that name is statically allocated! */ | ||
562 : | wrld->isArray = isArray; | ||
563 : | wrld->nDims = nDims; | ||
564 : | wrld->base = NEWVEC(int32_t, nDims); | ||
565 : | wrld->size = NEWVEC(uint32_t, nDims); | ||
566 : | size_t numStrands = 1; | ||
567 : | for (int i = 0; i < wrld->nDims; i++) { | ||
568 : | numStrands *= size[i]; | ||
569 : | wrld->base[i] = base[i]; | ||
570 : | wrld->size[i] = size[i]; | ||
571 : | } | ||
572 : | |||
573 : | if (VerboseFlg) { | ||
574 : | printf("AllocInitially: %d", size[0]); | ||
575 : | for (int i = 1; i < nDims; i++) printf(" x %d", size[i]); | ||
576 : | printf("\n"); | ||
577 : | } | ||
578 : | |||
579 : | // allocate the strand state pointers | ||
580 : | wrld->numStrands = numStrands; | ||
581 : | wrld->inState = CheckedAlloc (strand->stateSzb * numStrands); | ||
582 : | wrld->outState = CheckedAlloc (strand->shadowStrandSzb * numStrands); | ||
583 : | wrld->status = NEWVEC(StatusInt_t, numStrands); | ||
584 : | |||
585 : | // initialize strand state pointers etc. | ||
586 : | for (size_t i = 0; i < numStrands; i++) { | ||
587 : | wrld->status[i] = DIDEROT_ACTIVE; | ||
588 : | } | ||
589 : | |||
590 : | return wrld; | ||
591 : | |||
592 : | } | ||
593 : | |||
594 : | // get strand state pointers | ||
595 : | void *Diderot_InState (Diderot_World_t *wrld, uint32_t i) | ||
596 : | { | ||
597 : | assert (i < wrld->numStrands); | ||
598 : | return wrld->inState + (i * Diderot_Strands[0]->stateSzb); | ||
599 : | } | ||
600 : | |||
601 : | void *Diderot_OutState (Diderot_World_t *wrld, uint32_t i) | ||
602 : | { | ||
603 : | assert (i < wrld->numStrands); | ||
604 : | // FIXME: this function is called when outState points to the shadow state, | ||
605 : | // but it is confusing to me -- JHR | ||
606 : | return wrld->outState + (i * Diderot_Strands[0]->shadowStrandSzb); | ||
607 : | } | ||
608 : | |||
609 : | /***** Support for shadow image values *****/ | ||
610 : | |||
611 : | void ShadowImage1D (cl_context cxt, Shadow_image1D_t *dst, Diderot_image1D_t *src) | ||
612 : | { | ||
613 : | dst->size[0] = src->size[0]; | ||
614 : | dst->s = src->s; | ||
615 : | dst->t = src->t; | ||
616 : | } | ||
617 : | |||
618 : | void ShadowImage2D (cl_context cxt, Shadow_image2D_t *dst, Diderot_image2D_t *src) | ||
619 : | { | ||
620 : | dst->size[0] = src->size[0]; | ||
621 : | dst->size[1] = src->size[1]; | ||
622 : | ShadowMat2x2 (dst->w2i, src->w2i); | ||
623 : | ShadowVec2 (&dst->tVec, src->tVec); | ||
624 : | ShadowMat2x2 (dst->w2iT, src->w2iT); | ||
625 : | } | ||
626 : | |||
627 : | void ShadowImage3D (cl_context cxt, Shadow_image3D_t *dst, Diderot_image3D_t *src) | ||
628 : | { | ||
629 : | dst->size[0] = src->size[0]; | ||
630 : | dst->size[1] = src->size[1]; | ||
631 : | dst->size[2] = src->size[2]; | ||
632 : | ShadowMat3x3 (dst->w2i, src->w2i); | ||
633 : | ShadowVec3 (&dst->tVec, src->tVec); | ||
634 : | ShadowMat3x3 (dst->w2iT, src->w2iT); | ||
635 : | } |
root@smlnj-gforge.cs.uchicago.edu | ViewVC Help |
Powered by ViewVC 1.0.0 |