SCM Repository
Annotation of /branches/pure-cfg/src/lib/cl-target/main.c
Parent Directory
|
Revision Log
Revision 1513 - (view) (download) (as text)
1 : | jhr | 1267 | /*! \file main.c |
2 : | * | ||
3 : | * \author John Reppy | ||
4 : | */ | ||
5 : | |||
6 : | /* | ||
7 : | * COPYRIGHT (c) 2011 The Diderot Project (http://diderot-language.cs.uchicago.edu) | ||
8 : | * All rights reserved. | ||
9 : | */ | ||
10 : | |||
11 : | jhr | 1275 | #include <Diderot/diderot.h> |
12 : | jhr | 1411 | #include "clinfo.h" |
13 : | jhr | 1267 | #include <string.h> |
14 : | #include <stdio.h> | ||
15 : | #include <assert.h> | ||
16 : | lamonts | 1271 | #include <sys/sysctl.h> |
17 : | #include <sys/stat.h> | ||
18 : | jhr | 1267 | |
19 : | jhr | 1492 | // FIXME: the kernels should really be in the lib subdirectory |
20 : | // path to kernels | ||
21 : | #define KERNELS_FILE_PATH DIDEROT_INCLUDE_PATH "/cl_kernels/kernels.cl" | ||
22 : | lamonts | 1488 | |
23 : | lamonts | 1513 | |
24 : | lamonts | 1488 | typedef struct { |
25 : | jhr | 1492 | cl_int blkIdx; //!< the id of this block |
26 : | cl_int nActive; //!< number of active (status != DIE or STABLE) strands | ||
27 : | cl_int nDead; //!< number of strands in the DIE state | ||
28 : | cl_int nStabilizing; //!< number of new strands in the STABILIZE state | ||
29 : | cl_int nDying; //!< number of new strands in the DIE state | ||
30 : | lamonts | 1488 | } StrandBlock_t; |
31 : | |||
32 : | typedef struct { | ||
33 : | jhr | 1492 | cl_int numStrands; //!< number of strands |
34 : | cl_int sId; //!< the index into the todo list or queue | ||
35 : | lamonts | 1501 | cl_int nextStrand; //!< index of the next strand to retrieve from the pool |
36 : | cl_int clearQueueSz; //!< an indicator on whether the queue size should be cleared | ||
37 : | jhr | 1492 | cl_int queueSize; //!< number of blocks on the scheduler's queue |
38 : | cl_int todoSize; //!< number of blocks on the scheduler's todo list | ||
39 : | lamonts | 1500 | cl_int numAvailable; //!< number of active strands left to process |
40 : | lamonts | 1488 | } SchedState_t; |
41 : | |||
42 : | |||
43 : | jhr | 1267 | // NOTE: we probably should put this in a file that supports runtime printing |
44 : | static bool VerboseFlg = false; | ||
45 : | jhr | 1287 | static bool TimingFlg = false; |
46 : | lamonts | 1513 | static bool NrrdOutputFlg = false; |
47 : | lamonts | 1470 | #define WARP_SIZE 32 |
48 : | lamonts | 1459 | |
49 : | jhr | 1473 | typedef struct { |
50 : | cl_kernel kern; //!< OpenCL kernel that implements the program | ||
51 : | size_t workGrpSize; //!< size of workgroup for this kernel | ||
52 : | cl_ulong localSzb; //!< size of local memory used by kernel | ||
53 : | } GPUKernel_t; | ||
54 : | |||
55 : | jhr | 1267 | struct struct_world { |
56 : | jhr | 1471 | const char *name; //!< the program name |
57 : | bool isArray; //!< is the initialization an array or collection? | ||
58 : | uint32_t nDims; //!< depth of iteration nesting | ||
59 : | int32_t *base; //!< nDims array of base indices | ||
60 : | uint32_t *size; //!< nDims array of iteration sizes | ||
61 : | uint32_t numStrands; //!< number of strands in the world | ||
62 : | uint32_t strandSize; //!< the sizeof of the strand buffers | ||
63 : | lamonts | 1341 | void *inState; |
64 : | void *outState; | ||
65 : | jhr | 1471 | int *status; //!< array of strand status flags |
66 : | cl_device_id device; //!< OpenCL device | ||
67 : | cl_context context; //!< OpenCL execution context | ||
68 : | cl_command_queue cmdQ; //!< OpenCL command queue | ||
69 : | lamonts | 1500 | GPUKernel_t kernel[3]; //!< OpenCL kernel that implements the program |
70 : | jhr | 1267 | }; |
71 : | jhr | 1471 | |
72 : | lamonts | 1459 | typedef struct { |
73 : | lamonts | 1488 | cl_mem schedMem; |
74 : | cl_mem outMem; | ||
75 : | cl_mem inMem; | ||
76 : | cl_mem blocksMem; | ||
77 : | cl_mem strandBlocksIdxsMem; | ||
78 : | cl_mem queueMem; | ||
79 : | cl_mem todoMem; | ||
80 : | cl_mem statusMem; | ||
81 : | } Kernel_Args_t; | ||
82 : | jhr | 1267 | |
83 : | jhr | 1291 | static bool InitCL (CLInfo_t *clInfo, Diderot_World_t *wrld); |
84 : | jhr | 1473 | static void CheckErrorCode (cl_int sts, const char * msg); |
85 : | lamonts | 1488 | static void SetPhase1Args (cl_kernel kernel, int *argCount, Kernel_Args_t *args); |
86 : | static void SetPhase2Args (cl_kernel kernel, int *argCount, Kernel_Args_t *args, int blk_sz); | ||
87 : | lamonts | 1500 | static void SetScheduleKernelArgs(cl_kernel kernel, int *argCount, Kernel_Args_t *args); |
88 : | |||
89 : | jhr | 1473 | static bool CreateKernel (cl_device_id dev, cl_program prog, const char *name, GPUKernel_t *kern); |
90 : | |||
91 : | jhr | 1358 | extern void Diderot_LoadGlobals (cl_context cxt, cl_kernel kernel, cl_command_queue cmdQ, int argStart); |
92 : | |||
93 : | jhr | 1267 | int main (int argc, const char **argv) |
94 : | { | ||
95 : | // get information about OpenCL support | ||
96 : | CLInfo_t *clInfo = GetCLInfo(); | ||
97 : | if (clInfo == 0) { | ||
98 : | jhr | 1327 | fprintf (stderr, "no OpenCL support\n"); |
99 : | exit (1); | ||
100 : | jhr | 1267 | } |
101 : | |||
102 : | Diderot_Options_t *opts = Diderot_OptNew (); | ||
103 : | |||
104 : | Diderot_OptAddFlag (opts, "verbose", "enable runtime-system messages", &VerboseFlg); | ||
105 : | jhr | 1411 | Diderot_OptAddFlag (opts, "timing", "enable execution timing", &TimingFlg); |
106 : | lamonts | 1513 | Diderot_OptAddFlag (opts, "nrrd", "enable nrrd output", &NrrdOutputFlg); |
107 : | jhr | 1267 | Diderot_RegisterGlobalOpts (opts); |
108 : | Diderot_OptProcess (opts, argc, argv); | ||
109 : | Diderot_OptFree (opts); | ||
110 : | |||
111 : | jhr | 1379 | if (VerboseFlg) |
112 : | jhr | 1492 | PrintCLInfo (stdout, clInfo); |
113 : | jhr | 1379 | |
114 : | jhr | 1267 | // run the generated global initialization code |
115 : | jhr | 1379 | if (VerboseFlg) |
116 : | jhr | 1411 | fprintf (stderr, "initializing globals ...\n"); |
117 : | Diderot_InitGlobals (); | ||
118 : | lamonts | 1316 | |
119 : | jhr | 1291 | Diderot_World_t *wrld = Diderot_Initially (); // this may not be right for OpenCL |
120 : | lamonts | 1459 | |
121 : | jhr | 1291 | if (! InitCL(clInfo, wrld)) |
122 : | exit (1); | ||
123 : | jhr | 1282 | |
124 : | lamonts | 1488 | // Conversion of strands from their host types to their shadow types |
125 : | jhr | 1472 | void *shadowInState = CheckedAlloc(Diderot_Strands[0]->shadowStrandSzb * wrld->numStrands); |
126 : | lamonts | 1488 | void *shadowOutState = CheckedAlloc(Diderot_Strands[0]->shadowStrandSzb * wrld->numStrands); |
127 : | lamonts | 1462 | uint8_t *strandPtr = (uint8_t *)wrld->inState; |
128 : | uint8_t *strandShadowPtr = (uint8_t *)shadowInState; | ||
129 : | lamonts | 1488 | uint8_t *strandShadowOutPtr = (uint8_t *)shadowInState; |
130 : | lamonts | 1462 | size_t shadowSize = Diderot_Strands[0]->shadowStrandSzb * wrld->numStrands; |
131 : | |||
132 : | jhr | 1469 | for (int i = 0; i < wrld->numStrands; i++) { |
133 : | Diderot_Strands[0]->strandCopy (strandPtr, strandShadowPtr); | ||
134 : | lamonts | 1488 | Diderot_Strands[0]->strandCopy (strandPtr, strandShadowOutPtr); |
135 : | jhr | 1469 | strandPtr += Diderot_Strands[0]->stateSzb; |
136 : | strandShadowPtr += Diderot_Strands[0]->shadowStrandSzb; | ||
137 : | lamonts | 1488 | strandShadowOutPtr += Diderot_Strands[0]->shadowStrandSzb; |
138 : | lamonts | 1462 | } |
139 : | jhr | 1471 | free (wrld->inState); |
140 : | lamonts | 1488 | free(wrld->outState); |
141 : | lamonts | 1462 | wrld->inState = shadowInState; |
142 : | lamonts | 1488 | wrld->outState = shadowOutState; |
143 : | lamonts | 1462 | |
144 : | jhr | 1469 | // hack to make the invariant part of the state the same in both copies |
145 : | lamonts | 1462 | memcpy (wrld->outState, wrld->inState, shadowSize); |
146 : | |||
147 : | jhr | 1327 | int argCount = 0; |
148 : | cl_int sts = CL_SUCCESS; | ||
149 : | lamonts | 1316 | |
150 : | lamonts | 1488 | Kernel_Args_t kernelArgs; |
151 : | SchedState_t scheduler; | ||
152 : | lamonts | 1346 | |
153 : | lamonts | 1488 | scheduler.numStrands = wrld->numStrands; |
154 : | lamonts | 1500 | scheduler.nextStrand = 0; |
155 : | lamonts | 1488 | scheduler.todoSize = 0; |
156 : | scheduler.sId = 0; | ||
157 : | lamonts | 1501 | scheduler.clearQueueSz= 1; |
158 : | lamonts | 1500 | scheduler.numAvailable = wrld->numStrands; |
159 : | lamonts | 1488 | |
160 : | lamonts | 1465 | size_t globalWorkSize[1] = {0}; |
161 : | lamonts | 1488 | size_t localWorkSize[1] = {0}; |
162 : | jhr | 1492 | int dIdx = clInfo->mainDeviceIdx; |
163 : | lamonts | 1488 | int pIdx = clInfo->mainPlatformIdx; |
164 : | globalWorkSize[0] = clInfo->platforms[pIdx].devices[dIdx].numCUs * clInfo->platforms[pIdx].devices[dIdx].cuWidth; | ||
165 : | localWorkSize[0] = clInfo->platforms[pIdx].devices[dIdx].cuWidth; | ||
166 : | lamonts | 1502 | int numberOfBlocks = ceil((double)wrld->numStrands/localWorkSize[0]); |
167 : | lamonts | 1488 | |
168 : | size_t strandBlkMemSize = sizeof(int) * numberOfBlocks * clInfo->platforms[pIdx].devices[dIdx].cuWidth;; | ||
169 : | size_t schedListMemSize = sizeof(int) * numberOfBlocks; | ||
170 : | jhr | 1492 | int *schedulerQueue = (int *)CheckedAlloc(schedListMemSize); |
171 : | int *schedulerTodoList = (int *)CheckedAlloc(schedListMemSize); | ||
172 : | int *strandBlocksIdxs = (int *)CheckedAlloc(strandBlkMemSize); | ||
173 : | lamonts | 1488 | scheduler.queueSize = numberOfBlocks; |
174 : | lamonts | 1513 | |
175 : | lamonts | 1488 | size_t strandblkMemSize = sizeof(StrandBlock_t) * numberOfBlocks; |
176 : | StrandBlock_t * strandBlks = (StrandBlock_t *)CheckedAlloc(strandblkMemSize); | ||
177 : | lamonts | 1470 | |
178 : | jhr | 1492 | for (int i = 0; i < numberOfBlocks; i++) { |
179 : | lamonts | 1488 | schedulerQueue[i] = i; |
180 : | strandBlks[i].blkIdx = i; | ||
181 : | strandBlks[i].nActive = 0; | ||
182 : | strandBlks[i].nDead = 0; | ||
183 : | strandBlks[i].nStabilizing = 0; | ||
184 : | jhr | 1492 | strandBlks[i].nDying = 0; |
185 : | jhr | 1432 | } |
186 : | lamonts | 1459 | |
187 : | lamonts | 1488 | kernelArgs.schedMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, |
188 : | sizeof(SchedState_t), &scheduler, &sts); | ||
189 : | CheckErrorCode (sts, "error creating OpenCL scheduler buffer\n"); | ||
190 : | lamonts | 1459 | |
191 : | lamonts | 1488 | kernelArgs.blocksMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, |
192 : | strandblkMemSize, strandBlks , &sts); | ||
193 : | CheckErrorCode (sts, "error creating OpenCL strand blocks buffer\n"); | ||
194 : | lamonts | 1465 | |
195 : | lamonts | 1488 | kernelArgs.strandBlocksIdxsMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, |
196 : | strandBlkMemSize, strandBlocksIdxs , &sts); | ||
197 : | CheckErrorCode (sts, "error creating OpenCL strand blocks indices buffer\n"); | ||
198 : | lamonts | 1425 | |
199 : | lamonts | 1488 | kernelArgs.queueMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, |
200 : | schedListMemSize, schedulerQueue , &sts); | ||
201 : | CheckErrorCode (sts, "error creating OpenCL scheduler queue buffer\n"); | ||
202 : | |||
203 : | kernelArgs.todoMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, | ||
204 : | schedListMemSize, schedulerTodoList , &sts); | ||
205 : | CheckErrorCode (sts, "error creating OpenCL scheduler todo buffer\n"); | ||
206 : | |||
207 : | kernelArgs.inMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, | ||
208 : | shadowSize, wrld->inState, &sts); | ||
209 : | CheckErrorCode (sts, "error creating OpenCL strand in-state buffer\n"); | ||
210 : | |||
211 : | kernelArgs.outMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, | ||
212 : | shadowSize, wrld->outState, &sts); | ||
213 : | CheckErrorCode (sts, "error creating OpenCL strand out-state buffer\n"); | ||
214 : | |||
215 : | kernelArgs.statusMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, | ||
216 : | sizeof(int) * wrld->numStrands, wrld->status, &sts); | ||
217 : | CheckErrorCode (sts, "error creating OpenCL world status buffer\n"); | ||
218 : | |||
219 : | lamonts | 1513 | //Setup the Update Kernel's arguments |
220 : | lamonts | 1488 | SetPhase1Args(wrld->kernel[0].kern, &argCount, &kernelArgs); |
221 : | Diderot_LoadGlobals(wrld->context, wrld->kernel[0].kern, wrld->cmdQ, argCount); | ||
222 : | lamonts | 1459 | |
223 : | lamonts | 1513 | //Setup the Compaction Kernel's arguments |
224 : | lamonts | 1488 | argCount=0; |
225 : | SetPhase2Args(wrld->kernel[1].kern, &argCount, &kernelArgs,clInfo->platforms[pIdx].devices[dIdx].cuWidth); | ||
226 : | |||
227 : | lamonts | 1513 | //Setup the Scheduler Kernel's arguments |
228 : | lamonts | 1500 | argCount = 0; |
229 : | SetScheduleKernelArgs(wrld->kernel[2].kern, &argCount, &kernelArgs); | ||
230 : | |||
231 : | lamonts | 1459 | clFinish(wrld->cmdQ); |
232 : | |||
233 : | double t0 = GetTime(); | ||
234 : | |||
235 : | jhr | 1469 | clFinish(wrld->cmdQ); |
236 : | lamonts | 1461 | |
237 : | lamonts | 1500 | while (scheduler.numAvailable > 0) { |
238 : | |||
239 : | jhr | 1492 | //Runs the update kernel on all strands |
240 : | lamonts | 1488 | sts = clEnqueueNDRangeKernel(wrld->cmdQ, wrld->kernel[0].kern, 1, NULL, |
241 : | lamonts | 1513 | globalWorkSize, localWorkSize, 0, NULL, NULL); |
242 : | lamonts | 1500 | CheckErrorCode (sts, "error in executing update kernel\n"); |
243 : | lamonts | 1488 | |
244 : | lamonts | 1500 | sts = clEnqueueTask(wrld->cmdQ,wrld->kernel[2].kern,0,NULL,NULL); |
245 : | CheckErrorCode (sts, "error in executing scheduler update kernel before compaction\n"); | ||
246 : | lamonts | 1513 | |
247 : | lamonts | 1488 | |
248 : | lamonts | 1500 | // Run the compaction kernel on all strands |
249 : | sts = clEnqueueNDRangeKernel(wrld->cmdQ, wrld->kernel[1].kern, 1, NULL, | ||
250 : | jhr | 1492 | globalWorkSize, localWorkSize, 0, NULL, NULL); |
251 : | lamonts | 1513 | CheckErrorCode (sts, "error ccccn executing compaction kernel\n"); |
252 : | lamonts | 1501 | |
253 : | lamonts | 1513 | |
254 : | lamonts | 1500 | sts = clEnqueueTask(wrld->cmdQ,wrld->kernel[2].kern,0,NULL,NULL); |
255 : | CheckErrorCode (sts, "error in executing scheduler update kernel after compaction\n"); | ||
256 : | lamonts | 1459 | |
257 : | lamonts | 1513 | |
258 : | sts = clEnqueueReadBuffer(wrld->cmdQ, kernelArgs.schedMem, CL_TRUE, 0, sizeof(SchedState_t), | ||
259 : | lamonts | 1500 | &scheduler, 0, NULL, NULL); |
260 : | lamonts | 1462 | |
261 : | lamonts | 1513 | CheckErrorCode (sts, "error reading back scheduler information\n"); |
262 : | |||
263 : | |||
264 : | lamonts | 1459 | } |
265 : | lamonts | 1470 | |
266 : | lamonts | 1488 | sts = clEnqueueReadBuffer(wrld->cmdQ, kernelArgs.outMem, CL_TRUE, 0,shadowSize, |
267 : | jhr | 1469 | wrld->outState, 0, NULL, NULL); |
268 : | jhr | 1471 | CheckErrorCode (sts, "error in reading back output\n"); |
269 : | lamonts | 1459 | |
270 : | jhr | 1469 | sts = clEnqueueReadBuffer( |
271 : | lamonts | 1488 | wrld->cmdQ, kernelArgs.statusMem, CL_TRUE, 0, sizeof(int) * wrld->numStrands, |
272 : | jhr | 1469 | wrld->status, 0, NULL, NULL); |
273 : | jhr | 1471 | CheckErrorCode (sts, "error in reading back status\n"); |
274 : | jhr | 1342 | |
275 : | lamonts | 1513 | |
276 : | jhr | 1469 | clFinish (wrld->cmdQ); |
277 : | jhr | 1473 | clReleaseKernel(wrld->kernel[0].kern); |
278 : | clReleaseKernel(wrld->kernel[1].kern); | ||
279 : | lamonts | 1316 | clReleaseCommandQueue(wrld->cmdQ); |
280 : | clReleaseContext(wrld->context); | ||
281 : | |||
282 : | jhr | 1287 | double totalTime = GetTime() - t0; |
283 : | |||
284 : | if (VerboseFlg) | ||
285 : | fprintf (stderr, "done in %f seconds\n", totalTime); | ||
286 : | else if (TimingFlg) | ||
287 : | printf ("usr=%f\n", totalTime); | ||
288 : | |||
289 : | nseltzer | 1453 | // here we have the final state of all of the strands in the "in" buffer |
290 : | lamonts | 1513 | // output the final strand states |
291 : | Output_Args_t outArgs; | ||
292 : | outArgs.name = wrld->name; | ||
293 : | outArgs.isArray = wrld->isArray; | ||
294 : | outArgs.numStrands = wrld->numStrands; | ||
295 : | outArgs.status = wrld->status; | ||
296 : | outArgs.inState = wrld->outState; | ||
297 : | outArgs.nDims = wrld->nDims; | ||
298 : | outArgs.size = wrld->size; | ||
299 : | outArgs.isStatusInt = true; | ||
300 : | outArgs.outputSzb = Diderot_Strands[0]->shadowStrandSzb; | ||
301 : | if (NrrdOutputFlg) | ||
302 : | Diderot_Output (&outArgs); | ||
303 : | else | ||
304 : | Diderot_Print (&outArgs); | ||
305 : | jhr | 1287 | |
306 : | nseltzer | 1450 | |
307 : | |||
308 : | lamonts | 1488 | Diderot_Shutdown (wrld); |
309 : | jhr | 1287 | |
310 : | return 0; | ||
311 : | |||
312 : | lamonts | 1271 | } |
313 : | jhr | 1469 | |
314 : | lamonts | 1488 | static void SetPhase1Args (cl_kernel kernel, int *argCount, Kernel_Args_t *args) |
315 : | lamonts | 1459 | { |
316 : | jhr | 1492 | int count = *argCount; |
317 : | lamonts | 1459 | cl_int sts = CL_SUCCESS; |
318 : | jhr | 1282 | |
319 : | jhr | 1492 | sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->inMem); |
320 : | lamonts | 1500 | CheckErrorCode (sts, "Update Kernel: error Setting OpenCL strand in-state argument\n"); |
321 : | lamonts | 1488 | |
322 : | jhr | 1492 | sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->outMem); |
323 : | lamonts | 1500 | CheckErrorCode (sts, "Update Kernel: error Setting OpenCL strand out-state argument\n"); |
324 : | lamonts | 1459 | |
325 : | jhr | 1492 | sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->statusMem); |
326 : | lamonts | 1500 | CheckErrorCode (sts, "Update Kernel: error Setting OpenCL world status argument\n"); |
327 : | lamonts | 1459 | |
328 : | jhr | 1492 | sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->schedMem); |
329 : | lamonts | 1500 | CheckErrorCode (sts, "Update Kernel: error Setting OpenCL scheduler argument\n"); |
330 : | lamonts | 1459 | |
331 : | jhr | 1492 | sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->blocksMem); |
332 : | lamonts | 1500 | CheckErrorCode (sts, "Update Kernel: error Setting OpenCL strand blocks argument\n"); |
333 : | lamonts | 1459 | |
334 : | jhr | 1492 | sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->strandBlocksIdxsMem); |
335 : | lamonts | 1500 | CheckErrorCode (sts, "Update Kernel: error Setting OpenCL strand blocks' indices argument\n"); |
336 : | lamonts | 1459 | |
337 : | jhr | 1492 | sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->queueMem); |
338 : | lamonts | 1500 | CheckErrorCode (sts, "Update Kernel: error Setting OpenCL scheduler queue argument\n"); |
339 : | lamonts | 1488 | |
340 : | jhr | 1492 | sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->todoMem); |
341 : | lamonts | 1500 | CheckErrorCode (sts, "Update Kernel: error Setting OpenCL scheduler todo argument\n"); |
342 : | lamonts | 1488 | |
343 : | jhr | 1492 | sts = clSetKernelArg (kernel, count++, sizeof(StrandBlock_t), NULL); |
344 : | lamonts | 1513 | CheckErrorCode (sts, "Update Kernel: error Setting OpenCL local strand blockargument\n"); |
345 : | lamonts | 1488 | |
346 : | jhr | 1492 | *argCount = count; |
347 : | |||
348 : | jhr | 1469 | } |
349 : | |||
350 : | lamonts | 1488 | static void SetPhase2Args (cl_kernel kernel, int *argCount, Kernel_Args_t *args, int blk_sz) |
351 : | lamonts | 1459 | { |
352 : | jhr | 1492 | int count = *argCount; |
353 : | cl_int sts = CL_SUCCESS; | ||
354 : | lamonts | 1459 | |
355 : | jhr | 1492 | sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->statusMem); |
356 : | lamonts | 1500 | CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL world status argument\n"); |
357 : | lamonts | 1459 | |
358 : | jhr | 1492 | sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->schedMem); |
359 : | lamonts | 1500 | CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL scheduler argument\n"); |
360 : | lamonts | 1488 | |
361 : | jhr | 1492 | sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->blocksMem); |
362 : | lamonts | 1500 | CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL strand blocks argument\n"); |
363 : | lamonts | 1488 | |
364 : | jhr | 1492 | sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->strandBlocksIdxsMem); |
365 : | lamonts | 1500 | CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL strand blocks' indices argument\n"); |
366 : | lamonts | 1488 | |
367 : | jhr | 1492 | sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->queueMem); |
368 : | lamonts | 1500 | CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL scheduler queue argument\n"); |
369 : | lamonts | 1488 | |
370 : | jhr | 1492 | sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->todoMem); |
371 : | lamonts | 1500 | CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL scheduler todo argument\n"); |
372 : | lamonts | 1488 | |
373 : | jhr | 1492 | sts = clSetKernelArg (kernel, count++, sizeof(StrandBlock_t), NULL); |
374 : | lamonts | 1500 | CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL local strand blockargument\n"); |
375 : | lamonts | 1488 | |
376 : | jhr | 1492 | sts = clSetKernelArg (kernel, count++, sizeof(int) * blk_sz, NULL); |
377 : | lamonts | 1500 | CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL local preStable argument\n"); |
378 : | lamonts | 1488 | |
379 : | jhr | 1492 | sts = clSetKernelArg (kernel, count++, sizeof(int) * blk_sz, NULL); |
380 : | lamonts | 1500 | CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL local preDead argument\n"); |
381 : | lamonts | 1488 | |
382 : | lamonts | 1500 | sts = clSetKernelArg (kernel, count++, sizeof(int) * blk_sz * 2, NULL); |
383 : | CheckErrorCode (sts, "Compaction Kernel: error Setting OpenCL local temporary array for the prefix scan of preStable preDead argument\n"); | ||
384 : | lamonts | 1488 | |
385 : | lamonts | 1513 | |
386 : | jhr | 1492 | *argCount = count; |
387 : | jhr | 1469 | } |
388 : | lamonts | 1500 | static void SetScheduleKernelArgs(cl_kernel kernel, int *argCount, Kernel_Args_t *args) |
389 : | { | ||
390 : | int count = *argCount; | ||
391 : | cl_int sts = CL_SUCCESS; | ||
392 : | |||
393 : | sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->schedMem); | ||
394 : | CheckErrorCode (sts, "Scheduler Kernel: error setting OpenCL scheduler argument\n"); | ||
395 : | jhr | 1469 | |
396 : | lamonts | 1500 | *argCount = count; |
397 : | } | ||
398 : | |||
399 : | jhr | 1471 | static void CheckErrorCode (cl_int sts, const char *msg) |
400 : | lamonts | 1459 | { |
401 : | if (sts != CL_SUCCESS) { | ||
402 : | jhr | 1464 | fprintf (stderr, "%s", msg); |
403 : | lamonts | 1459 | exit(1); |
404 : | } | ||
405 : | jhr | 1469 | } |
406 : | |||
407 : | jhr | 1287 | /*! \brief load OpenCL code from a file |
408 : | */ | ||
409 : | jhr | 1291 | static char *LoadSource (const char *filename) |
410 : | lamonts | 1271 | { |
411 : | struct stat statbuf; | ||
412 : | jhr | 1287 | if (stat(filename, &statbuf) < 0) { |
413 : | jhr | 1327 | fprintf (stderr, "unable to stat OpenCL source file %s\n", filename); |
414 : | exit (1); | ||
415 : | jhr | 1287 | } |
416 : | |||
417 : | jhr | 1472 | char *source = (char *)CheckedAlloc(statbuf.st_size + 1); |
418 : | jhr | 1287 | if (source == 0) { |
419 : | jhr | 1327 | fprintf (stderr, "unable to allocate memory for OpenCL source\n"); |
420 : | exit (1); | ||
421 : | jhr | 1287 | } |
422 : | |||
423 : | FILE *fh = fopen(filename, "r"); | ||
424 : | if ((fh == 0) | ||
425 : | || (fread(source, statbuf.st_size, 1, fh) != 1)) { | ||
426 : | jhr | 1327 | fprintf (stderr, "unable to read OpenCL source from %s\n", filename); |
427 : | exit (1); | ||
428 : | jhr | 1287 | } |
429 : | source[statbuf.st_size] = '\0'; | ||
430 : | fclose (fh); | ||
431 : | lamonts | 1271 | |
432 : | return source; | ||
433 : | jhr | 1342 | } |
434 : | |||
435 : | jhr | 1473 | //! create a kernel object from a program |
436 : | static bool CreateKernel (cl_device_id dev, cl_program prog, const char *name, GPUKernel_t *kern) | ||
437 : | { | ||
438 : | cl_int sts; | ||
439 : | |||
440 : | kern->kern = clCreateKernel(prog, name, &sts); | ||
441 : | if (sts != CL_SUCCESS) { | ||
442 : | fprintf (stderr, "error getting %s from program\n", name); | ||
443 : | return false; | ||
444 : | } | ||
445 : | sts = clGetKernelWorkGroupInfo ( | ||
446 : | kern->kern, dev, CL_KERNEL_WORK_GROUP_SIZE, | ||
447 : | sizeof(size_t), &(kern->workGrpSize), 0); | ||
448 : | if (sts != CL_SUCCESS) { | ||
449 : | fprintf (stderr, "error getting workgroup size for %s\n", name); | ||
450 : | return false; | ||
451 : | } | ||
452 : | sts = clGetKernelWorkGroupInfo ( | ||
453 : | kern->kern, dev, CL_KERNEL_LOCAL_MEM_SIZE, | ||
454 : | sizeof(cl_ulong), &(kern->localSzb), 0); | ||
455 : | if (sts != CL_SUCCESS) { | ||
456 : | fprintf (stderr, "error getting local memory size for %s\n", name); | ||
457 : | return false; | ||
458 : | } | ||
459 : | |||
460 : | if (VerboseFlg) { | ||
461 : | fprintf(stderr, "kernel %s: workgroup size = %d, local memory = %d bytes\n", | ||
462 : | name, (int)kern->workGrpSize, (int)kern->localSzb); | ||
463 : | } | ||
464 : | |||
465 : | return true; | ||
466 : | } | ||
467 : | |||
468 : | jhr | 1352 | static void LogMessagesToStderr (const char *errstr, const void *private_info, size_t cb, void *user_data) |
469 : | lamonts | 1328 | { |
470 : | jhr | 1480 | fprintf(stderr, "***** error log *****\n"); |
471 : | jhr | 1342 | fprintf(stderr, "%s\n", errstr); |
472 : | jhr | 1480 | fprintf(stderr, "***** end error log *****\n"); |
473 : | jhr | 1342 | } |
474 : | jhr | 1267 | |
475 : | jhr | 1417 | /*! \brief initialize the OpenCL execution context, including loading and compiling the OpenCL |
476 : | * program. | ||
477 : | * \param clInfo points to the summary information about the available OpenCL devices. | ||
478 : | * \param wrld the Diderot execution information. | ||
479 : | * \return true if initialization is successful, otherwise false. | ||
480 : | jhr | 1291 | */ |
481 : | static bool InitCL (CLInfo_t *clInfo, Diderot_World_t *wrld) | ||
482 : | { | ||
483 : | jhr | 1327 | cl_int sts; |
484 : | jhr | 1291 | |
485 : | // find a GPU on platform[0] | ||
486 : | jhr | 1420 | DeviceInfo_t *dev = 0; |
487 : | lamonts | 1488 | clInfo->mainPlatformIdx = 0; |
488 : | jhr | 1291 | int i; |
489 : | for (i = 0; i < clInfo->platforms[0].numDevices; i++) { | ||
490 : | jhr | 1417 | if (isGPUDevice (&(clInfo->platforms[0].devices[i])) |
491 : | jhr | 1472 | && clInfo->platforms[0].devices[i].isAvail) { |
492 : | jhr | 1420 | dev = &(clInfo->platforms[0].devices[i]); |
493 : | lamonts | 1488 | clInfo->mainDeviceIdx = i; |
494 : | jhr | 1291 | break; |
495 : | } | ||
496 : | } | ||
497 : | lamonts | 1488 | |
498 : | jhr | 1420 | if (dev == 0) { |
499 : | jhr | 1291 | fprintf (stderr, "unable to find GPU device\n"); |
500 : | return false; | ||
501 : | } | ||
502 : | |||
503 : | jhr | 1469 | if (VerboseFlg) { |
504 : | fprintf (stderr, "using platform 0, device %d: %s\n", | ||
505 : | i, clInfo->platforms[0].devices[i].name); | ||
506 : | } | ||
507 : | |||
508 : | jhr | 1291 | // create the context |
509 : | jhr | 1420 | cl_context cxt = clCreateContext(0, 1, &(dev->id), LogMessagesToStderr, 0, &sts); |
510 : | jhr | 1291 | if (sts != CL_SUCCESS) { |
511 : | jhr | 1327 | fprintf (stderr, "error creating OpenCL context\n"); |
512 : | jhr | 1291 | return false; |
513 : | } | ||
514 : | |||
515 : | // create the program from the source | ||
516 : | int fnameLen = strlen(wrld->name) + 4; // name + ".cl\0" | ||
517 : | jhr | 1472 | char *fname = (char *)CheckedAlloc(fnameLen); |
518 : | jhr | 1291 | snprintf(fname, fnameLen, "%s.cl", wrld->name); |
519 : | lamonts | 1459 | char *updateSource = LoadSource (fname); |
520 : | jhr | 1464 | free (fname); |
521 : | lamonts | 1459 | |
522 : | jhr | 1492 | // load scheduler kernels |
523 : | char *kernelsSource = LoadSource(KERNELS_FILE_PATH); | ||
524 : | |||
525 : | lamonts | 1513 | const char *src[2] = {kernelsSource,updateSource}; |
526 : | jhr | 1464 | cl_program prog = clCreateProgramWithSource(cxt, 2, src, NULL, &sts); |
527 : | jhr | 1492 | free (updateSource); |
528 : | free (kernelsSource); | ||
529 : | jhr | 1291 | if (sts != CL_SUCCESS) { |
530 : | jhr | 1327 | fprintf (stderr, "error creating program\n"); |
531 : | jhr | 1291 | return false; |
532 : | } | ||
533 : | |||
534 : | // build the program | ||
535 : | jhr | 1420 | char options[1024]; |
536 : | snprintf (options, sizeof(options), | ||
537 : | jhr | 1472 | "-D DIDEROT_CL_VERSION=%d -D DIDEROT_CU_WIDTH=%d -I %s -w", |
538 : | jhr | 1420 | 100*dev->majorVersion + dev->minorVersion, |
539 : | jhr | 1472 | dev->cuWidth, |
540 : | jhr | 1420 | DIDEROT_INCLUDE_PATH); |
541 : | jhr | 1472 | if (VerboseFlg) { |
542 : | fprintf (stderr, "clBuildProgram options: %s\n", options); | ||
543 : | } | ||
544 : | jhr | 1420 | sts = clBuildProgram (prog, 1, &(dev->id), options, 0, 0); |
545 : | jhr | 1291 | if (sts != CL_SUCCESS) { |
546 : | jhr | 1352 | size_t logSzb; |
547 : | jhr | 1420 | clGetProgramBuildInfo (prog, dev->id, CL_PROGRAM_BUILD_LOG, 0, 0, &logSzb); |
548 : | jhr | 1472 | char *log = CheckedAlloc(logSzb+1); |
549 : | jhr | 1420 | clGetProgramBuildInfo (prog, dev->id, CL_PROGRAM_BUILD_LOG, logSzb, log, &logSzb); |
550 : | jhr | 1352 | log[logSzb] = '\0'; |
551 : | fprintf (stderr, "error compiling program:\n%s\n", log); | ||
552 : | free (log); | ||
553 : | jhr | 1291 | return false; |
554 : | } | ||
555 : | |||
556 : | jhr | 1473 | // extract the kernels from the program |
557 : | lamonts | 1488 | if ((! CreateKernel (dev->id, prog, "Diderot_UpdateKernel", &(wrld->kernel[0]))) |
558 : | lamonts | 1500 | || (! CreateKernel (dev->id, prog, "Diderot_CompactionKernel", &(wrld->kernel[1]))) |
559 : | || (! CreateKernel (dev->id, prog, "Diderot_SchedUpdateKernel", &(wrld->kernel[2])))) | ||
560 : | jhr | 1291 | return false; |
561 : | |||
562 : | jhr | 1417 | // create the command queue |
563 : | jhr | 1420 | cl_command_queue q = clCreateCommandQueue(cxt, dev->id, 0, &sts); |
564 : | jhr | 1417 | if (sts != CL_SUCCESS) { |
565 : | fprintf (stderr, "error creating OpenCL command queue\n"); | ||
566 : | return false; | ||
567 : | } | ||
568 : | |||
569 : | jhr | 1291 | // initialize world info |
570 : | jhr | 1420 | wrld->device = dev->id; |
571 : | jhr | 1291 | wrld->context = cxt; |
572 : | wrld->cmdQ = q; | ||
573 : | |||
574 : | return true; | ||
575 : | } | ||
576 : | jhr | 1327 | |
577 : | jhr | 1267 | // this should be the part of the scheduler |
578 : | void *Diderot_AllocStrand (Strand_t *strand) | ||
579 : | { | ||
580 : | jhr | 1472 | return CheckedAlloc(strand->stateSzb); |
581 : | jhr | 1267 | } |
582 : | |||
583 : | // block allocation of an initial collection of strands | ||
584 : | lamonts | 1346 | Diderot_World_t *Diderot_AllocInitially( |
585 : | jhr | 1287 | const char *name, // the name of the program |
586 : | jhr | 1267 | Strand_t *strand, // the type of strands being allocated |
587 : | bool isArray, // is the initialization an array or collection? | ||
588 : | uint32_t nDims, // depth of iteration nesting | ||
589 : | int32_t *base, // nDims array of base indices | ||
590 : | uint32_t *size) // nDims array of iteration sizes | ||
591 : | { | ||
592 : | jhr | 1472 | Diderot_World_t *wrld = NEW(Diderot_World_t); |
593 : | jhr | 1267 | if (wrld == 0) { |
594 : | fprintf (stderr, "unable to allocate world\n"); | ||
595 : | exit (1); | ||
596 : | } | ||
597 : | |||
598 : | jhr | 1287 | wrld->name = name; /* NOTE: we are assuming that name is statically allocated! */ |
599 : | jhr | 1267 | wrld->isArray = isArray; |
600 : | wrld->nDims = nDims; | ||
601 : | jhr | 1472 | wrld->base = NEWVEC(int32_t, nDims); |
602 : | wrld->size = NEWVEC(uint32_t, nDims); | ||
603 : | jhr | 1267 | size_t numStrands = 1; |
604 : | for (int i = 0; i < wrld->nDims; i++) { | ||
605 : | numStrands *= size[i]; | ||
606 : | wrld->base[i] = base[i]; | ||
607 : | wrld->size[i] = size[i]; | ||
608 : | } | ||
609 : | |||
610 : | jhr | 1378 | if (VerboseFlg) { |
611 : | printf("AllocInitially: %d", size[0]); | ||
612 : | for (int i = 1; i < nDims; i++) printf(" x %d", size[i]); | ||
613 : | printf("\n"); | ||
614 : | } | ||
615 : | jhr | 1267 | |
616 : | // allocate the strand state pointers | ||
617 : | wrld->numStrands = numStrands; | ||
618 : | lamonts | 1346 | wrld->strandSize = strand->stateSzb * numStrands; |
619 : | jhr | 1472 | /* |
620 : | wrld->inState = NEWVEC(void *, numStrands); | ||
621 : | wrld->outState = NEWVEC(void *, numStrands); | ||
622 : | */ | ||
623 : | wrld->inState = CheckedAlloc (wrld->strandSize); | ||
624 : | wrld->outState = CheckedAlloc (strand->shadowStrandSzb * numStrands); | ||
625 : | wrld->status = NEWVEC(int, numStrands); | ||
626 : | jhr | 1267 | if ((wrld->inState == 0) || (wrld->outState == 0) || (wrld->status == 0)) { |
627 : | fprintf (stderr, "unable to allocate strand states\n"); | ||
628 : | exit (1); | ||
629 : | } | ||
630 : | |||
631 : | // initialize strand state pointers etc. | ||
632 : | for (size_t i = 0; i < numStrands; i++) { | ||
633 : | lamonts | 1341 | // wrld->inState[i] = Diderot_AllocStrand (strand); |
634 : | // wrld->outState[i] = Diderot_AllocStrand (strand); | ||
635 : | lamonts | 1351 | wrld->status[i] = DIDEROT_ACTIVE; |
636 : | jhr | 1267 | } |
637 : | |||
638 : | return wrld; | ||
639 : | |||
640 : | } | ||
641 : | |||
642 : | // get strand state pointers | ||
643 : | void *Diderot_InState (Diderot_World_t *wrld, uint32_t i) | ||
644 : | { | ||
645 : | assert (i < wrld->numStrands); | ||
646 : | lamonts | 1461 | return wrld->inState + i; |
647 : | jhr | 1267 | } |
648 : | |||
649 : | void *Diderot_OutState (Diderot_World_t *wrld, uint32_t i) | ||
650 : | { | ||
651 : | assert (i < wrld->numStrands); | ||
652 : | lamonts | 1341 | return &wrld->outState[i]; |
653 : | jhr | 1267 | } |
654 : | |||
655 : | bool Diderot_IsActive (Diderot_World_t *wrld, uint32_t i) | ||
656 : | { | ||
657 : | assert (i < wrld->numStrands); | ||
658 : | return !wrld->status[i]; | ||
659 : | jhr | 1287 | } |
660 : | jhr | 1435 | |
661 : | jhr | 1354 | /***** Support for shadow image values *****/ |
662 : | jhr | 1435 | |
663 : | jhr | 1354 | void ShadowImage1D (cl_context cxt, Shadow_image1D_t *dst, Diderot_image1D_t *src) |
664 : | { | ||
665 : | dst->size[0] = src->size[0]; | ||
666 : | dst->s = src->s; | ||
667 : | dst->t = dst->t; | ||
668 : | } | ||
669 : | jhr | 1435 | |
670 : | jhr | 1354 | void ShadowImage2D (cl_context cxt, Shadow_image2D_t *dst, Diderot_image2D_t *src) |
671 : | { | ||
672 : | lamonts | 1461 | dst->size[0] = src->size[0]; |
673 : | jhr | 1354 | dst->size[1] = src->size[1]; |
674 : | jhr | 1355 | ShadowMat2x2 (dst->w2i, src->w2i); |
675 : | lamonts | 1461 | ShadowVec2 (&dst->tVec, src->tVec); |
676 : | jhr | 1355 | ShadowMat2x2 (dst->w2iT, src->w2iT); |
677 : | jhr | 1354 | } |
678 : | |||
679 : | void ShadowImage3D (cl_context cxt, Shadow_image3D_t *dst, Diderot_image3D_t *src) | ||
680 : | { | ||
681 : | dst->size[0] = src->size[0]; | ||
682 : | dst->size[1] = src->size[1]; | ||
683 : | dst->size[2] = src->size[2]; | ||
684 : | jhr | 1355 | ShadowMat3x3 (dst->w2i, src->w2i); |
685 : | lamonts | 1461 | ShadowVec3 (&dst->tVec, src->tVec); |
686 : | jhr | 1435 | ShadowMat3x3 (dst->w2iT, src->w2iT); |
687 : | jhr | 1354 | } |
root@smlnj-gforge.cs.uchicago.edu | ViewVC Help |
Powered by ViewVC 1.0.0 |