Home My Page Projects Code Snippets Project Openings diderot
Summary Activity Tracker Tasks SCM

SCM Repository

[diderot] Annotation of /branches/pure-cfg/src/include/cl_kernels/kernels.cl
ViewVC logotype

Annotation of /branches/pure-cfg/src/include/cl_kernels/kernels.cl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1641 - (view) (download)

1 : lamonts 1513 /*! \file cl-scheduler.h
2 :     *
3 :     * \author Lamont Samuels
4 :     */
5 :    
6 :     /*
7 :     * COPYRIGHT (c) 2011 The Diderot Project (http://diderot-language.cs.uchicago.edu)
8 :     * All rights reserved.
9 :     */
10 : lamonts 1641 #define DIDEROT_CL_VERSION 110
11 :     #if (DIDEROT_CL_VERSION == 100)
12 :     # pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics: enable
13 :     # pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics: enable
14 :     #endif
15 : lamonts 1513 #define BLK_SZ DIDEROT_CU_WIDTH
16 :    
17 :     #define DIDEROT_SINGLE_PRECISION
18 :     #define DIDEROT_TARGET_CL
19 :     #include "Diderot/cl-diderot.h"
20 :    
21 :     typedef struct {
22 : jhr 1618 int blkIdx; // the id of this block
23 :     int nActive; // number of active (status != DIE or STABLE) strands
24 :     int nDead; // number of strands in the DIE state
25 :     int nStabilizing; // number of new strands in the STABILIZE state
26 :     int nDying; // number of new strands in the DIE state
27 : lamonts 1513 } StrandBlock_t;
28 :    
29 :     typedef struct {
30 : jhr 1618 int numStrands; // number of strands
31 :     int sId; // the index accumlator for the todo list or queue
32 :     int nextStrand; // index of the next strand to retrieve from the pool
33 :     int clearQueueSz; // an indicator on whether the queue size should be cleared
34 :     int queueSize; // number of blocks on the scheduler's queue
35 :     int todoSize; // number of blocks on the scheduler's todo list
36 :     int numAvailable; // number of active strands left to process
37 : lamonts 1513 } SchedState_t;
38 :    
39 : lamonts 1621 //! \brief Retrieve a block's worth (BLK_SZ) of strands for a StrandBlock_t.
40 : lamonts 1513 //! \param sched global scheduler state
41 :     //! \param blocks global array of strand block indicies
42 :     //! \param lBlk pointer to global storage for the strand block
43 :     //! \return true if there were strands assigned to the strand block, otherwise false
44 :     inline bool GrabbedWork (
45 :     __global SchedState_t *sched,
46 :     __global int *blockIndxs,
47 :     __local StrandBlock_t *lBlk)
48 :     {
49 : jhr 1618 // Get a block of strands
50 :     int startStrandIdx = atom_add(&sched->nextStrand, BLK_SZ);
51 :     int idx = 0, count = 0;
52 : lamonts 1513
53 : jhr 1618 // For each work-item in the workgroup
54 :     for (int i = startStrandIdx, count = 0; i < sched->numStrands && count < BLK_SZ; i++,count++) {
55 :     // Increment the number of active strands and assign each work-item a strand to process
56 :     lBlk->nActive++;
57 :     blockIndxs[lBlk->blkIdx * BLK_SZ + idx] = i;
58 :     idx++;
59 :     }
60 :     return (idx != 0);
61 : lamonts 1513 }
62 :    
63 : lamonts 1621 //! \brief Get a block of strands from the scheduler's queue pool.
64 : lamonts 1513 //! \param sched global scheduler state
65 : lamonts 1621 //! \param blockIndxs the strand indicies for a strand block
66 :     //! \param queue global scheduler's queue
67 : lamonts 1513 //! \param blocks global array of strand blocks
68 : lamonts 1621 //! \param lBlk pointer to local storage for the strand block
69 : lamonts 1513 inline void GetBlock (
70 :     __global SchedState_t *sched,
71 :     __global int *blockIndxs,
72 :     __global int *queue,
73 :     __global StrandBlock_t *blocks,
74 :     __local StrandBlock_t *lBlk)
75 :     {
76 :    
77 :     // Retrieve a block from the list
78 :     int blkIdx = atom_inc(&(sched->sId));
79 : lamonts 1621
80 :     // Check to see if the index retrieved is less than the
81 : lamonts 1513 // number of blocks on the queue
82 :     if (blkIdx < sched->queueSize) {
83 :     // Copy over the global data for the block into local memory
84 :     lBlk->blkIdx = (blocks[queue[blkIdx]]).blkIdx;
85 :     lBlk->nActive = (blocks[queue[blkIdx]]).nActive;
86 :     lBlk->nDead = (blocks[queue[blkIdx]]).nDead;
87 :     lBlk->nStabilizing = (blocks[queue[blkIdx]]).nStabilizing;
88 :     lBlk->nDying = (blocks[queue[blkIdx]]).nDying;
89 :     mem_fence(CLK_LOCAL_MEM_FENCE);
90 : lamonts 1621
91 :     // If the block has no active strands then
92 : lamonts 1513 if (lBlk->nActive == 0) {
93 :    
94 : lamonts 1621 // Check to see if there is any strands left to processs.
95 : lamonts 1513 // If there are no strands left to process than make this block inactive
96 :     if(!GrabbedWork(sched,blockIndxs,lBlk))
97 :     lBlk->nActive = -1;
98 :     else
99 :     blocks[lBlk->blkIdx].nActive = lBlk->nActive;
100 :     }
101 :     }
102 :     else // if there isn't any queue items then notify the the workgroup
103 :     lBlk->nActive = -1;
104 :     }
105 : jhr 1618
106 : lamonts 1621 //! \brief Copies the strand block back from __local space to __global space
107 :     //! \param gBlock global strand block
108 :     //! \param lBlock local strand block
109 : jhr 1618 inline void StrandBlock_Copy (__global StrandBlock_t * gBlock, __local StrandBlock_t * lBlock)
110 : lamonts 1513 {
111 : jhr 1618 gBlock->nActive = lBlock->nActive;
112 :     gBlock->nDead = lBlock->nDead;
113 :     gBlock->nStabilizing = lBlock->nStabilizing;
114 :     gBlock->nDying = lBlock->nDying;
115 : lamonts 1513 }
116 : jhr 1618
117 : lamonts 1621 //! \brief Get a block of strands from the scheduler's todo list.
118 : lamonts 1513 //! \param sched global scheduler state
119 : lamonts 1621 //! \param queue global scheduler's todo queue
120 : lamonts 1513 //! \param blocks global array of strand blocks
121 : lamonts 1621 //! \param lBlk pointer to local storage for the strand block
122 : lamonts 1513 inline void GetBlockFromTodoList (
123 : jhr 1618 __global SchedState_t *sched,
124 :     __global int *todoList,
125 :     __global StrandBlock_t *blocks,
126 :     __local StrandBlock_t *lBlk)
127 : lamonts 1513 {
128 :     // Retrieve a block from the todolist
129 :     int blkIdx = atom_inc(&(sched->sId));
130 :    
131 : lamonts 1621 // Check to see if the index retrieved is less than the
132 :     // number of blocks on the todo queue
133 : lamonts 1513 if (blkIdx < sched->todoSize) {
134 : lamonts 1621 // Copy over the global data for the block into local memory
135 : lamonts 1513 lBlk-> blkIdx = (blocks[todoList[blkIdx]]).blkIdx;
136 :     lBlk->nActive = (blocks[todoList[blkIdx]]).nActive;
137 :     lBlk->nDead = (blocks[todoList[blkIdx]]).nDead;
138 :     lBlk->nStabilizing = (blocks[todoList[blkIdx]]).nStabilizing;
139 :     lBlk->nDying = (blocks[todoList[blkIdx]]).nDying;
140 :     mem_fence(CLK_LOCAL_MEM_FENCE);
141 : jhr 1618 }
142 : lamonts 1621 else // if there isn't any queue items then notify the the workgroup
143 : lamonts 1513 lBlk->nActive = -1;
144 :    
145 :     }
146 :    
147 : lamonts 1621 // A native parallel-prefix-scan algorithm
148 :     //! \param input the block indicies for a strand block
149 :     //! \param status global scheduler's status array
150 :     //! \param output the pre-stable or pre-die arrays
151 :     //! \param statusTy type of status for the output data
152 :     //! \param n the number of strand indicies
153 :     //! \param temp pointer to local storage for the scan algorithm
154 : lamonts 1513 inline void scan (
155 :     __global int *input,
156 :     __global int *status,
157 :     __local int *output,
158 :     StrandStatus_t statusTy,
159 :     int n,
160 :     __local int *temp)
161 :     {
162 :     int thid = get_local_id(0);
163 :     int pout = 0, pin = 1;
164 :    
165 :     // load input into local memory
166 : lamonts 1621 // exclusive scan: shift right by one and set first element to 0
167 : lamonts 1513 if (thid == 0 || status[(*(input + thid - 1))] != (int)statusTy)
168 :     temp[thid] = 0;
169 :     else
170 :     temp[thid] = 1;
171 :    
172 :     barrier(CLK_LOCAL_MEM_FENCE);
173 :    
174 :     for (int offset = 1; offset < n; offset *= 2) {
175 :     pout = 1 - pout;
176 :     pin = 1 - pout;
177 :    
178 :     if(thid >= offset)
179 :     temp[pout * n + thid] = temp[pin * n + thid] + temp[pin * n + thid - offset];
180 :     else
181 :     temp[pout * n + thid] = temp[pin * n + thid];
182 :    
183 :     barrier(CLK_LOCAL_MEM_FENCE);
184 :     }
185 :     output[thid] = temp[pout* n + thid];
186 :     }
187 : jhr 1618
188 : lamonts 1621 // Clears the queue sizes and strand id acculator
189 :     //! \param sched global scheduler state
190 : jhr 1622 __kernel void Diderot_SchedUpdateKernel (__global SchedState_t *sched)
191 : lamonts 1500 {
192 : jhr 1622 sched->sId = 0;
193 :     if(sched->clearQueueSz == 1) {
194 : lamonts 1501 sched->clearQueueSz = 0;
195 :     sched->queueSize = 0;
196 : jhr 1622 } else {
197 : lamonts 1501 sched->clearQueueSz = 1;
198 : lamonts 1500 sched->todoSize = 0;
199 : jhr 1622 }
200 : lamonts 1500 }
201 : lamonts 1513
202 : lamonts 1621 // Compaction Kernel: compact strands and replicate stable state
203 :     //! \param status global strand status array
204 :     //! \param sched global scheduler state
205 :     //! \param blocks global array of strand blocks
206 :     //! \param blockIndxs the strand indicies for a strand block
207 :     //! \param queue global scheduler's queue
208 :     //! \param todo global scheduler's todo queue
209 :     //! \param preStable pointer to local storage for number of stable strands with lower index
210 :     //! \param preDead pointer to local storage for number of dead strands with lower index
211 :     //! \param prefixScanTemp pointer to local storage for prefix-scan algorithm
212 : lamonts 1488 __kernel __attribute__((reqd_work_group_size(BLK_SZ, 1, 1)))
213 :     void Diderot_CompactionKernel (
214 :     __global int *status, // strand status array
215 :     __global SchedState_t *sched, // scheduler state
216 :     __global StrandBlock_t *blocks, // array of scheduler blocks
217 :     __global int * blockIndxs,
218 : lamonts 1501 __global int * queue,
219 :     __global int * todoList,
220 : lamonts 1488 __local StrandBlock_t * bp,
221 :     __local int * preStable,
222 :     __local int * preDead,
223 : lamonts 1513 __local int * prefixScanTemp
224 : lamonts 1488 )
225 :     {
226 : lamonts 1513 int id = get_local_id(0);
227 :     int qIdx;
228 : jhr 1618
229 : lamonts 1488 do {
230 :     if (id == 0) {
231 : lamonts 1513 bool done;
232 :     do {
233 :     GetBlockFromTodoList(sched,todoList,blocks,bp);
234 :     done = true;
235 : lamonts 1488 if(bp->nActive > 0) {
236 : lamonts 1513 bp->nActive = bp->nActive - (bp->nStabilizing + bp->nDying);
237 :     bp->nDead += bp->nDying;
238 :    
239 : lamonts 1488 if (bp->nActive == 0) {
240 : lamonts 1513 //Check to see if this block can execute more strands for the
241 :     //next iteration
242 :     GrabbedWork(sched,blockIndxs,bp);
243 :     bp->nStabilizing = 0;
244 :     bp->nDying = 0;
245 :     StrandBlock_Copy(&blocks[bp->blkIdx],bp);
246 :     done = false;
247 :     }
248 :     }
249 :    
250 : lamonts 1488 }
251 : jhr 1618 while (!done);
252 : lamonts 1488 }
253 :     barrier (CLK_LOCAL_MEM_FENCE);
254 :    
255 : lamonts 1501 if (bp->nActive > 0) {
256 : jhr 1618 int idx = blockIndxs[bp->blkIdx * BLK_SZ + id];
257 : lamonts 1488
258 : lamonts 1513 scan(blockIndxs + bp->blkIdx * BLK_SZ,status,preStable,DIDEROT_STABLE,BLK_SZ,prefixScanTemp);
259 : lamonts 1488 barrier(CLK_LOCAL_MEM_FENCE);
260 : lamonts 1500
261 :     scan(blockIndxs + bp->blkIdx * BLK_SZ,status,preDead,DIDEROT_DIE,BLK_SZ,prefixScanTemp);
262 : lamonts 1488 barrier(CLK_LOCAL_MEM_FENCE);
263 : lamonts 1513
264 : jhr 1618 // compaction
265 :     // with these arrays, we can then compute the new index of each slot in
266 :     // parallel as follows
267 : lamonts 1488 switch (status[idx]) {
268 :     case DIDEROT_ACTIVE:
269 :     id = id - preStable[id] - preDead[id];
270 :     break;
271 :     case DIDEROT_STABLE:
272 :     id = bp->nActive + preStable[id];
273 :     break;
274 :     case DIDEROT_DIE:
275 : lamonts 1500 id = bp->nActive + bp->nStabilizing + preDead[id];
276 : lamonts 1488 break;
277 :     }
278 : lamonts 1513 blockIndxs[bp->blkIdx * BLK_SZ + id] = idx;
279 : lamonts 1501
280 : lamonts 1513 barrier (CLK_LOCAL_MEM_FENCE);
281 :     if (get_local_id(0) == 0) { // note that id ?= get_local_id(0)
282 : lamonts 1488 // put bp on scheduling queue
283 : lamonts 1513 qIdx = atom_inc(&(sched->queueSize));
284 :     bp-> nStabilizing = 0;
285 :     bp->nDying = 0;
286 : lamonts 1501 StrandBlock_Copy(&blocks[bp->blkIdx],bp);
287 :     queue[qIdx] = bp->blkIdx;
288 : lamonts 1513 }
289 :     }
290 : lamonts 1488 } while (bp->nActive > 0);
291 :    
292 :     }

root@smlnj-gforge.cs.uchicago.edu
ViewVC Help
Powered by ViewVC 1.0.0