Home My Page Projects Code Snippets Project Openings diderot
Summary Activity Tracker Tasks SCM

SCM Repository

[diderot] View of /branches/lamont/src/compiler/cl-target/fragments/sched.in
ViewVC logotype

View of /branches/lamont/src/compiler/cl-target/fragments/sched.in

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2298 - (download) (annotate)
Fri Mar 15 22:18:22 2013 UTC (6 years, 8 months ago) by jhr
File size: 9487 byte(s)
  Merging in changes from vis12 branch.
//! \brief Retrieve a block's worth (BLK_SZ) of strands for a StrandBlock_t. 
//! \param sched global scheduler state
//! \param blocks global array of strand block indicies 
//! \param lBlk pointer to global storage for the strand block
//! \return true if there were strands assigned to the strand block, otherwise false 
inline bool GrabWork ( 
    __global SchedState_t       *sched,
    __global int                *blockIndxs, 
    __local StrandBlock_t       *lBlk) 
{ 
  // Get a block of strands 
    int startStrandIdx = atom_add(&sched->nextStrand, BLK_SZ); 
    int idx = 0, count = 0; 
          
  // For each work-item in the workgroup 
    for (int i = startStrandIdx, count = 0; i < sched->numStrands && count < BLK_SZ; i++,count++)  {
      // Increment the number of active strands and assign each work-item a strand to process 
          lBlk->nActive++; 
          blockIndxs[lBlk->blkIdx * BLK_SZ + idx] = i; 
          idx++; 
    }
    return (idx != 0); 
} 

//! \brief Get a block of strands from the scheduler's queue pool.
//! \param sched global scheduler state
//! \param blockIndxs the strand indicies for a strand block 
//! \param queue global scheduler's queue 
//! \param blocks global array of strand blocks
//! \param lBlk pointer to local storage for the strand block
inline void GetBlock (
    __global SchedState_t       *sched,
    __global int                *blockIndxs, 
    __global int                *queue, 
    __global StrandBlock_t      *blocks,
    __local StrandBlock_t       *lBlk)
{

  // Retrieve a block from the list 
    int blkIdx = atom_inc(&(sched->sId)); 

  // Check to see if the index retrieved is less than the 
  // number of blocks on the queue 
    if (blkIdx < sched->queueSize) {
      // Copy over the global data for the block into local memory 
        lBlk->blkIdx = (blocks[queue[blkIdx]]).blkIdx;        
        lBlk->nActive = (blocks[queue[blkIdx]]).nActive;      
        lBlk->nDead = (blocks[queue[blkIdx]]).nDead;        
        lBlk->nStabilizing = (blocks[queue[blkIdx]]).nStabilizing;  
        lBlk->nDying = (blocks[queue[blkIdx]]).nDying;     
        mem_fence(CLK_LOCAL_MEM_FENCE); 

      // If the block has no active strands then 
        if (lBlk->nActive == 0)  {

          // Check to see if there is any strands left to processs.
          // If there are no strands left to process than make this block inactive 
            if (!GrabWork(sched,blockIndxs,lBlk)) 
                lBlk->nActive = -1; 
            else
                blocks[lBlk->blkIdx].nActive = lBlk->nActive; 
        }    
    }
    else // if there isn't any queue items then notify the the workgroup 
        lBlk->nActive = -1; 
}

//! \brief Copies the strand block back from __local space to __global space 
//! \param gBlock global strand block 
//! \param lBlock local strand block 
inline void StrandBlock_Copy (__global StrandBlock_t * gBlock, __local StrandBlock_t * lBlock) 
{
    gBlock->nActive = lBlock->nActive; 
    gBlock->nDead = lBlock->nDead; 
    gBlock->nStabilizing = lBlock->nStabilizing; 
    gBlock->nDying = lBlock->nDying; 
} 

//! \brief Get a block of strands from the scheduler's todo list.
//! \param sched global scheduler state
//! \param queue global scheduler's todo queue 
//! \param blocks global array of strand blocks
//! \param lBlk pointer to local storage for the strand block
inline void GetBlockFromTodoList (
    __global SchedState_t       *sched,
    __global int                *todoList, 
    __global StrandBlock_t      *blocks,
    __local StrandBlock_t       *lBlk)
{
  // Retrieve a block from the todolist 
    int blkIdx = atom_inc(&(sched->sId)); 

  // Check to see if the index retrieved is less than the 
  // number of blocks on the todo queue 
    if (blkIdx < sched->todoSize) {
      // Copy over the global data for the block into local memory 
        lBlk->blkIdx = (blocks[todoList[blkIdx]]).blkIdx; 
        lBlk->nActive = (blocks[todoList[blkIdx]]).nActive;      
        lBlk->nDead = (blocks[todoList[blkIdx]]).nDead;        
        lBlk->nStabilizing = (blocks[todoList[blkIdx]]).nStabilizing;  
        lBlk->nDying = (blocks[todoList[blkIdx]]).nDying; 
        mem_fence(CLK_LOCAL_MEM_FENCE); 
    }
    else // if there isn't any queue items then notify the the workgroup 
        lBlk->nActive = -1; 

}

// A native parallel-prefix-scan algorithm 
//! \param input the block indicies for a strand block 
//! \param status global scheduler's status array
//! \param output the pre-stable or pre-die arrays 
//! \param statusTy type of status for the output data 
//! \param n the number of strand indicies 
//! \param temp pointer to local storage for the scan algorithm 
inline void scan (
    __global int        *input, 
    __global int        *status, 
    __local int         *output, 
    StrandStatus_t      statusTy, 
    int                 n, 
    __local int         *temp) 
{ 
    int thid = get_local_id(0); 
    int pout = 0, pin = 1; 

  // load input into local memory 
  // exclusive scan: shift right by one and set first element to 0
    if (thid == 0 || status[(*(input + thid - 1))] != (int)statusTy) 
        temp[thid] = 0; 
    else 
        temp[thid] = 1;  
       
    barrier(CLK_LOCAL_MEM_FENCE); 

    for (int offset = 1; offset < n; offset *= 2) {  
        pout = 1 - pout; 
        pin = 1 - pout; 
        
        if(thid >= offset) 
            temp[pout * n + thid] = temp[pin * n + thid] + temp[pin * n + thid - offset]; 
        else 
            temp[pout * n + thid] = temp[pin * n + thid]; 
    
        barrier(CLK_LOCAL_MEM_FENCE); 
    } 
    output[thid] = temp[pout* n + thid]; 
}

// Scheduler Meta-Clearing Kernel: Clears the queue sizes and strand id acculator 
//! \param sched global scheduler state
__kernel void Diderot_SchedUpdateKernel (__global SchedState_t *sched) 
{
    sched->sId = 0;
    if (sched->clearQueueSz == 1) {
        sched->clearQueueSz = 0; 
        sched->queueSize = 0;
    }
    else {
        sched->clearQueueSz = 1; 
        sched->todoSize = 0; 
    } 
}

// Compaction Kernel: compact strands and replicate stable state
//! \param status global strand status array 
//! \param sched global scheduler state
//! \param blocks global array of strand blocks
//! \param blockIndxs the strand indicies for a strand block 
//! \param queue global scheduler's queue 
//! \param todo global scheduler's todo queue 
//! \param preStable pointer to local storage for number of stable strands with lower index
//! \param preDead pointer to local storage for number of dead strands with lower index
//! \param prefixScanTemp pointer to local storage for prefix-scan algorithm 
__kernel __attribute__((reqd_work_group_size(BLK_SZ, 1, 1)))
void Diderot_CompactionKernel (
    __global int *status,          // strand status array
    __global SchedState_t *sched,       // scheduler state
    __global StrandBlock_t *blocks,     // array of scheduler blocks
    __global int *blockIndxs, 
    __global int *queue, 
    __global int *todoList,
    __local StrandBlock_t *bp, 
    __local int *preStable, 
    __local int *preDead, 
    __local int *prefixScanTemp)
{ 
    int id = get_local_id(0);   
    int qIdx; 

    do {
        if (id == 0) {
           bool done;
           do {     
            GetBlockFromTodoList(sched,todoList,blocks,bp);         
            done = true; 
            if (bp->nActive > 0) {            
                bp->nActive =  bp->nActive - (bp->nStabilizing + bp->nDying);
                bp->nDead += bp->nDying;  
                if (bp->nActive == 0) {
                   //Check to see if this block can execute more strands for the 
                   //next iteration 
                   GrabWork(sched,blockIndxs,bp); 
                   bp->nStabilizing = 0; 
                   bp->nDying = 0; 
                   StrandBlock_Copy(&blocks[bp->blkIdx],bp); 
                   done = false;  
                } 
              }
            } while (!done); 
        }
        barrier (CLK_LOCAL_MEM_FENCE);

        if (bp->nActive > 0) {
           int idx = blockIndxs[bp->blkIdx * BLK_SZ + id]; 

           scan(blockIndxs + bp->blkIdx * BLK_SZ,status,preStable,DIDEROT_STABLE,BLK_SZ,prefixScanTemp); 
           barrier(CLK_LOCAL_MEM_FENCE); 
           scan(blockIndxs + bp->blkIdx * BLK_SZ,status,preDead,DIDEROT_DIE,BLK_SZ,prefixScanTemp); 
           barrier(CLK_LOCAL_MEM_FENCE); 

          // compaction
          // with these arrays, we can then compute the new index of each slot in
          // parallel as follows
	    int newIdx; 
            switch (status[idx]) {
              case DIDEROT_ACTIVE:
                newIdx = id - preStable[id] - preDead[id];
                break;
              case DIDEROT_STABLE:
                newIdx = bp->nActive + preStable[id];
                break;
              case DIDEROT_DIE:
                newIdx = bp->nActive + preStable[id] + preDead[id];
                break;
            }
            blockIndxs[bp->blkIdx * BLK_SZ + newIdx] = idx; 

            barrier (CLK_LOCAL_MEM_FENCE);  
            if (id == 0) {
              // put bp on scheduling queue
                qIdx = atom_inc(&(sched->queueSize)); 
                bp->nStabilizing = 0; 
                bp->nDying = 0; 
                StrandBlock_Copy(&blocks[bp->blkIdx],bp); 
                queue[qIdx] = bp->blkIdx; 
            }  
        } 
    } while (bp->nActive > 0);

}

root@smlnj-gforge.cs.uchicago.edu
ViewVC Help
Powered by ViewVC 1.0.0