/*! \file cl-scheduler.h * * \author Lamont Samuels */ /* * COPYRIGHT (c) 2011 The Diderot Project (http://diderot-language.cs.uchicago.edu) * All rights reserved. */ #define DIDEROT_CL_VERSION 110 #if (DIDEROT_CL_VERSION == 100) # pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics: enable # pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics: enable #endif #define BLK_SZ DIDEROT_CU_WIDTH #define DIDEROT_SINGLE_PRECISION #define DIDEROT_TARGET_CL #include "Diderot/cl-diderot.h" typedef struct { int blkIdx; // the id of this block int nActive; // number of active (status != DIE or STABLE) strands int nDead; // number of strands in the DIE state int nStabilizing; // number of new strands in the STABILIZE state int nDying; // number of new strands in the DIE state } StrandBlock_t; typedef struct { int numStrands; // number of strands int sId; // the index accumlator for the todo list or queue int nextStrand; // index of the next strand to retrieve from the pool int clearQueueSz; // an indicator on whether the queue size should be cleared int queueSize; // number of blocks on the scheduler's queue int todoSize; // number of blocks on the scheduler's todo list int numAvailable; // number of active strands left to process } SchedState_t; //! \brief Retrieve a block's worth (BLK_SZ) of strands for a StrandBlock_t. //! \param sched global scheduler state //! \param blocks global array of strand block indicies //! \param lBlk pointer to global storage for the strand block //! \return true if there were strands assigned to the strand block, otherwise false inline bool GrabbedWork ( __global SchedState_t *sched, __global int *blockIndxs, __local StrandBlock_t *lBlk) { // Get a block of strands int startStrandIdx = atom_add(&sched->nextStrand, BLK_SZ); int idx = 0, count = 0; // For each work-item in the workgroup for (int i = startStrandIdx, count = 0; i < sched->numStrands && count < BLK_SZ; i++,count++) { // Increment the number of active strands and assign each work-item a strand to process lBlk->nActive++; blockIndxs[lBlk->blkIdx * BLK_SZ + idx] = i; idx++; } return (idx != 0); } //! \brief Get a block of strands from the scheduler's queue pool. //! \param sched global scheduler state //! \param blockIndxs the strand indicies for a strand block //! \param queue global scheduler's queue //! \param blocks global array of strand blocks //! \param lBlk pointer to local storage for the strand block inline void GetBlock ( __global SchedState_t *sched, __global int *blockIndxs, __global int *queue, __global StrandBlock_t *blocks, __local StrandBlock_t *lBlk) { // Retrieve a block from the list int blkIdx = atom_inc(&(sched->sId)); // Check to see if the index retrieved is less than the // number of blocks on the queue if (blkIdx < sched->queueSize) { // Copy over the global data for the block into local memory lBlk->blkIdx = (blocks[queue[blkIdx]]).blkIdx; lBlk->nActive = (blocks[queue[blkIdx]]).nActive; lBlk->nDead = (blocks[queue[blkIdx]]).nDead; lBlk->nStabilizing = (blocks[queue[blkIdx]]).nStabilizing; lBlk->nDying = (blocks[queue[blkIdx]]).nDying; mem_fence(CLK_LOCAL_MEM_FENCE); // If the block has no active strands then if (lBlk->nActive == 0) { // Check to see if there is any strands left to processs. // If there are no strands left to process than make this block inactive if(!GrabbedWork(sched,blockIndxs,lBlk)) lBlk->nActive = -1; else blocks[lBlk->blkIdx].nActive = lBlk->nActive; } } else // if there isn't any queue items then notify the the workgroup lBlk->nActive = -1; } //! \brief Copies the strand block back from __local space to __global space //! \param gBlock global strand block //! \param lBlock local strand block inline void StrandBlock_Copy (__global StrandBlock_t * gBlock, __local StrandBlock_t * lBlock) { gBlock->nActive = lBlock->nActive; gBlock->nDead = lBlock->nDead; gBlock->nStabilizing = lBlock->nStabilizing; gBlock->nDying = lBlock->nDying; } //! \brief Get a block of strands from the scheduler's todo list. //! \param sched global scheduler state //! \param queue global scheduler's todo queue //! \param blocks global array of strand blocks //! \param lBlk pointer to local storage for the strand block inline void GetBlockFromTodoList ( __global SchedState_t *sched, __global int *todoList, __global StrandBlock_t *blocks, __local StrandBlock_t *lBlk) { // Retrieve a block from the todolist int blkIdx = atom_inc(&(sched->sId)); // Check to see if the index retrieved is less than the // number of blocks on the todo queue if (blkIdx < sched->todoSize) { // Copy over the global data for the block into local memory lBlk-> blkIdx = (blocks[todoList[blkIdx]]).blkIdx; lBlk->nActive = (blocks[todoList[blkIdx]]).nActive; lBlk->nDead = (blocks[todoList[blkIdx]]).nDead; lBlk->nStabilizing = (blocks[todoList[blkIdx]]).nStabilizing; lBlk->nDying = (blocks[todoList[blkIdx]]).nDying; mem_fence(CLK_LOCAL_MEM_FENCE); } else // if there isn't any queue items then notify the the workgroup lBlk->nActive = -1; } // A native parallel-prefix-scan algorithm //! \param input the block indicies for a strand block //! \param status global scheduler's status array //! \param output the pre-stable or pre-die arrays //! \param statusTy type of status for the output data //! \param n the number of strand indicies //! \param temp pointer to local storage for the scan algorithm inline void scan ( __global int *input, __global int *status, __local int *output, StrandStatus_t statusTy, int n, __local int *temp) { int thid = get_local_id(0); int pout = 0, pin = 1; // load input into local memory // exclusive scan: shift right by one and set first element to 0 if (thid == 0 || status[(*(input + thid - 1))] != (int)statusTy) temp[thid] = 0; else temp[thid] = 1; barrier(CLK_LOCAL_MEM_FENCE); for (int offset = 1; offset < n; offset *= 2) { pout = 1 - pout; pin = 1 - pout; if(thid >= offset) temp[pout * n + thid] = temp[pin * n + thid] + temp[pin * n + thid - offset]; else temp[pout * n + thid] = temp[pin * n + thid]; barrier(CLK_LOCAL_MEM_FENCE); } output[thid] = temp[pout* n + thid]; } // Clears the queue sizes and strand id acculator //! \param sched global scheduler state __kernel void Diderot_SchedUpdateKernel (__global SchedState_t *sched) { sched->sId = 0; if(sched->clearQueueSz == 1) { sched->clearQueueSz = 0; sched->queueSize = 0; } else { sched->clearQueueSz = 1; sched->todoSize = 0; } } // Compaction Kernel: compact strands and replicate stable state //! \param status global strand status array //! \param sched global scheduler state //! \param blocks global array of strand blocks //! \param blockIndxs the strand indicies for a strand block //! \param queue global scheduler's queue //! \param todo global scheduler's todo queue //! \param preStable pointer to local storage for number of stable strands with lower index //! \param preDead pointer to local storage for number of dead strands with lower index //! \param prefixScanTemp pointer to local storage for prefix-scan algorithm __kernel __attribute__((reqd_work_group_size(BLK_SZ, 1, 1))) void Diderot_CompactionKernel ( __global int *status, // strand status array __global SchedState_t *sched, // scheduler state __global StrandBlock_t *blocks, // array of scheduler blocks __global int * blockIndxs, __global int * queue, __global int * todoList, __local StrandBlock_t * bp, __local int * preStable, __local int * preDead, __local int * prefixScanTemp ) { int id = get_local_id(0); int qIdx; do { if (id == 0) { bool done; do { GetBlockFromTodoList(sched,todoList,blocks,bp); done = true; if(bp->nActive > 0) { bp->nActive = bp->nActive - (bp->nStabilizing + bp->nDying); bp->nDead += bp->nDying; if (bp->nActive == 0) { //Check to see if this block can execute more strands for the //next iteration GrabbedWork(sched,blockIndxs,bp); bp->nStabilizing = 0; bp->nDying = 0; StrandBlock_Copy(&blocks[bp->blkIdx],bp); done = false; } } } while (!done); } barrier (CLK_LOCAL_MEM_FENCE); if (bp->nActive > 0) { int idx = blockIndxs[bp->blkIdx * BLK_SZ + id]; scan(blockIndxs + bp->blkIdx * BLK_SZ,status,preStable,DIDEROT_STABLE,BLK_SZ,prefixScanTemp); barrier(CLK_LOCAL_MEM_FENCE); scan(blockIndxs + bp->blkIdx * BLK_SZ,status,preDead,DIDEROT_DIE,BLK_SZ,prefixScanTemp); barrier(CLK_LOCAL_MEM_FENCE); // compaction // with these arrays, we can then compute the new index of each slot in // parallel as follows switch (status[idx]) { case DIDEROT_ACTIVE: id = id - preStable[id] - preDead[id]; break; case DIDEROT_STABLE: id = bp->nActive + preStable[id]; break; case DIDEROT_DIE: id = bp->nActive + bp->nStabilizing + preDead[id]; break; } blockIndxs[bp->blkIdx * BLK_SZ + id] = idx; barrier (CLK_LOCAL_MEM_FENCE); if (get_local_id(0) == 0) { // note that id ?= get_local_id(0) // put bp on scheduling queue qIdx = atom_inc(&(sched->queueSize)); bp-> nStabilizing = 0; bp->nDying = 0; StrandBlock_Copy(&blocks[bp->blkIdx],bp); queue[qIdx] = bp->blkIdx; } } } while (bp->nActive > 0); }