Home My Page Projects Code Snippets Project Openings diderot
Summary Activity Tracker Tasks SCM

SCM Repository

[diderot] View of /branches/pure-cfg/src/lib/cl-target/main.c
ViewVC logotype

View of /branches/pure-cfg/src/lib/cl-target/main.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3349 - (download) (as text) (annotate)
Tue Oct 27 15:16:36 2015 UTC (3 years, 11 months ago) by jhr
File size: 24497 byte(s)
making copyrights consistent for all code in the repository
/*! \file main.c
 *
 * \author John Reppy
 */

/*
 * This code is part of the Diderot Project (http://diderot-language.cs.uchicago.edu)
 *
 * COPYRIGHT (c) 2015 The University of Chicago
 * All rights reserved.
 */

#include <Diderot/diderot.h>
#include "clinfo.h"
#include <sys/sysctl.h>
#include <sys/stat.h>

typedef struct {
    cl_int              blkIdx;         //!< the id of this block 
    cl_int              nActive;        //!< number of active (status != DIE or STABLE) strands 
    cl_int              nDead;          //!< number of strands in the DIE state
    cl_int              nStabilizing;   //!< number of new strands in the STABILIZE state
    cl_int              nDying;         //!< number of new strands in the DIE state
} StrandBlock_t;

typedef struct { 
    cl_int              numStrands;     //!< number of strands 
    cl_int              sId;            //!< the index into the todo list or queue 
    cl_int              nextStrand;     //!< index of the next strand to retrieve from the pool
    cl_int              clearQueueSz;   //!< an indicator on whether the queue size should be cleared
    cl_int              queueSize;      //!< number of blocks on the scheduler's queue  
    cl_int              todoSize;       //!< number of blocks on the scheduler's todo list
    cl_int              numAvailable;   //!< number of active strands left to process 
} SchedState_t; 

typedef struct {
    cl_kernel           kern;           //!< OpenCL kernel that implements the program
    size_t              workGrpSize;    //!< size of workgroup for this kernel
    cl_ulong            localSzb;       //!< size of local memory used by kernel
} GPUKernel_t;

struct struct_world {
    STRUCT_WORLD_PREFIX
// FIXME: document these fields!
    void                *inState;
    void                *outState;
    DeviceInfo_t        *device;        //!< info about OpenCL device that we are using.
    uint32_t            nWorkers;       //!< number of work groups to create
    cl_context          context;        //!< OpenCL execution context
    cl_command_queue    cmdQ;           //!< OpenCL command queue
    GPUKernel_t         kernel[3];      //!< OpenCL kernel that implements the program
};

// FIXME: need documentation for this stuct!
typedef struct {
   cl_mem       schedMem; 
   cl_mem       outMem; 
   cl_mem       inMem; 
   cl_mem       blocksMem; 
   cl_mem       strandBlocksIdxsMem; 
   cl_mem       queueMem; 
   cl_mem       todoMem; 
   cl_mem       statusMem;
} KernelArgs_t;

static bool InitCL (CLInfo_t *clInfo, Diderot_World_t *wrld);
static void CheckErrorCode (cl_int sts, const char * msg);
static void SetPhase1Args (cl_kernel kernel, int *argCount, KernelArgs_t *args); 
static void SetPhase2Args (cl_kernel kernel, int *argCount, KernelArgs_t *args, int blk_sz);
static void SetScheduleKernelArgs (cl_kernel kernel, int *argCount, KernelArgs_t *args); 

static bool CreateKernel (cl_device_id dev, cl_program prog, const char *name, GPUKernel_t *kern);

extern void Diderot_LoadGlobals (cl_context cxt, cl_kernel kernel, cl_command_queue cmdQ, int argStart);

int main (int argc, const char **argv)
{
  // get information about OpenCL support
    CLInfo_t *clInfo = GetCLInfo();
    if (clInfo == 0) {
        fprintf (stderr, "no OpenCL support\n");
        exit (1);
    }

    // run the generated global initialization code
    if (VerboseFlg)
        fprintf (stderr, "initializing globals ...\n");
    Diderot_InitGlobals ();
    
    Diderot_int_t nWorkersPerCU = 4;
    Diderot_Options_t *opts = Diderot_OptNew ();
    Diderot_OptAddInt (opts, "np", "specify number of workers/CU", &nWorkersPerCU, true);
    Diderot_RegisterGlobalOpts (opts);
    Diderot_OptProcess (opts, argc, argv);
    Diderot_OptFree (opts);

    if (VerboseFlg)
        PrintCLInfo (stdout, clInfo);

    Diderot_World_t *wrld = Diderot_Initially ();  // this may not be right for OpenCL

    if (! InitCL(clInfo, wrld))
        exit (1);

  // set the number of work groups
    if ((0 < nWorkersPerCU) && (nWorkersPerCU < wrld->device->maxWISize[0] / wrld->device->cuWidth))
        wrld->nWorkers = nWorkersPerCU * wrld->device->numCUs;
    else
        wrld->nWorkers = wrld->device->numCUs;  // fallback to one worker per CU
    if (VerboseFlg)
        fprintf (stderr, "using %d x %d threads\n",
            wrld->nWorkers, wrld->device->cuWidth);

  // Conversion of strands from their host types to their shadow types
    void *shadowInState = CheckedAlloc(Diderot_Strands[0]->shadowStrandSzb * wrld->numStrands); 
    void *shadowOutState = CheckedAlloc(Diderot_Strands[0]->shadowStrandSzb * wrld->numStrands); 
    uint8_t *strandPtr = (uint8_t *)wrld->inState;
    uint8_t *strandShadowPtr = (uint8_t *)shadowInState;
    uint8_t *strandShadowOutPtr = (uint8_t *)shadowOutState;
    size_t shadowSize = Diderot_Strands[0]->shadowStrandSzb * wrld->numStrands; 
    
    for (int i = 0;  i < wrld->numStrands;  i++) {
        Diderot_Strands[0]->strandCopy (strandPtr, strandShadowPtr);
        Diderot_Strands[0]->strandCopy (strandPtr, strandShadowOutPtr); 
        strandPtr += Diderot_Strands[0]->stateSzb;
        strandShadowPtr += Diderot_Strands[0]->shadowStrandSzb;
        strandShadowOutPtr += Diderot_Strands[0]->shadowStrandSzb;
    }
// FIXME: it is confusing to use the inState/outState pointers for two different purposes.
    free (wrld->inState); 
    free (wrld->outState); 
    wrld->inState = shadowInState; 
    wrld->outState = shadowOutState; 

  // hack to make the invariant part of the state the same in both copies
    memcpy (wrld->outState, wrld->inState, shadowSize);

    int argCount = 0;
    cl_int sts = CL_SUCCESS;  

    KernelArgs_t kernelArgs;
    SchedState_t scheduler; 

    scheduler.numStrands = wrld->numStrands; 
    scheduler.nextStrand = 0; 
    scheduler.todoSize = 0; 
    scheduler.sId = 0;   
    scheduler.clearQueueSz= 1; 
    scheduler.numAvailable = wrld->numStrands; 

    size_t globalWorkSize[1] = {0};
    size_t localWorkSize[1] = {0}; 
    globalWorkSize[0] = wrld->nWorkers * wrld->device->cuWidth; 
    localWorkSize[0] = wrld->device->cuWidth;

  // compute the number of scheduler blocks
    int numberOfBlocks = (wrld->numStrands + wrld->device->cuWidth - 1) / wrld->device->cuWidth;
    assert (numberOfBlocks * wrld->device->cuWidth >= wrld->numStrands);
    
    size_t strandBlkMemSize = sizeof(int) * numberOfBlocks * wrld->device->cuWidth;
    size_t schedListMemSize = sizeof(int) * numberOfBlocks; 
    int *schedulerQueue = (int *)CheckedAlloc(schedListMemSize); 
    int *schedulerTodoList = (int *)CheckedAlloc(schedListMemSize); 
    int *strandBlocksIdxs = (int *)CheckedAlloc(strandBlkMemSize);     
    scheduler.queueSize = numberOfBlocks; 

    size_t strandblkMemSize = sizeof(StrandBlock_t) * numberOfBlocks;         
    StrandBlock_t *strandBlks = (StrandBlock_t *)CheckedAlloc(strandblkMemSize); 

    for (int i = 0; i < numberOfBlocks; i++) { 
        schedulerQueue[i] = i; 
        strandBlks[i].blkIdx = i;    
        strandBlks[i].nActive = 0; 
        strandBlks[i].nDead = 0;      
        strandBlks[i].nStabilizing = 0;  
        strandBlks[i].nDying = 0;
    }

    kernelArgs.schedMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
        sizeof(SchedState_t), &scheduler, &sts);
    CheckErrorCode (sts, "error creating OpenCL scheduler buffer\n"); 

    kernelArgs.blocksMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
        strandblkMemSize, strandBlks , &sts);
    CheckErrorCode (sts, "error creating OpenCL strand blocks buffer\n"); 

    kernelArgs.strandBlocksIdxsMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
        strandBlkMemSize, strandBlocksIdxs , &sts);
    CheckErrorCode (sts, "error creating OpenCL strand blocks indices buffer\n"); 

    kernelArgs.queueMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
        schedListMemSize, schedulerQueue , &sts);
    CheckErrorCode (sts, "error creating OpenCL scheduler queue buffer\n"); 

    kernelArgs.todoMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
        schedListMemSize, schedulerTodoList , &sts);
    CheckErrorCode (sts, "error creating OpenCL scheduler todo buffer\n"); 

    kernelArgs.inMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
        shadowSize, wrld->inState, &sts);
    CheckErrorCode (sts, "error creating OpenCL strand in-state buffer\n"); 

    kernelArgs.outMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
        shadowSize, wrld->outState, &sts);
    CheckErrorCode (sts, "error creating OpenCL strand out-state buffer\n"); 

    kernelArgs.statusMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
        sizeof(int) * wrld->numStrands, wrld->status, &sts);
    CheckErrorCode (sts, "error creating OpenCL world status buffer\n"); 

    //Setup the Update Kernel's arguments 
    SetPhase1Args (wrld->kernel[0].kern, &argCount, &kernelArgs); 
    Diderot_LoadGlobals (wrld->context, wrld->kernel[0].kern, wrld->cmdQ, argCount); 

    //Setup the Compaction Kernel's arguments 
    argCount=0;
    SetPhase2Args (wrld->kernel[1].kern, &argCount, &kernelArgs, wrld->device->cuWidth); 

    //Setup the Scheduler Kernel's arguments 
    argCount = 0; 
    SetScheduleKernelArgs (wrld->kernel[2].kern, &argCount, &kernelArgs); 

    double t0 = airTime();

    while (scheduler.numAvailable > 0) { 
      // Runs the update kernel on all  strands 
        sts = clEnqueueNDRangeKernel(wrld->cmdQ, wrld->kernel[0].kern, 1, NULL,
                globalWorkSize, localWorkSize, 0, NULL, NULL); 
        CheckErrorCode (sts, "error in executing update kernel\n");

        sts = clEnqueueTask(wrld->cmdQ,wrld->kernel[2].kern, 0, NULL, NULL); 
        CheckErrorCode (sts, "error in executing scheduler update kernel before compaction\n");
 
      // Run the compaction kernel on all strands 
        sts = clEnqueueNDRangeKernel(wrld->cmdQ, wrld->kernel[1].kern, 1, NULL,
                globalWorkSize, localWorkSize, 0, NULL, NULL);
        CheckErrorCode (sts, "error ccccn executing compaction kernel\n");

        sts = clEnqueueTask(wrld->cmdQ,wrld->kernel[2].kern,0,NULL,NULL); 
        CheckErrorCode (sts, "error in executing scheduler update kernel after compaction\n");

        sts = clEnqueueReadBuffer(wrld->cmdQ, kernelArgs.schedMem, CL_TRUE, 0, sizeof(SchedState_t),
                &scheduler, 0, NULL, NULL); 
        CheckErrorCode (sts, "error reading back scheduler information\n");
    }

    sts = clEnqueueReadBuffer(wrld->cmdQ, kernelArgs.outMem, CL_TRUE, 0,shadowSize,
        wrld->outState, 0, NULL, NULL);
    CheckErrorCode (sts, "error in reading back output\n");

    sts = clEnqueueReadBuffer(
        wrld->cmdQ, kernelArgs.statusMem, CL_TRUE, 0, sizeof(int) * wrld->numStrands,
        wrld->status, 0, NULL, NULL);
    CheckErrorCode (sts, "error in reading back status\n");

    wrld->outputSzb = Diderot_Strands[0]->shadowStrandSzb; 
    clFinish (wrld->cmdQ); 
    clReleaseKernel(wrld->kernel[0].kern);
    clReleaseKernel(wrld->kernel[1].kern);
    clReleaseCommandQueue(wrld->cmdQ);
    clReleaseContext(wrld->context);

    double totalTime = airTime() - t0;

    if (VerboseFlg)
        fprintf (stderr, "done in %f seconds\n", totalTime);
    else if (TimingFlg)
        printf ("usr=%f\n", totalTime);

  // here we have the final state of all of the strands in the "in" buffer
  // output the final strand states
    if (NrrdOutputFlg)
        Diderot_Output (wrld, Diderot_Strands[0]->shadowStrandSzb);
    else
        Diderot_Print (wrld);

    Diderot_Shutdown (wrld);

    return 0;

}

static void SetPhase1Args (cl_kernel kernel, int *argCount, KernelArgs_t *args)
{ 
    int count = *argCount;
    cl_int sts = CL_SUCCESS;  

    sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->inMem);
    CheckErrorCode (sts, "Update Kernel: error Setting OpenCL strand in-state argument\n"); 

    sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->outMem);
    CheckErrorCode (sts, "Update Kernel: error Setting OpenCL strand out-state argument\n"); 

    sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->statusMem);
    CheckErrorCode (sts, "Update Kernel: error Setting OpenCL world status argument\n"); 

    sts = clSetKernelArg (kernel,  count++, sizeof(cl_mem), &args->schedMem);
    CheckErrorCode (sts, "Update Kernel: error Setting OpenCL scheduler argument\n"); 

    sts = clSetKernelArg (kernel,  count++, sizeof(cl_mem), &args->blocksMem);
    CheckErrorCode (sts, "Update Kernel: error Setting OpenCL strand blocks argument\n"); 

    sts = clSetKernelArg (kernel,  count++, sizeof(cl_mem), &args->strandBlocksIdxsMem);
    CheckErrorCode (sts, "Update Kernel: error Setting OpenCL strand blocks' indices argument\n"); 

    sts = clSetKernelArg (kernel,  count++, sizeof(cl_mem), &args->queueMem);
    CheckErrorCode (sts, "Update Kernel: error Setting OpenCL scheduler queue argument\n"); 

    sts = clSetKernelArg (kernel,  count++, sizeof(cl_mem), &args->todoMem);
    CheckErrorCode (sts, "Update Kernel: error Setting OpenCL scheduler todo argument\n"); 

    sts = clSetKernelArg (kernel,  count++, sizeof(StrandBlock_t), NULL);
    CheckErrorCode (sts, "Update Kernel:  error Setting OpenCL local strand blockargument\n"); 

    *argCount = count;

}

static void SetPhase2Args (cl_kernel kernel, int *argCount, KernelArgs_t *args, int blk_sz)
{
    int count = *argCount;
    cl_int sts = CL_SUCCESS;

    sts = clSetKernelArg (kernel, count++, sizeof(cl_mem), &args->statusMem);
    CheckErrorCode (sts, "Compaction Kernel:  error Setting OpenCL world status argument\n"); 

    sts = clSetKernelArg (kernel,  count++, sizeof(cl_mem), &args->schedMem);
    CheckErrorCode (sts, "Compaction Kernel:  error Setting OpenCL scheduler argument\n"); 

    sts = clSetKernelArg (kernel,  count++, sizeof(cl_mem), &args->blocksMem);
    CheckErrorCode (sts, "Compaction Kernel:  error Setting OpenCL strand blocks argument\n"); 

    sts = clSetKernelArg (kernel,  count++, sizeof(cl_mem), &args->strandBlocksIdxsMem);
    CheckErrorCode (sts, "Compaction Kernel:  error Setting OpenCL strand blocks' indices argument\n"); 

    sts = clSetKernelArg (kernel,  count++, sizeof(cl_mem), &args->queueMem);
    CheckErrorCode (sts, "Compaction Kernel:  error Setting OpenCL scheduler queue argument\n"); 

    sts = clSetKernelArg (kernel,  count++, sizeof(cl_mem), &args->todoMem);
    CheckErrorCode (sts, "Compaction Kernel:  error Setting OpenCL scheduler todo argument\n"); 

    sts = clSetKernelArg (kernel,  count++, sizeof(StrandBlock_t), NULL);
    CheckErrorCode (sts, "Compaction Kernel:  error Setting OpenCL local strand blockargument\n"); 

    sts = clSetKernelArg (kernel,  count++, sizeof(int) * blk_sz, NULL);
    CheckErrorCode (sts, "Compaction Kernel:  error Setting OpenCL local preStable argument\n"); 

    sts = clSetKernelArg (kernel,  count++, sizeof(int) * blk_sz, NULL);
    CheckErrorCode (sts, "Compaction Kernel:  error Setting OpenCL local preDead argument\n"); 

    sts = clSetKernelArg (kernel,  count++, sizeof(int) * blk_sz * 2, NULL);
    CheckErrorCode (sts, "Compaction Kernel:  error Setting OpenCL local temporary array for the prefix scan of preStable preDead argument\n"); 


    *argCount = count;
}

static void SetScheduleKernelArgs(cl_kernel kernel, int *argCount, KernelArgs_t *args) 
{
    int count = *argCount;
    cl_int sts = CL_SUCCESS;
 
    sts = clSetKernelArg (kernel,  count++, sizeof(cl_mem), &args->schedMem);
    CheckErrorCode (sts, "Scheduler Kernel: error setting OpenCL scheduler argument\n"); 

    *argCount = count; 
}

static void CheckErrorCode (cl_int sts, const char *msg) 
{ 
    if (sts != CL_SUCCESS) {
        fprintf (stderr, "%s", msg);
        exit(1); 
    } 
}

/*! \brief load OpenCL code from a file
 */
static char *LoadSource (const char *filename) 
{  
    struct stat statbuf;
    if (stat(filename, &statbuf) < 0) {
        fprintf (stderr, "unable to stat OpenCL source file %s\n", filename);
        exit (1);
    }

    char *source = (char *)CheckedAlloc(statbuf.st_size + 1);
    if (source == 0) {
        fprintf (stderr, "unable to allocate memory for OpenCL source\n");
        exit (1);
    }

    FILE *fh = fopen(filename, "r");
    if ((fh == 0)
    ||  (fread(source, statbuf.st_size, 1, fh) != 1)) {
        fprintf (stderr, "unable to read OpenCL source from %s\n", filename);
        exit (1);
    }
    source[statbuf.st_size] = '\0';
    fclose (fh);
    
    return source; 
}

//! create a kernel object from a program
static bool CreateKernel (cl_device_id dev, cl_program prog, const char *name, GPUKernel_t *kern)
{
    cl_int sts;

    kern->kern = clCreateKernel(prog, name, &sts);
    if (sts != CL_SUCCESS) {
        fprintf (stderr, "error getting %s from program\n", name);
        return false;
    }
    sts = clGetKernelWorkGroupInfo (
        kern->kern, dev, CL_KERNEL_WORK_GROUP_SIZE,
        sizeof(size_t), &(kern->workGrpSize), 0);
    if (sts != CL_SUCCESS) {
        fprintf (stderr, "error getting workgroup size for %s\n", name);
        return false;
    }
    sts = clGetKernelWorkGroupInfo (
        kern->kern, dev, CL_KERNEL_LOCAL_MEM_SIZE,
        sizeof(cl_ulong), &(kern->localSzb), 0);
    if (sts != CL_SUCCESS) {
        fprintf (stderr, "error getting local memory size for %s\n", name);
        return false;
    }

    if (VerboseFlg) {
        fprintf(stderr, "kernel %s: workgroup size = %d, local memory = %d bytes\n",
                name, (int)kern->workGrpSize, (int)kern->localSzb);
    }

    return true;
}

static void LogMessagesToStderr (const char *errstr, const void *private_info, size_t cb, void *user_data)
{
    fprintf(stderr, "***** error log *****\n"); 
    fprintf(stderr, "%s\n", errstr); 
    fprintf(stderr, "***** end error log *****\n"); 
}

/*! \brief initialize the OpenCL execution context, including loading and compiling the OpenCL
 *         program.
 *  \param clInfo points to the summary information about the available OpenCL devices.
 *  \param wrld the Diderot execution information.
 *  \return true if initialization is successful, otherwise false.
 */
static bool InitCL (CLInfo_t *clInfo, Diderot_World_t *wrld)
{
    cl_int              sts;
    int                 pltIx = 0;  // main patform index

  // find a GPU on platform[0]
    DeviceInfo_t *dev = 0;
    int i;
    for (i = 0;  i < clInfo->platforms[pltIx].numDevices;  i++) {
        if (isGPUDevice (&(clInfo->platforms[pltIx].devices[i]))
        &&  clInfo->platforms[pltIx].devices[i].isAvail) {
            dev = &(clInfo->platforms[pltIx].devices[i]);
            break;
        }
    }

    if (dev == 0) {
        fprintf (stderr, "unable to find GPU device\n");
        return false;
    }

    if (VerboseFlg) {
        fprintf (stderr, "using platform %d, device %d: %s\n",
            pltIx, i, clInfo->platforms[0].devices[i].name);
    }

  // create the context
    cl_context cxt = clCreateContext(0, 1, &(dev->id), LogMessagesToStderr, 0, &sts);
    if (sts != CL_SUCCESS) {
        fprintf (stderr, "error creating OpenCL context\n");
        return false;
    }

  // create the program from the source
    int fnameLen = strlen(wrld->name) + 4;  // name + ".cl\0"
    char *fname = (char *)CheckedAlloc(fnameLen);
    snprintf(fname, fnameLen, "%s.cl", wrld->name);
    char *updateSource = LoadSource (fname);
    free (fname);

    const char *src[1] = {updateSource};
    cl_program prog = clCreateProgramWithSource(cxt, 1, src, NULL, &sts);
    free (updateSource);
    if (sts != CL_SUCCESS) {
        fprintf (stderr, "error creating program\n");
        return false;
    }

  // build the program
    char options[1024];
    snprintf (options, sizeof(options),
        "-D DIDEROT_CL_VERSION=%d -D DIDEROT_CU_WIDTH=%d -I %s -w",
        100*dev->majorVersion + dev->minorVersion,
        dev->cuWidth,
        DIDEROT_INCLUDE_PATH);
    if (VerboseFlg) {
        fprintf (stderr, "clBuildProgram options: %s\n", options);
    }
    sts = clBuildProgram (prog, 1, &(dev->id), options, 0, 0);
    if (sts != CL_SUCCESS) {
        size_t logSzb;
        clGetProgramBuildInfo (prog, dev->id, CL_PROGRAM_BUILD_LOG, 0, 0, &logSzb);
        char *log = CheckedAlloc(logSzb+1);
        clGetProgramBuildInfo (prog, dev->id, CL_PROGRAM_BUILD_LOG, logSzb, log, &logSzb);
        log[logSzb] = '\0';
        fprintf (stderr, "error compiling program:\n%s\n", log);
        free (log);
        return false;
    }

  // extract the kernels from the program
    if ((! CreateKernel (dev->id, prog, "Diderot_UpdateKernel", &(wrld->kernel[0])))
    ||  (! CreateKernel (dev->id, prog, "Diderot_CompactionKernel", &(wrld->kernel[1])))
    ||  (! CreateKernel (dev->id, prog, "Diderot_SchedUpdateKernel", &(wrld->kernel[2]))))
        return false;

  // create the command queue
    cl_command_queue q = clCreateCommandQueue(cxt, dev->id, 0, &sts);
    if (sts != CL_SUCCESS) {
        fprintf (stderr, "error creating OpenCL command queue\n");
        return false;
    }

  // initialize world info
    wrld->device = dev;
    wrld->context = cxt;
    wrld->cmdQ = q;

    return true;
}

// this should be the part of the scheduler
void *Diderot_AllocStrand (Strand_t *strand)
{
    return CheckedAlloc(strand->stateSzb);
}

// block allocation of an initial collection of strands
Diderot_World_t *Diderot_AllocInitially (
    const char *name,           // the name of the program
    Strand_t *strand,           // the type of strands being allocated
    bool isArray,               // is the initialization an array or collection?
    uint32_t nDims,             // depth of iteration nesting
    int32_t *base,              // nDims array of base indices
    uint32_t *size)             // nDims array of iteration sizes
{
    Diderot_World_t *wrld = NEW(Diderot_World_t);
    if (wrld == 0) {
        fprintf (stderr, "unable to allocate world\n");
        exit (1);
    }

    wrld->name = name; /* NOTE: we are assuming that name is statically allocated! */
    wrld->isArray = isArray;
    wrld->nDims = nDims;
    wrld->base = NEWVEC(int32_t, nDims);
    wrld->size = NEWVEC(uint32_t, nDims);
    size_t numStrands = 1;
    for (int i = 0;  i < wrld->nDims;  i++) {
        numStrands *= size[i];
        wrld->base[i] = base[i];
        wrld->size[i] = size[i];
    }

    if (VerboseFlg) {
        printf("AllocInitially: %d", size[0]);
        for (int i = 1;  i < nDims;  i++) printf(" x %d", size[i]);
        printf("\n");
    }

  // allocate the strand state pointers
    wrld->numStrands = numStrands;
    wrld->inState = CheckedAlloc (strand->stateSzb * numStrands); 
    wrld->outState = CheckedAlloc (strand->shadowStrandSzb * numStrands); 
    wrld->status = NEWVEC(StatusInt_t, numStrands);

  // initialize strand state pointers etc.
    for (size_t i = 0;  i < numStrands;  i++) {
        wrld->status[i] = DIDEROT_ACTIVE;
    }

    return wrld;

}

// get strand state pointers
void *Diderot_InState (Diderot_World_t *wrld, uint32_t i)
{
    assert (i < wrld->numStrands);
    return wrld->inState + (i * Diderot_Strands[0]->stateSzb);
}

void *Diderot_OutState (Diderot_World_t *wrld, uint32_t i)
{
    assert (i < wrld->numStrands);
// FIXME: this function is called when outState points to the shadow state,
// but it is confusing to me  -- JHR
    return wrld->outState + (i * Diderot_Strands[0]->shadowStrandSzb);
}

/***** Support for shadow image values *****/

void ShadowImage1D (cl_context cxt, Shadow_image1D_t *dst, Diderot_image1D_t *src)
{
    dst->size[0] = src->size[0];
    dst->s = src->s;
    dst->t = src->t;
}

void ShadowImage2D (cl_context cxt, Shadow_image2D_t *dst, Diderot_image2D_t *src)
{
   dst->size[0] = src->size[0];
    dst->size[1] = src->size[1];
    ShadowMat2x2 (dst->w2i, src->w2i);
    ShadowVec2 (&dst->tVec, src->tVec);
    ShadowMat2x2 (dst->w2iT, src->w2iT);
}

void ShadowImage3D (cl_context cxt, Shadow_image3D_t *dst, Diderot_image3D_t *src)
{
    dst->size[0] = src->size[0];
    dst->size[1] = src->size[1];
    dst->size[2] = src->size[2];
    ShadowMat3x3 (dst->w2i, src->w2i);
    ShadowVec3 (&dst->tVec, src->tVec);
    ShadowMat3x3 (dst->w2iT, src->w2iT);  
}

root@smlnj-gforge.cs.uchicago.edu
ViewVC Help
Powered by ViewVC 1.0.0