Home My Page Projects Code Snippets Project Openings diderot
Summary Activity Tracker Tasks SCM

SCM Repository

[diderot] View of /branches/pure-cfg/src/lib/cl-target/main.c
ViewVC logotype

View of /branches/pure-cfg/src/lib/cl-target/main.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1462 - (download) (as text) (annotate)
Tue Aug 9 07:22:45 2011 UTC (8 years, 1 month ago) by lamonts
File size: 20390 byte(s)
Added Support for converting host strands to its shadow types 
/*! \file main.c
 *
 * \author John Reppy
 */

/*
 * COPYRIGHT (c) 2011 The Diderot Project (http://diderot-language.cs.uchicago.edu)
 * All rights reserved.
 */

#include <Diderot/diderot.h>
#include "clinfo.h"
#include <string.h>
#include <stdio.h>
#include <assert.h>
#include <sys/sysctl.h>
#include <sys/stat.h>

// NOTE: we probably should put this in a file that supports runtime printing
static bool     VerboseFlg = false;
static bool     TimingFlg = false;

static char * globalSource = "\
__kernel void Dideort_GlobalKernel(int num_work_groups,\
                                   int num_strands, __global int * workers, \
                                   int workers_per_group, int limit) \
{ \
    int nextStrand = 0, index = 0; \
   \
    for(int idx = 0; idx < num_work_groups; idx++) \
    {\
       workers[idx] = nextStrand;   \
       nextStrand =  nextStrand + (limit * workers_per_group);\
    }\
}"; 

struct struct_world {
    const char          *name;          // the program name
    bool                isArray;        // is the initialization an array or collection?
    uint32_t            nDims;          // depth of iteration nesting
    int32_t             *base;          // nDims array of base indices
    uint32_t            *size;          // nDims array of iteration sizes
    uint32_t            numStrands;     // number of strands in the world
    unsigned int        strandSize;     // the sizeof of the strand buffers
    void                *inState;
    void                *outState;
    int                 *status;        // array of strand status flags
    cl_device_id        device;         // OpenCL device
    cl_context          context;        // OpenCL execution context
    cl_command_queue    cmdQ;           // OpenCL command queue
    cl_kernel           kernel[2];         // OpenCL Kernel that implements the program
};
typedef struct {
   cl_mem stateOutMem; 
   cl_mem statusMem; 
   cl_mem workerQueueMem; 
   cl_mem numActiveMem; 
   cl_int numStrands; 
   cl_int limit;  
}UpdateKernel_Args;

typedef struct { 
   cl_int numberOfWorkGroups; 
   cl_int numStrands; 
   cl_mem workerQueueMem; 
   cl_int workGroupSize; 
   cl_int limit; 
}GlobalKernel_Args; 

static bool InitCL (CLInfo_t *clInfo, Diderot_World_t *wrld);
static void checkErrorCode (cl_int sts, const char * msg); 
static void setupGlobalKernelArgs(cl_kernel kernel, int argCount, GlobalKernel_Args * args); 
static void setUpdateKernelArgs( cl_kernel kernel, int  * argCount, UpdateKernel_Args * args); 
extern void Diderot_LoadGlobals (cl_context cxt, cl_kernel kernel, cl_command_queue cmdQ, int argStart);

int main (int argc, const char **argv)
{
  // get information about OpenCL support
    CLInfo_t *clInfo = GetCLInfo();
    if (clInfo == 0) {
        fprintf (stderr, "no OpenCL support\n");
        exit (1);
    }

    Diderot_Options_t *opts = Diderot_OptNew ();

    Diderot_OptAddFlag (opts, "verbose", "enable runtime-system messages", &VerboseFlg);
    Diderot_OptAddFlag (opts, "timing", "enable execution timing", &TimingFlg);
    Diderot_RegisterGlobalOpts (opts);
    Diderot_OptProcess (opts, argc, argv);
    Diderot_OptFree (opts);

    if (VerboseFlg)
	PrintCLInfo (stdout, clInfo);

  // run the generated global initialization code
    if (VerboseFlg)
        fprintf (stderr, "initializing globals ...\n");
    Diderot_InitGlobals ();

  /***** FIXME: OpenCL specific stuff goes here.  Things to do:
   **
   ** 1) copy data to GPU
   ** 2) initialize strands
   ** 3) run strands to termination
   ** 4) load results from GPU
   **/
    
    Diderot_World_t *wrld = Diderot_Initially ();  // this may not be right for OpenCL


    if (! InitCL(clInfo, wrld))
        exit (1);

    
    /* Conversion of strands from their host types to their shadow types */ 
    void * shadowInState = malloc(Diderot_Strands[0]->shadowStrandSzb * wrld->numStrands); 
    uint8_t *strandPtr = (uint8_t *)wrld->inState;
    uint8_t *strandShadowPtr = (uint8_t *)shadowInState;
    size_t shadowSize = Diderot_Strands[0]->shadowStrandSzb * wrld->numStrands; 
    
    for (int i = 0;  i < wrld->numStrands;  i++, strandPtr += Diderot_Strands[0]->stateSzb,strandShadowPtr+= Diderot_Strands[0]->shadowStrandSzb) {
            Diderot_Strands[0]->strandCopy (strandPtr,strandShadowPtr);
    }
    free((uint8_t *)wrld->inState); 
    wrld->inState = shadowInState; 

  

    // hack to make the invariant part of the state the same in both copies
    memcpy (wrld->outState, wrld->inState, shadowSize);



    int argCount = 0;
    cl_int sts = CL_SUCCESS;  
    UpdateKernel_Args updateArgs; 
    GlobalKernel_Args globalArgs; 
    

    updateArgs.stateOutMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
        shadowSize, wrld->outState, &sts);
    checkErrorCode(sts,"error creating OpenCL strand in-state buffer\n"); 


    updateArgs.statusMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
        sizeof(int) * wrld->numStrands, wrld->status, &sts);
    checkErrorCode(sts,"error creating OpenCL world status buffer\n"); 


    size_t globalWorkSize[3] = {0, 0, 0};
    size_t localWorkSize[3] = {32, 32, 32}; 
	size_t workerQueueSize; 
    int numberOfWorkGroups = 1; 
    int numActive = wrld->numStrands;
    int workGroupSize = 0;   
    int * workQueue; 
    
 
 if (wrld->nDims == 1) {
        globalWorkSize[0] = wrld->size[0] + (localWorkSize[0] - (wrld->size[0] & (localWorkSize[0] - 1))); 
        numberOfWorkGroups = (globalWorkSize[0] / localWorkSize[0]); 
        workGroupSize = localWorkSize[0]; 
    }
    else if (wrld->nDims == 2) {
        globalWorkSize[0] = wrld->size[0] + (localWorkSize[0] - (wrld->size[0] & (localWorkSize[0] - 1))); 
        globalWorkSize[1] = wrld->size[1] + (localWorkSize[1] - (wrld->size[1] & (localWorkSize[1] - 1))); 
        numberOfWorkGroups = (globalWorkSize[0] / localWorkSize[0])  *  (globalWorkSize[1] / localWorkSize[1]); 
        workGroupSize = localWorkSize[0] * localWorkSize[1];      
    }
    else if (wrld->nDims == 3) { 
        localWorkSize[0] = 8; 
        localWorkSize[1] = 8; 
        localWorkSize[2] = 8; 
        globalWorkSize[0] = wrld->size[0] + (localWorkSize[0] - (wrld->size[0] & (localWorkSize[0] - 1))); 
        globalWorkSize[1] = wrld->size[1] + (localWorkSize[1] - (wrld->size[1] & (localWorkSize[1] - 1))); 
        globalWorkSize[2] = wrld->size[2] + (localWorkSize[2] - (wrld->size[2] & (localWorkSize[2] - 1))); 
        numberOfWorkGroups = (globalWorkSize[0] / localWorkSize[0])  *  (globalWorkSize[1] / localWorkSize[1]) * (globalWorkSize[2] / localWorkSize[2]); 
        workGroupSize = localWorkSize[0] * localWorkSize[1] * localWorkSize[2]; 
    }
    else {
        assert (0);  
    }
 

    updateArgs.numActiveMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
     sizeof(int), &numActive, &sts);
     checkErrorCode(sts,"error creating OpenCL number of avaliable  buffer\n"); 

    workerQueueSize = sizeof(int) * numberOfWorkGroups; 

    if((workQueue = (int *)malloc(workerQueueSize)) == NULL) 
    {
        fprintf(stderr, "error in allocationg workqueue"); 
        exit(1); 
    } 
    
    updateArgs.workerQueueMem = clCreateBuffer (wrld->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                                     workerQueueSize, wrld->status, &sts);
    checkErrorCode(sts,"error creating OpenCL worker queue buffer\n"); 
    globalArgs.workerQueueMem = updateArgs.workerQueueMem;
    
    updateArgs.numStrands = wrld->numStrands; 
    globalArgs.numStrands = wrld->numStrands; 
    updateArgs.limit = 2; 
    globalArgs.limit = updateArgs.limit; 
    globalArgs.numberOfWorkGroups = numberOfWorkGroups; 
    globalArgs.workGroupSize = workGroupSize; 

    //Setup the Update Kernel's arguments 
    setUpdateKernelArgs(wrld->kernel[1], &argCount, &updateArgs); 
    Diderot_LoadGlobals(wrld->context, wrld->kernel[1], wrld->cmdQ, argCount); 

    clFinish(wrld->cmdQ);  

    //Setup the Global Kernel's arguments  
    argCount = 0;
    setupGlobalKernelArgs(wrld->kernel[0],argCount, &globalArgs); 
    
    double t0 = GetTime();

     clFinish(wrld->cmdQ);  


    while(numActive > 0) 
    { 
        sts = clEnqueueTask(wrld->cmdQ, wrld->kernel[0], 0, NULL, NULL);
        checkErrorCode (sts, "error in executing global kernel code\n");

        clFinish(wrld->cmdQ);  

        sts = clEnqueueNDRangeKernel(wrld->cmdQ, wrld->kernel[1], wrld->nDims, NULL, globalWorkSize,
              localWorkSize, 0, NULL, NULL);

        checkErrorCode(sts, "error in executing update kernel\n");
        clFinish(wrld->cmdQ);    

       sts = clEnqueueReadBuffer(wrld->cmdQ, updateArgs.numActiveMem, CL_TRUE, 0, sizeof(int),
           &numActive, 0, NULL, NULL);

        checkErrorCode(sts, "error in reading back number of active strands: %d");

        clFinish(wrld->cmdQ);  

    }
  
    sts = clEnqueueReadBuffer(wrld->cmdQ, updateArgs.stateOutMem, CL_TRUE, 0,shadowSize,
            wrld->outState, 0, NULL, NULL);
    checkErrorCode(sts, "error in reading back output\n");


   sts = clEnqueueReadBuffer(wrld->cmdQ, updateArgs.statusMem, CL_TRUE, 0, sizeof(int) * wrld->numStrands,
                             wrld->status, 0, NULL, NULL);
   checkErrorCode(sts, "error in reading back status\n");

   clFinish (wrld->cmdQ); 
    clReleaseKernel(wrld->kernel[0]);
    clReleaseKernel(wrld->kernel[1]);
    clReleaseCommandQueue(wrld->cmdQ);
    clReleaseContext(wrld->context);

    double totalTime = GetTime() - t0;

    if (VerboseFlg)
        fprintf (stderr, "done in %f seconds\n", totalTime);
    else if (TimingFlg)
        printf ("usr=%f\n", totalTime);

  // here we have the final state of all of the strands in the "in" buffer
    int outFileNameLen = strlen(wrld->name) + 5;
    char *outFileName = (char *)malloc(outFileNameLen);
    snprintf (outFileName, outFileNameLen, "%s.txt", wrld->name);
    FILE *outS = fopen(outFileName, "w");
    if (outS == NULL) {
        fprintf (stderr, "Cannot open output file %s\n", outFileName);
        exit (8);
    }

    strandPtr = (uint8_t *)wrld->outState;
    for (int i = 0;  i < wrld->numStrands;  i++, strandPtr += Diderot_Strands[0]->shadowStrandSzb) {
        if (wrld->status[i] == DIDEROT_STABLE)
            Diderot_Strands[0]->print (outS, strandPtr);
    }

    fclose (outS);

    Diderot_Shutdown (wrld); 

    return 0;

}
static void setUpdateKernelArgs( cl_kernel kernel, int  * argCount, UpdateKernel_Args * args)
{ 
    cl_int sts = CL_SUCCESS;  

    sts = clSetKernelArg (kernel, *(argCount), sizeof(cl_mem), &args->stateOutMem);
    checkErrorCode(sts,"error Setting OpenCL strand out-state argument\n"); 
    *argCount+=1; 

    sts = clSetKernelArg (kernel, *(argCount), sizeof(cl_mem), &args->statusMem);
    checkErrorCode(sts,"error Setting OpenCL world status argument\n"); 
    *argCount+=1; 

    sts = clSetKernelArg (kernel,  *(argCount), sizeof(cl_mem), &args->workerQueueMem);
    checkErrorCode(sts,"error Setting OpenCL worker queue buffer argument\n"); 
    *argCount+=1; 

    sts = clSetKernelArg (kernel,  *(argCount), sizeof(cl_mem), &args->numActiveMem);
    checkErrorCode(sts,"error Setting OpenCL number of avaliable argument\n"); 
    *argCount+=1; 

    sts = clSetKernelArg(kernel, *(argCount), sizeof(cl_int), &args->numStrands);
    checkErrorCode(sts,"error Setting OpenCL width argument\n"); 
    *argCount+=1; 

    sts = clSetKernelArg(kernel, *(argCount), sizeof(cl_int), &args->limit);
    checkErrorCode(sts,"error Setting OpenCL limit\n"); 
    *argCount+=1; 
} 
static void setupGlobalKernelArgs(cl_kernel kernel, int argCount, GlobalKernel_Args * args) 
{
    cl_int sts = CL_SUCCESS;  

    sts = clSetKernelArg(kernel, argCount++, sizeof(cl_int), &args->numberOfWorkGroups);
    checkErrorCode(sts, "error Setting OpenCL number of work groups\n");
 
    sts = clSetKernelArg(kernel, argCount++,  sizeof(cl_int), &args->numStrands);
    checkErrorCode(sts,"error Setting OpenCL number strands\n");

    sts = clSetKernelArg (kernel,  argCount++, sizeof(cl_mem), &args->workerQueueMem);
    checkErrorCode(sts, "error Setting OpenCL worker queue buffer argument\n");

    sts = clSetKernelArg(kernel, argCount++, sizeof(cl_int), &args->workGroupSize);
    checkErrorCode(sts,"error Setting OpenCL work group size\n");

    sts = clSetKernelArg(kernel, argCount++, sizeof(cl_int), &args->limit);
    checkErrorCode(sts, "error Setting OpenCL limit\n");
} 
static void checkErrorCode (cl_int sts, const char * msg) 
{ 
    if (sts != CL_SUCCESS) {
        fprintf (stderr, msg);
        exit(1); 
    } 
}   
/*! \brief load OpenCL code from a file
 */
static char *LoadSource (const char *filename) 
{  
    struct stat statbuf;
    if (stat(filename, &statbuf) < 0) {
        fprintf (stderr, "unable to stat OpenCL source file %s\n", filename);
        exit (1);
    }

    char *source = (char *) malloc(statbuf.st_size + 1);
    if (source == 0) {
        fprintf (stderr, "unable to allocate memory for OpenCL source\n");
        exit (1);
    }

    FILE *fh = fopen(filename, "r");
    if ((fh == 0)
    ||  (fread(source, statbuf.st_size, 1, fh) != 1)) {
        fprintf (stderr, "unable to read OpenCL source from %s\n", filename);
        exit (1);
    }
    source[statbuf.st_size] = '\0';
    fclose (fh);
    
    return source; 
}

static void LogMessagesToStderr (const char *errstr, const void *private_info, size_t cb, void *user_data)
{
    fprintf(stderr, "%s\n", errstr); 
}

/*! \brief initialize the OpenCL execution context, including loading and compiling the OpenCL
 *         program.
 *  \param clInfo points to the summary information about the available OpenCL devices.
 *  \param wrld the Diderot execution information.
 *  \return true if initialization is successful, otherwise false.
 */
static bool InitCL (CLInfo_t *clInfo, Diderot_World_t *wrld)
{
    cl_int              sts;

  // find a GPU on platform[0]
    DeviceInfo_t *dev = 0;
    int i;
    for (i = 0;  i < clInfo->platforms[0].numDevices;  i++) {
        if (isGPUDevice (&(clInfo->platforms[0].devices[i]))
	&&  clInfo->platforms[0].devices[i].isAvail)
	{
            dev = &(clInfo->platforms[0].devices[i]);
            break;
        }
    }
    if (dev == 0) {
        fprintf (stderr, "unable to find GPU device\n");
        return false;
    }

  // create the context
    cl_context cxt = clCreateContext(0, 1, &(dev->id), LogMessagesToStderr, 0, &sts);
    if (sts != CL_SUCCESS) {
        fprintf (stderr, "error creating OpenCL context\n");
        return false;
    }

  // create the program from the source
    int fnameLen = strlen(wrld->name) + 4;  // name + ".cl\0"
    char *fname = (char *)malloc(fnameLen);
    snprintf(fname, fnameLen, "%s.cl", wrld->name);
    char *updateSource = LoadSource (fname);
    char *src = (char *)malloc(strlen(updateSource) + strlen(globalSource) + 1); 
    strncat(src,updateSource,strlen(updateSource)); 
    strncat(src,globalSource,strlen(globalSource)); 

    free (fname);
    cl_program prog = clCreateProgramWithSource(cxt, 1, &src, NULL, &sts);
    if (sts != CL_SUCCESS) {
        fprintf (stderr, "error creating program\n");
        return false;
    }
    free ((void *)src);

  // build the program
    char options[1024];
    snprintf (options, sizeof(options),
        "-D DIDEROT_CL_VERSION=%d -I %s -w",
        100*dev->majorVersion + dev->minorVersion,
        DIDEROT_INCLUDE_PATH);
    sts = clBuildProgram (prog, 1, &(dev->id), options, 0, 0);
    if (sts != CL_SUCCESS) {
        size_t logSzb;
        clGetProgramBuildInfo (prog, dev->id, CL_PROGRAM_BUILD_LOG, 0, 0, &logSzb);
        char *log = malloc(logSzb+1);
        clGetProgramBuildInfo (prog, dev->id, CL_PROGRAM_BUILD_LOG, logSzb, log, &logSzb);
        log[logSzb] = '\0';
        fprintf (stderr, "error compiling program:\n%s\n", log);
        free (log);
        return false;
    }

  // extract the kernel from the program
    cl_kernel kernel = clCreateKernel(prog, "Dideort_GlobalKernel", &sts);
    if (sts != CL_SUCCESS) {
        fprintf (stderr, "error getting kernel from program\n");
        return false;
    }

    wrld->kernel[0] = kernel;

    kernel = clCreateKernel(prog, "Diderot_UpdateKernel", &sts);

    if (sts != CL_SUCCESS) {
        fprintf (stderr, "error getting kernel from program\n");
        return false;
    }

    wrld->kernel[1] = kernel;

  // create the command queue
    cl_command_queue q = clCreateCommandQueue(cxt, dev->id, 0, &sts);
    if (sts != CL_SUCCESS) {
        fprintf (stderr, "error creating OpenCL command queue\n");
        return false;
    }

  // initialize world info
    wrld->device = dev->id;
    wrld->context = cxt;
    wrld->cmdQ = q;
   

    return true;
}

// this should be the part of the scheduler
void *Diderot_AllocStrand (Strand_t *strand)
{
    return malloc(strand->stateSzb);
}

// block allocation of an initial collection of strands
Diderot_World_t *Diderot_AllocInitially(
    const char *name,           // the name of the program
    Strand_t *strand,           // the type of strands being allocated
    bool isArray,               // is the initialization an array or collection?
    uint32_t nDims,             // depth of iteration nesting
    int32_t *base,              // nDims array of base indices
    uint32_t *size)             // nDims array of iteration sizes
{
    Diderot_World_t *wrld = (Diderot_World_t *) malloc (sizeof(Diderot_World_t));
    if (wrld == 0) {
        fprintf (stderr, "unable to allocate world\n");
        exit (1);
    }

    wrld->name = name; /* NOTE: we are assuming that name is statically allocated! */
    wrld->isArray = isArray;
    wrld->nDims = nDims;
    wrld->base = (int32_t *) malloc (nDims * sizeof(int32_t));
    wrld->size = (uint32_t *) malloc (nDims * sizeof(uint32_t));
    size_t numStrands = 1;
    for (int i = 0;  i < wrld->nDims;  i++) {
        numStrands *= size[i];
        wrld->base[i] = base[i];
        wrld->size[i] = size[i];
    }

    if (VerboseFlg) {
        printf("AllocInitially: %d", size[0]);
        for (int i = 1;  i < nDims;  i++) printf(" x %d", size[i]);
        printf("\n");
    }

  // allocate the strand state pointers
    wrld->numStrands = numStrands;
    wrld->strandSize = strand->stateSzb * numStrands; 
  /*  wrld->inState = (void **) malloc (numStrands * sizeof(void *));
    wrld->outState = (void **) malloc (numStrands * sizeof(void *)); */
    wrld->inState =  malloc (wrld->strandSize); 
    wrld->outState = malloc (strand->shadowStrandSzb * numStrands); 
    wrld->status = (int *) malloc (numStrands * sizeof(int));
    if ((wrld->inState == 0) || (wrld->outState == 0) || (wrld->status == 0)) {
        fprintf (stderr, "unable to allocate strand states\n");
        exit (1);
    }

  // initialize strand state pointers etc.
    for (size_t i = 0;  i < numStrands;  i++) {
       // wrld->inState[i] = Diderot_AllocStrand (strand);
       // wrld->outState[i] = Diderot_AllocStrand (strand);
        wrld->status[i] = DIDEROT_ACTIVE;
    }

    return wrld;

}

// get strand state pointers
void *Diderot_InState (Diderot_World_t *wrld, uint32_t i)
{
    assert (i < wrld->numStrands);
    return wrld->inState + i;
}

void *Diderot_OutState (Diderot_World_t *wrld, uint32_t i)
{
    assert (i < wrld->numStrands);
    return &wrld->outState[i];
}

bool Diderot_IsActive (Diderot_World_t *wrld, uint32_t i)
{
    assert (i < wrld->numStrands);
    return !wrld->status[i];
}

/***** Support for shadow image values *****/

void ShadowImage1D (cl_context cxt, Shadow_image1D_t *dst, Diderot_image1D_t *src)
{
    dst->size[0] = src->size[0];
    dst->s = src->s;
    dst->t = dst->t;
}

void ShadowImage2D (cl_context cxt, Shadow_image2D_t *dst, Diderot_image2D_t *src)
{
   dst->size[0] = src->size[0];
    dst->size[1] = src->size[1];
    ShadowMat2x2 (dst->w2i, src->w2i);
    ShadowVec2 (&dst->tVec, src->tVec);
    ShadowMat2x2 (dst->w2iT, src->w2iT);
}

void ShadowImage3D (cl_context cxt, Shadow_image3D_t *dst, Diderot_image3D_t *src)
{
    dst->size[0] = src->size[0];
    dst->size[1] = src->size[1];
    dst->size[2] = src->size[2];
    ShadowMat3x3 (dst->w2i, src->w2i);
    ShadowVec3 (&dst->tVec, src->tVec);
    ShadowMat3x3 (dst->w2iT, src->w2iT);  
}

root@smlnj-gforge.cs.uchicago.edu
ViewVC Help
Powered by ViewVC 1.0.0