void CopyArrayFromDevice(cl_command_queue cqCommandQueue, float *host, cl_mem device, cl_mem pboCL, int numBodies, bool bDouble)
    {   
        cl_int ciErrNum;
        unsigned int size;

        if (pboCL) 
        {
            ciErrNum = clEnqueueAcquireGLObjects(cqCommandQueue, 1, &pboCL, 0, NULL, NULL);
            oclCheckError(ciErrNum, CL_SUCCESS);
        }

		if (bDouble)
		{
			size = numBodies * 4 * sizeof(double);
			double *dHost = (double *)malloc(size);
			ciErrNum = clEnqueueReadBuffer(cqCommandQueue, device, CL_TRUE, 0, size, dHost, 0, NULL, NULL);
			for (int i = 0; i < numBodies * 4; i++)
			{
				host[i] = (float)(dHost[i]);
			}
			free(dHost);
		}
		else
		{
        	size = numBodies * 4 * sizeof(float);
        	ciErrNum = clEnqueueReadBuffer(cqCommandQueue, device, CL_TRUE, 0, size, host, 0, NULL, NULL);
        }
		oclCheckError(ciErrNum, CL_SUCCESS);
		
        if (pboCL) 
        {
            ciErrNum = clEnqueueReleaseGLObjects(cqCommandQueue, 1, &pboCL, 0, NULL, NULL);
            oclCheckError(ciErrNum, CL_SUCCESS);
        }
    }
extern "C" size_t scanExclusiveShort(
    cl_command_queue cqCommandQueue,
    cl_mem d_Dst,
    cl_mem d_Src,
    uint batchSize,
    uint arrayLength
) {
    //Check power-of-two factorization
    uint log2L;
    uint factorizationRemainder = factorRadix2(log2L, arrayLength);
    oclCheckError( factorizationRemainder == 1, shrTRUE);

    //Check supported size range
    oclCheckError( (arrayLength >= MIN_SHORT_ARRAY_SIZE) && (arrayLength <= MAX_SHORT_ARRAY_SIZE), shrTRUE );

    //Check total batch size limit
    oclCheckError( (batchSize * arrayLength) <= MAX_BATCH_ELEMENTS, shrTRUE );

    //Check all work-groups to be fully packed with data
    oclCheckError( (batchSize * arrayLength) % (4 * WORKGROUP_SIZE) == 0, shrTRUE);

    return scanExclusiveLocal1(
               cqCommandQueue,
               d_Dst,
               d_Src,
               batchSize,
               arrayLength
           );
}
extern "C" void convolutionColumns(
    cl_command_queue cqCommandQueue,
    cl_mem d_Dst,
    cl_mem d_Src,
    cl_mem c_Kernel,
    cl_uint imageW,
    cl_uint imageH
){
    cl_int ciErrNum;
    size_t localWorkSize[2], globalWorkSize[2];

    oclCheckError( COLUMNS_BLOCKDIM_Y * COLUMNS_HALO_STEPS >= KERNEL_RADIUS, shrTRUE );
    oclCheckError( imageW % COLUMNS_BLOCKDIM_X == 0, shrTRUE );
    oclCheckError( imageH % (COLUMNS_RESULT_STEPS * COLUMNS_BLOCKDIM_Y) == 0, shrTRUE );

    if(!cqCommandQueue)
        cqCommandQueue = cqDefaultCommandQueue;

    ciErrNum  = clSetKernelArg(ckConvolutionColumns, 0, sizeof(cl_mem),       (void*)&d_Dst);
    ciErrNum |= clSetKernelArg(ckConvolutionColumns, 1, sizeof(cl_mem),       (void*)&d_Src);
    ciErrNum |= clSetKernelArg(ckConvolutionColumns, 2, sizeof(cl_mem),       (void*)&c_Kernel);
    ciErrNum |= clSetKernelArg(ckConvolutionColumns, 3, sizeof(unsigned int), (void*)&imageW);
    ciErrNum |= clSetKernelArg(ckConvolutionColumns, 4, sizeof(unsigned int), (void*)&imageH);
    ciErrNum |= clSetKernelArg(ckConvolutionColumns, 5, sizeof(unsigned int), (void*)&imageW);
    oclCheckError(ciErrNum, CL_SUCCESS);

    localWorkSize[0] = COLUMNS_BLOCKDIM_X;
    localWorkSize[1] = COLUMNS_BLOCKDIM_Y;
    globalWorkSize[0] = imageW;
    globalWorkSize[1] = imageH / COLUMNS_RESULT_STEPS;

    ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckConvolutionColumns, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
    oclCheckError(ciErrNum, CL_SUCCESS);
}
////////////////////////////////////////////////////////////////////////////////
// Large scan launcher
////////////////////////////////////////////////////////////////////////////////
static void scanExclusiveLocal2(
    cl_command_queue cqCommandQueue,
    cl_mem d_Buffer,
    cl_mem d_Dst,
    cl_mem d_Src,
    uint n,
    uint size
) {
    cl_int ciErrNum;
    size_t localWorkSize, globalWorkSize;

    uint elements = n * size;
    ciErrNum  = clSetKernelArg(ckScanExclusiveLocal2, 0, sizeof(cl_mem), (void *)&d_Buffer);
    ciErrNum |= clSetKernelArg(ckScanExclusiveLocal2, 1, sizeof(cl_mem), (void *)&d_Dst);
    ciErrNum |= clSetKernelArg(ckScanExclusiveLocal2, 2, sizeof(cl_mem), (void *)&d_Src);
    ciErrNum |= clSetKernelArg(ckScanExclusiveLocal2, 3, 2 * WORKGROUP_SIZE * sizeof(uint), NULL);
    ciErrNum |= clSetKernelArg(ckScanExclusiveLocal2, 4, sizeof(uint), (void *)&elements);
    ciErrNum |= clSetKernelArg(ckScanExclusiveLocal2, 5, sizeof(uint), (void *)&size);
    oclCheckError(ciErrNum, CL_SUCCESS);

    localWorkSize = WORKGROUP_SIZE;
    globalWorkSize = iSnapUp(elements, WORKGROUP_SIZE);

    ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckScanExclusiveLocal2, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
    oclCheckError(ciErrNum, CL_SUCCESS);
}
    void integrateSystem(cl_command_queue cqCommandQueue,
    		cl_kernel k,
    		cl_mem newPositions,
    		cl_mem newVelocities,
    		cl_mem newEdges,
    		cl_mem oldPositions,
    		cl_mem oldVelocities,
    		cl_mem oldEdges,
    		cl_mem oldForces,
    		float deltaTime, float damping,
    		int numBodies, int p, int q,
    		bool bDouble)
    {
    	int sharedMemSize;

    	//for double precision
    	if (bDouble)
    	{
    		sharedMemSize = p * q * sizeof(cl_double4); // 4 doubles for pos
    	}
    	else
    	{
    		sharedMemSize = p * q * sizeof(cl_float4); // 4 floats for pos
    	}

    	size_t global_work_size[2];
    	size_t local_work_size[2];
    	cl_int ciErrNum = CL_SUCCESS;
    	cl_kernel kernel;

    	kernel = k;

    	ciErrNum |= clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&newPositions);
    	ciErrNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&newVelocities);
    	ciErrNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&newEdges);
    	ciErrNum |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&oldPositions);
    	ciErrNum |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&oldVelocities);
    	ciErrNum |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&oldEdges);
    	ciErrNum |= clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&oldForces);
    	ciErrNum |= clSetKernelArg(kernel, 7, sizeof(cl_float), (void *)&deltaTime);
    	ciErrNum |= clSetKernelArg(kernel, 8, sizeof(cl_float), (void *)&damping);

    	oclCheckError(ciErrNum, CL_SUCCESS);

    	// set work-item dimensions
    	local_work_size[0] = p;
    	local_work_size[1] = q;
    	global_work_size[0]= numBodies;
    	global_work_size[1]= q;

    	// execute the kernel:
    	ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL);

    	oclCheckError(ciErrNum, CL_SUCCESS);
    }
///////////////////////////////////////////////////////////////////////////////
//  test the bandwidth of a device to host memcopy of a specific size
///////////////////////////////////////////////////////////////////////////////
double testDeviceToDeviceTransfer(unsigned int memSize)
{
    double elapsedTimeInSec = 0.0;
    double bandwidthInMBs = 0.0;
    unsigned char* h_idata = NULL;
    cl_int ciErrNum = CL_SUCCESS;
    
    //allocate host memory
    h_idata = (unsigned char *)malloc( memSize );
        
    //initialize the memory
    for(unsigned int i = 0; i < memSize/sizeof(unsigned char); i++)
    {
        h_idata[i] = (unsigned char) (i & 0xff);
    }

    // allocate device input and output memory and initialize the device input memory
    cl_mem d_idata = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, memSize, NULL, &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);
    cl_mem d_odata = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, memSize, NULL, &ciErrNum);         
    oclCheckError(ciErrNum, CL_SUCCESS);
    ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, d_idata, CL_TRUE, 0, memSize, h_idata, 0, NULL, NULL);
    oclCheckError(ciErrNum, CL_SUCCESS);

    // Sync queue to host, start timer 0, and copy data from one GPU buffer to another GPU bufffer
    clFinish(cqCommandQueue);
    shrDeltaT(0);
    for(unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
    {
        ciErrNum = clEnqueueCopyBuffer(cqCommandQueue, d_idata, d_odata, 0, 0, memSize, 0, NULL, NULL);                
        oclCheckError(ciErrNum, CL_SUCCESS);
    }    

    // Sync with GPU
    clFinish(cqCommandQueue);
    
    //get the the elapsed time in seconds
    elapsedTimeInSec = shrDeltaT(0);
    
    // Calculate bandwidth in MB/s 
    //      This is for kernels that read and write GMEM simultaneously 
    //      Obtained Throughput for unidirectional block copies will be 1/2 of this #
    bandwidthInMBs = 2.0 * ((double)memSize * (double)MEMCOPY_ITERATIONS)/(elapsedTimeInSec * (double)(1 << 20));

    //clean up memory on host and device
    free(h_idata);
    clReleaseMemObject(d_idata);
    clReleaseMemObject(d_odata);

    return bandwidthInMBs;
}
void BodySystemCPU::update(float deltaTime)
{
    oclCheckError(m_bInitialized, shrTRUE);

    _integrateNBodySystem(deltaTime);
    std::swap(m_currentRead, m_currentWrite);
}
void BodySystemCPU::_initialize(int numBodies, int numEdges)
{
    oclCheckError(m_bInitialized, shrFALSE);

    m_numBodies = numBodies;
    m_numEdges = numEdges;

    m_pos[0] = new float[m_numBodies*4];
    m_pos[1] = new float[m_numBodies*4];
    m_vel[0] = new float[m_numBodies*4];
    m_vel[1] = new float[m_numBodies*4];
    m_edge[0] = new float[m_numBodies*4];
    m_edge[1] = new float[m_numBodies*4];
    m_force  = new float[m_numBodies*4];

    memset(m_pos[0], 0, m_numBodies*4*sizeof(float));
    memset(m_pos[1], 0, m_numBodies*4*sizeof(float));
    memset(m_vel[0], 0, m_numBodies*4*sizeof(float));
    memset(m_vel[1], 0, m_numBodies*4*sizeof(float));
    memset(m_edge[0], 0, m_numEdges*4*sizeof(float));
    memset(m_edge[1], 0, m_numEdges*4*sizeof(float));
    memset(m_force, 0, m_numBodies*4*sizeof(float));

    m_bInitialized = true;
}
    void CopyArrayToDevice(int __size, cl_command_queue cqCommandQueue, cl_mem device, const float* host, int numBodies, bool bDouble)
    {
        cl_int ciErrNum;
        unsigned int size;
		if (bDouble)
		{
			size = numBodies * 4 * sizeof(double);

			double *cdHost = (double *)malloc(size);

			for (int i = 0; i < numBodies * 4; i++)
			{
				cdHost[i] = (double)host[i];
			}

			ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, device, CL_TRUE, 0, size, cdHost, 0, NULL, NULL);
			free(cdHost);
		}
		else
		{
			size = numBodies*4*sizeof(float);

			ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, device, CL_TRUE, 0, size, host, 0, NULL, NULL);
		}
		oclCheckError(ciErrNum, CL_SUCCESS);
    }
    void computeSpringsForces(cl_command_queue cqCommandQueue,
    		cl_kernel k,
    		cl_mem newForces,
    		cl_mem newEdges,
    		cl_mem oldPositions,
    		cl_mem oldEdges,
    		int numEdges, int p, int q,
    		bool bDouble)
    {
    	int sharedMemSize;

    	//for double precision
    	if (bDouble)
    	{
    		sharedMemSize = p * q * sizeof(cl_double4); // 4 doubles for pos
    	}
    	else
    	{
    		sharedMemSize = p * q * sizeof(cl_float4); // 4 floats for pos
    	}

    	size_t global_work_size[2];
    	size_t local_work_size[2];
    	cl_int ciErrNum = CL_SUCCESS;
    	cl_kernel kernel;

    	kernel = k;

    	ciErrNum |= clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&newForces);
    	ciErrNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&newEdges);
    	ciErrNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&oldPositions);
    	ciErrNum |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&oldEdges);
    	ciErrNum |= clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&numEdges);

    	oclCheckError(ciErrNum, CL_SUCCESS);

    	// set work-item dimensions
    	local_work_size[0] = numEdges;   // todo IMPORTANT!! this is XXX there was = p;
    	local_work_size[1] = q;
    	global_work_size[0]= numEdges;
    	global_work_size[1]= q;

    	// execute the kernel:
    	ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL);

    	oclCheckError(ciErrNum, CL_SUCCESS);
    }
//////////////////////////////////////////////////////////////////////////////
//! Initializes the global context and command queue
//////////////////////////////////////////////////////////////////////////////
void oclInit( ) {
  cl_platform_id cpPlatform;
  cl_device_id cdDevice;
  cl_int ciErrNum;

  ciErrNum = oclGetPlatformID(&cpPlatform);
  oclCheckError(ciErrNum, CL_SUCCESS);

  ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);
  oclCheckError(ciErrNum, CL_SUCCESS);

  g_clDeviceContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErrNum);
  oclCheckError(ciErrNum, CL_SUCCESS);

  g_clDeviceQueue = clCreateCommandQueue(g_clDeviceContext, cdDevice, 0, &ciErrNum);
  oclCheckError(ciErrNum, CL_SUCCESS);
}
extern "C" void closeScan(void) {
    cl_int ciErrNum;
    ciErrNum  = clReleaseMemObject(d_Buffer);
    ciErrNum |= clReleaseKernel(ckUniformUpdate);
    ciErrNum |= clReleaseKernel(ckScanExclusiveLocal2);
    ciErrNum |= clReleaseKernel(ckScanExclusiveLocal1);
    ciErrNum |= clReleaseProgram(cpProgram);
    oclCheckError(ciErrNum, CL_SUCCESS);
}
extern "C" void initScan(cl_context cxGPUContext, cl_command_queue cqParamCommandQue, const char **argv) {
    cl_int ciErrNum;
    size_t kernelLength;

    shrLog(" ...loading Scan.cl\n");
    char *cScan = oclLoadProgSource(shrFindFilePath("Scan.cl", argv[0]), "// My comment\n", &kernelLength);
    oclCheckError(cScan != NULL, shrTRUE);

    shrLog(" ...creating scan program\n");
    cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cScan, &kernelLength, &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);

    shrLog(" ...building scan program\n");
    ciErrNum = clBuildProgram(cpProgram, 0, NULL, compileOptions, NULL, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
        // write out standard error, Build Log and PTX, then cleanup and exit
        shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
        oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
        oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclScan.ptx");
        oclCheckError(ciErrNum, CL_SUCCESS);
    }

    shrLog(" ...creating scan kernels\n");
    ckScanExclusiveLocal1 = clCreateKernel(cpProgram, "scanExclusiveLocal1", &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);
    ckScanExclusiveLocal2 = clCreateKernel(cpProgram, "scanExclusiveLocal2", &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);
    ckUniformUpdate = clCreateKernel(cpProgram, "uniformUpdate", &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);

    shrLog( " ...checking minimum supported workgroup size\n");
    //Check for work group size
    cl_device_id device;
    size_t szScanExclusiveLocal1, szScanExclusiveLocal2, szUniformUpdate;

    ciErrNum  = clGetCommandQueueInfo(cqParamCommandQue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device, NULL);
    ciErrNum |= clGetKernelWorkGroupInfo(ckScanExclusiveLocal1,  device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &szScanExclusiveLocal1, NULL);
    ciErrNum |= clGetKernelWorkGroupInfo(ckScanExclusiveLocal2, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &szScanExclusiveLocal2, NULL);
    ciErrNum |= clGetKernelWorkGroupInfo(ckUniformUpdate, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &szUniformUpdate, NULL);
    oclCheckError(ciErrNum, CL_SUCCESS);

    if( (szScanExclusiveLocal1 < WORKGROUP_SIZE) || (szScanExclusiveLocal2 < WORKGROUP_SIZE) || (szUniformUpdate < WORKGROUP_SIZE) ) {
        shrLog("ERROR: Minimum work-group size %u required by this application is not supported on this device.\n", WORKGROUP_SIZE);
        exit(0);
    }

    shrLog(" ...allocating internal buffers\n");
    d_Buffer = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, (MAX_BATCH_ELEMENTS / (4 * WORKGROUP_SIZE)) * sizeof(uint), NULL, &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);

    //Discard temp storage
    free(cScan);
}
extern "C" size_t scanExclusiveLarge(
    cl_command_queue cqCommandQueue,
    cl_mem d_Dst,
    cl_mem d_Src,
    uint batchSize,
    uint arrayLength
) {
    //Check power-of-two factorization
    uint log2L;
    uint factorizationRemainder = factorRadix2(log2L, arrayLength);
    oclCheckError( factorizationRemainder == 1, shrTRUE);

    //Check supported size range
    oclCheckError( (arrayLength >= MIN_LARGE_ARRAY_SIZE) && (arrayLength <= MAX_LARGE_ARRAY_SIZE), shrTRUE );

    //Check total batch size limit
    oclCheckError( (batchSize * arrayLength) <= MAX_BATCH_ELEMENTS, shrTRUE );

    scanExclusiveLocal1(
        cqCommandQueue,
        d_Dst,
        d_Src,
        (batchSize * arrayLength) / (4 * WORKGROUP_SIZE),
        4 * WORKGROUP_SIZE
    );

    scanExclusiveLocal2(
        cqCommandQueue,
        d_Buffer,
        d_Dst,
        d_Src,
        batchSize,
        arrayLength / (4 * WORKGROUP_SIZE)
    );

    return uniformUpdate(
               cqCommandQueue,
               d_Dst,
               d_Buffer,
               (batchSize * arrayLength) / (4 * WORKGROUP_SIZE)
           );
}
static size_t uniformUpdate(
    cl_command_queue cqCommandQueue,
    cl_mem d_Dst,
    cl_mem d_Buffer,
    uint n
) {
    cl_int ciErrNum;
    size_t localWorkSize, globalWorkSize;

    ciErrNum  = clSetKernelArg(ckUniformUpdate, 0, sizeof(cl_mem), (void *)&d_Dst);
    ciErrNum |= clSetKernelArg(ckUniformUpdate, 1, sizeof(cl_mem), (void *)&d_Buffer);
    oclCheckError(ciErrNum, CL_SUCCESS);

    localWorkSize = WORKGROUP_SIZE;
    globalWorkSize = n * WORKGROUP_SIZE;

    ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckUniformUpdate, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
    oclCheckError(ciErrNum, CL_SUCCESS);

    return localWorkSize;
}
void BodySystemCPU::_finalize()
{
    oclCheckError(m_bInitialized, shrTRUE);

    delete [] m_pos[0];
    delete [] m_pos[1];
    delete [] m_vel[0];
    delete [] m_vel[1];
    delete [] m_edge[0];
    delete [] m_edge[1];
    delete [] m_force;
}
extern "C" void initConvolutionSeparable(cl_context cxGPUContext, cl_command_queue cqParamCommandQueue, const char **argv){
    cl_int ciErrNum;
    size_t kernelLength;

    shrLog("Loading ConvolutionSeparable.cl...\n");
        char *cPathAndName = shrFindFilePath("ConvolutionSeparable.cl", argv[0]);
        oclCheckError(cPathAndName != NULL, shrTRUE);
        char *cConvolutionSeparable = oclLoadProgSource(cPathAndName, "// My comment\n", &kernelLength);
        oclCheckError(cConvolutionSeparable != NULL, shrTRUE);

    shrLog("Creating convolutionSeparable program...\n");
        cpConvolutionSeparable = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cConvolutionSeparable, &kernelLength, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

    shrLog("Building convolutionSeparable program...\n");
        char compileOptions[2048];
        #ifdef _WIN32
            sprintf_s(compileOptions, 2048, "\
                -cl-fast-relaxed-math                                  \
                -D KERNEL_RADIUS=%u\
                -D ROWS_BLOCKDIM_X=%u -D COLUMNS_BLOCKDIM_X=%u\
                -D ROWS_BLOCKDIM_Y=%u -D COLUMNS_BLOCKDIM_Y=%u\
                -D ROWS_RESULT_STEPS=%u -D COLUMNS_RESULT_STEPS=%u\
                -D ROWS_HALO_STEPS=%u -D COLUMNS_HALO_STEPS=%u\
                ",
                KERNEL_RADIUS,
                ROWS_BLOCKDIM_X,   COLUMNS_BLOCKDIM_X,
                ROWS_BLOCKDIM_Y,   COLUMNS_BLOCKDIM_Y,
                ROWS_RESULT_STEPS, COLUMNS_RESULT_STEPS,
                ROWS_HALO_STEPS,   COLUMNS_HALO_STEPS
            );
        #else
            sprintf(compileOptions, "\
                -cl-fast-relaxed-math                                  \
                -D KERNEL_RADIUS=%u\
                -D ROWS_BLOCKDIM_X=%u -D COLUMNS_BLOCKDIM_X=%u\
                -D ROWS_BLOCKDIM_Y=%u -D COLUMNS_BLOCKDIM_Y=%u\
                -D ROWS_RESULT_STEPS=%u -D COLUMNS_RESULT_STEPS=%u\
                -D ROWS_HALO_STEPS=%u -D COLUMNS_HALO_STEPS=%u\
                ",
                KERNEL_RADIUS,
                ROWS_BLOCKDIM_X,   COLUMNS_BLOCKDIM_X,
                ROWS_BLOCKDIM_Y,   COLUMNS_BLOCKDIM_Y,
                ROWS_RESULT_STEPS, COLUMNS_RESULT_STEPS,
                ROWS_HALO_STEPS,   COLUMNS_HALO_STEPS
            );
        #endif
        ciErrNum = clBuildProgram(cpConvolutionSeparable, 0, NULL, compileOptions, NULL, NULL);
        oclCheckError(ciErrNum, CL_SUCCESS);
        ckConvolutionRows = clCreateKernel(cpConvolutionSeparable, "convolutionRows", &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);
        ckConvolutionColumns = clCreateKernel(cpConvolutionSeparable, "convolutionColumns", &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

    cqDefaultCommandQueue = cqParamCommandQueue;
    free(cConvolutionSeparable);
}
////////////////////////////////////////////////////////////////////////////////
// Short scan launcher
////////////////////////////////////////////////////////////////////////////////
static size_t scanExclusiveLocal1(
    cl_command_queue cqCommandQueue,
    cl_mem d_Dst,
    cl_mem d_Src,
    uint n,
    uint size
) {
    cl_int ciErrNum;
    size_t localWorkSize, globalWorkSize;

    ciErrNum  = clSetKernelArg(ckScanExclusiveLocal1, 0, sizeof(cl_mem), (void *)&d_Dst);
    ciErrNum |= clSetKernelArg(ckScanExclusiveLocal1, 1, sizeof(cl_mem), (void *)&d_Src);
    ciErrNum |= clSetKernelArg(ckScanExclusiveLocal1, 2, 2 * WORKGROUP_SIZE * sizeof(uint), NULL);
    ciErrNum |= clSetKernelArg(ckScanExclusiveLocal1, 3, sizeof(uint), (void *)&size);
    oclCheckError(ciErrNum, CL_SUCCESS);

    localWorkSize = WORKGROUP_SIZE;
    globalWorkSize = (n * size) / 4;

    ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckScanExclusiveLocal1, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
    oclCheckError(ciErrNum, CL_SUCCESS);

    return localWorkSize;
}
void BodySystemCPU::setArray(BodyArray array, const float* data)
{
    oclCheckError(m_bInitialized, shrTRUE);
    float* target = 0;

    switch (array)
    {
    default:
    case BODYSYSTEM_POSITION:
        target = m_pos[m_currentRead];
        break;
    case BODYSYSTEM_VELOCITY:
        target = m_vel[m_currentRead];
        break;
    case BODYSYSTEM_EDGE:
    	target = m_edge[m_currentRead];
    	break;
    }

    memcpy(target, data, m_numBodies*4*sizeof(float));
}
float* BodySystemCPU::getArray(BodyArray array)
{
    oclCheckError(m_bInitialized, shrTRUE);

    float* data = 0;
    switch (array)
    {
    default:
    case BODYSYSTEM_POSITION:
        data = m_pos[m_currentRead];
        break;
    case BODYSYSTEM_VELOCITY:
        data = m_vel[m_currentRead];
        break;
    case BODYSYSTEM_EDGE:
    	data = m_edge[m_currentRead];
    	break;
    }

    return data;
}
Beispiel #21
0
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for 
////////////////////////////////////////////////////////////////////////////////
int runTest(int argc, const char** argv)
{
    cl_platform_id cpPlatform = NULL;
    cl_uint ciDeviceCount = 0;
    cl_device_id *cdDevices = NULL;
    cl_int ciErrNum = CL_SUCCESS;

    //Get the NVIDIA platform
    ciErrNum = oclGetPlatformID(&cpPlatform);
    if (ciErrNum != CL_SUCCESS)
    {
        shrLog("Error: Failed to create OpenCL context!\n");
        return ciErrNum;
    }

    //Get the devices
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &ciDeviceCount);
    cdDevices = (cl_device_id *)malloc(ciDeviceCount * sizeof(cl_device_id) );
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, ciDeviceCount, cdDevices, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
        shrLog("Error: Failed to create OpenCL context!\n");
        return ciErrNum;
    }

    //Create the context
    cxGPUContext = clCreateContext(0, ciDeviceCount, cdDevices, NULL, NULL, &ciErrNum);
    if (ciErrNum != CL_SUCCESS)
    {
        shrLog("Error: Failed to create OpenCL context!\n");
        return ciErrNum;
    }

    if(shrCheckCmdLineFlag(argc, (const char**)argv, "device"))
    {
        // User specified GPUs
        char* deviceList;
        char* deviceStr;
        char* next_token;
        shrGetCmdLineArgumentstr(argc, (const char**)argv, "device", &deviceList);

        #ifdef WIN32
            deviceStr = strtok_s (deviceList," ,.-", &next_token);
        #else
            deviceStr = strtok (deviceList," ,.-");
        #endif   
        ciDeviceCount = 0;
        while(deviceStr != NULL) 
        {
            // get and print the device for this queue
            cl_device_id device = oclGetDev(cxGPUContext, atoi(deviceStr));
			if( device == (cl_device_id) -1  ) {
				shrLog(" Device %s does not exist!\n", deviceStr);
				return -1;
			}
			
			shrLog("Device %s: ", deviceStr);
            oclPrintDevName(LOGBOTH, device);            
            shrLog("\n");
           
            // create command queue
            commandQueue[ciDeviceCount] = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
            if (ciErrNum != CL_SUCCESS)
            {
                shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
                return ciErrNum;
            }
                
            ++ciDeviceCount;

            #ifdef WIN32
                deviceStr = strtok_s (NULL," ,.-", &next_token);
            #else            
                deviceStr = strtok (NULL," ,.-");
            #endif
        }

        free(deviceList);
    } 
    else 
    {
        // Find out how many GPU's to compute on all available GPUs
	    size_t nDeviceBytes;
	    ciErrNum |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes);
	    ciDeviceCount = (cl_uint)nDeviceBytes/sizeof(cl_device_id);

        if (ciErrNum != CL_SUCCESS)
        {
            shrLog(" Error %i in clGetDeviceIDs call !!!\n\n", ciErrNum);
            return ciErrNum;
        }
        else if (ciDeviceCount == 0)
        {
            shrLog(" There are no devices supporting OpenCL (return code %i)\n\n", ciErrNum);
            return -1;
        } 

        // create command-queues
        for(unsigned int i = 0; i < ciDeviceCount; ++i) 
        {
            // get and print the device for this queue
            cl_device_id device = oclGetDev(cxGPUContext, i);
            shrLog("Device %d: ", i);
            oclPrintDevName(LOGBOTH, device);            
            shrLog("\n");

            // create command queue
            commandQueue[i] = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
            if (ciErrNum != CL_SUCCESS)
            {
                shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
                return ciErrNum;
            }
        }
    }

    // Optional Command-line multiplier for matrix sizes
    shrGetCmdLineArgumenti(argc, (const char**)argv, "sizemult", &iSizeMultiple); 
    iSizeMultiple = CLAMP(iSizeMultiple, 1, 10);
    uiWA = WA * iSizeMultiple;
    uiHA = HA * iSizeMultiple;
    uiWB = WB * iSizeMultiple;
    uiHB = HB * iSizeMultiple;
    uiWC = WC * iSizeMultiple;
    uiHC = HC * iSizeMultiple;
    shrLog("\nUsing Matrix Sizes: A(%u x %u), B(%u x %u), C(%u x %u)\n", 
            uiWA, uiHA, uiWB, uiHB, uiWC, uiHC);

    // allocate host memory for matrices A and B
    unsigned int size_A = uiWA * uiHA;
    unsigned int mem_size_A = sizeof(float) * size_A;
    float* h_A_data = (float*)malloc(mem_size_A);
    unsigned int size_B = uiWB * uiHB;
    unsigned int mem_size_B = sizeof(float) * size_B;
    float* h_B_data = (float*)malloc(mem_size_B);

    // initialize host memory
    srand(2006);
    shrFillArray(h_A_data, size_A);
    shrFillArray(h_B_data, size_B);

    // allocate host memory for result
    unsigned int size_C = uiWC * uiHC;
    unsigned int mem_size_C = sizeof(float) * size_C;
    float* h_C = (float*) malloc(mem_size_C);

    // create OpenCL buffer pointing to the host memory
    cl_mem h_A = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
				    mem_size_A, h_A_data, &ciErrNum);
    if (ciErrNum != CL_SUCCESS)
    {
        shrLog("Error: clCreateBuffer\n");
        return ciErrNum;
    }

    // Program Setup
    size_t program_length;
    const char* header_path = shrFindFilePath("matrixMul.h", argv[0]);
    oclCheckError(header_path != NULL, shrTRUE);
    char* header = oclLoadProgSource(header_path, "", &program_length);
    if(!header)
    {
        shrLog("Error: Failed to load the header %s!\n", header_path);
        return -1000;
    }
    const char* source_path = shrFindFilePath("matrixMul.cl", argv[0]);
    oclCheckError(source_path != NULL, shrTRUE);
    char *source = oclLoadProgSource(source_path, header, &program_length);
    if(!source)
    {
        shrLog("Error: Failed to load compute program %s!\n", source_path);
        return -2000;
    }

    // create the program
    cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&source, 
                                                    &program_length, &ciErrNum);
    if (ciErrNum != CL_SUCCESS)
    {
        shrLog("Error: Failed to create program\n");
        return ciErrNum;
    }
    free(header);
    free(source);
    
    // build the program
    ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
        // write out standard error, Build Log and PTX, then return error
        shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
        oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
        oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclMatrixMul.ptx");
        return ciErrNum;
    }

    // write out PTX if requested on the command line
    if(shrCheckCmdLineFlag(argc, argv, "dump-ptx") )
    {
        oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclMatrixMul.ptx");
    }

    // Create Kernel
    for(unsigned int i = 0; i < ciDeviceCount; ++i) {
        multiplicationKernel[i] = clCreateKernel(cpProgram, "matrixMul", &ciErrNum);
        if (ciErrNum != CL_SUCCESS)
        {
            shrLog("Error: Failed to create kernel\n");
            return ciErrNum;
        }
    }
        
    // Run multiplication on 1..deviceCount GPUs to compare improvement
    shrLog("\nRunning Computations on 1 - %d GPU's...\n\n", ciDeviceCount);
    for(unsigned int k = 1; k <= ciDeviceCount; ++k) 
    {
        matrixMulGPU(k, h_A, h_B_data, mem_size_B, h_C);
    }

    // compute reference solution
    shrLog("Comparing results with CPU computation... \n\n");
    float* reference = (float*) malloc(mem_size_C);
    computeGold(reference, h_A_data, h_B_data, uiHA, uiWA, uiWB);

    // check result
    shrBOOL res = shrCompareL2fe(reference, h_C, size_C, 1.0e-6f);
    if (res != shrTRUE) 
    {
        printDiff(reference, h_C, uiWC, uiHC, 100, 1.0e-5f);
    }

    // clean up OCL resources
    ciErrNum = clReleaseMemObject(h_A);
    for(unsigned int k = 0; k < ciDeviceCount; ++k) 
    {
        ciErrNum |= clReleaseKernel( multiplicationKernel[k] );
        ciErrNum |= clReleaseCommandQueue( commandQueue[k] );
    }
    ciErrNum |= clReleaseProgram(cpProgram);
    ciErrNum |= clReleaseContext(cxGPUContext);
    if(ciErrNum != CL_SUCCESS)
    {
        shrLog("Error: Failure releasing OpenCL resources: %d\n", ciErrNum);
        return ciErrNum;
    }

    // clean up memory
    free(h_A_data);
    free(h_B_data);
    free(h_C);
    free(reference);
    
    return ((shrTRUE == res) ? CL_SUCCESS : -3000);
}
Beispiel #22
0
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, const char** argv) {

  const char *my_name = "[oclAvcDisc]";

  int bPassed = 1;
  char filename[500], cBuffer[1024], sProfileString[2048];
  FILE *log_ofs=NULL;
  time_t g_the_time;

  /* OpenCL variables */
  cl_int ciErrNum;
  cl_platform_id clSelectedPlatformID = NULL; 
  cl_uint ciDeviceCount;
  cl_device_id *devices;

  sprintf(filename, "oclAvcDisc.txt");
  if( (log_ofs=fopen(filename, "a"))== NULL )  {
    fprintf(stderr, "[oclAvcDisc] Error, could not open file %s\n", filename);
    exit(1);
  }

  g_the_time = time(NULL);

  _write_log(log_ofs, "%s oclDeviceQuery.exe Starting...\n", my_name); 

  /* Get OpenCL platform ID for NVIDIA if avaiable, otherwise default */
  _write_log(log_ofs, "%s OpenCL SW Info:\n", my_name);
  ciErrNum = oclGetPlatformID (&clSelectedPlatformID);
  oclCheckError(ciErrNum, CL_SUCCESS);

  /* Get OpenCL platform name and version */
  ciErrNum = clGetPlatformInfo (clSelectedPlatformID, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL);
  if (ciErrNum == CL_SUCCESS) {
    _write_log(log_ofs, "%s CL_PLATFORM_NAME: \t%s\n", my_name, cBuffer);
  } else {
    _write_log(log_ofs, "%s Error %i in clGetPlatformInfo Call !!!\n\n", my_name, ciErrNum);
    bPassed = 0;
  }

  ciErrNum = clGetPlatformInfo (clSelectedPlatformID, CL_PLATFORM_VERSION, sizeof(cBuffer), cBuffer, NULL);
  if (ciErrNum == CL_SUCCESS) {
    _write_log(log_ofs, "%s CL_PLATFORM_VERSION: \t%s\n", my_name, cBuffer);
  } else {
    _write_log(" Error %i in clGetPlatformInfo Call !!!\n\n", ciErrNum);
    bPassed = 0;
  }

  // Get and log OpenCL device info 
  _write_log(log_ofs, "%s OpenCL Device Info:\n\n", my_name);
  ciErrNum = clGetDeviceIDs (clSelectedPlatformID, CL_DEVICE_TYPE_ALL, 0, NULL, &ciDeviceCount);

  // check for 0 devices found or errors... 
  if (ciDeviceCount == 0) {
    _write_log(log_ofs, "%s No devices found supporting OpenCL (return code %i)\n\n", my_name, ciErrNum);
    bPassed = false;
  } else if (ciErrNum != CL_SUCCESS) {
    _write_log(log_ofs, "%s Error %i in clGetDeviceIDs call !!!\n\n", my_name, ciErrNum);
    bPassed = false;
  } else {
    // Get and log the OpenCL device ID's
     char cTemp[2];
    _write_log(log_ofs, "%s %u devices found supporting OpenCL:\n\n", my_name , ciDeviceCount);
    sprintf(cTemp, "%u", ciDeviceCount);
    if ((devices = (cl_device_id*)malloc(sizeof(cl_device_id) * ciDeviceCount)) == NULL) {
      _write_log(log_ofs, "%s Failed to allocate memory for devices !!!\n\n", my_name);
      bPassed = false;
    }
    ciErrNum = clGetDeviceIDs (clSelectedPlatformID, CL_DEVICE_TYPE_ALL, ciDeviceCount, devices, &ciDeviceCount);
    if (ciErrNum == CL_SUCCESS) {
      //Create a context for the devices
      cl_context cxGPUContext = clCreateContext(0, ciDeviceCount, devices, NULL, NULL, &ciErrNum);
      if (ciErrNum != CL_SUCCESS) {
        _write_log(log_ofs, "%s Error %i in clCreateContext call !!!\n\n", my_name, ciErrNum);
        bPassed = false;
      } else {
        // show info for each device in the context
        for(unsigned int i = 0; i < ciDeviceCount; ++i ) {  
          _write_log(log_ofs, "%s ---------------------------------\n", my_name);
          clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(cBuffer), &cBuffer, NULL);
          _write_log(log_ofs, "%s Device %s\n", my_name, cBuffer);
          _write_log(log_ofs, "%s ---------------------------------\n", my_name);
          oclPrintDevInfo(LOGBOTH, devices[i]);
        }
        // Determine and show image format support 
        cl_uint uiNumSupportedFormats = 0;

        // 2D
        clGetSupportedImageFormats(cxGPUContext, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE2D, NULL, NULL, &uiNumSupportedFormats);
        cl_image_format *ImageFormats = NULL;
        ImageFormats = (cl_image_format*)malloc(uiNumSupportedFormats*sizeof(cl_image_format));
        if(ImageFormats==NULL) {
          _write_log(log_ofs, "%s Error, could not alloc ImageFormats\n", my_name);
          exit(2);
        }

        clGetSupportedImageFormats(cxGPUContext, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE2D, uiNumSupportedFormats, ImageFormats, NULL);
        _write_log(log_ofs, "%s  ---------------------------------\n", my_name);
        _write_log(log_ofs, "%s  2D Image Formats Supported (%u)\n", my_name, uiNumSupportedFormats); 
        _write_log(log_ofs, "%s  ---------------------------------\n", my_name);
        _write_log(log_ofs, "%s  %-6s%-16s%-22s\n\n", my_name, "#", "Channel Order", "Channel Type"); 
        for(unsigned int i = 0; i < uiNumSupportedFormats; i++) {  
           _write_log(log_ofs, "%s  %-6u%-16s%-22s\n", my_name, (i + 1), 
               oclImageFormatString(ImageFormats[i].image_channel_order), 
               oclImageFormatString(ImageFormats[i].image_channel_data_type));
        }
        _write_log(log_ofs, "%s\n", my_name); 
        free(ImageFormats); ImageFormats = NULL;

        // 3D
        clGetSupportedImageFormats(cxGPUContext, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE3D, NULL, NULL, &uiNumSupportedFormats);
        ImageFormats = (cl_image_format*)malloc(uiNumSupportedFormats*sizeof(cl_image_format));
        if(ImageFormats==NULL) {
          _write_log(log_ofs, "%s Error, could not alloc ImageFormats\n", my_name);
          exit(3);
        }
        clGetSupportedImageFormats(cxGPUContext, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE3D, uiNumSupportedFormats, ImageFormats, NULL);
        _write_log(log_ofs, "%s  ---------------------------------\n", my_name);
        _write_log(log_ofs, "%s  3D Image Formats Supported (%u)\n", my_name, uiNumSupportedFormats); 
        _write_log(log_ofs, "%s  ---------------------------------\n", my_name);
        _write_log(log_ofs, "%s  %-6s%-16s%-22s\n\n", my_name, "#", "Channel Order", "Channel Type"); 
        for(unsigned int i = 0; i < uiNumSupportedFormats; i++) {  
          _write_log(log_ofs, "%s  %-6u%-16s%-22s\n", my_name, (i + 1),
              oclImageFormatString(ImageFormats[i].image_channel_order), 
              oclImageFormatString(ImageFormats[i].image_channel_data_type));
        }
        write_log(log_ofs, "%s\n", my_name); 
        free(ImageFormats); ImageFormats=NULL;
      }
    } else {
      write_log(log_ofs, "%s Error %i in clGetDeviceIDs call !!!\n\n", my_name, ciErrNum);
      bPassed = 0;
    }
  }

  // finish
  _write_log(log_ofs, "%s %s\n\n", my_name, bPassed==1 ? "PASSED" : "FAILED"); 
  fflush(log_ofs);
  fclose(log_ofs);
  return(0);
}
Beispiel #23
0
////////////////////////////////////////////////////////////////////////////////
// Test driver
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    cl_platform_id cpPlatform;
    cl_device_id cdDevice;
    cl_context cxGPUContext;                        //OpenCL context
    cl_command_queue cqCommandQueue;                //OpenCL command queue
    cl_mem c_Kernel, d_Input, d_Buffer, d_Output;   //OpenCL memory buffer objects
    cl_float *h_Kernel, *h_Input, *h_Buffer, *h_OutputCPU, *h_OutputGPU;

    cl_int ciErrNum;

    const unsigned int imageW = 3072;
    const unsigned int imageH = 3072;

    shrQAStart(argc, argv);

    // set logfile name and start logs
    shrSetLogFileName ("oclConvolutionSeparable.txt");
    shrLog("%s Starting...\n\n", argv[0]); 

    shrLog("Allocating and initializing host memory...\n");
        h_Kernel    = (cl_float *)malloc(KERNEL_LENGTH * sizeof(cl_float));
        h_Input     = (cl_float *)malloc(imageW * imageH * sizeof(cl_float));
        h_Buffer    = (cl_float *)malloc(imageW * imageH * sizeof(cl_float));
        h_OutputCPU = (cl_float *)malloc(imageW * imageH * sizeof(cl_float));
        h_OutputGPU = (cl_float *)malloc(imageW * imageH * sizeof(cl_float));

        srand(2009);
        for(unsigned int i = 0; i < KERNEL_LENGTH; i++)
            h_Kernel[i] = (cl_float)(rand() % 16);

        for(unsigned int i = 0; i < imageW * imageH; i++)
            h_Input[i] = (cl_float)(rand() % 16);

    shrLog("Initializing OpenCL...\n");
        //Get the NVIDIA platform
        ciErrNum = oclGetPlatformID(&cpPlatform);
        oclCheckError(ciErrNum, CL_SUCCESS);

        //Get the devices
        ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);

        //Create the context
        cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

        //Create a command-queue
        cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

    shrLog("Initializing OpenCL separable convolution...\n");
        initConvolutionSeparable(cxGPUContext, cqCommandQueue, (const char **)argv);

    shrLog("Creating OpenCL memory objects...\n");
        c_Kernel = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, KERNEL_LENGTH * sizeof(cl_float), h_Kernel, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);
        d_Input = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, imageW * imageH * sizeof(cl_float), h_Input, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);
        d_Buffer = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, imageW * imageH * sizeof(cl_float), NULL, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);
        d_Output = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, imageW * imageH * sizeof(cl_float), NULL, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

    shrLog("Applying separable convolution to %u x %u image...\n\n", imageW, imageH);
        //Just a single run or a warmup iteration
        convolutionRows(
            NULL,
            d_Buffer,
            d_Input,
            c_Kernel,
            imageW,
            imageH
        );

        convolutionColumns(
            NULL,
            d_Output,
            d_Buffer,
            c_Kernel,
            imageW,
            imageH
        );

#ifdef GPU_PROFILING
    const int numIterations = 16;
    cl_event startMark, endMark;
    ciErrNum = clEnqueueMarker(cqCommandQueue, &startMark);
    ciErrNum |= clFinish(cqCommandQueue);
    shrCheckError(ciErrNum, CL_SUCCESS);
    shrDeltaT(0);

    for(int iter = 0; iter < numIterations; iter++){
        convolutionRows(
            cqCommandQueue,
            d_Buffer,
            d_Input,
            c_Kernel,
            imageW,
            imageH
        );

        convolutionColumns(
            cqCommandQueue,
            d_Output,
            d_Buffer,
            c_Kernel,
            imageW,
            imageH
        );
    }
    ciErrNum  = clEnqueueMarker(cqCommandQueue, &endMark);
    ciErrNum |= clFinish(cqCommandQueue);
    shrCheckError(ciErrNum, CL_SUCCESS);

    //Calculate performance metrics by wallclock time
    double gpuTime = shrDeltaT(0) / (double)numIterations;
    shrLogEx(LOGBOTH | MASTER, 0, "oclConvolutionSeparable, Throughput = %.4f MPixels/s, Time = %.5f s, Size = %u Pixels, NumDevsUsed = %i, Workgroup = %u\n",
            (1.0e-6 * (double)(imageW * imageH)/ gpuTime), gpuTime, (imageW * imageH), 1, 0);

    //Get OpenCL profiler  info
    cl_ulong startTime = 0, endTime = 0;
    ciErrNum  = clGetEventProfilingInfo(startMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &startTime, NULL);
    ciErrNum |= clGetEventProfilingInfo(endMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL);
    shrCheckError(ciErrNum, CL_SUCCESS);
    shrLog("\nOpenCL time: %.5f s\n\n", 1.0e-9 * ((double)endTime - (double)startTime)/ (double)numIterations);
#endif

    shrLog("Reading back OpenCL results...\n\n");
        ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Output, CL_TRUE, 0, imageW * imageH * sizeof(cl_float), h_OutputGPU, 0, NULL, NULL);
        oclCheckError(ciErrNum, CL_SUCCESS);

    shrLog("Comparing against Host/C++ computation...\n"); 
        convolutionRowHost(h_Buffer, h_Input, h_Kernel, imageW, imageH, KERNEL_RADIUS);
        convolutionColumnHost(h_OutputCPU, h_Buffer, h_Kernel, imageW, imageH, KERNEL_RADIUS);
        double sum = 0, delta = 0;
        double L2norm;
        for(unsigned int i = 0; i < imageW * imageH; i++){
            delta += (h_OutputCPU[i] - h_OutputGPU[i]) * (h_OutputCPU[i] - h_OutputGPU[i]);
            sum += h_OutputCPU[i] * h_OutputCPU[i];
        }
        L2norm = sqrt(delta / sum);
        shrLog("Relative L2 norm: %.3e\n\n", L2norm);

    // cleanup
    closeConvolutionSeparable();
    ciErrNum  = clReleaseMemObject(d_Output);
    ciErrNum |= clReleaseMemObject(d_Buffer);
    ciErrNum |= clReleaseMemObject(d_Input);
    ciErrNum |= clReleaseMemObject(c_Kernel);
    ciErrNum |= clReleaseCommandQueue(cqCommandQueue);
    ciErrNum |= clReleaseContext(cxGPUContext);
    oclCheckError(ciErrNum, CL_SUCCESS);

    free(h_OutputGPU);
    free(h_OutputCPU);
    free(h_Buffer);
    free(h_Input);
    free(h_Kernel);

    // finish
    shrQAFinishExit(argc, (const char **)argv, (L2norm < 1e-6) ? QA_PASSED : QA_FAILED);
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    shrQAStart(argc, argv);
    // start logs 
    shrSetLogFileName ("oclSimpleMultiGPU.txt");
    shrLog("%s Starting, Array = %u float values...\n\n", argv[0], DATA_N); 

    // OpenCL
    cl_platform_id cpPlatform;
    cl_uint ciDeviceCount;
    cl_device_id* cdDevices;
    cl_context cxGPUContext;
    cl_device_id cdDevice;                          // GPU device
    int deviceNr[MAX_GPU_COUNT];
    cl_command_queue commandQueue[MAX_GPU_COUNT];
    cl_mem d_Data[MAX_GPU_COUNT];
    cl_mem d_Result[MAX_GPU_COUNT];
    cl_program cpProgram; 
    cl_kernel reduceKernel[MAX_GPU_COUNT];
    cl_event GPUDone[MAX_GPU_COUNT];
    cl_event GPUExecution[MAX_GPU_COUNT];
    size_t programLength;
    cl_int ciErrNum;			               
    char cDeviceName [256];
    cl_mem h_DataBuffer;

    // Vars for reduction results
    float h_SumGPU[MAX_GPU_COUNT * ACCUM_N];   
    float *h_Data;
    double sumGPU;
    double sumCPU, dRelError;

    // allocate and init host buffer with with some random generated input data
    h_Data = (float *)malloc(DATA_N * sizeof(float));
    shrFillArray(h_Data, DATA_N);

    // start timer & logs 
    shrLog("Setting up OpenCL on the Host...\n\n"); 
    shrDeltaT(1);

    // Annotate profiling state
    #ifdef GPU_PROFILING
        shrLog("OpenCL Profiling is enabled...\n\n"); 
    #endif

     //Get the NVIDIA platform
    ciErrNum = oclGetPlatformID(&cpPlatform);
    oclCheckError(ciErrNum, CL_SUCCESS);
    shrLog("clGetPlatformID...\n"); 

    //Get the devices
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &ciDeviceCount);
    oclCheckError(ciErrNum, CL_SUCCESS);
    cdDevices = (cl_device_id *)malloc(ciDeviceCount * sizeof(cl_device_id) );
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, ciDeviceCount, cdDevices, NULL);
    oclCheckError(ciErrNum, CL_SUCCESS);
    shrLog("clGetDeviceIDs...\n"); 

    //Create the context
    cxGPUContext = clCreateContext(0, ciDeviceCount, cdDevices, NULL, NULL, &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);
    shrLog("clCreateContext...\n");

    // Set up command queue(s) for GPU's specified on the command line or all GPU's
    if(shrCheckCmdLineFlag(argc, (const char **)argv, "device"))
    {
        // User specified GPUs
        int ciMaxDeviceID = ciDeviceCount-1;

        ciDeviceCount = 0;
        char* deviceList;
        char* deviceStr;
        char* next_token;
        shrGetCmdLineArgumentstr(argc, (const char **)argv, "device", &deviceList);

        #ifdef WIN32
            deviceStr = strtok_s (deviceList," ,.-", &next_token);
        #else
            deviceStr = strtok (deviceList," ,.-");
        #endif   

        // Create command queues for all Requested GPU's
        while(deviceStr != NULL) 
        {
            // get & log device index # and name
            deviceNr[ciDeviceCount] = atoi(deviceStr);
            if( deviceNr[ciDeviceCount] > ciMaxDeviceID ) {
                shrLog(" Invalid user specified device ID: %d\n", deviceNr[ciDeviceCount]);
                return 1;
            }

            cdDevice = oclGetDev(cxGPUContext, deviceNr[ciDeviceCount]);
            ciErrNum = clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cDeviceName), cDeviceName, NULL);
            oclCheckError(ciErrNum, CL_SUCCESS);
            shrLog(" Device %i: %s\n\n", deviceNr[ciDeviceCount], cDeviceName);

            // create a command que
            commandQueue[ciDeviceCount] = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
            oclCheckError(ciErrNum, CL_SUCCESS);
            shrLog("clCreateCommandQueue\n"); 

            ++ciDeviceCount;

            #ifdef WIN32
                deviceStr = strtok_s (NULL," ,.-", &next_token);
            #else            
                deviceStr = strtok (NULL," ,.-");
            #endif
        }

        free(deviceList);
    } 
    else 
    {
        // Find out how many GPU's to compute on all available GPUs
        size_t nDeviceBytes;
        ciErrNum = clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes);
        oclCheckError(ciErrNum, CL_SUCCESS);
        ciDeviceCount = (cl_uint)nDeviceBytes/sizeof(cl_device_id);

        for(unsigned int i = 0; i < ciDeviceCount; ++i ) 
        {
            // get & log device index # and name
            deviceNr[i] = i;
            cdDevice = oclGetDev(cxGPUContext, i);
            ciErrNum = clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cDeviceName), cDeviceName, NULL);
            oclCheckError(ciErrNum, CL_SUCCESS);
            shrLog(" Device %i: %s\n", i, cDeviceName);

            // create a command que
            commandQueue[i] = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
            oclCheckError(ciErrNum, CL_SUCCESS);
            shrLog("clCreateCommandQueue\n\n"); 
        }
    }

    // Load the OpenCL source code from the .cl file 
    const char* source_path = shrFindFilePath("simpleMultiGPU.cl", argv[0]);
    char *source = oclLoadProgSource(source_path, "", &programLength);
    oclCheckError(source != NULL, shrTRUE);
    shrLog("oclLoadProgSource\n"); 

    // Create the program for all GPUs in the context
    cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&source, &programLength, &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);
    shrLog("clCreateProgramWithSource\n"); 
    
    // build the program
    ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
        // write out standard error, Build Log and PTX, then cleanup and exit
        shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
        oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
        oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclSimpleMultiGPU.ptx");
        oclCheckError(ciErrNum, CL_SUCCESS); 
    }
    shrLog("clBuildProgram\n"); 

    // Create host buffer with page-locked memory
    h_DataBuffer = clCreateBuffer(cxGPUContext, CL_MEM_COPY_HOST_PTR | CL_MEM_ALLOC_HOST_PTR,
                                  DATA_N * sizeof(float), h_Data, &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);
    shrLog("clCreateBuffer (Page-locked Host)\n\n"); 

    // Create buffers for each GPU, with data divided evenly among GPU's
    int sizePerGPU = DATA_N / ciDeviceCount;
    int workOffset[MAX_GPU_COUNT];
    int workSize[MAX_GPU_COUNT];
    workOffset[0] = 0;
    for(unsigned int i = 0; i < ciDeviceCount; ++i ) 
    {
        workSize[i] = (i != (ciDeviceCount - 1)) ? sizePerGPU : (DATA_N - workOffset[i]);        

        // Input buffer
        d_Data[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, workSize[i] * sizeof(float), NULL, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);
        shrLog("clCreateBuffer (Input)\t\tDev %i\n", i); 

        // Copy data from host to device
        ciErrNum = clEnqueueCopyBuffer(commandQueue[i], h_DataBuffer, d_Data[i], workOffset[i] * sizeof(float), 
                                      0, workSize[i] * sizeof(float), 0, NULL, NULL);        
        shrLog("clEnqueueCopyBuffer (Input)\tDev %i\n", i);

        // Output buffer
        d_Result[i] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, ACCUM_N * sizeof(float), NULL, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);
        shrLog("clCreateBuffer (Output)\t\tDev %i\n", i);
        
        // Create kernel
        reduceKernel[i] = clCreateKernel(cpProgram, "reduce", &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);
        shrLog("clCreateKernel\t\t\tDev %i\n", i); 
        
        // Set the args values and check for errors
        ciErrNum |= clSetKernelArg(reduceKernel[i], 0, sizeof(cl_mem), &d_Result[i]);
        ciErrNum |= clSetKernelArg(reduceKernel[i], 1, sizeof(cl_mem), &d_Data[i]);
        ciErrNum |= clSetKernelArg(reduceKernel[i], 2, sizeof(int), &workSize[i]);
        oclCheckError(ciErrNum, CL_SUCCESS);
        shrLog("clSetKernelArg\t\t\tDev %i\n\n", i);

        workOffset[i + 1] = workOffset[i] + workSize[i];
    }

    // Set # of work items in work group and total in 1 dimensional range
    size_t localWorkSize[] = {THREAD_N};        
    size_t globalWorkSize[] = {ACCUM_N};        

    // Start timer and launch reduction kernel on each GPU, with data split between them 
    shrLog("Launching Kernels on GPU(s)...\n\n");
    for(unsigned int i = 0; i < ciDeviceCount; i++) 
    {        
        ciErrNum = clEnqueueNDRangeKernel(commandQueue[i], reduceKernel[i], 1, 0, globalWorkSize, localWorkSize,
                                         0, NULL, &GPUExecution[i]);
        oclCheckError(ciErrNum, CL_SUCCESS);
    }
    
    // Copy result from device to host for each device
    for(unsigned int i = 0; i < ciDeviceCount; i++) 
    {
        ciErrNum = clEnqueueReadBuffer(commandQueue[i], d_Result[i], CL_FALSE, 0, ACCUM_N * sizeof(float), 
                            h_SumGPU + i *  ACCUM_N, 0, NULL, &GPUDone[i]);
        oclCheckError(ciErrNum, CL_SUCCESS);
    }

    // Synchronize with the GPUs and do accumulated error check
    clWaitForEvents(ciDeviceCount, GPUDone);
    shrLog("clWaitForEvents complete...\n\n"); 

    // Aggregate results for multiple GPU's and stop/log processing time
    sumGPU = 0;
    for(unsigned int i = 0; i < ciDeviceCount * ACCUM_N; i++)
    {
         sumGPU += h_SumGPU[i];
    }

    // Print Execution Times for each GPU
    #ifdef GPU_PROFILING
        shrLog("Profiling Information for GPU Processing:\n\n");
        for(unsigned int i = 0; i < ciDeviceCount; i++) 
        {
            cdDevice = oclGetDev(cxGPUContext, deviceNr[i]);
            clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cDeviceName), cDeviceName, NULL);
            shrLog("Device %i : %s\n", deviceNr[i], cDeviceName);
            shrLog("  Reduce Kernel     : %.5f s\n", executionTime(GPUExecution[i]));
            shrLog("  Copy Device->Host : %.5f s\n\n\n", executionTime(GPUDone[i]));
        }
    #endif

    // Run the computation on the Host CPU and log processing time 
    shrLog("Launching Host/CPU C++ Computation...\n\n");
    sumCPU = 0;
    for(unsigned int i = 0; i < DATA_N; i++)
    {
        sumCPU += h_Data[i];
    }

    // Check GPU result against CPU result 
    dRelError = 100.0 * fabs(sumCPU - sumGPU) / fabs(sumCPU);
    shrLog("Comparing against Host/C++ computation...\n"); 
    shrLog(" GPU sum: %f\n CPU sum: %f\n", sumGPU, sumCPU);
    shrLog(" Relative Error (100.0 * Error / Golden) = %f \n\n", dRelError);

    // cleanup 
    free(source);
    free(h_Data);
    for(unsigned int i = 0; i < ciDeviceCount; ++i ) 
    {
        clReleaseKernel(reduceKernel[i]);
        clReleaseCommandQueue(commandQueue[i]);
    }
    clReleaseProgram(cpProgram);
    clReleaseContext(cxGPUContext);

    // finish
    shrQAFinishExit(argc, (const char **)argv, (dRelError < 1e-4) ? QA_PASSED : QA_FAILED);
  }
// Main function
// *********************************************************************
int main(int argc, char** argv) 
{
    shrQAStart(argc, argv);
    
    int use_gpu = 0;
    for(int i = 0; i < argc && argv; i++)
    {
        if(!argv[i])
            continue;
          
        if(strstr(argv[i], "cpu"))
            use_gpu = 0;        

        else if(strstr(argv[i], "gpu"))
            use_gpu = 1;
    }

    // start logs
    shrSetLogFileName ("oclDXTCompression.txt");
    shrLog("%s Starting...\n\n", argv[0]); 

    cl_platform_id cpPlatform = NULL;
    cl_uint uiNumDevices = 0;
    cl_device_id *cdDevices = NULL;
    cl_context cxGPUContext;
    cl_command_queue cqCommandQueue;
    cl_program cpProgram;
    cl_kernel ckKernel;
    cl_mem cmMemObjs[3];
    cl_mem cmAlphaTable4, cmProds4;
    cl_mem cmAlphaTable3, cmProds3;
    size_t szGlobalWorkSize[1];
    size_t szLocalWorkSize[1];
    cl_int ciErrNum;

    // Get the path of the filename
    char *filename;
    if (shrGetCmdLineArgumentstr(argc, (const char **)argv, "image", &filename)) {
        image_filename = filename;
    }
    // load image
    const char* image_path = shrFindFilePath(image_filename, argv[0]);
    oclCheckError(image_path != NULL, shrTRUE);
    shrLoadPPM4ub(image_path, (unsigned char **)&h_img, &width, &height);
    oclCheckError(h_img != NULL, shrTRUE);
    shrLog("Loaded '%s', %d x %d pixels\n\n", image_path, width, height);

    // Convert linear image to block linear. 
    const uint memSize = width * height * sizeof(cl_uint);
    uint* block_image = (uint*)malloc(memSize);

    // Convert linear image to block linear. 
    for(uint by = 0; by < height/4; by++) {
        for(uint bx = 0; bx < width/4; bx++) {
            for (int i = 0; i < 16; i++) {
                const int x = i & 3;
                const int y = i / 4;
                block_image[(by * width/4 + bx) * 16 + i] = 
                    ((uint *)h_img)[(by * 4 + y) * 4 * (width/4) + bx * 4 + x];
            }
        }
    }

    // Get the NVIDIA platform
    ciErrNum = oclGetPlatformID(&cpPlatform);
    oclCheckError(ciErrNum, CL_SUCCESS);

    // Get the platform's GPU devices
    ciErrNum = clGetDeviceIDs(cpPlatform, use_gpu?CL_DEVICE_TYPE_GPU:CL_DEVICE_TYPE_CPU, 0, NULL, &uiNumDevices);
    oclCheckError(ciErrNum, CL_SUCCESS);
    cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
    ciErrNum = clGetDeviceIDs(cpPlatform, use_gpu?CL_DEVICE_TYPE_GPU:CL_DEVICE_TYPE_CPU, uiNumDevices, cdDevices, NULL);
    oclCheckError(ciErrNum, CL_SUCCESS);

    // Create the context
    cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);

    // get and log device
    cl_device_id device;
    if( shrCheckCmdLineFlag(argc, (const char **)argv, "device") ) {
      int device_nr = 0;
      shrGetCmdLineArgumenti(argc, (const char **)argv, "device", &device_nr);
      device = oclGetDev(cxGPUContext, device_nr);
      if( device == (cl_device_id)-1 ) {
          shrLog(" Invalid GPU Device: devID=%d.  %d valid GPU devices detected\n\n", device_nr, uiNumDevices);
		  shrLog(" exiting...\n");
          return -1;
      }
    } else {
      device = oclGetMaxFlopsDev(cxGPUContext);
    }

    oclPrintDevName(LOGBOTH, device);
    shrLog("\n");

    // create a command-queue
    cqCommandQueue = clCreateCommandQueue(cxGPUContext, device, 0, &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);

    // Memory Setup

    // Constants
    cmAlphaTable4 = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 4 * sizeof(cl_float), (void*)&alphaTable4[0], &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);
    cmProds4 = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 4 * sizeof(cl_int), (void*)&prods4[0], &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);
    cmAlphaTable3 = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 4 * sizeof(cl_float), (void*)&alphaTable3[0], &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);
    cmProds3 = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 4 * sizeof(cl_int), (void*)&prods3[0], &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);

    // Compute permutations.
    cl_uint permutations[1024];
    computePermutations(permutations);

    // Upload permutations.
    cmMemObjs[0] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                  sizeof(cl_uint) * 1024, permutations, &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);

    // Image
    cmMemObjs[1] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, memSize, NULL, &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);
    
    // Result
    const uint compressedSize = (width / 4) * (height / 4) * 8;
    cmMemObjs[2] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, compressedSize, NULL , &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);
    
    unsigned int * h_result = (uint*)malloc(compressedSize);

    // Program Setup
    size_t program_length;
    const char* source_path = shrFindFilePath("DXTCompression.cl", argv[0]);
    oclCheckError(source_path != NULL, shrTRUE);
    char *source = oclLoadProgSource(source_path, "", &program_length);
    oclCheckError(source != NULL, shrTRUE);

    // create the program
    cpProgram = clCreateProgramWithSource(cxGPUContext, 1,
        (const char **) &source, &program_length, &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);

    // build the program
    ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
        // write out standard error, Build Log and PTX, then cleanup and exit
        shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
        oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
        oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclDXTCompression.ptx");
        oclCheckError(ciErrNum, CL_SUCCESS); 
    }

    // create the kernel
    ckKernel = clCreateKernel(cpProgram, "compress", &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);

    // set the args values
    ciErrNum  = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void *) &cmMemObjs[0]);
    ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void *) &cmMemObjs[1]);
    ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void *) &cmMemObjs[2]);
    ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(cl_mem), (void*)&cmAlphaTable4);
    ciErrNum |= clSetKernelArg(ckKernel, 4, sizeof(cl_mem), (void*)&cmProds4);
    ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(cl_mem), (void*)&cmAlphaTable3);
    ciErrNum |= clSetKernelArg(ckKernel, 6, sizeof(cl_mem), (void*)&cmProds3);
    oclCheckError(ciErrNum, CL_SUCCESS);

    // Copy input data host to device
    clEnqueueWriteBuffer(cqCommandQueue, cmMemObjs[1], CL_FALSE, 0, sizeof(cl_uint) * width * height, block_image, 0,0,0);

    // Determine launch configuration and run timed computation numIterations times
	int blocks = ((width + 3) / 4) * ((height + 3) / 4); // rounds up by 1 block in each dim if %4 != 0

	// Restrict the numbers of blocks to launch on low end GPUs to avoid kernel timeout
	cl_uint compute_units;
    clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
	int blocksPerLaunch = MIN(blocks, 768 * (int)compute_units);

    // set work-item dimensions
    szGlobalWorkSize[0] = blocksPerLaunch * NUM_THREADS;
    szLocalWorkSize[0]= NUM_THREADS;

#ifdef GPU_PROFILING
    shrLog("\nRunning DXT Compression on %u x %u image...\n", width, height);
    shrLog("\n%u Workgroups, %u Work Items per Workgroup, %u Work Items in NDRange...\n\n", 
           blocks, NUM_THREADS, blocks * NUM_THREADS);

    int numIterations = 50;
    for (int i = -1; i < numIterations; ++i) {
        if (i == 0) { // start timing only after the first warmup iteration
            clFinish(cqCommandQueue); // flush command queue
            shrDeltaT(0); // start timer
        }
#endif
        // execute kernel
		for( int j=0; j<blocks; j+= blocksPerLaunch ) {
			clSetKernelArg(ckKernel, 7, sizeof(int), &j);
			szGlobalWorkSize[0] = MIN( blocksPerLaunch, blocks-j ) * NUM_THREADS;
			ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL,
				                              szGlobalWorkSize, szLocalWorkSize, 
					                          0, NULL, NULL);
			oclCheckError(ciErrNum, CL_SUCCESS);
		}

#ifdef GPU_PROFILING
    }
    clFinish(cqCommandQueue);
    double dAvgTime = shrDeltaT(0) / (double)numIterations;
    shrLogEx(LOGBOTH | MASTER, 0, "oclDXTCompression, Throughput = %.4f MPixels/s, Time = %.5f s, Size = %u Pixels, NumDevsUsed = %i, Workgroup = %d\n", 
           (1.0e-6 * (double)(width * height)/ dAvgTime), dAvgTime, (width * height), 1, szLocalWorkSize[0]); 
#endif

    // blocking read output
    ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmMemObjs[2], CL_TRUE, 0,
                                   compressedSize, h_result, 0, NULL, NULL);
    oclCheckError(ciErrNum, CL_SUCCESS);

    // Write DDS file.
    FILE* fp = NULL;
    char output_filename[1024];
    #ifdef WIN32
        strcpy_s(output_filename, 1024, image_path);
        strcpy_s(output_filename + strlen(image_path) - 3, 1024 - strlen(image_path) + 3, "dds");
        fopen_s(&fp, output_filename, "wb");
    #else
        strcpy(output_filename, image_path);
        strcpy(output_filename + strlen(image_path) - 3, "dds");
        fp = fopen(output_filename, "wb");
    #endif
    oclCheckError(fp != NULL, shrTRUE);

    DDSHeader header;
    header.fourcc = FOURCC_DDS;
    header.size = 124;
    header.flags  = (DDSD_WIDTH|DDSD_HEIGHT|DDSD_CAPS|DDSD_PIXELFORMAT|DDSD_LINEARSIZE);
    header.height = height;
    header.width = width;
    header.pitch = compressedSize;
    header.depth = 0;
    header.mipmapcount = 0;
    memset(header.reserved, 0, sizeof(header.reserved));
    header.pf.size = 32;
    header.pf.flags = DDPF_FOURCC;
    header.pf.fourcc = FOURCC_DXT1;
    header.pf.bitcount = 0;
    header.pf.rmask = 0;
    header.pf.gmask = 0;
    header.pf.bmask = 0;
    header.pf.amask = 0;
    header.caps.caps1 = DDSCAPS_TEXTURE;
    header.caps.caps2 = 0;
    header.caps.caps3 = 0;
    header.caps.caps4 = 0;
    header.notused = 0;

    fwrite(&header, sizeof(DDSHeader), 1, fp);
    fwrite(h_result, compressedSize, 1, fp);

    fclose(fp);

    // Make sure the generated image matches the reference image (regression check)
    shrLog("\nComparing against Host/C++ computation...\n");     
    const char* reference_image_path = shrFindFilePath(refimage_filename, argv[0]);
    oclCheckError(reference_image_path != NULL, shrTRUE);

    // read in the reference image from file
    #ifdef WIN32
        fopen_s(&fp, reference_image_path, "rb");
    #else
        fp = fopen(reference_image_path, "rb");
    #endif
    oclCheckError(fp != NULL, shrTRUE);
    fseek(fp, sizeof(DDSHeader), SEEK_SET);
    uint referenceSize = (width / 4) * (height / 4) * 8;
    uint * reference = (uint *)malloc(referenceSize);
    fread(reference, referenceSize, 1, fp);
    fclose(fp);

    // compare the reference image data to the sample/generated image
    float rms = 0;
    for (uint y = 0; y < height; y += 4)
    {
        for (uint x = 0; x < width; x += 4)
        {
            // binary comparison of data
            uint referenceBlockIdx = ((y/4) * (width/4) + (x/4));
            uint resultBlockIdx = ((y/4) * (width/4) + (x/4));
            int cmp = compareBlock(((BlockDXT1 *)h_result) + resultBlockIdx, ((BlockDXT1 *)reference) + referenceBlockIdx);

            // log deviations, if any
            if (cmp != 0.0f) 
            {
                compareBlock(((BlockDXT1 *)h_result) + resultBlockIdx, ((BlockDXT1 *)reference) + referenceBlockIdx);
                shrLog("Deviation at (%d, %d):\t%f rms\n", x/4, y/4, float(cmp)/16/3);
            }
            rms += cmp;
        }
    }
    rms /= width * height * 3;
    shrLog("RMS(reference, result) = %f\n\n", rms);

    // Free OpenCL resources
    oclDeleteMemObjs(cmMemObjs, 3);
    clReleaseMemObject(cmAlphaTable4);
    clReleaseMemObject(cmProds4);
    clReleaseMemObject(cmAlphaTable3);
    clReleaseMemObject(cmProds3);
    clReleaseKernel(ckKernel);
    clReleaseProgram(cpProgram);
    clReleaseCommandQueue(cqCommandQueue);
    clReleaseContext(cxGPUContext);

    // Free host memory
    free(source);
    free(h_img);

    // finish
    shrQAFinishExit(argc, (const char **)argv, (rms <= ERROR_THRESHOLD) ? QA_PASSED : QA_FAILED);
}
    // Function to read in kernel from uncompiled source, create the OCL program and build the OCL program 
    // **************************************************************************************************
    int CreateProgramAndKernel(cl_context cxGPUContext, cl_device_id* cdDevices, const char *kernel_name, cl_kernel *kernel, bool bDouble)
    {
        cl_program cpProgram;
        size_t szSourceLen;
        cl_int ciErrNum = CL_SUCCESS; 

        // Read the kernel in from file
        shrLog("\nLoading Uncompiled kernel from .cl file, using %s\n", clSourcefile);
        char* cPathAndFile = shrFindFilePath(clSourcefile, cExecutablePath);
        oclCheckError(cPathAndFile != NULL, shrTRUE);
        char* pcSource = oclLoadProgSource(cPathAndFile, "", &szSourceLen);
        oclCheckError(pcSource != NULL, shrTRUE);

	// Check OpenCL version -> vec3 types are supported only from version 1.1 and above
	char cOCLVersion[32];
	clGetDeviceInfo(cdDevices[0], CL_DEVICE_VERSION, sizeof(cOCLVersion), &cOCLVersion, 0);

	int iVec3Length = 3;
	if( strncmp("OpenCL 1.0", cOCLVersion, 10) == 0 ) {
		iVec3Length = 4;
	}


		//for double precision
		char *pcSourceForDouble;
		std::stringstream header;
		if (bDouble)
		{
			header << "#define REAL double";
			header << std::endl;
			header << "#define REAL4 double4";
			header << std::endl;
			header << "#define REAL3 double" << iVec3Length;
			header << std::endl;
			header << "#define ZERO3 {0.0, 0.0, 0.0" << ((iVec3Length == 4) ? ", 0.0}" : "}");
			header << std::endl;
		}
		else
		{
			header << "#define REAL float";
			header << std::endl;
			header << "#define REAL4 float4";
			header << std::endl;
			header << "#define REAL3 float" << iVec3Length;
			header << std::endl;
			header << "#define ZERO3 {0.0f, 0.0f, 0.0f" << ((iVec3Length == 4) ? ", 0.0f}" : "}");
			header << std::endl;
		}
		
		header << pcSource;
		pcSourceForDouble = (char *)malloc(header.str().size() + 1);
		szSourceLen = header.str().size();
#ifdef WIN32
        strcpy_s(pcSourceForDouble, szSourceLen + 1, header.str().c_str());
#else
        strcpy(pcSourceForDouble, header.str().c_str());
#endif

        // create the program 
        cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&pcSourceForDouble, &szSourceLen, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);
        shrLog("clCreateProgramWithSource\n"); 

        // Build the program with 'mad' Optimization option
#ifdef MAC
	char *flags = "-cl-fast-relaxed-math -DMAC";
#else
	char *flags = "-cl-fast-relaxed-math";
#endif
        ciErrNum = clBuildProgram(cpProgram, 0, NULL, flags, NULL, NULL);
        if (ciErrNum != CL_SUCCESS)
        {
            // write out standard error, Build Log and PTX, then cleanup and exit
            shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
            oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
            oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclNbody.ptx");
            oclCheckError(ciErrNum, CL_SUCCESS); 
        }
        shrLog("clBuildProgram\n"); 

        // create the kernel
        *kernel = clCreateKernel(cpProgram, kernel_name, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS); 
        shrLog("clCreateKernel\n"); 

		size_t wgSize;
		ciErrNum = clGetKernelWorkGroupInfo(*kernel, cdDevices[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL);
		if (wgSize == 64) {
		  shrLog(
			 "ERROR: Minimum work-group size 256 required by this application is not supported on this device.\n");
		  exit(0);
		}
	
		free(pcSourceForDouble);

        return 0;
    }
Beispiel #27
0
////////////////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    cl_platform_id cpPlatform;       //OpenCL platform
    cl_device_id cdDevice;           //OpenCL device
    cl_context       cxGPUContext;   //OpenCL context
    cl_command_queue cqCommandQueue; //OpenCL command que
    cl_mem      d_Input, d_Output;   //OpenCL memory buffer objects

    cl_int ciErrNum;

    float *h_Input, *h_OutputCPU, *h_OutputGPU;

    const uint
        imageW = 2048,
        imageH = 2048,
        stride = 2048;

    const int dir = DCT_FORWARD;

    shrQAStart(argc, argv);


    int use_gpu = 0;
    for(int i = 0; i < argc && argv; i++)
    {
        if(!argv[i])
            continue;
          
        if(strstr(argv[i], "cpu"))
            use_gpu = 0;        

        else if(strstr(argv[i], "gpu"))
            use_gpu = 1;
    }

    // set logfile name and start logs
    shrSetLogFileName ("oclDCT8x8.txt");
    shrLog("%s Starting...\n\n", argv[0]); 

    shrLog("Allocating and initializing host memory...\n");
        h_Input     = (float *)malloc(imageH * stride * sizeof(float));
        h_OutputCPU = (float *)malloc(imageH * stride * sizeof(float));
        h_OutputGPU = (float *)malloc(imageH * stride * sizeof(float));
        srand(2009);
        for(uint i = 0; i < imageH; i++)
            for(uint j = 0; j < imageW; j++)
                h_Input[i * stride + j] = (float)rand() / (float)RAND_MAX;

    shrLog("Initializing OpenCL...\n");
        //Get the NVIDIA platform
        ciErrNum = oclGetPlatformID(&cpPlatform);
        oclCheckError(ciErrNum, CL_SUCCESS);

        //Get a GPU device
        ciErrNum = clGetDeviceIDs(cpPlatform, use_gpu?CL_DEVICE_TYPE_GPU:CL_DEVICE_TYPE_CPU, 1, &cdDevice, NULL);
        oclCheckError(ciErrNum, CL_SUCCESS);

        //Create the context
        cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

        //Create a command-queue
        cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

    shrLog("Initializing OpenCL DCT 8x8...\n");
        initDCT8x8(cxGPUContext, cqCommandQueue, (const char **)argv);

    shrLog("Creating OpenCL memory objects...\n");
        d_Input = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, imageH * stride *  sizeof(cl_float), h_Input, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);
        d_Output = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, imageH * stride * sizeof(cl_float), NULL, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

    shrLog("Performing DCT8x8 of %u x %u image...\n\n", imageH, imageW);
        //Just a single iteration or a warmup iteration
        DCT8x8(
            cqCommandQueue,
            d_Output,
            d_Input,
            stride,
            imageH,
            imageW,
            dir
        );

#ifdef GPU_PROFILING
    const int numIterations = 16;
    cl_event startMark, endMark;
    ciErrNum = clEnqueueMarker(cqCommandQueue, &startMark);
    ciErrNum |= clFinish(cqCommandQueue);
    shrCheckError(ciErrNum, CL_SUCCESS);
    shrDeltaT(0);

    for(int iter = 0; iter < numIterations; iter++)
        DCT8x8(
            NULL,
            d_Output,
            d_Input,
            stride,
            imageH,
            imageW,
            dir
        );

    ciErrNum  = clEnqueueMarker(cqCommandQueue, &endMark);
    ciErrNum |= clFinish(cqCommandQueue);
    shrCheckError(ciErrNum, CL_SUCCESS);

    //Calculate performance metrics by wallclock time
    double gpuTime = shrDeltaT(0) / (double)numIterations;
    shrLogEx(LOGBOTH | MASTER, 0, "oclDCT8x8, Throughput = %.4f MPixels/s, Time = %.5f s, Size = %u Pixels, NumDevsUsed = %i, Workgroup = %u\n", 
            (1.0e-6 * (double)(imageW * imageH)/ gpuTime), gpuTime, (imageW * imageH), 1, 0); 

    //Get profiler time
    cl_ulong startTime = 0, endTime = 0;
    ciErrNum  = clGetEventProfilingInfo(startMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &startTime, NULL);
    ciErrNum |= clGetEventProfilingInfo(endMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL);
    shrCheckError(ciErrNum, CL_SUCCESS);
    shrLog("\nOpenCL time: %.5f s\n\n", 1.0e-9 * ((double)endTime - (double)startTime) / (double)numIterations);
#endif

    shrLog("Reading back OpenCL results...\n");
        ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Output, CL_TRUE, 0, imageH * stride * sizeof(cl_float), h_OutputGPU, 0, NULL, NULL);
        oclCheckError(ciErrNum, CL_SUCCESS);

    shrLog("Comparing against Host/C++ computation...\n"); 
        DCT8x8CPU(h_OutputCPU, h_Input, stride, imageH, imageW, dir);
        double sum = 0, delta = 0;
        double L2norm;
        for(uint i = 0; i < imageH; i++)
            for(uint j = 0; j < imageW; j++){
                sum += h_OutputCPU[i * stride + j] * h_OutputCPU[i * stride + j];
                delta += (h_OutputGPU[i * stride + j] - h_OutputCPU[i * stride + j]) * (h_OutputGPU[i * stride + j] - h_OutputCPU[i * stride + j]);
            }
        L2norm = sqrt(delta / sum);
        shrLog("Relative L2 norm: %.3e\n\n", L2norm);

    shrLog("Shutting down...\n");
        //Release kernels and program
        closeDCT8x8();

        //Release other OpenCL objects
        ciErrNum  = clReleaseMemObject(d_Output);
        ciErrNum |= clReleaseMemObject(d_Input);
        ciErrNum |= clReleaseCommandQueue(cqCommandQueue);
        ciErrNum |= clReleaseContext(cxGPUContext);
        oclCheckError(ciErrNum, CL_SUCCESS);

        //Release host buffers
        free(h_OutputGPU);
        free(h_OutputCPU);
        free(h_Input);

        //Finish
        shrQAFinishExit(argc, (const char **)argv, (L2norm < 1E-6) ? QA_PASSED : QA_FAILED);
}
Beispiel #28
0
/**
 * Helper function that will load the OpenCL source program, build and return a handle to that OpenCL kernel
 * @param context - the OpenCL context
 * @param device - the device to compile for
 * @param program - the program that is being built (in/out)
 * @param sourcePath - full path to the source file
 * @param options - build options, e.g. define flags("-D name=value")
 * @return an error code on failure, 0 on success
 */
cl_program CLState::compileOCLProgram(const char* sourcePath,
		const std::string& options) {
	cl_int errNum;

	size_t program_length;
	oclCheckError(sourcePath != NULL, shrTRUE);
	char *source = oclLoadProgSource(sourcePath, "", &program_length);
	if (!source) {
		shrLog("Error: Failed to load compute program %s!\n", sourcePath);
		BOOST_THROW_EXCEPTION(
				runtime_error()
						<< error_message(
								(std::string(
										"Error: Failed to load cl program source from ")
										+ sourcePath).c_str()));
	}

	// create the simple increment OpenCL program
	cl_program program = clCreateProgramWithSource(context, 1,
			(const char **) &source, &program_length, &errNum);
	if (errNum != CL_SUCCESS) {
		shrLog("Error: Failed to create program\n");
		BOOST_THROW_EXCEPTION(runtime_error() << cl_error_code(errNum));
	} else {
		shrLog("clCreateProgramWithSource <%s> succeeded, program_length=%d\n",
				sourcePath, program_length);
	}
	free(source);

	// build the program
	cl_build_status build_status = CL_SUCCESS;

	errNum =
			clBuildProgram(program, 0, NULL,
					std::string(
							"-cl-fast-relaxed-math -cl-nv-verbose" + options).c_str(),
					NULL, NULL);
	if (errNum != CL_SUCCESS) {
		// write out standard error, Build Log and PTX, then return error
		shrLogEx(LOGBOTH | ERRORMSG, errNum, STDERROR);
		oclLogBuildInfo(program, oclGetFirstDev(context));
		oclLogPtx(program, oclGetFirstDev(context), "build_error.ptx");
		BOOST_THROW_EXCEPTION(runtime_error() << cl_error_code(errNum));
	} else {
		shrLog("clBuildProgram <%s> succeeded\n", sourcePath);
		if (this->execDevices != NULL) {
			for (uint iDevice = 0;
					iDevice < this->execDeviceCount
							&& build_status == CL_SUCCESS
							&& errNum == CL_SUCCESS; iDevice++) {
				cl_device_id device = this->execDevices[iDevice];
				errNum = clGetProgramBuildInfo(program, device,
				CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &build_status,
				NULL);
				shrLog("clGetProgramBuildInfo returned: ");
				if (build_status == CL_SUCCESS) {
					shrLog("CL_SUCCESS\n");
				} else {
					shrLog("CLErrorNumber = %d\n", errNum);
				}
				// print out the build log, note in the case where there is nothing shown, some OpenCL PTX->SASS caching has happened
				{
					char *build_log;
					size_t ret_val_size;

					errNum = clGetProgramBuildInfo(program, device,
					CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);

					if (errNum != CL_SUCCESS) {
						shrLog(
								"clGetProgramBuildInfo device %d, failed to get the log size at line %d\n",
								device, __LINE__);
					}
					build_log = (char *) malloc(ret_val_size + 1);

					errNum = clGetProgramBuildInfo(program, device,
					CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);

					if (errNum != CL_SUCCESS) {
						shrLog(
								"clGetProgramBuildInfo device %d, failed to get the build log at line %d\n",
								device, __LINE__);
					}
					// to be carefully, terminate with \0
					// there's no information in the reference whether the string is 0 terminated or not
					build_log[ret_val_size] = '\0';
					shrLog("%s\n", build_log);
				}
			}
		}
	}
	this->programs.push_back(program);
	return program;
}
//////////////////////////////////////////////////////////////////////////////
//! Releases the global context and command queue
//!
//! @param clContext     the context to release
//! @param clQueue       the command queue to release
//////////////////////////////////////////////////////////////////////////////
void oclShutdown(cl_context clContext, cl_command_queue clQueue) {
  cl_int ciErrNum;
  ciErrNum  = clReleaseCommandQueue(clQueue);
  ciErrNum |= clReleaseContext(clContext);
  oclCheckError(ciErrNum, CL_SUCCESS);
}
    void IntegrateNbodySystem(cl_command_queue cqCommandQueue,
                              cl_kernel MT_kernel, cl_kernel noMT_kernel,
                              cl_mem newPos, cl_mem newVel,
                              cl_mem oldPos, cl_mem oldVel,
                              cl_mem pboCLOldPos, cl_mem pboCLNewPos,
                              float deltaTime, float damping, float softSq,
                              int numBodies, int p, int q,
                              int bUsePBO, bool bDouble)
    {
        int sharedMemSize;

		//for double precision
		if (bDouble)
		{
			sharedMemSize = p * q * sizeof(cl_double4); // 4 doubles for pos
		}
		else
		{
			sharedMemSize = p * q * sizeof(cl_float4); // 4 floats for pos
		}

        size_t global_work_size[2];
        size_t local_work_size[2];
        cl_int ciErrNum = CL_SUCCESS;
        cl_kernel kernel;

        // When the numBodies / thread block size is < # multiprocessors 
        // (16 on G80), the GPU is underutilized. For example, with 256 threads per
        // block and 1024 bodies, there will only be 4 thread blocks, so the 
        // GPU will only be 25% utilized.  To improve this, we use multiple threads
        // per body.  We still can use blocks of 256 threads, but they are arranged
        // in q rows of p threads each.  Each thread processes 1/q of the forces 
        // that affect each body, and then 1/q of the threads (those with 
        // threadIdx.y==0) add up the partial sums from the other threads for that 
        // body.  To enable this, use the "--p=" and "--q=" command line options to
        // this example.  e.g.: "nbody.exe --n=1024 --p=64 --q=4" will use 4 
        // threads per body and 256 threads per block. There will be n/p = 16 
        // blocks, so a G80 GPU will be 100% utilized.

        if (q == 1)
        {
            kernel = MT_kernel;
        }
        else
        {
            kernel = noMT_kernel;
        }

        if (bUsePBO)
        {
            ciErrNum = clEnqueueAcquireGLObjects(cqCommandQueue, 1, &pboCLOldPos, 0, NULL, NULL);
            ciErrNum |= clEnqueueAcquireGLObjects(cqCommandQueue, 1, &pboCLNewPos, 0, NULL, NULL);
            oclCheckError(ciErrNum, CL_SUCCESS);
        }

	    ciErrNum |= clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&newPos);
        ciErrNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&newVel);
        ciErrNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&oldPos);
        ciErrNum |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&oldVel);
		if (bDouble)
		{
			double ddeltaTime = (double)deltaTime;
			double ddamping = (double)damping;
			double dsoftSq = (double)softSq;
			ciErrNum |= clSetKernelArg(kernel, 4, sizeof(cl_double), (void *)&ddeltaTime);
			ciErrNum |= clSetKernelArg(kernel, 5, sizeof(cl_double), (void *)&ddamping);
			ciErrNum |= clSetKernelArg(kernel, 6, sizeof(cl_double), (void *)&dsoftSq);
		}
		else
		{
			ciErrNum |= clSetKernelArg(kernel, 4, sizeof(cl_float), (void *)&deltaTime);
			ciErrNum |= clSetKernelArg(kernel, 5, sizeof(cl_float), (void *)&damping);
			ciErrNum |= clSetKernelArg(kernel, 6, sizeof(cl_float), (void *)&softSq);
		}
        ciErrNum |= clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&numBodies);
        ciErrNum |= clSetKernelArg(kernel, 8, sharedMemSize, NULL);
        oclCheckError(ciErrNum, CL_SUCCESS);

        // set work-item dimensions
        local_work_size[0] = p;
        local_work_size[1] = q;
        global_work_size[0]= numBodies;
        global_work_size[1]= q;

        // execute the kernel:
        ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL);
        oclCheckError(ciErrNum, CL_SUCCESS);

        if (bUsePBO)
        {
            ciErrNum = clEnqueueReleaseGLObjects(cqCommandQueue, 1, &pboCLNewPos, 0, NULL, NULL);
            ciErrNum |= clEnqueueReleaseGLObjects(cqCommandQueue, 1, &pboCLOldPos,  0, NULL, NULL);
            oclCheckError(ciErrNum, CL_SUCCESS);
        }
    }