////////////////////////////////////////////////////////////////////////////////
// OpenCL Black-Scholes kernel launcher
////////////////////////////////////////////////////////////////////////////////
extern "C" void BlackScholes(
    cl_command_queue cqCommandQueue,
    cl_mem d_Call, //Call option price
    cl_mem d_Put,  //Put option price
    cl_mem d_S,    //Current stock price
    cl_mem d_X,    //Option strike price
    cl_mem d_T,    //Option years
    cl_float R,    //Riskless rate of return
    cl_float V,    //Stock volatility
    cl_uint optionCount
){
    cl_int ciErrNum;

    if(!cqCommandQueue)
        cqCommandQueue = cqDefaultCommandQueue;

    ciErrNum  = clSetKernelArg(ckBlackScholes, 0, sizeof(cl_mem),   (void *)&d_Call);
    ciErrNum |= clSetKernelArg(ckBlackScholes, 1, sizeof(cl_mem),   (void *)&d_Put);
    ciErrNum |= clSetKernelArg(ckBlackScholes, 2, sizeof(cl_mem),   (void *)&d_S);
    ciErrNum |= clSetKernelArg(ckBlackScholes, 3, sizeof(cl_mem),   (void *)&d_X);
    ciErrNum |= clSetKernelArg(ckBlackScholes, 4, sizeof(cl_mem),   (void *)&d_T);
    ciErrNum |= clSetKernelArg(ckBlackScholes, 5, sizeof(cl_float), (void *)&R);
    ciErrNum |= clSetKernelArg(ckBlackScholes, 6, sizeof(cl_float), (void *)&V);
    ciErrNum |= clSetKernelArg(ckBlackScholes, 7, sizeof(cl_uint),  (void *)&optionCount);
    shrCheckError(ciErrNum, CL_SUCCESS);

    //Run the kernel
    size_t globalWorkSize = 60 * 1024;
	size_t localWorkSize = 128;
    ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckBlackScholes, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
    shrCheckError(ciErrNum, CL_SUCCESS);
}
void BodySystemCPU::update(float deltaTime)
{
    shrCheckError(m_bInitialized, shrTRUE);

    _integrateNBodySystem(deltaTime);
    std::swap(m_currentRead, m_currentWrite);
}
extern "C" void closeHistogram64(void){
    cl_int ciErrNum;

    ciErrNum  = clReleaseMemObject(d_PartialHistograms);
    ciErrNum |= clReleaseKernel(ckMergeHistogram64);
    ciErrNum |= clReleaseKernel(ckHistogram64);
    ciErrNum |= clReleaseProgram(cpHistogram64);
    shrCheckError(ciErrNum, CL_SUCCESS);
}
void BodySystemCPU::_finalize()
{
    shrCheckError(m_bInitialized, shrTRUE);

    delete [] m_pos[0];
    delete [] m_pos[1];
    delete [] m_vel[0];
    delete [] m_vel[1];
    delete [] m_force;
}
extern "C" void initHistogram64(cl_context cxGPUContext, cl_command_queue cqParamCommandQue, const char **argv){
    cl_int ciErrNum;
    size_t kernelLength;

    shrLog("...loading Histogram64.cl from file\n");
        char *cHistogram64 = oclLoadProgSource(shrFindFilePath("Histogram64.cl", argv[0]), "// My comment\n", &kernelLength);
        shrCheckError(cHistogram64 != NULL, shrTRUE);

    shrLog("...creating histogram64 program\n");
         cpHistogram64 = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cHistogram64, &kernelLength, &ciErrNum);
        shrCheckError(ciErrNum, CL_SUCCESS);

    shrLog("...building histogram64 program\n");
        ciErrNum = clBuildProgram(cpHistogram64, 0, NULL, compileOptions, NULL, NULL);
        shrCheckError(ciErrNum, CL_SUCCESS);

    shrLog("...creating histogram64 kernels\n");
        ckHistogram64 = clCreateKernel(cpHistogram64, "histogram64", &ciErrNum);
        shrCheckError(ciErrNum, CL_SUCCESS);
        ckMergeHistogram64 = clCreateKernel(cpHistogram64, "mergeHistogram64", &ciErrNum);
        shrCheckError(ciErrNum, CL_SUCCESS);

    shrLog("...allocating internal histogram64 buffer\n");
        d_PartialHistograms = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, MAX_PARTIAL_HISTOGRAM64_COUNT * HISTOGRAM64_BIN_COUNT * sizeof(uint), NULL, &ciErrNum);
        shrCheckError(ciErrNum, CL_SUCCESS);

    //Save default command queue
    cqDefaultCommandQue = cqParamCommandQue;

    //Discard temp storage
    free(cHistogram64);

    //Save ptx code to separate file
    oclLogPtx(cpHistogram64, oclGetFirstDev(cxGPUContext), "Histogram64.ptx");
}
extern "C" void initBlackScholes(cl_context cxGPUContext, cl_command_queue cqParamCommandQueue, const char **argv){
    cl_int ciErrNum;
    size_t kernelLength;

    shrLog(LOGBOTH, 0, "...loading BlackScholes.cl\n");
        char *cBlackScholes = oclLoadProgSource(shrFindFilePath("BlackScholes.cl", argv[0]), "// My comment\n", &kernelLength);
        shrCheckError(cBlackScholes != NULL, shrTRUE);

    shrLog(LOGBOTH, 0, "...creating BlackScholes program\n");
        cpBlackScholes = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cBlackScholes, &kernelLength, &ciErrNum);
        shrCheckError(ciErrNum, CL_SUCCESS);

    shrLog(LOGBOTH, 0, "...building BlackScholes program\n");
        ciErrNum = clBuildProgram(cpBlackScholes, 0, NULL, NULL, NULL, NULL);
        shrCheckError(ciErrNum, CL_SUCCESS);

    shrLog(LOGBOTH, 0, "...creating BlackScholes kernels\n");
        ckBlackScholes = clCreateKernel(cpBlackScholes, "BlackScholes", &ciErrNum);
        shrCheckError(ciErrNum, CL_SUCCESS);

    cqDefaultCommandQueue = cqParamCommandQueue;
    free(cBlackScholes);
}
extern "C" size_t histogram256(cl_command_queue cqCommandQueue, cl_mem d_Histogram, cl_mem d_Data, uint byteCount){
    cl_int ciErrNum;
    size_t localWorkSize, globalWorkSize;

    if(!cqCommandQueue)
        cqCommandQueue = cqDefaultCommandQue;

    {
        shrCheckError( ((byteCount % 4) == 0), shrTRUE );
        uint dataCount = byteCount / 4;
        ciErrNum  = clSetKernelArg(ckHistogram256, 0, sizeof(cl_mem),  (void *)&d_PartialHistograms);
        ciErrNum |= clSetKernelArg(ckHistogram256, 1, sizeof(cl_mem),  (void *)&d_Data);
        ciErrNum |= clSetKernelArg(ckHistogram256, 2, sizeof(cl_uint), (void *)&dataCount);
        shrCheckError(ciErrNum, CL_SUCCESS);

        localWorkSize  = WARP_SIZE * WARP_COUNT;
        globalWorkSize = PARTIAL_HISTOGRAM256_COUNT * localWorkSize;

        ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckHistogram256, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
        shrCheckError(ciErrNum, CL_SUCCESS);
    }

    {
        ciErrNum  = clSetKernelArg(ckMergeHistogram256, 0, sizeof(cl_mem),  (void *)&d_Histogram);
        ciErrNum |= clSetKernelArg(ckMergeHistogram256, 1, sizeof(cl_mem),  (void *)&d_PartialHistograms);
        ciErrNum |= clSetKernelArg(ckMergeHistogram256, 2, sizeof(cl_uint), (void *)&PARTIAL_HISTOGRAM256_COUNT);
        shrCheckError(ciErrNum, CL_SUCCESS);

        localWorkSize  = MERGE_WORKGROUP_SIZE;
        globalWorkSize = HISTOGRAM256_BIN_COUNT * localWorkSize;

        ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckMergeHistogram256, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
        shrCheckError(ciErrNum, CL_SUCCESS);

        return (WARP_SIZE * WARP_COUNT);
    }
}
void BodySystemCPU::setArray(BodyArray array, const float* data)
{
    shrCheckError(m_bInitialized, shrTRUE);
    float* target = 0;

    switch (array)
    {
    default:
    case BODYSYSTEM_POSITION:
        target = m_pos[m_currentRead];
        break;
    case BODYSYSTEM_VELOCITY:
        target = m_vel[m_currentRead];
        break;
    }

    memcpy(target, data, m_numBodies*4*sizeof(float));
}
float* BodySystemCPU::getArray(BodyArray array)
{
    shrCheckError(m_bInitialized, shrTRUE);

    float* data = 0;
    switch (array)
    {
    default:
    case BODYSYSTEM_POSITION:
        data = m_pos[m_currentRead];
        break;
    case BODYSYSTEM_VELOCITY:
        data = m_vel[m_currentRead];
        break;
    }

    return data;
}
void BodySystemCPU::_initialize(int numBodies)
{
    shrCheckError(m_bInitialized, shrFALSE);

    m_numBodies = numBodies;

    m_pos[0] = new float[m_numBodies*4];
    m_pos[1] = new float[m_numBodies*4];
    m_vel[0] = new float[m_numBodies*4];
    m_vel[1] = new float[m_numBodies*4];
    m_force  = new float[m_numBodies*4];

    memset(m_pos[0], 0, m_numBodies*4*sizeof(float));
    memset(m_pos[1], 0, m_numBodies*4*sizeof(float));
    memset(m_vel[0], 0, m_numBodies*4*sizeof(float));
    memset(m_vel[1], 0, m_numBodies*4*sizeof(float));
    memset(m_force, 0, m_numBodies*4*sizeof(float));

    m_bInitialized = true;
}
extern "C" size_t histogram64(
    cl_command_queue cqCommandQueue,
    cl_mem d_Histogram,
    cl_mem d_Data,
    uint byteCount
){
    cl_int ciErrNum;
    uint histogramCount;
    size_t localWorkSize, globalWorkSize;

    if(!cqCommandQueue)
        cqCommandQueue = cqDefaultCommandQue;

    {
        histogramCount = iDivUp(byteCount, HISTOGRAM64_WORKGROUP_SIZE * iSnapDown(255, 16));
        shrCheckError( (byteCount % 16 == 0), shrTRUE );
        shrCheckError( (histogramCount <= MAX_PARTIAL_HISTOGRAM64_COUNT), shrTRUE );
        cl_uint dataCount = byteCount / 16;

        ciErrNum  = clSetKernelArg(ckHistogram64, 0, sizeof(cl_mem),  (void *)&d_PartialHistograms);
        ciErrNum |= clSetKernelArg(ckHistogram64, 1, sizeof(cl_mem),  (void *)&d_Data);
        ciErrNum |= clSetKernelArg(ckHistogram64, 2, sizeof(cl_uint), (void *)&dataCount);
        shrCheckError(ciErrNum, CL_SUCCESS);

        localWorkSize = HISTOGRAM64_WORKGROUP_SIZE;
        globalWorkSize = histogramCount * HISTOGRAM64_WORKGROUP_SIZE;

        ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckHistogram64, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
        shrCheckError(ciErrNum, CL_SUCCESS);
    }

    {
        ciErrNum  = clSetKernelArg(ckMergeHistogram64, 0, sizeof(cl_mem),  (void *)&d_Histogram);
        ciErrNum |= clSetKernelArg(ckMergeHistogram64, 1, sizeof(cl_mem),  (void *)&d_PartialHistograms);
        ciErrNum |= clSetKernelArg(ckMergeHistogram64, 2, sizeof(cl_uint), (void *)&histogramCount);
        shrCheckError(ciErrNum, CL_SUCCESS);

        localWorkSize = MERGE_WORKGROUP_SIZE;
        globalWorkSize = HISTOGRAM64_BIN_COUNT * MERGE_WORKGROUP_SIZE;

        ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckMergeHistogram64, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
        shrCheckError(ciErrNum, CL_SUCCESS);

        return HISTOGRAM64_WORKGROUP_SIZE;
    }
}
Exemple #12
0
////////////////////////////////////////////////////////////////////////////////
// Test driver
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    cl_platform_id cpPlatform;
    cl_device_id cdDevice;
    cl_context cxGPUContext;                        //OpenCL context
    cl_command_queue cqCommandQueue;                //OpenCL command queue
    cl_mem c_Kernel, d_Input, d_Buffer, d_Output;   //OpenCL memory buffer objects
    cl_float *h_Kernel, *h_Input, *h_Buffer, *h_OutputCPU, *h_OutputGPU;

    cl_int ciErrNum;

    const unsigned int imageW = 3072;
    const unsigned int imageH = 3072;

    shrQAStart(argc, argv);

    // set logfile name and start logs
    shrSetLogFileName ("oclConvolutionSeparable.txt");
    shrLog("%s Starting...\n\n", argv[0]); 

    shrLog("Allocating and initializing host memory...\n");
        h_Kernel    = (cl_float *)malloc(KERNEL_LENGTH * sizeof(cl_float));
        h_Input     = (cl_float *)malloc(imageW * imageH * sizeof(cl_float));
        h_Buffer    = (cl_float *)malloc(imageW * imageH * sizeof(cl_float));
        h_OutputCPU = (cl_float *)malloc(imageW * imageH * sizeof(cl_float));
        h_OutputGPU = (cl_float *)malloc(imageW * imageH * sizeof(cl_float));

        srand(2009);
        for(unsigned int i = 0; i < KERNEL_LENGTH; i++)
            h_Kernel[i] = (cl_float)(rand() % 16);

        for(unsigned int i = 0; i < imageW * imageH; i++)
            h_Input[i] = (cl_float)(rand() % 16);

    shrLog("Initializing OpenCL...\n");
        //Get the NVIDIA platform
        ciErrNum = oclGetPlatformID(&cpPlatform);
        oclCheckError(ciErrNum, CL_SUCCESS);

        //Get the devices
        ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);

        //Create the context
        cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

        //Create a command-queue
        cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

    shrLog("Initializing OpenCL separable convolution...\n");
        initConvolutionSeparable(cxGPUContext, cqCommandQueue, (const char **)argv);

    shrLog("Creating OpenCL memory objects...\n");
        c_Kernel = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, KERNEL_LENGTH * sizeof(cl_float), h_Kernel, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);
        d_Input = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, imageW * imageH * sizeof(cl_float), h_Input, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);
        d_Buffer = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, imageW * imageH * sizeof(cl_float), NULL, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);
        d_Output = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, imageW * imageH * sizeof(cl_float), NULL, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

    shrLog("Applying separable convolution to %u x %u image...\n\n", imageW, imageH);
        //Just a single run or a warmup iteration
        convolutionRows(
            NULL,
            d_Buffer,
            d_Input,
            c_Kernel,
            imageW,
            imageH
        );

        convolutionColumns(
            NULL,
            d_Output,
            d_Buffer,
            c_Kernel,
            imageW,
            imageH
        );

#ifdef GPU_PROFILING
    const int numIterations = 16;
    cl_event startMark, endMark;
    ciErrNum = clEnqueueMarker(cqCommandQueue, &startMark);
    ciErrNum |= clFinish(cqCommandQueue);
    shrCheckError(ciErrNum, CL_SUCCESS);
    shrDeltaT(0);

    for(int iter = 0; iter < numIterations; iter++){
        convolutionRows(
            cqCommandQueue,
            d_Buffer,
            d_Input,
            c_Kernel,
            imageW,
            imageH
        );

        convolutionColumns(
            cqCommandQueue,
            d_Output,
            d_Buffer,
            c_Kernel,
            imageW,
            imageH
        );
    }
    ciErrNum  = clEnqueueMarker(cqCommandQueue, &endMark);
    ciErrNum |= clFinish(cqCommandQueue);
    shrCheckError(ciErrNum, CL_SUCCESS);

    //Calculate performance metrics by wallclock time
    double gpuTime = shrDeltaT(0) / (double)numIterations;
    shrLogEx(LOGBOTH | MASTER, 0, "oclConvolutionSeparable, Throughput = %.4f MPixels/s, Time = %.5f s, Size = %u Pixels, NumDevsUsed = %i, Workgroup = %u\n",
            (1.0e-6 * (double)(imageW * imageH)/ gpuTime), gpuTime, (imageW * imageH), 1, 0);

    //Get OpenCL profiler  info
    cl_ulong startTime = 0, endTime = 0;
    ciErrNum  = clGetEventProfilingInfo(startMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &startTime, NULL);
    ciErrNum |= clGetEventProfilingInfo(endMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL);
    shrCheckError(ciErrNum, CL_SUCCESS);
    shrLog("\nOpenCL time: %.5f s\n\n", 1.0e-9 * ((double)endTime - (double)startTime)/ (double)numIterations);
#endif

    shrLog("Reading back OpenCL results...\n\n");
        ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Output, CL_TRUE, 0, imageW * imageH * sizeof(cl_float), h_OutputGPU, 0, NULL, NULL);
        oclCheckError(ciErrNum, CL_SUCCESS);

    shrLog("Comparing against Host/C++ computation...\n"); 
        convolutionRowHost(h_Buffer, h_Input, h_Kernel, imageW, imageH, KERNEL_RADIUS);
        convolutionColumnHost(h_OutputCPU, h_Buffer, h_Kernel, imageW, imageH, KERNEL_RADIUS);
        double sum = 0, delta = 0;
        double L2norm;
        for(unsigned int i = 0; i < imageW * imageH; i++){
            delta += (h_OutputCPU[i] - h_OutputGPU[i]) * (h_OutputCPU[i] - h_OutputGPU[i]);
            sum += h_OutputCPU[i] * h_OutputCPU[i];
        }
        L2norm = sqrt(delta / sum);
        shrLog("Relative L2 norm: %.3e\n\n", L2norm);

    // cleanup
    closeConvolutionSeparable();
    ciErrNum  = clReleaseMemObject(d_Output);
    ciErrNum |= clReleaseMemObject(d_Buffer);
    ciErrNum |= clReleaseMemObject(d_Input);
    ciErrNum |= clReleaseMemObject(c_Kernel);
    ciErrNum |= clReleaseCommandQueue(cqCommandQueue);
    ciErrNum |= clReleaseContext(cxGPUContext);
    oclCheckError(ciErrNum, CL_SUCCESS);

    free(h_OutputGPU);
    free(h_OutputCPU);
    free(h_Buffer);
    free(h_Input);
    free(h_Kernel);

    // finish
    shrQAFinishExit(argc, (const char **)argv, (L2norm < 1e-6) ? QA_PASSED : QA_FAILED);
}
extern "C" void closeBlackScholes(void){
    cl_int ciErrNum;
    ciErrNum  = clReleaseKernel(ckBlackScholes);
    ciErrNum |= clReleaseProgram(cpBlackScholes);
    shrCheckError(ciErrNum, CL_SUCCESS);
}
extern "C" void initBlackScholes(cl_context cxGPUContext, cl_command_queue cqParamCommandQueue, const char **argv){
    cl_int ciErrNum;
    size_t kernelLength;

    shrLog("...loading BlackScholes.cl\n");
        char *cPathAndName = shrFindFilePath("BlackScholes.cl", argv[0]);
        shrCheckError(cPathAndName != NULL, shrTRUE);
        char *cBlackScholes = oclLoadProgSource(cPathAndName, "// My comment\n", &kernelLength);
        shrCheckError(cBlackScholes != NULL, shrTRUE);

    shrLog("...creating BlackScholes program\n");
        cpBlackScholes = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cBlackScholes, &kernelLength, &ciErrNum);
        shrCheckError(ciErrNum, CL_SUCCESS);

    shrLog("...building BlackScholes program\n");
        ciErrNum = clBuildProgram(cpBlackScholes, 0, NULL, "-cl-fast-relaxed-math -Werror", NULL, NULL);

        if(ciErrNum != CL_BUILD_SUCCESS){
            shrLog("*** Compilation failure ***\n");

            size_t deviceNum;
            cl_device_id *cdDevices;
            ciErrNum = clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &deviceNum);
            shrCheckError(ciErrNum, CL_SUCCESS);

            cdDevices = (cl_device_id *)malloc(deviceNum * sizeof(cl_device_id));
            shrCheckError(cdDevices != NULL, shrTRUE);

            ciErrNum = clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, deviceNum * sizeof(cl_device_id), cdDevices, NULL);
            shrCheckError(ciErrNum, CL_SUCCESS);

            size_t logSize;
            char *logTxt;

            ciErrNum = clGetProgramBuildInfo(cpBlackScholes, cdDevices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
            shrCheckError(ciErrNum, CL_SUCCESS);

            logTxt = (char *)malloc(logSize);
            shrCheckError(logTxt != NULL, shrTRUE);

            ciErrNum = clGetProgramBuildInfo(cpBlackScholes, cdDevices[0], CL_PROGRAM_BUILD_LOG, logSize, logTxt, NULL);
            shrCheckError(ciErrNum, CL_SUCCESS);

            shrLog("%s\n", logTxt);
            shrLog("*** Exiting ***\n");
            free(logTxt);
            free(cdDevices);
            exit(666);
        }

    //Save ptx code to separate file
    oclLogPtx(cpBlackScholes, oclGetFirstDev(cxGPUContext), "BlackScholes.ptx");

    shrLog("...creating BlackScholes kernels\n");
        ckBlackScholes = clCreateKernel(cpBlackScholes, "BlackScholes", &ciErrNum);
        shrCheckError(ciErrNum, CL_SUCCESS);

    cqDefaultCommandQueue = cqParamCommandQueue;
    free(cBlackScholes);
    free(cPathAndName);
}
Exemple #15
0
////////////////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    cl_platform_id cpPlatform;       //OpenCL platform
    cl_device_id cdDevice;           //OpenCL device
    cl_context       cxGPUContext;   //OpenCL context
    cl_command_queue cqCommandQueue; //OpenCL command que
    cl_mem      d_Input, d_Output;   //OpenCL memory buffer objects

    cl_int ciErrNum;

    float *h_Input, *h_OutputCPU, *h_OutputGPU;

    const uint
        imageW = 2048,
        imageH = 2048,
        stride = 2048;

    const int dir = DCT_FORWARD;

    shrQAStart(argc, argv);


    int use_gpu = 0;
    for(int i = 0; i < argc && argv; i++)
    {
        if(!argv[i])
            continue;
          
        if(strstr(argv[i], "cpu"))
            use_gpu = 0;        

        else if(strstr(argv[i], "gpu"))
            use_gpu = 1;
    }

    // set logfile name and start logs
    shrSetLogFileName ("oclDCT8x8.txt");
    shrLog("%s Starting...\n\n", argv[0]); 

    shrLog("Allocating and initializing host memory...\n");
        h_Input     = (float *)malloc(imageH * stride * sizeof(float));
        h_OutputCPU = (float *)malloc(imageH * stride * sizeof(float));
        h_OutputGPU = (float *)malloc(imageH * stride * sizeof(float));
        srand(2009);
        for(uint i = 0; i < imageH; i++)
            for(uint j = 0; j < imageW; j++)
                h_Input[i * stride + j] = (float)rand() / (float)RAND_MAX;

    shrLog("Initializing OpenCL...\n");
        //Get the NVIDIA platform
        ciErrNum = oclGetPlatformID(&cpPlatform);
        oclCheckError(ciErrNum, CL_SUCCESS);

        //Get a GPU device
        ciErrNum = clGetDeviceIDs(cpPlatform, use_gpu?CL_DEVICE_TYPE_GPU:CL_DEVICE_TYPE_CPU, 1, &cdDevice, NULL);
        oclCheckError(ciErrNum, CL_SUCCESS);

        //Create the context
        cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

        //Create a command-queue
        cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

    shrLog("Initializing OpenCL DCT 8x8...\n");
        initDCT8x8(cxGPUContext, cqCommandQueue, (const char **)argv);

    shrLog("Creating OpenCL memory objects...\n");
        d_Input = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, imageH * stride *  sizeof(cl_float), h_Input, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);
        d_Output = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, imageH * stride * sizeof(cl_float), NULL, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

    shrLog("Performing DCT8x8 of %u x %u image...\n\n", imageH, imageW);
        //Just a single iteration or a warmup iteration
        DCT8x8(
            cqCommandQueue,
            d_Output,
            d_Input,
            stride,
            imageH,
            imageW,
            dir
        );

#ifdef GPU_PROFILING
    const int numIterations = 16;
    cl_event startMark, endMark;
    ciErrNum = clEnqueueMarker(cqCommandQueue, &startMark);
    ciErrNum |= clFinish(cqCommandQueue);
    shrCheckError(ciErrNum, CL_SUCCESS);
    shrDeltaT(0);

    for(int iter = 0; iter < numIterations; iter++)
        DCT8x8(
            NULL,
            d_Output,
            d_Input,
            stride,
            imageH,
            imageW,
            dir
        );

    ciErrNum  = clEnqueueMarker(cqCommandQueue, &endMark);
    ciErrNum |= clFinish(cqCommandQueue);
    shrCheckError(ciErrNum, CL_SUCCESS);

    //Calculate performance metrics by wallclock time
    double gpuTime = shrDeltaT(0) / (double)numIterations;
    shrLogEx(LOGBOTH | MASTER, 0, "oclDCT8x8, Throughput = %.4f MPixels/s, Time = %.5f s, Size = %u Pixels, NumDevsUsed = %i, Workgroup = %u\n", 
            (1.0e-6 * (double)(imageW * imageH)/ gpuTime), gpuTime, (imageW * imageH), 1, 0); 

    //Get profiler time
    cl_ulong startTime = 0, endTime = 0;
    ciErrNum  = clGetEventProfilingInfo(startMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &startTime, NULL);
    ciErrNum |= clGetEventProfilingInfo(endMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL);
    shrCheckError(ciErrNum, CL_SUCCESS);
    shrLog("\nOpenCL time: %.5f s\n\n", 1.0e-9 * ((double)endTime - (double)startTime) / (double)numIterations);
#endif

    shrLog("Reading back OpenCL results...\n");
        ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Output, CL_TRUE, 0, imageH * stride * sizeof(cl_float), h_OutputGPU, 0, NULL, NULL);
        oclCheckError(ciErrNum, CL_SUCCESS);

    shrLog("Comparing against Host/C++ computation...\n"); 
        DCT8x8CPU(h_OutputCPU, h_Input, stride, imageH, imageW, dir);
        double sum = 0, delta = 0;
        double L2norm;
        for(uint i = 0; i < imageH; i++)
            for(uint j = 0; j < imageW; j++){
                sum += h_OutputCPU[i * stride + j] * h_OutputCPU[i * stride + j];
                delta += (h_OutputGPU[i * stride + j] - h_OutputCPU[i * stride + j]) * (h_OutputGPU[i * stride + j] - h_OutputCPU[i * stride + j]);
            }
        L2norm = sqrt(delta / sum);
        shrLog("Relative L2 norm: %.3e\n\n", L2norm);

    shrLog("Shutting down...\n");
        //Release kernels and program
        closeDCT8x8();

        //Release other OpenCL objects
        ciErrNum  = clReleaseMemObject(d_Output);
        ciErrNum |= clReleaseMemObject(d_Input);
        ciErrNum |= clReleaseCommandQueue(cqCommandQueue);
        ciErrNum |= clReleaseContext(cxGPUContext);
        oclCheckError(ciErrNum, CL_SUCCESS);

        //Release host buffers
        free(h_OutputGPU);
        free(h_OutputCPU);
        free(h_Input);

        //Finish
        shrQAFinishExit(argc, (const char **)argv, (L2norm < 1E-6) ? QA_PASSED : QA_FAILED);
}
// Main function
// *********************************************************************
int main(const int argc, const char** argv) 
{
    // start logs
    shrSetLogFileName ("oclDXTCompression.txt");
    shrLog(LOGBOTH, 0, "%s Starting...\n\n", argv[0]); 

    cl_context cxGPUContext;
    cl_command_queue cqCommandQueue;
    cl_program cpProgram;
    cl_kernel ckKernel;
    cl_mem cmMemObjs[3];
    size_t szGlobalWorkSize[1];
    size_t szLocalWorkSize[1];
    cl_int ciErrNum;

    // Get the path of the filename
    char *filename;
    if (shrGetCmdLineArgumentstr(argc, argv, "image", &filename)) {
        image_filename = filename;
    }
    // load image
    const char* image_path = shrFindFilePath(image_filename, argv[0]);
    shrCheckError(image_path != NULL, shrTRUE);
    shrLoadPPM4ub(image_path, (unsigned char **)&h_img, &width, &height);
    shrCheckError(h_img != NULL, shrTRUE);
    shrLog(LOGBOTH, 0, "Loaded '%s', %d x %d pixels\n", image_path, width, height);

    // Convert linear image to block linear. 
    uint * block_image = (uint *) malloc(width * height * 4);

    // Convert linear image to block linear. 
    for(uint by = 0; by < height/4; by++) {
        for(uint bx = 0; bx < width/4; bx++) {
            for (int i = 0; i < 16; i++) {
                const int x = i & 3;
                const int y = i / 4;
                block_image[(by * width/4 + bx) * 16 + i] = 
                    ((uint *)h_img)[(by * 4 + y) * 4 * (width/4) + bx * 4 + x];
            }
        }
    }

    // create the OpenCL context on a GPU device
    cxGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &ciErrNum);
    shrCheckError(ciErrNum, CL_SUCCESS);

    // get and log device
    cl_device_id device;
    if( shrCheckCmdLineFlag(argc, argv, "device") ) {
      int device_nr = 0;
      shrGetCmdLineArgumenti(argc, argv, "device", &device_nr);
      device = oclGetDev(cxGPUContext, device_nr);
    } else {
      device = oclGetMaxFlopsDev(cxGPUContext);
    }
    oclPrintDevInfo(LOGBOTH, device);

    // create a command-queue
    cqCommandQueue = clCreateCommandQueue(cxGPUContext, device, 0, &ciErrNum);
    shrCheckError(ciErrNum, CL_SUCCESS);

    // Memory Setup

    // Compute permutations.
    cl_uint permutations[1024];
    computePermutations(permutations);

    // Upload permutations.
    cmMemObjs[0] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                  sizeof(cl_uint) * 1024, permutations, &ciErrNum);
    shrCheckError(ciErrNum, CL_SUCCESS);

    // Image
    cmMemObjs[1] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY ,
                                  sizeof(cl_uint) * width * height, NULL, &ciErrNum);
    shrCheckError(ciErrNum, CL_SUCCESS);
    
    // Result
    const uint compressedSize = (width / 4) * (height / 4) * 8;

    cmMemObjs[2] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY,
                                  compressedSize, NULL , &ciErrNum);
    shrCheckError(ciErrNum, CL_SUCCESS);
    
    unsigned int * h_result = (uint *)malloc(compressedSize);

    // Program Setup
    size_t program_length;
    const char* source_path = shrFindFilePath("DXTCompression.cl", argv[0]);
    shrCheckError(source_path != NULL, shrTRUE);
    char *source = oclLoadProgSource(source_path, "", &program_length);
    shrCheckError(source != NULL, shrTRUE);

    // create the program
    cpProgram = clCreateProgramWithSource(cxGPUContext, 1,
        (const char **) &source, &program_length, &ciErrNum);
    shrCheckError(ciErrNum, CL_SUCCESS);

    // build the program
    ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-mad-enable", NULL, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
        // write out standard error, Build Log and PTX, then cleanup and exit
        shrLog(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
        oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
        oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclDXTCompression.ptx");
        shrCheckError(ciErrNum, CL_SUCCESS); 
    }

    // create the kernel
    ckKernel = clCreateKernel(cpProgram, "compress", &ciErrNum);
    shrCheckError(ciErrNum, CL_SUCCESS);

    // set the args values
    ciErrNum  = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void *) &cmMemObjs[0]);
    ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void *) &cmMemObjs[1]);
    ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void *) &cmMemObjs[2]);
    ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(float) * 4 * 16, NULL);
    ciErrNum |= clSetKernelArg(ckKernel, 4, sizeof(float) * 4 * 16, NULL);
    ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(int) * 64, NULL);
    ciErrNum |= clSetKernelArg(ckKernel, 6, sizeof(float) * 16 * 6, NULL);
    ciErrNum |= clSetKernelArg(ckKernel, 7, sizeof(unsigned int) * 160, NULL);
    ciErrNum |= clSetKernelArg(ckKernel, 8, sizeof(int) * 16, NULL);
    shrCheckError(ciErrNum, CL_SUCCESS);

    shrLog(LOGBOTH, 0, "Running DXT Compression on %u x %u image...\n\n", width, height);

    // Upload the image
    clEnqueueWriteBuffer(cqCommandQueue, cmMemObjs[1], CL_FALSE, 0, sizeof(cl_uint) * width * height, block_image, 0,0,0);

    // set work-item dimensions
    szGlobalWorkSize[0] = width * height * (NUM_THREADS/16);
    szLocalWorkSize[0]= NUM_THREADS;
    
#ifdef GPU_PROFILING
    int numIterations = 100;
    for (int i = -1; i < numIterations; ++i) {
        if (i == 0) { // start timing only after the first warmup iteration
            clFinish(cqCommandQueue); // flush command queue
            shrDeltaT(0); // start timer
        }
#endif
        // execute kernel
        ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL,
                                          szGlobalWorkSize, szLocalWorkSize, 
                                          0, NULL, NULL);
        shrCheckError(ciErrNum, CL_SUCCESS);
#ifdef GPU_PROFILING
    }
    clFinish(cqCommandQueue);
    double dAvgTime = shrDeltaT(0) / (double)numIterations;
    shrLog(LOGBOTH | MASTER, 0, "oclDXTCompression, Throughput = %.4f, Time = %.5f, Size = %u, NumDevsUsed = %i\n", 
        (1.0e-6 * (double)(width * height)/ dAvgTime), dAvgTime, (width * height), 1); 

#endif

    // blocking read output
    ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmMemObjs[2], CL_TRUE, 0,
                                   compressedSize, h_result, 0, NULL, NULL);
    shrCheckError(ciErrNum, CL_SUCCESS);

    // Write DDS file.
    FILE* fp = NULL;
    char output_filename[1024];
    #ifdef WIN32
        strcpy_s(output_filename, 1024, image_path);
        strcpy_s(output_filename + strlen(image_path) - 3, 1024 - strlen(image_path) + 3, "dds");
        fopen_s(&fp, output_filename, "wb");
    #else
        strcpy(output_filename, image_path);
        strcpy(output_filename + strlen(image_path) - 3, "dds");
        fp = fopen(output_filename, "wb");
    #endif
    shrCheckError(fp != NULL, shrTRUE);

    DDSHeader header;
    header.fourcc = FOURCC_DDS;
    header.size = 124;
    header.flags  = (DDSD_WIDTH|DDSD_HEIGHT|DDSD_CAPS|DDSD_PIXELFORMAT|DDSD_LINEARSIZE);
    header.height = height;
    header.width = width;
    header.pitch = compressedSize;
    header.depth = 0;
    header.mipmapcount = 0;
    memset(header.reserved, 0, sizeof(header.reserved));
    header.pf.size = 32;
    header.pf.flags = DDPF_FOURCC;
    header.pf.fourcc = FOURCC_DXT1;
    header.pf.bitcount = 0;
    header.pf.rmask = 0;
    header.pf.gmask = 0;
    header.pf.bmask = 0;
    header.pf.amask = 0;
    header.caps.caps1 = DDSCAPS_TEXTURE;
    header.caps.caps2 = 0;
    header.caps.caps3 = 0;
    header.caps.caps4 = 0;
    header.notused = 0;

    fwrite(&header, sizeof(DDSHeader), 1, fp);
    fwrite(h_result, compressedSize, 1, fp);

    fclose(fp);

    // Make sure the generated image matches the reference image (regression check)
    shrLog(LOGBOTH, 0, "\nComparing against Host/C++ computation...\n");     
    const char* reference_image_path = shrFindFilePath(refimage_filename, argv[0]);
    shrCheckError(reference_image_path != NULL, shrTRUE);

    // read in the reference image from file
    #ifdef WIN32
        fopen_s(&fp, reference_image_path, "rb");
    #else
        fp = fopen(reference_image_path, "rb");
    #endif
    shrCheckError(fp != NULL, shrTRUE);
    fseek(fp, sizeof(DDSHeader), SEEK_SET);
    uint referenceSize = (width / 4) * (height / 4) * 8;
    uint * reference = (uint *)malloc(referenceSize);
    fread(reference, referenceSize, 1, fp);
    fclose(fp);

    // compare the reference image data to the sample/generated image
    float rms = 0;
    for (uint y = 0; y < height; y += 4)
    {
        for (uint x = 0; x < width; x += 4)
        {
            // binary comparison of data
            uint referenceBlockIdx = ((y/4) * (width/4) + (x/4));
            uint resultBlockIdx = ((y/4) * (width/4) + (x/4));
            int cmp = compareBlock(((BlockDXT1 *)h_result) + resultBlockIdx, ((BlockDXT1 *)reference) + referenceBlockIdx);

            // log deviations, if any
            if (cmp != 0.0f) 
            {
                compareBlock(((BlockDXT1 *)h_result) + resultBlockIdx, ((BlockDXT1 *)reference) + referenceBlockIdx);
                shrLog(LOGBOTH, 0, "Deviation at (%d, %d):\t%f rms\n", x/4, y/4, float(cmp)/16/3);
            }
            rms += cmp;
        }
    }
    rms /= width * height * 3;
    shrLog(LOGBOTH, 0, "RMS(reference, result) = %f\n\n", rms);
    shrLog(LOGBOTH, 0, "TEST %s\n\n", (rms <= ERROR_THRESHOLD) ? "PASSED" : "FAILED !!!");

    // Free OpenCL resources
    oclDeleteMemObjs(cmMemObjs, 3);
    clReleaseKernel(ckKernel);
    clReleaseProgram(cpProgram);
    clReleaseCommandQueue(cqCommandQueue);
    clReleaseContext(cxGPUContext);

    // Free host memory
    free(source);
    free(h_img);

    // finish
    shrEXIT(argc, argv);
}