Example #1
0
void cleanup()
{
    sdkDeleteTimer(&timer);
    sdkDeleteTimer(&kernel_timer);

    if (h_img)
    {
        free(h_img);
        h_img=NULL;
    }

    if (d_img)
    {
        cudaFree(d_img);
        d_img=NULL;
    }

    if (d_temp)
    {
        cudaFree(d_temp);
        d_temp=NULL;
    }

    // Refer to boxFilter_kernel.cu for implementation
    freeTextures();

    cudaGraphicsUnregisterResource(cuda_pbo_resource);

    glDeleteBuffersARB(1, &pbo);
    glDeleteTextures(1, &texid);
    glDeleteProgramsARB(1, &shader);
}
Example #2
0
void cleanup()
{
    sdkDeleteTimer(&timer);
    sdkDeleteTimer(&kernel_timer);

    if (hImage)
    {
        free(hImage);
    }

    freeTextures();

    //DEPRECATED: checkCudaErrors(cudaGLUnregisterBufferObject(pbo));
    cudaGraphicsUnregisterResource(cuda_pbo_resource);

    glDeleteBuffersARB(1, &pbo);
    glDeleteTextures(1, &texid);
    glDeleteProgramsARB(1, &shader);

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();
}
Example #3
0
void copy_image(PPM_IMG img_in)
{
    StopWatchInterface *timer=NULL;

    PPM_IMG host_img;
    PPM_IMG device_img;

    int size = img_in.w * img_in.h * sizeof(unsigned char);

    host_img.w = img_in.w;
    host_img.h = img_in.h;
    host_img.img_r = (unsigned char *)malloc(size);
    host_img.img_g = (unsigned char *)malloc(size);
    host_img.img_b = (unsigned char *)malloc(size);

    device_img.w = img_in.w;
    device_img.h = img_in.h;
    cudaMalloc((void **)&(device_img.img_r), size);
    cudaMalloc((void **)&(device_img.img_g), size);
    cudaMalloc((void **)&(device_img.img_b), size);

    launchEmptyKernel();    // lauch an empty kernel
    printf("Starting copy image...\n");

    // CPU to GPU
    sdkCreateTimer(&timer);
    sdkStartTimer(&timer);
    cudaMemcpy(device_img.img_r, img_in.img_r, size, cudaMemcpyHostToDevice);
    cudaMemcpy(device_img.img_g, img_in.img_g, size, cudaMemcpyHostToDevice);
    cudaMemcpy(device_img.img_b, img_in.img_b, size, cudaMemcpyHostToDevice);
    sdkStopTimer(&timer);
    printf("Time of copy image from CPU to GPU: %f (ms)\n", sdkGetTimerValue(&timer));
    sdkDeleteTimer(&timer);

    // GPU to CPU
    sdkCreateTimer(&timer);
    sdkStartTimer(&timer);
    cudaMemcpy(host_img.img_r, device_img.img_r, size, cudaMemcpyDeviceToHost);
    cudaMemcpy(host_img.img_g, device_img.img_g, size, cudaMemcpyDeviceToHost);
    cudaMemcpy(host_img.img_b, device_img.img_b, size, cudaMemcpyDeviceToHost);
    sdkStopTimer(&timer);
    printf("Time of copy image from GPU to CPU: %f (ms)\n", sdkGetTimerValue(&timer));
    sdkDeleteTimer(&timer);

    cudaFree(device_img.img_r);
    cudaFree(device_img.img_g);
    cudaFree(device_img.img_b);

    free(host_img.img_r);
    free(host_img.img_g);
    free(host_img.img_b);
}
Example #4
0
void runAutoTest(const char *ref_file, char *exec_path)
{
    checkCudaErrors(cudaMalloc((void **)&d_output, width*height*sizeof(GLubyte)*4));

    // render the volumeData
    render_kernel(gridSize, blockSize, d_output, width, height, w);

    checkCudaErrors(cudaDeviceSynchronize());
    getLastCudaError("render_kernel failed");

    void *h_output = malloc(width*height*sizeof(GLubyte)*4);
    checkCudaErrors(cudaMemcpy(h_output, d_output, width*height*sizeof(GLubyte)*4, cudaMemcpyDeviceToHost));
    sdkDumpBin(h_output, width*height*sizeof(GLubyte)*4, "simpleTexture3D.bin");

    bool bTestResult = sdkCompareBin2BinFloat("simpleTexture3D.bin", sdkFindFilePath(ref_file, exec_path), width*height,
                                              MAX_EPSILON_ERROR, THRESHOLD, exec_path);

    checkCudaErrors(cudaFree(d_output));
    free(h_output);

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();
    sdkStopTimer(&timer);
    sdkDeleteTimer(&timer);

    exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
void siTest(T *d_ptclA, T *d_ptclA_new, T *d_wghtA, unsigned int size, int stateDim)
{
    int blocks, threads;
    float elapsedTimeInMs = 0.0f;
    threads = BLOCK_SIZE;
    blocks = (size + threads - 1) / threads;
#ifdef NVS
    while (blocks > GRID_LIMIT){
        blocks >>= 1;
        threads <<= 1;
    }
#endif
    StopWatchInterface *timer = NULL;
    sdkCreateTimer(&timer);

    for (int i = 0 ; i < TEST_ITERATIONS ; i ++){
        cudaDeviceSynchronize();
        sdkStartTimer(&timer);

        SI<T>(blocks, threads, d_ptclA, d_ptclA_new, d_wghtA, size, stateDim);

        checkCudaErrors(cudaDeviceSynchronize());
        sdkStopTimer(&timer);
    }
    elapsedTimeInMs = sdkGetAverageTimerValue(&timer);
    printf("%f\t", elapsedTimeInMs);
    printf("size=%u, stateDim=%d, blocks=%d, threads=%d\n",size, stateDim, blocks, threads);

    sdkDeleteTimer(&timer);
}
Example #6
0
        ~NBodyDemo()
        {
            if (m_nbodyCpu)
            {
                delete m_nbodyCpu;
            }

            if (m_nbodyCuda)
            {
                delete m_nbodyCuda;
            }

            if (m_hPos)
            {
                delete [] m_hPos;
            }

            if (m_hVel)
            {
                delete [] m_hVel;
            }

            if (m_hColor)
            {
                delete [] m_hColor;
            }

            sdkDeleteTimer(&demoTimer);

            if (!benchmark && !compareToCPU)
                delete m_renderer;
        }
void cleanup()
{
    sdkDeleteTimer(&timer);

    checkCudaErrors(cudaFree(d_img));
    checkCudaErrors(cudaFree(d_temp));

    if (!runBenchmark)
    {
        if (pbo)
        {
            checkCudaErrors(cudaGLUnregisterBufferObject(pbo));
            glDeleteBuffersARB(1, &pbo);
        }

        if (texid)
        {
            glDeleteTextures(1, &texid);
        }
    }
	
    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();
}
Example #8
0
void cleanup(void)
{
    cudaGraphicsUnregisterResource(cuda_vbo_resource);

    unbindTexture();
    deleteTexture();

    // Free all host and device resources
    free(hvfield);
    free(particles);
    cudaFree(dvfield);
    cudaFree(vxfield);
    cudaFree(vyfield);
    cufftDestroy(planr2c);
    cufftDestroy(planc2r);

    glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0);
    glDeleteBuffersARB(1, &vbo);

    sdkDeleteTimer(&timer);

    if (g_bExitESC)
    {
        checkCudaErrors(cudaDeviceReset());
    }
}
void cleanup()
{
    sdkDeleteTimer(&timer);
    sdkDeleteTimer(&animationTimer);

    Volume_deinit(&volumeOriginal);
    Volume_deinit(&volumeFilter0);
    Volume_deinit(&volumeFilter1);
    VolumeRender_deinit();

    if (pbo)
    {
        cudaGraphicsUnregisterResource(cuda_pbo_resource);
        glDeleteBuffersARB(1, &pbo);
        glDeleteTextures(1, &volumeTex);
    }
}
Example #10
0
void cleanup()
{
    sdkDeleteTimer(&timer);

    if (vbo)
    {
        deleteVBO(&vbo, cuda_vbo_resource);
    }
}
Example #11
0
void cleanup()
{
    sdkDeleteTimer(&timer);

    if (psystem)
    {
        delete psystem;
    }
    return;
}
Example #12
0
void cleanup(void)
{
    cudaGraphicsUnregisterResource(cuda_pbo_resource);

    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
    glDeleteBuffers(1, &pbo_buffer);
    glDeleteTextures(1, &texid);
    deleteTexture();

    sdkDeleteTimer(&timer);
}
Example #13
0
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
    pArgc = &argc;
    pArgv = argv;

    // parse arguments
    char *filename;

    printf("Starting bicubicTexture\n");

    if (checkCmdLineFlag(argc, (const char **) argv, "help"))
    {
        printHelp();
        exit(EXIT_SUCCESS);
    }

    if (checkCmdLineFlag(argc, (const char **) argv, "mode"))
    {
        g_FilterMode = (eFilterMode)getCmdLineArgumentInt(argc, (const char **) argv, "mode");

        if (g_FilterMode < MODE_NEAREST && g_FilterMode > MODE_CATMULL_ROM)
        {
            printf("Invalid Mode setting %d\n", g_FilterMode);
            exit(EXIT_FAILURE);
        }
    }

    if (getCmdLineArgumentString(argc, (const char **) argv, "file", &filename))
    {
        dumpFilename = filename;
        fpsLimit     = frameCheckNumber;

        // Running CUDA kernel (bicubicFiltering) without visualization (QA Testing/Verification)
        runAutoTest(argc, argv, (const char *)dumpFilename, g_FilterMode);
    }
    else
    {
        // This runs the CUDA kernel (bicubicFiltering) + OpenGL visualization
        initialize(argc, argv);
        glutMainLoop();
        sdkDeleteTimer(&timer);

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        exit(EXIT_SUCCESS);
    }

    exit(EXIT_SUCCESS);
}
void cleanup()
{
    sdkDeleteTimer(&timer);
    sdkDeleteTimer(&kernel_timer);

    if (h_img)
    {
        free(h_img);
        h_img=NULL;
    }

    if (d_img)
    {
        cudaFree(d_img);
        d_img=NULL;
    }

    if (d_temp)
    {
        cudaFree(d_temp);
        d_temp=NULL;
    }

    // Refer to boxFilter_kernel.cu for implementation
    freeTextures();

    cudaGraphicsUnregisterResource(cuda_pbo_resource);

    glDeleteBuffersARB(1, &pbo);
    glDeleteTextures(1, &texid);
    glDeleteProgramsARB(1, &shader);

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();
}
Example #15
0
void run_cpu_color_test(PPM_IMG img_in)
{
    StopWatchInterface *timer=NULL;
    printf("Starting CPU processing...\n");

    sdkCreateTimer(&timer);
    sdkStartTimer(&timer);
    img_obuf_yuv_cpu = rgb2yuv(img_in); //Start RGB 2 YUV
    sdkStopTimer(&timer);
    printf("RGB to YUV conversion time: %f (ms)\n", sdkGetTimerValue(&timer));
    sdkDeleteTimer(&timer);

    sdkCreateTimer(&timer);
    sdkStartTimer(&timer);
    img_obuf_rgb_cpu = yuv2rgb(img_obuf_yuv_cpu); //Start YUV 2 RGB
    sdkStopTimer(&timer);
    printf("YUV to RGB conversion time: %f (ms)\n", sdkGetTimerValue(&timer));
    sdkDeleteTimer(&timer);    

    write_yuv(img_obuf_yuv_cpu, "out_yuv.yuv");
    write_ppm(img_obuf_rgb_cpu, "out_rgb.ppm");
}
Example #16
0
void run_gpu_color_test(PPM_IMG img_in)
{
    StopWatchInterface *timer=NULL;
    launchEmptyKernel();    // lauch an empty kernel
    printf("Starting GPU processing...\n");

    sdkCreateTimer(&timer);
    sdkStartTimer(&timer);
    img_obuf_yuv_gpu = rgb2yuvGPU(img_in); //Start RGB 2 YUV
    sdkStopTimer(&timer);
    printf("RGB to YUV conversion time(GPU): %f (ms)\n", sdkGetTimerValue(&timer));
    sdkDeleteTimer(&timer);

    sdkCreateTimer(&timer);
    sdkStartTimer(&timer);
    img_obuf_rgb_gpu = yuv2rgbGPU(img_obuf_yuv_gpu); //Start YUV 2 RGB
    sdkStopTimer(&timer);
    printf("YUV to RGB conversion time(GPU): %f (ms)\n", sdkGetTimerValue(&timer));
    sdkDeleteTimer(&timer);    

    write_ppm(img_obuf_rgb_gpu, "out_rgb.ppm");
    write_yuv(img_obuf_yuv_gpu, "out_yuv.yuv");
}
void cleanup()
{
    sdkDeleteTimer(&timer);
    sdkDeleteTimer(&animationTimer);

    Volume_deinit(&volumeOriginal);
    Volume_deinit(&volumeFilter0);
    Volume_deinit(&volumeFilter1);
    VolumeRender_deinit();

    if (pbo)
    {
        cudaGraphicsUnregisterResource(cuda_pbo_resource);
        glDeleteBuffersARB(1, &pbo);
        glDeleteTextures(1, &volumeTex);
    }

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();
}
void cleanup()
{
    sdkDeleteTimer(&timer);

    if (psystem)
    {
        delete psystem;
    }
    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();
    return;
}
void cleanup()
{
    free(h_Src);
    checkCudaErrors(CUDA_FreeArray());
    checkCudaErrors(cudaGraphicsUnregisterResource(cuda_pbo_resource));

    glDeleteProgramsARB(1, &shader);

    sdkDeleteTimer(&timer);

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();
}
void cleanup(void)
{
    cudaGraphicsUnregisterResource(cuda_pbo_resource);

    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
    glDeleteBuffers(1, &pbo_buffer);
    glDeleteTextures(1, &texid);
    deleteTexture();

    sdkDeleteTimer(&timer);

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();
}
Example #21
0
void cleanup()
{
    sdkDeleteTimer(&timer);

    // add extra check to unmap the resource before unregistering it
    if (g_GraphicsMapFlag)
    {
        cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0);
        g_GraphicsMapFlag--;
    }

    // unregister this buffer object from CUDA C
    cudaGraphicsUnregisterResource(cuda_pbo_resource);
    glDeleteBuffersARB(1, &pbo);

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();
}
void cleanup()
{
    freeTexture();
    checkCudaErrors(cudaGraphicsUnregisterResource(cuda_pbo_resource));

    glDeleteBuffersARB(1, &pbo);

#if USE_BUFFER_TEX
    glDeleteTextures(1, &bufferTex);
    glDeleteProgramsARB(1, &fprog);
#else
    glDeleteTextures(1, &displayTex);
#endif

    sdkDeleteTimer(&timer);

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();
}
Example #23
0
void cleanup(void)
{
    cudaGraphicsUnregisterResource(cuda_vbo_resource);

    unbindTexture();
    deleteTexture();

    // Free all host and device resources
    free(hvfield);
    free(particles);
#ifdef BROADCAST
	free(packets);
#endif
    cudaFree(dvfield);
    cudaFree(vxfield);
    cudaFree(vyfield);
    cufftDestroy(planr2c);
    cufftDestroy(planc2r);

    glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0);
    glDeleteBuffersARB(1, &vbo);

    sdkDeleteTimer(&timer);
}
Example #24
0
int main(int argc, char **argv)
{
    printf("%s Starting...\n\n", argv[0]);

    //Use command-line specified CUDA device, otherwise use device with highest Gflops/s
    findCudaDevice(argc, (const char **)argv);

    uint *d_Input, *d_Output;
    uint *h_Input, *h_OutputCPU, *h_OutputGPU;
    StopWatchInterface  *hTimer = NULL;
    const uint N = 13 * 1048576 / 2;

    printf("Allocating and initializing host arrays...\n");
    sdkCreateTimer(&hTimer);
    h_Input     = (uint *)malloc(N * sizeof(uint));
    h_OutputCPU = (uint *)malloc(N * sizeof(uint));
    h_OutputGPU = (uint *)malloc(N * sizeof(uint));
    srand(2009);

    for (uint i = 0; i < N; i++)
    {
        h_Input[i] = rand();
    }

    printf("Allocating and initializing CUDA arrays...\n");
    checkCudaErrors(cudaMalloc((void **)&d_Input, N * sizeof(uint)));
    checkCudaErrors(cudaMalloc((void **)&d_Output, N * sizeof(uint)));
    checkCudaErrors(cudaMemcpy(d_Input, h_Input, N * sizeof(uint), cudaMemcpyHostToDevice));

    printf("Initializing CUDA-C scan...\n\n");
    initScan();

    int globalFlag = 1;
    size_t szWorkgroup;
    const int iCycles = 100;
    printf("*** Running GPU scan for short arrays (%d identical iterations)...\n\n", iCycles);

    for (uint arrayLength = MIN_SHORT_ARRAY_SIZE; arrayLength <= MAX_SHORT_ARRAY_SIZE; arrayLength <<= 1)
    {
        printf("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength);
        checkCudaErrors(cudaDeviceSynchronize());
        sdkResetTimer(&hTimer);
        sdkStartTimer(&hTimer);

        for (int i = 0; i < iCycles; i++)
        {
            szWorkgroup = scanExclusiveShort(d_Output, d_Input, N / arrayLength, arrayLength);
        }

        checkCudaErrors(cudaDeviceSynchronize());
        sdkStopTimer(&hTimer);
        double timerValue = 1.0e-3 * sdkGetTimerValue(&hTimer) / iCycles;

        printf("Validating the results...\n");
        printf("...reading back GPU results\n");
        checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost));

        printf(" ...scanExclusiveHost()\n");
        scanExclusiveHost(h_OutputCPU, h_Input, N / arrayLength, arrayLength);

        // Compare GPU results with CPU results and accumulate error for this test
        printf(" ...comparing the results\n");
        int localFlag = 1;

        for (uint i = 0; i < N; i++)
        {
            if (h_OutputCPU[i] != h_OutputGPU[i])
            {
                localFlag = 0;
                break;
            }
        }

        // Log message on individual test result, then accumulate to global flag
        printf(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!");
        globalFlag = globalFlag && localFlag;

        // Data log
        if (arrayLength == MAX_SHORT_ARRAY_SIZE)
        {
            printf("\n");
            printf("scan-Short, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n",
                   (1.0e-6 * (double)arrayLength/timerValue), timerValue, (unsigned int)arrayLength, 1, (unsigned int)szWorkgroup);
            printf("\n");
        }
    }

    printf("***Running GPU scan for large arrays (%u identical iterations)...\n\n", iCycles);

    for (uint arrayLength = MIN_LARGE_ARRAY_SIZE; arrayLength <= MAX_LARGE_ARRAY_SIZE; arrayLength <<= 1)
    {
        printf("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength);
        checkCudaErrors(cudaDeviceSynchronize());
        sdkResetTimer(&hTimer);
        sdkStartTimer(&hTimer);

        for (int i = 0; i < iCycles; i++)
        {
            szWorkgroup = scanExclusiveLarge(d_Output, d_Input, N / arrayLength, arrayLength);
        }

        checkCudaErrors(cudaDeviceSynchronize());
        sdkStopTimer(&hTimer);
        double timerValue = 1.0e-3 * sdkGetTimerValue(&hTimer) / iCycles;

        printf("Validating the results...\n");
        printf("...reading back GPU results\n");
        checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost));

        printf("...scanExclusiveHost()\n");
        scanExclusiveHost(h_OutputCPU, h_Input, N / arrayLength, arrayLength);

        // Compare GPU results with CPU results and accumulate error for this test
        printf(" ...comparing the results\n");
        int localFlag = 1;

        for (uint i = 0; i < N; i++)
        {
            if (h_OutputCPU[i] != h_OutputGPU[i])
            {
                localFlag = 0;
                break;
            }
        }

        // Log message on individual test result, then accumulate to global flag
        printf(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!");
        globalFlag = globalFlag && localFlag;

        // Data log
        if (arrayLength == MAX_LARGE_ARRAY_SIZE)
        {
            printf("\n");
            printf("scan-Large, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n",
                   (1.0e-6 * (double)arrayLength/timerValue), timerValue, (unsigned int)arrayLength, 1, (unsigned int)szWorkgroup);
            printf("\n");
        }
    }


    printf("Shutting down...\n");
    closeScan();
    checkCudaErrors(cudaFree(d_Output));
    checkCudaErrors(cudaFree(d_Input));

    sdkDeleteTimer(&hTimer);

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();
    // pass or fail (cumulative... all tests in the loop)
    exit(globalFlag ? EXIT_SUCCESS : EXIT_FAILURE);
}
Example #25
0
////////////////////////////////////////////////////////////////////////////////
// Test driver
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    uint *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal;
    uint *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal;
    StopWatchInterface *hTimer = NULL;

    const uint   N = 4 * 1048576;
    const uint DIR = 1;
    const uint numValues = 65536;

    printf("%s Starting...\n\n", argv[0]);

    int dev = findCudaDevice(argc, (const char **) argv);

    if (dev == -1)
    {
        return EXIT_FAILURE;
    }

    printf("Allocating and initializing host arrays...\n\n");
    sdkCreateTimer(&hTimer);
    h_SrcKey = (uint *)malloc(N * sizeof(uint));
    h_SrcVal = (uint *)malloc(N * sizeof(uint));
    h_DstKey = (uint *)malloc(N * sizeof(uint));
    h_DstVal = (uint *)malloc(N * sizeof(uint));

    srand(2009);

    for (uint i = 0; i < N; i++)
    {
        h_SrcKey[i] = rand() % numValues;
    }

    fillValues(h_SrcVal, N);

    printf("Allocating and initializing CUDA arrays...\n\n");
    checkCudaErrors(cudaMalloc((void **)&d_DstKey, N * sizeof(uint)));
    checkCudaErrors(cudaMalloc((void **)&d_DstVal, N * sizeof(uint)));
    checkCudaErrors(cudaMalloc((void **)&d_BufKey, N * sizeof(uint)));
    checkCudaErrors(cudaMalloc((void **)&d_BufVal, N * sizeof(uint)));
    checkCudaErrors(cudaMalloc((void **)&d_SrcKey, N * sizeof(uint)));
    checkCudaErrors(cudaMalloc((void **)&d_SrcVal, N * sizeof(uint)));
    checkCudaErrors(cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice));

    printf("Initializing GPU merge sort...\n");
    initMergeSort();

    printf("Running GPU merge sort...\n");
    checkCudaErrors(cudaDeviceSynchronize());
    sdkResetTimer(&hTimer);
    sdkStartTimer(&hTimer);
    mergeSort(
        d_DstKey,
        d_DstVal,
        d_BufKey,
        d_BufVal,
        d_SrcKey,
        d_SrcVal,
        N,
        DIR
    );
    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&hTimer);
    printf("Time: %f ms\n", sdkGetTimerValue(&hTimer));

    printf("Reading back GPU merge sort results...\n");
    checkCudaErrors(cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost));
    checkCudaErrors(cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost));

    printf("Inspecting the results...\n");
    uint keysFlag = validateSortedKeys(
                        h_DstKey,
                        h_SrcKey,
                        1,
                        N,
                        numValues,
                        DIR
                    );

    uint valuesFlag = validateSortedValues(
                          h_DstKey,
                          h_DstVal,
                          h_SrcKey,
                          1,
                          N
                      );

    printf("Shutting down...\n");
    closeMergeSort();
    sdkDeleteTimer(&hTimer);
    checkCudaErrors(cudaFree(d_SrcVal));
    checkCudaErrors(cudaFree(d_SrcKey));
    checkCudaErrors(cudaFree(d_BufVal));
    checkCudaErrors(cudaFree(d_BufKey));
    checkCudaErrors(cudaFree(d_DstVal));
    checkCudaErrors(cudaFree(d_DstKey));
    free(h_DstVal);
    free(h_DstKey);
    free(h_SrcVal);
    free(h_SrcKey);

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();

    exit((keysFlag && valuesFlag) ? EXIT_SUCCESS : EXIT_FAILURE);
}
Example #26
0
/*
void initCellSystem(uint3 gridSize)
{
    csystem = new CellSystem(gridSize);
    //psystem->reset(ParticleSystem::CONFIG_GRID);
}
*/
void cleanup()
{
    sdkDeleteTimer(&timer);
}
Example #27
0
////////////////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    float
    *h_Kernel,
    *h_Input,
    *h_Buffer,
    *h_OutputCPU,
    *h_OutputGPU;

    cudaArray
    *a_Src;

    cudaChannelFormatDesc floatTex = cudaCreateChannelDesc<float>();

    float
    *d_Output;

    float
    gpuTime;

    StopWatchInterface *hTimer = NULL;

    const int imageW = 3072;
    const int imageH = 3072 / 2;
    const unsigned int iterations = 10;

    printf("[%s] - Starting...\n", argv[0]);

    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    findCudaDevice(argc, (const char **)argv);

    sdkCreateTimer(&hTimer);

    printf("Initializing data...\n");
    h_Kernel    = (float *)malloc(KERNEL_LENGTH * sizeof(float));
    h_Input     = (float *)malloc(imageW * imageH * sizeof(float));
    h_Buffer    = (float *)malloc(imageW * imageH * sizeof(float));
    h_OutputCPU = (float *)malloc(imageW * imageH * sizeof(float));
    h_OutputGPU = (float *)malloc(imageW * imageH * sizeof(float));
    checkCudaErrors(cudaMallocArray(&a_Src, &floatTex, imageW, imageH));
    checkCudaErrors(cudaMalloc((void **)&d_Output, imageW * imageH * sizeof(float)));

    srand(2009);

    for (unsigned int i = 0; i < KERNEL_LENGTH; i++)
    {
        h_Kernel[i] = (float)(rand() % 16);
    }

    for (unsigned int i = 0; i < imageW * imageH; i++)
    {
        h_Input[i] = (float)(rand() % 16);
    }

    setConvolutionKernel(h_Kernel);
    checkCudaErrors(cudaMemcpyToArray(a_Src, 0, 0, h_Input, imageW * imageH * sizeof(float), cudaMemcpyHostToDevice));


    printf("Running GPU rows convolution (%u identical iterations)...\n", iterations);
    checkCudaErrors(cudaDeviceSynchronize());
    sdkResetTimer(&hTimer);
    sdkStartTimer(&hTimer);

    for (unsigned int i = 0; i < iterations; i++)
    {
        convolutionRowsGPU(
            d_Output,
            a_Src,
            imageW,
            imageH
        );
    }

    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&hTimer);
    gpuTime = sdkGetTimerValue(&hTimer) / (float)iterations;
    printf("Average convolutionRowsGPU() time: %f msecs; //%f Mpix/s\n", gpuTime, imageW * imageH * 1e-6 / (0.001 * gpuTime));

    //While CUDA kernels can't write to textures directly, this copy is inevitable
    printf("Copying convolutionRowGPU() output back to the texture...\n");
    checkCudaErrors(cudaDeviceSynchronize());
    sdkResetTimer(&hTimer);
    sdkStartTimer(&hTimer);
    checkCudaErrors(cudaMemcpyToArray(a_Src, 0, 0, d_Output, imageW * imageH * sizeof(float), cudaMemcpyDeviceToDevice));
    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&hTimer);
    gpuTime = sdkGetTimerValue(&hTimer);
    printf("cudaMemcpyToArray() time: %f msecs; //%f Mpix/s\n", gpuTime, imageW * imageH * 1e-6 / (0.001 * gpuTime));

    printf("Running GPU columns convolution (%i iterations)\n", iterations);
    checkCudaErrors(cudaDeviceSynchronize());
    sdkResetTimer(&hTimer);
    sdkStartTimer(&hTimer);

    for (int i = 0; i < iterations; i++)
    {
        convolutionColumnsGPU(
            d_Output,
            a_Src,
            imageW,
            imageH
        );
    }

    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&hTimer);
    gpuTime = sdkGetTimerValue(&hTimer) / (float)iterations;
    printf("Average convolutionColumnsGPU() time: %f msecs; //%f Mpix/s\n", gpuTime, imageW * imageH * 1e-6 / (0.001 * gpuTime));

    printf("Reading back GPU results...\n");
    checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, imageW * imageH * sizeof(float), cudaMemcpyDeviceToHost));

    printf("Checking the results...\n");
    printf("...running convolutionRowsCPU()\n");
    convolutionRowsCPU(
        h_Buffer,
        h_Input,
        h_Kernel,
        imageW,
        imageH,
        KERNEL_RADIUS
    );

    printf("...running convolutionColumnsCPU()\n");
    convolutionColumnsCPU(
        h_OutputCPU,
        h_Buffer,
        h_Kernel,
        imageW,
        imageH,
        KERNEL_RADIUS
    );

    double delta = 0;
    double sum = 0;

    for (unsigned int i = 0; i < imageW * imageH; i++)
    {
        sum += h_OutputCPU[i] * h_OutputCPU[i];
        delta += (h_OutputGPU[i] - h_OutputCPU[i]) * (h_OutputGPU[i] - h_OutputCPU[i]);
    }

    double L2norm = sqrt(delta / sum);
    printf("Relative L2 norm: %E\n", L2norm);
    printf("Shutting down...\n");

    checkCudaErrors(cudaFree(d_Output));
    checkCudaErrors(cudaFreeArray(a_Src));
    free(h_OutputGPU);
    free(h_Buffer);
    free(h_Input);
    free(h_Kernel);

    sdkDeleteTimer(&hTimer);

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();

    if (L2norm > 1e-6)
    {
        printf("Test failed!\n");
        exit(EXIT_FAILURE);
    }

    printf("Test passed\n");
    exit(EXIT_SUCCESS);
}
bool
runTest(int argc, char **argv, ReduceType datatype)
{
    int size = 1<<24;    // number of elements to reduce
    int maxThreads = 256;  // number of threads per block
    int whichKernel = 6;
    int maxBlocks = 64;
    bool cpuFinalReduction = false;
    int cpuFinalThreshold = 1;

    if (checkCmdLineFlag(argc, (const char **) argv, "n"))
    {
        size = getCmdLineArgumentInt(argc, (const char **) argv, "n");
    }

    if (checkCmdLineFlag(argc, (const char **) argv, "threads"))
    {
        maxThreads = getCmdLineArgumentInt(argc, (const char **) argv, "threads");
    }

    if (checkCmdLineFlag(argc, (const char **) argv, "kernel"))
    {
        whichKernel = getCmdLineArgumentInt(argc, (const char **) argv, "kernel");
    }

    if (checkCmdLineFlag(argc, (const char **) argv, "maxblocks"))
    {
        maxBlocks = getCmdLineArgumentInt(argc, (const char **) argv, "maxblocks");
    }

    printf("%d elements\n", size);
    printf("%d threads (max)\n", maxThreads);

    cpuFinalReduction = checkCmdLineFlag(argc, (const char **) argv, "cpufinal");

    if (checkCmdLineFlag(argc, (const char **) argv, "cputhresh"))
    {
        cpuFinalThreshold = getCmdLineArgumentInt(argc, (const char **) argv, "cputhresh");
    }

    bool runShmoo = checkCmdLineFlag(argc, (const char **) argv, "shmoo");

    if (runShmoo)
    {
        shmoo<T>(1, 33554432, maxThreads, maxBlocks, datatype);
    }
    else
    {
        // create random input data on CPU
        unsigned int bytes = size * sizeof(T);

        T *h_idata = (T *) malloc(bytes);

        for (int i=0; i<size; i++)
        {
            // Keep the numbers small so we don't get truncation error in the sum
            if (datatype == REDUCE_INT)
            {
                h_idata[i] = (T)(rand() & 0xFF);
            }
            else
            {
                h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX;
            }
        }

        int numBlocks = 0;
        int numThreads = 0;
        getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads);

        if (numBlocks == 1)
        {
            cpuFinalThreshold = 1;
        }

        // allocate mem for the result on host side
        T *h_odata = (T *) malloc(numBlocks*sizeof(T));

        printf("%d blocks\n\n", numBlocks);

        // allocate device memory and data
        T *d_idata = NULL;
        T *d_odata = NULL;

        checkCudaErrors(cudaMalloc((void **) &d_idata, bytes));
        checkCudaErrors(cudaMalloc((void **) &d_odata, numBlocks*sizeof(T)));

        // copy data directly to device memory
        checkCudaErrors(cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice));
        checkCudaErrors(cudaMemcpy(d_odata, h_idata, numBlocks*sizeof(T), cudaMemcpyHostToDevice));

        // warm-up
        reduce<T>(size, numThreads, numBlocks, whichKernel, d_idata, d_odata);

        int testIterations = 100;

        StopWatchInterface *timer = 0;
        sdkCreateTimer(&timer);

        T gpu_result = 0;

        gpu_result = benchmarkReduce<T>(size, numThreads, numBlocks, maxThreads, maxBlocks,
                                        whichKernel, testIterations, cpuFinalReduction,
                                        cpuFinalThreshold, timer,
                                        h_odata, d_idata, d_odata);

        double reduceTime = sdkGetAverageTimerValue(&timer) * 1e-3;
        printf("Reduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %d, Workgroup = %u\n",
               1.0e-9 * ((double)bytes)/reduceTime, reduceTime, size, 1, numThreads);

        // compute reference solution
        T cpu_result = reduceCPU<T>(h_idata, size);

        int precision = 0;
        double threshold = 0;
        double diff = 0;

        if (datatype == REDUCE_INT)
        {
            printf("\nGPU result = %d\n", (int)gpu_result);
            printf("CPU result = %d\n\n", (int)cpu_result);
        }
        else
        {
            if (datatype == REDUCE_FLOAT)
            {
                precision = 8;
                threshold = 1e-8 * size;
            }
            else
            {
                precision = 12;
                threshold = 1e-12 * size;
            }

            printf("\nGPU result = %.*f\n", precision, (double)gpu_result);
            printf("CPU result = %.*f\n\n", precision, (double)cpu_result);

            diff = fabs((double)gpu_result - (double)cpu_result);
        }

        // cleanup
        sdkDeleteTimer(&timer);
        free(h_idata);
        free(h_odata);

        checkCudaErrors(cudaFree(d_idata));
        checkCudaErrors(cudaFree(d_odata));

        if (datatype == REDUCE_INT)
        {
            return (gpu_result == cpu_result);
        }
        else
        {
            return (diff < threshold);
        }
    }

    return true;
}
void shmoo(int minN, int maxN, int maxThreads, int maxBlocks, ReduceType datatype)
{
    // create random input data on CPU
    unsigned int bytes = maxN * sizeof(T);

    T *h_idata = (T *) malloc(bytes);

    for (int i = 0; i < maxN; i++)
    {
        // Keep the numbers small so we don't get truncation error in the sum
        if (datatype == REDUCE_INT)
        {
            h_idata[i] = (T)(rand() & 0xFF);
        }
        else
        {
            h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX;
        }
    }

    int maxNumBlocks = MIN(maxN / maxThreads, MAX_BLOCK_DIM_SIZE);

    // allocate mem for the result on host side
    T *h_odata = (T *) malloc(maxNumBlocks*sizeof(T));

    // allocate device memory and data
    T *d_idata = NULL;
    T *d_odata = NULL;

    checkCudaErrors(cudaMalloc((void **) &d_idata, bytes));
    checkCudaErrors(cudaMalloc((void **) &d_odata, maxNumBlocks*sizeof(T)));

    // copy data directly to device memory
    checkCudaErrors(cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_odata, h_idata, maxNumBlocks*sizeof(T), cudaMemcpyHostToDevice));

    // warm-up
    for (int kernel = 0; kernel < 7; kernel++)
    {
        reduce<T>(maxN, maxThreads, maxNumBlocks, kernel, d_idata, d_odata);
    }

    int testIterations = 100;

    StopWatchInterface *timer = 0;
    sdkCreateTimer(&timer);

    // print headers
    printf("Time in milliseconds for various numbers of elements for each kernel\n\n\n");
    printf("Kernel");

    for (int i = minN; i <= maxN; i *= 2)
    {
        printf(", %d", i);
    }

    for (int kernel = 0; kernel < 7; kernel++)
    {
        printf("\n%d", kernel);

        for (int i = minN; i <= maxN; i *= 2)
        {
            sdkResetTimer(&timer);
            int numBlocks = 0;
            int numThreads = 0;
            getNumBlocksAndThreads(kernel, i, maxBlocks, maxThreads, numBlocks, numThreads);

            float reduceTime;

            if (numBlocks <= MAX_BLOCK_DIM_SIZE)
            {
                benchmarkReduce(i, numThreads, numBlocks, maxThreads, maxBlocks, kernel,
                                testIterations, false, 1, timer, h_odata, d_idata, d_odata);
                reduceTime =  sdkGetAverageTimerValue(&timer);
            }
            else
            {
                reduceTime = -1.0;
            }

            printf(", %.5f", reduceTime);
        }
    }

    // cleanup
    sdkDeleteTimer(&timer);
    free(h_idata);
    free(h_odata);

    checkCudaErrors(cudaFree(d_idata));
    checkCudaErrors(cudaFree(d_odata));
}
int main(int argc, char **argv)
{
    // Start logs
    printf("[%s] - Starting...\n", argv[0]);

    //'h_' prefix - CPU (host) memory space
    float
    //Results calculated by CPU for reference
    *h_CallResultCPU,
    *h_PutResultCPU,
    //CPU copy of GPU results
    *h_CallResultGPU,
    *h_PutResultGPU,
    //CPU instance of input data
    *h_StockPrice,
    *h_OptionStrike,
    *h_OptionYears;

    //'d_' prefix - GPU (device) memory space
    CUdeviceptr
    //Results calculated by GPU
    d_CallResult,
    d_PutResult,

    //GPU instance of input data
    d_StockPrice,
    d_OptionStrike,
    d_OptionYears;

    double
    delta, ref, sum_delta, sum_ref, max_delta, L1norm, gpuTime;

    StopWatchInterface *hTimer = NULL;
    int i;

    sdkCreateTimer(&hTimer);

    printf("Initializing data...\n");
    printf("...allocating CPU memory for options.\n");

    h_CallResultCPU = (float *)malloc(OPT_SZ);
    h_PutResultCPU  = (float *)malloc(OPT_SZ);
    h_CallResultGPU = (float *)malloc(OPT_SZ);
    h_PutResultGPU  = (float *)malloc(OPT_SZ);
    h_StockPrice    = (float *)malloc(OPT_SZ);
    h_OptionStrike  = (float *)malloc(OPT_SZ);
    h_OptionYears   = (float *)malloc(OPT_SZ);


    char *ptx, *kernel_file;
    size_t ptxSize;
    kernel_file = sdkFindFilePath("BlackScholes_kernel.cuh", argv[0]);

    // Set a Compiler Option to have maximum register to be used by each thread.
    char *compile_options[1];
    compile_options[0] = (char *) malloc(sizeof(char)*(strlen("--maxrregcount=16")));
    strcpy((char *)compile_options[0],"--maxrregcount=16");

    // Compile the kernel BlackScholes_kernel.
    compileFileToPTX(kernel_file, 1, (const char **)compile_options, &ptx, &ptxSize);
    CUmodule module = loadPTX(ptx, argc, argv);

    CUfunction kernel_addr;
    checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "BlackScholesGPU"));

    printf("...allocating GPU memory for options.\n");
    checkCudaErrors(cuMemAlloc(&d_CallResult, OPT_SZ));
    checkCudaErrors(cuMemAlloc(&d_PutResult, OPT_SZ));
    checkCudaErrors(cuMemAlloc(&d_StockPrice, OPT_SZ));
    checkCudaErrors(cuMemAlloc(&d_OptionStrike,OPT_SZ));
    checkCudaErrors(cuMemAlloc(&d_OptionYears, OPT_SZ));

    printf("...generating input data in CPU mem.\n");
    srand(5347);

    //Generate options set
    for (i = 0; i < OPT_N; i++)
    {
        h_CallResultCPU[i] = 0.0f;
        h_PutResultCPU[i]  = -1.0f;
        h_StockPrice[i]    = RandFloat(5.0f, 30.0f);
        h_OptionStrike[i]  = RandFloat(1.0f, 100.0f);
        h_OptionYears[i]   = RandFloat(0.25f, 10.0f);
    }

    printf("...copying input data to GPU mem.\n");
    //Copy options data to GPU memory for further processing
    checkCudaErrors(cuMemcpyHtoD(d_StockPrice, h_StockPrice, OPT_SZ));
    checkCudaErrors(cuMemcpyHtoD(d_OptionStrike, h_OptionStrike, OPT_SZ));
    checkCudaErrors(cuMemcpyHtoD(d_OptionYears, h_OptionYears, OPT_SZ));

    printf("Data init done.\n\n");
    printf("Executing Black-Scholes GPU kernel (%i iterations)...\n", NUM_ITERATIONS);

    sdkResetTimer(&hTimer);
    sdkStartTimer(&hTimer);

    dim3 cudaBlockSize( 128, 1, 1);
    dim3 cudaGridSize(DIV_UP(OPT_N/2, 128),1,1);

    float risk = RISKFREE;
    float volatility = VOLATILITY;
    int optval = OPT_N;

    void *arr[] = { (void *)&d_CallResult, (void *)&d_PutResult, (void *)&d_StockPrice,
        (void *)&d_OptionStrike, (void *)&d_OptionYears, (void *)&risk, (void *)&volatility, (void *)&optval };

    for (i = 0; i < NUM_ITERATIONS; i++)
    {

        checkCudaErrors(cuLaunchKernel(kernel_addr,
                                            cudaGridSize.x, cudaGridSize.y, cudaGridSize.z, /* grid dim */
                                            cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, /* block dim */
                                            0,0, /* shared mem, stream */
                                            &arr[0], /* arguments */
                                            0));

    }

    checkCudaErrors(cuCtxSynchronize());

    sdkStopTimer(&hTimer);
    gpuTime = sdkGetTimerValue(&hTimer) / NUM_ITERATIONS;

    //Both call and put is calculated
    printf("Options count             : %i     \n", 2 * OPT_N);
    printf("BlackScholesGPU() time    : %f msec\n", gpuTime);
    printf("Effective memory bandwidth: %f GB/s\n", ((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (gpuTime * 1E-3));
    printf("Gigaoptions per second    : %f     \n\n", ((double)(2 * OPT_N) * 1E-9) / (gpuTime * 1E-3));
    printf("BlackScholes, Throughput = %.4f GOptions/s, Time = %.5f s, Size = %u options, NumDevsUsed = %u, Workgroup = %u\n",
           (((double)(2.0 * OPT_N) * 1.0E-9) / (gpuTime * 1.0E-3)), gpuTime*1e-3, (2 * OPT_N), 1, 128);

    printf("\nReading back GPU results...\n");

    //Read back GPU results to compare them to CPU results
    checkCudaErrors(cuMemcpyDtoH(h_CallResultGPU, d_CallResult, OPT_SZ));
    checkCudaErrors(cuMemcpyDtoH(h_PutResultGPU, d_PutResult, OPT_SZ));

    printf("Checking the results...\n");
    printf("...running CPU calculations.\n\n");

    //Calculate options values on CPU
    BlackScholesCPU(
        h_CallResultCPU,
        h_PutResultCPU,
        h_StockPrice,
        h_OptionStrike,
        h_OptionYears,
        RISKFREE,
        VOLATILITY,
        OPT_N
    );

    printf("Comparing the results...\n");
    //Calculate max absolute difference and L1 distance
    //between CPU and GPU results
    sum_delta = 0;
    sum_ref   = 0;
    max_delta = 0;

    for (i = 0; i < OPT_N; i++)
    {
        ref   = h_CallResultCPU[i];
        delta = fabs(h_CallResultCPU[i] - h_CallResultGPU[i]);

        if (delta > max_delta)
        {
            max_delta = delta;
        }

        sum_delta += delta;
        sum_ref   += fabs(ref);
    }

    L1norm = sum_delta / sum_ref;
    printf("L1 norm: %E\n", L1norm);
    printf("Max absolute error: %E\n\n", max_delta);

    printf("Shutting down...\n");
    printf("...releasing GPU memory.\n");

    checkCudaErrors(cuMemFree(d_OptionYears));
    checkCudaErrors(cuMemFree(d_OptionStrike));
    checkCudaErrors(cuMemFree(d_StockPrice));
    checkCudaErrors(cuMemFree(d_PutResult));
    checkCudaErrors(cuMemFree(d_CallResult));

    printf("...releasing CPU memory.\n");

    free(h_OptionYears);
    free(h_OptionStrike);
    free(h_StockPrice);
    free(h_PutResultGPU);
    free(h_CallResultGPU);
    free(h_PutResultCPU);
    free(h_CallResultCPU);

    sdkDeleteTimer(&hTimer);
    printf("Shutdown done.\n");

    printf("\n[%s] - Test Summary\n", argv[0]);

    cuProfilerStop();

    if (L1norm > 1e-6)
    {
        printf("Test failed!\n");
        exit(EXIT_FAILURE);
    }

    printf("Test passed\n");
    exit(EXIT_SUCCESS);
}