Beispiel #1
0
void cleanup()
{
    cutilCheckError( cutDeleteTimer( timer));
    if(h_img)cutFree(h_img);
    cutilSafeCall(cudaFree(d_img));
    cutilSafeCall(cudaFree(d_temp));

    // Refer to boxFilter_kernel.cu for implementation
    freeTextures();

    //DEPRECATED: cutilSafeCall(cudaGLUnregisterBufferObject(pbo));
    cudaGraphicsUnregisterResource(cuda_pbo_resource);

    glDeleteBuffersARB(1, &pbo);
    glDeleteTextures(1, &texid);
    glDeleteProgramsARB(1, &shader);

    if (g_CheckRender) {
        delete g_CheckRender;
        g_CheckRender = NULL;
    }
    if (g_FrameBufferObject) {
        delete g_FrameBufferObject;
        g_FrameBufferObject = NULL;
    }
}
void cleanup()
{
    if (g_bQAReadback) {
        cudaFree(d_pos);
        cudaFree(d_normal);
    } else {
        cutilCheckError( cutDeleteTimer( timer ));

        deleteVBO(&posVbo,    &cuda_posvbo_resource);
        deleteVBO(&normalVbo, &cuda_normalvbo_resource);
    }

    cudppDestroyPlan(scanplan);

    cutilSafeCall(cudaFree(d_edgeTable));
    cutilSafeCall(cudaFree(d_triTable));
    cutilSafeCall(cudaFree(d_numVertsTable));

    cutilSafeCall(cudaFree(d_voxelVerts));
    cutilSafeCall(cudaFree(d_voxelVertsScan));
    cutilSafeCall(cudaFree(d_voxelOccupied));
    cutilSafeCall(cudaFree(d_voxelOccupiedScan));
    cutilSafeCall(cudaFree(d_compVoxelArray));

    if (d_volume) cutilSafeCall(cudaFree(d_volume));

    if (g_CheckRender) {
        delete g_CheckRender; g_CheckRender = NULL;
    }
    if (g_FrameBufferObject) {
        delete g_FrameBufferObject; g_FrameBufferObject = NULL;
    }
}
Beispiel #3
0
void cleanup()
{
    cutilCheckError( cutDeleteTimer( timer));

    deleteVBO(&vbo);

    if (g_CheckRender) {
        delete g_CheckRender; g_CheckRender = NULL;
    }
}
Beispiel #4
0
void cleanup()
{
	cutilCheckError(cutStopTimer(timer));
	cutilCheckError(cutDeleteTimer( timer));

	cudaFree(a_d);cudaFree(b_d);cudaFree(r_d);

	cudaThreadExit();
      
}
void cleanup()
{
    cutilCheckError( cutDeleteTimer( hTimer));

    glDeleteProgramsARB(1, &shader);

    if (g_CheckRender) {
        delete g_CheckRender; g_CheckRender = NULL;
    }
    if (g_FrameBufferObject) {
        delete g_FrameBufferObject; g_FrameBufferObject = NULL;
    }
}
void cleanup()
{
    if (psystem)   delete psystem;
    if (renderer)  delete renderer;
    if (floorProg) delete floorProg;
    cutilCheckError(cutDeleteTimer(timer));
    if (params)    delete params;
    if (floorTex)  glDeleteTextures(1, &floorTex);

    if (g_CheckRender) {
        delete g_CheckRender; g_CheckRender = NULL;
    }
}
void cleanup(void) {
	cutilSafeCall(cudaGraphicsUnregisterResource(cuda_pbo_resource));

    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
    glDeleteBuffers(1, &pbo_buffer);
    glDeleteTextures(1, &texid);
    deleteTexture();

    if (g_CheckRender) {
        delete g_CheckRender; g_CheckRender = NULL;
    }

    cutilCheckError(cutDeleteTimer(timer));  
}
void cleanup()
{
    cutilCheckError( cutDeleteTimer( timer));

	freeCudaBuffers();

    if (pbo) {
	    cutilSafeCall(cudaGraphicsUnregisterResource(cuda_pbo_resource));
        glDeleteBuffersARB(1, &pbo);
        glDeleteTextures(1, &tex);
    }

    if (g_CheckRender) {
        delete g_CheckRender; g_CheckRender = NULL;
    }
}
Beispiel #9
0
void cleanup()
{
	free(a_h);free(b_h);free(r_h);
	free(control);

	cutilCheckError(cutStopTimer(timer));
	cutilCheckError(cutDeleteTimer( timer));

	cudaFree(a_d);cudaFree(b_d);cudaFree(r_d);

	
	cutilSafeCall(release());
	checkCUDAError("release");
	cudaThreadExit();
      
}
Beispiel #10
0
    void _runBenchmark(int iterations)
    {
        // once without timing to prime the device
        if (!useCpu)
            m_nbody->update(activeParams.m_timestep);

        if (useCpu)
        {
            cutCreateTimer(&timer);
            cutStartTimer(timer);
        }
        else
        {
            cutilSafeCall(cudaEventRecord(startEvent, 0));
        }

        for (int i = 0; i < iterations; ++i)
        {
            m_nbody->update(activeParams.m_timestep);
        }

        float milliseconds = 0;

        if (useCpu)
        {
            cutStopTimer(timer);
            milliseconds = cutGetTimerValue(timer);
            cutDeleteTimer(timer);
        }
        else
        {
            cutilSafeCall(cudaEventRecord(stopEvent, 0));  
            cutilSafeCall(cudaEventSynchronize(stopEvent));
            cutilSafeCall( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent));
        }
        
        double interactionsPerSecond = 0;
        double gflops = 0;
        computePerfStats(interactionsPerSecond, gflops, milliseconds, iterations);

        shrLog("%d bodies, total time for %d iterations: %.3f ms\n", 
            numBodies, iterations, milliseconds);
        shrLog("= %.3f billion interactions per second\n", interactionsPerSecond);
        shrLog("= %.3f %s-precision GFLOP/s at %d flops per interaction\n", gflops, 
               (sizeof(T) > 4) ? "double" : "single", flopsPerInteraction);   
    }
Beispiel #11
0
    ~NBodyDemo() 
    {
        if (m_nbodyCpu)
            delete m_nbodyCpu;
        if (m_nbodyCuda)
            delete m_nbodyCuda;

        if (m_hPos)
            delete [] m_hPos;
        if (m_hVel)
            delete [] m_hVel;
        if (m_hColor)
            delete [] m_hColor;

        cutilSafeCall(cudaEventDestroy(startEvent));
        cutilSafeCall(cudaEventDestroy(stopEvent));
        cutilSafeCall(cudaEventDestroy(hostMemSyncEvent));
        cutilCheckError(cutDeleteTimer(demoTimer));

        delete m_renderer;
    }
void cleanup()
{
    cutilCheckError( cutDeleteTimer( timer));
    if (!h_img) {
	    free(h_img);
	}

    cutilSafeCall(cudaFree(d_img));
    cutilSafeCall(cudaFree(d_temp));

    if (!runBenchmark) {
        if (pbo) {
            cutilSafeCall(cudaGLUnregisterBufferObject(pbo));    
            glDeleteBuffersARB(1, &pbo);
        }
        if (texid) {
            glDeleteTextures(1, &texid);
        }
    }

    if (g_CheckRender) {
        delete g_CheckRender; g_CheckRender = NULL;
    }
}
Beispiel #13
0
int main(int, char* argv[])
{
    // In assignment 2.1, we will fix filter_radius as 1,
    // and therefore it's filter size is 3x3.
    const unsigned int filter_radius = 1;

    unsigned int width, height;
    unsigned char* h_img = NULL;
    unsigned char* your_result = NULL;
    unsigned char* gold_result = NULL;

    // This is the filename of the image file in PGM format
    // and the file should be put under the subdirectory, "data".
    const char* image_filename = "test.pgm";
    
    loadImage(&h_img, &width, &height, image_filename, argv[0]);

    if (width <= 2*filter_radius || height <= 2*filter_radius) {
        fprintf(stderr, "Filter radius is too large.\n");
        exit(-1);
    }

    your_result = (unsigned char*) malloc(width*height*sizeof(unsigned char));
    gold_result = (unsigned char*) malloc(width*height*sizeof(unsigned char));


    // Run your median filter
    {
        unsigned int timer = 0;
        cutilCheckError(cutCreateTimer(&timer));
        cutilCheckError(cutStartTimer(timer));
        
        // You should implemnt medianFilter() in medianFilter_kernel.cu
        medianFilter(h_img, your_result, width, height, filter_radius);

        cutilCheckError(cutStopTimer(timer));
        printf("[Yours] Processing time: %f (ms) \n", cutGetTimerValue(timer));
        cutilCheckError(cutDeleteTimer(timer));
    }

    // Run the reference median filter
    {
        unsigned int timer = 0;
        cutilCheckError(cutCreateTimer(&timer));
        cutilCheckError(cutStartTimer(timer));
        
        medianFilter_gold(h_img, gold_result, width, height, filter_radius);

        cutilCheckError(cutStopTimer(timer));
        printf("[Gold]  Processing time: %f (ms) \n", cutGetTimerValue(timer));
        cutilCheckError(cutDeleteTimer(timer));
    }

    // You can use saveImage() to save the result.
    // Under Windows, you can use IrfanView (http://www.irfanview.com/) to view PGM files.
    // Or you can convert PGM into JPEG, PNG, and etc. 
    
    // saveImage(your_result, width, height, "output.pgm");
    // saveImage(gold_result, width, height, "gold.pgm");

    // Compare your result and the reference result.
    {
        if (test(your_result, gold_result, width, height, filter_radius)) {
            printf("PASSED\n");
        } else {
            printf("FAILED\n");
        }
    }

    free(gold_result);
    free(your_result);
    return 0;
}
Beispiel #14
0
int main(int argc, char **argv)
{
    uchar *h_Data;
    uint  *h_HistogramCPU, *h_HistogramGPU;
    uchar *d_Data;
    uint  *d_Histogram;
    uint hTimer;
    int PassFailFlag = 1;
    uint byteCount = 64 * 1048576;
    uint uiSizeMult = 1;

    cudaDeviceProp deviceProp;
    deviceProp.major = 0;
    deviceProp.minor = 0;
    int dev;

	shrQAStart(argc, argv);

	// set logfile name and start logs
    shrSetLogFileName ("histogram.txt");

    //Use command-line specified CUDA device, otherwise use device with highest Gflops/s
    if( shrCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
        dev = cutilDeviceInit(argc, argv);
        if (dev < 0) {
           printf("No CUDA Capable Devices found, exiting...\n");
           shrQAFinishExit(argc, (const char **)argv, QA_WAIVED);
        }
    } else {
        cudaSetDevice( dev = cutGetMaxGflopsDeviceId() );
        cutilSafeCall( cudaChooseDevice(&dev, &deviceProp) );
    }
    cutilSafeCall( cudaGetDeviceProperties(&deviceProp, dev) );

	printf("CUDA device [%s] has %d Multi-Processors, Compute %d.%d\n", 
		deviceProp.name, deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor);

	int version = deviceProp.major * 0x10 + deviceProp.minor;

	if(version < 0x11) 
    {
        printf("There is no device supporting a minimum of CUDA compute capability 1.1 for this SDK sample\n");
        cutilDeviceReset();
		shrQAFinishExit(argc, (const char **)argv, QA_WAIVED);
    }

    cutilCheckError(cutCreateTimer(&hTimer));

    // Optional Command-line multiplier to increase size of array to histogram
    if (shrGetCmdLineArgumentu(argc, (const char**)argv, "sizemult", &uiSizeMult))
    {
        uiSizeMult = CLAMP(uiSizeMult, 1, 10);
        byteCount *= uiSizeMult;
    }

    shrLog("Initializing data...\n");
        shrLog("...allocating CPU memory.\n");
            h_Data         = (uchar *)malloc(byteCount);
            h_HistogramCPU = (uint  *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint));
            h_HistogramGPU = (uint  *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint));

        shrLog("...generating input data\n");
            srand(2009);
            for(uint i = 0; i < byteCount; i++) 
                h_Data[i] = rand() % 256;

        shrLog("...allocating GPU memory and copying input data\n\n");
            cutilSafeCall( cudaMalloc((void **)&d_Data, byteCount  ) );
            cutilSafeCall( cudaMalloc((void **)&d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint)  ) );
            cutilSafeCall( cudaMemcpy(d_Data, h_Data, byteCount, cudaMemcpyHostToDevice) );

    {
        shrLog("Starting up 64-bin histogram...\n\n");
            initHistogram64();

        shrLog("Running 64-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns);
            for(int iter = -1; iter < numRuns; iter++){
                //iter == -1 -- warmup iteration
                if(iter == 0){
                    cutilSafeCall( cutilDeviceSynchronize() );
                    cutilCheckError( cutResetTimer(hTimer) );
                    cutilCheckError( cutStartTimer(hTimer) );
                }

                histogram64(d_Histogram, d_Data, byteCount);
            }

            cutilSafeCall( cutilDeviceSynchronize() );
            cutilCheckError(  cutStopTimer(hTimer));
            double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns;
        shrLog("histogram64() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs);
        shrLogEx(LOGBOTH | MASTER, 0, "histogram64, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", 
                (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM64_THREADBLOCK_SIZE); 

        shrLog("\nValidating GPU results...\n");
            shrLog(" ...reading back GPU results\n");
                cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM64_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) );

            shrLog(" ...histogram64CPU()\n");
               histogram64CPU(
                    h_HistogramCPU,
                    h_Data,
                    byteCount
                );

            shrLog(" ...comparing the results...\n");
                for(uint i = 0; i < HISTOGRAM64_BIN_COUNT; i++)
                    if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0;
            shrLog(PassFailFlag ? " ...64-bin histograms match\n\n" : " ***64-bin histograms do not match!!!***\n\n" );

        shrLog("Shutting down 64-bin histogram...\n\n\n");
            closeHistogram64();
    }

    {
        shrLog("Initializing 256-bin histogram...\n");
            initHistogram256();

        shrLog("Running 256-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns);
            for(int iter = -1; iter < numRuns; iter++){
                //iter == -1 -- warmup iteration
                if(iter == 0){
                    cutilSafeCall( cutilDeviceSynchronize() );
                    cutilCheckError( cutResetTimer(hTimer) );
                    cutilCheckError( cutStartTimer(hTimer) );
                }

                histogram256(d_Histogram, d_Data, byteCount);
            }

            cutilSafeCall( cutilDeviceSynchronize() );
            cutilCheckError(  cutStopTimer(hTimer));
            double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns;
        shrLog("histogram256() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs);
        shrLogEx(LOGBOTH | MASTER, 0, "histogram256, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", 
                (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM256_THREADBLOCK_SIZE); 
                
        shrLog("\nValidating GPU results...\n");
            shrLog(" ...reading back GPU results\n");
                cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) );

            shrLog(" ...histogram256CPU()\n");
                histogram256CPU(
                    h_HistogramCPU,
                    h_Data,
                    byteCount
                );

            shrLog(" ...comparing the results\n");
                for(uint i = 0; i < HISTOGRAM256_BIN_COUNT; i++)
                    if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0;
            shrLog(PassFailFlag ? " ...256-bin histograms match\n\n" : " ***256-bin histograms do not match!!!***\n\n" );

        shrLog("Shutting down 256-bin histogram...\n\n\n");
            closeHistogram256();
    }

    shrLog("Shutting down...\n");
        cutilCheckError(cutDeleteTimer(hTimer));
        cutilSafeCall( cudaFree(d_Histogram) );
        cutilSafeCall( cudaFree(d_Data) );
        free(h_HistogramGPU);
        free(h_HistogramCPU);
        free(h_Data);

    cutilDeviceReset();
	shrLog("%s - Test Summary\n", sSDKsample);
    // pass or fail (for both 64 bit and 256 bit histograms)
    shrQAFinishExit(argc, (const char **)argv, (PassFailFlag ? QA_PASSED : QA_FAILED));
}
Beispiel #15
0
int main(int argc, char **argv)
{
	GpuProfiling::initProf();
    // Start logs
    shrSetLogFileName ("scan.txt");
    shrLog("%s Starting...\n\n", argv[0]);

    //Use command-line specified CUDA device, otherwise use device with highest Gflops/s
    if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
        cutilDeviceInit(argc, argv);
    else
        cudaSetDevice( cutGetMaxGflopsDeviceId() );

    uint *d_Input, *d_Output;
    uint *h_Input, *h_OutputCPU, *h_OutputGPU;
    uint hTimer;
    const uint N = 13 * 1048576 / 2;

    shrLog("Allocating and initializing host arrays...\n");
        cutCreateTimer(&hTimer);
        h_Input     = (uint *)malloc(N * sizeof(uint));
        h_OutputCPU = (uint *)malloc(N * sizeof(uint));
        h_OutputGPU = (uint *)malloc(N * sizeof(uint));
        srand(2009);
        for(uint i = 0; i < N; i++)
            h_Input[i] = rand();

    shrLog("Allocating and initializing CUDA arrays...\n");
        cutilSafeCall( cudaMalloc((void **)&d_Input, N * sizeof(uint)) );
        cutilSafeCall( cudaMalloc((void **)&d_Output, N * sizeof(uint)) );
        cutilSafeCall( cudaMemcpy(d_Input, h_Input, N * sizeof(uint), cudaMemcpyHostToDevice) );

    shrLog("Initializing CUDA-C scan...\n\n");
        initScan();

    int globalFlag = 1;
    size_t szWorkgroup;
    const int iCycles = 100;
    shrLog("*** Running GPU scan for short arrays (%d identical iterations)...\n\n", iCycles);
        for(uint arrayLength = MIN_SHORT_ARRAY_SIZE; arrayLength <= MAX_SHORT_ARRAY_SIZE; arrayLength <<= 1){
            shrLog("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength);
                cutilSafeCall( cudaThreadSynchronize() );
                cutResetTimer(hTimer);
                cutStartTimer(hTimer);
                for(int i = 0; i < iCycles; i++)
                {
                    szWorkgroup = scanExclusiveShort(d_Output, d_Input, N / arrayLength, arrayLength);
                }
                cutilSafeCall( cudaThreadSynchronize());
                cutStopTimer(hTimer);
                double timerValue = 1.0e-3 * cutGetTimerValue(hTimer) / iCycles;

            shrLog("Validating the results...\n");
                shrLog("...reading back GPU results\n");
                    cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost) );

                shrLog(" ...scanExclusiveHost()\n");
                    scanExclusiveHost(h_OutputCPU, h_Input, N / arrayLength, arrayLength);

                // Compare GPU results with CPU results and accumulate error for this test
                shrLog(" ...comparing the results\n");
                    int localFlag = 1;
                    for(uint i = 0; i < N; i++)
                    {
                        if(h_OutputCPU[i] != h_OutputGPU[i])
                        {
                            localFlag = 0;
                            break;
                        }
                    }

                // Log message on individual test result, then accumulate to global flag
                shrLog(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!");
                globalFlag = globalFlag && localFlag;

                // Data log
                if (arrayLength == MAX_SHORT_ARRAY_SIZE)
                {
                    shrLog("\n");
                    shrLogEx(LOGBOTH | MASTER, 0, "scan-Short, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n",
                           (1.0e-6 * (double)arrayLength/timerValue), timerValue, arrayLength, 1, szWorkgroup);
                    shrLog("\n");
                }
        }

    shrLog("***Running GPU scan for large arrays (%u identical iterations)...\n\n", iCycles);
        for(uint arrayLength = MIN_LARGE_ARRAY_SIZE; arrayLength <= MAX_LARGE_ARRAY_SIZE; arrayLength <<= 1){
            shrLog("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength);
                cutilSafeCall( cudaThreadSynchronize() );
                cutResetTimer(hTimer);
                cutStartTimer(hTimer);
                for(int i = 0; i < iCycles; i++)
                {
                    szWorkgroup = scanExclusiveLarge(d_Output, d_Input, N / arrayLength, arrayLength);
                }
                cutilSafeCall( cudaThreadSynchronize() );
                cutStopTimer(hTimer);
                double timerValue = 1.0e-3 * cutGetTimerValue(hTimer) / iCycles;

            shrLog("Validating the results...\n");
                shrLog("...reading back GPU results\n");
                    cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost) );

                shrLog("...scanExclusiveHost()\n");
                    scanExclusiveHost(h_OutputCPU, h_Input, N / arrayLength, arrayLength);

                // Compare GPU results with CPU results and accumulate error for this test
                shrLog(" ...comparing the results\n");
                    int localFlag = 1;
                    for(uint i = 0; i < N; i++)
                    {
                        if(h_OutputCPU[i] != h_OutputGPU[i])
                        {
                            localFlag = 0;
                            break;
                        }
                    }

                // Log message on individual test result, then accumulate to global flag
                shrLog(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!");
                globalFlag = globalFlag && localFlag;

                // Data log
                if (arrayLength == MAX_LARGE_ARRAY_SIZE)
                {
                    shrLog("\n");
                    shrLogEx(LOGBOTH | MASTER, 0, "scan-Large, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n",
                           (1.0e-6 * (double)arrayLength/timerValue), timerValue, arrayLength, 1, szWorkgroup);
                    shrLog("\n");
                }
        }

    // pass or fail (cumulative... all tests in the loop)
    shrLog(globalFlag ? "PASSED\n\n" : "FAILED\n\n");
	GpuProfiling::printResults();

    shrLog("Shutting down...\n");
        closeScan();
        cutilSafeCall( cudaFree(d_Output));
        cutilSafeCall( cudaFree(d_Input));

        cutilCheckError( cutDeleteTimer(hTimer) );

        cudaThreadExit();
		exit(0);
        shrEXIT(argc, (const char**)argv);
}
////////////////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv){
    const unsigned int OPT_N_MAX = 512;
    unsigned int useDoublePrecision;

    printf("[binomialOptions]\n");

    int devID = cutilDeviceInit(argc, argv);
    if (devID < 0) {
       printf("exiting...\n");
       cutilExit(argc, argv);
       exit(0);
    }

    cutilSafeCall(cudaGetDevice(&devID));
    cudaDeviceProp deviceProp;
    cutilSafeCall(cudaGetDeviceProperties(&deviceProp, devID));

    char *precisionChoice;
    cutGetCmdLineArgumentstr(argc, (const char **)argv, "type", &precisionChoice);
    if(precisionChoice == NULL) {
        useDoublePrecision = 0;
    } else {
        if(!strcasecmp(precisionChoice, "double"))
            useDoublePrecision = 1;
        else
            useDoublePrecision = 0;
    }
    printf(useDoublePrecision ? "Using double precision...\n" : "Using single precision...\n");
    const int OPT_N = deviceEmulation() ? 1 : OPT_N_MAX;

    TOptionData optionData[OPT_N_MAX];
    float
        callValueBS[OPT_N_MAX],
        callValueGPU[OPT_N_MAX],
        callValueCPU[OPT_N_MAX];

    double
        sumDelta, sumRef, gpuTime, errorVal;

    unsigned int hTimer;
    int i;

    cutilCheckError( cutCreateTimer(&hTimer) );

    int version = deviceProp.major * 10 + deviceProp.minor;
    if(useDoublePrecision && version < 13){
        printf("Double precision is not supported.\n");
        return 0;
    }

    printf("Generating input data...\n");
        //Generate options set
        srand(123);
        for(i = 0; i < OPT_N; i++){
            optionData[i].S = randData(5.0f, 30.0f);
            optionData[i].X = randData(1.0f, 100.0f);
            optionData[i].T = randData(0.25f, 10.0f);
            optionData[i].R = 0.06f;
            optionData[i].V = 0.10f;
            BlackScholesCall(callValueBS[i], optionData[i]);
        }

    printf("Running GPU binomial tree...\n");
        cutilSafeCall( cudaThreadSynchronize() );
        cutilCheckError( cutResetTimer(hTimer) );
        cutilCheckError( cutStartTimer(hTimer) );

        if(useDoublePrecision)
            binomialOptions_SM13(callValueGPU, optionData, OPT_N);
        else
            binomialOptions_SM10(callValueGPU, optionData, OPT_N);

        cutilSafeCall( cudaThreadSynchronize() );
        cutilCheckError( cutStopTimer(hTimer) );
        gpuTime = cutGetTimerValue(hTimer);
    printf("Options count            : %i     \n", OPT_N);
    printf("Time steps               : %i     \n", NUM_STEPS);
    printf("binomialOptionsGPU() time: %f msec\n", gpuTime);
    printf("Options per second       : %f     \n", OPT_N / (gpuTime * 0.001));

    printf("Running CPU binomial tree...\n");
        for(i = 0; i < OPT_N; i++)
            binomialOptionsCPU(callValueCPU[i], optionData[i]);

    printf("Comparing the results...\n");
    sumDelta = 0;
    sumRef   = 0;
    printf("GPU binomial vs. Black-Scholes\n");
    for(i = 0; i < OPT_N; i++){
        sumDelta += fabs(callValueBS[i] - callValueGPU[i]);
        sumRef += fabs(callValueBS[i]);
    }
    if(sumRef >1E-5)
        printf("L1 norm: %E\n", sumDelta / sumRef);
    else
        printf("Avg. diff: %E\n", sumDelta / (double)OPT_N);

    printf("CPU binomial vs. Black-Scholes\n");
    sumDelta = 0;
    sumRef   = 0;
    for(i = 0; i < OPT_N; i++){
        sumDelta += fabs(callValueBS[i]- callValueCPU[i]);
        sumRef += fabs(callValueBS[i]);
    }
    if(sumRef >1E-5)
        printf("L1 norm: %E\n", sumDelta / sumRef);
    else
        printf("Avg. diff: %E\n", sumDelta / (double)OPT_N);

    printf("CPU binomial vs. GPU binomial\n");
    sumDelta = 0;
    sumRef   = 0;
    for(i = 0; i < OPT_N; i++){
        sumDelta += fabs(callValueGPU[i] - callValueCPU[i]);
        sumRef += callValueCPU[i];
    }
    if(sumRef > 1E-5)
        printf("L1 norm: %E\n", errorVal = sumDelta / sumRef);
    else
        printf("Avg. diff: %E\n", errorVal = sumDelta / (double)OPT_N);

    printf("Shutting down...\n");

	printf("\n[binomialOptions] - Test Summary:\n");
    printf((errorVal < 5e-4) ? "PASSED\n" : "FAILED\n");

    cutilCheckError( cutDeleteTimer(hTimer) );

    cudaThreadExit();

    cutilExit(argc, argv);
}
Beispiel #17
0
////////////////////////////////////////////////////////////////////////////////
// Test driver
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char** argv){
    uint 
        *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal;
    uint 
        *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal;
    uint hTimer;

    const uint   N = 4 * 1048576;
    const uint DIR = 1;

    const uint numValues = 65536;


    printf("Allocating and initializing host arrays...\n\n");
        cutCreateTimer(&hTimer);
        h_SrcKey = (uint *)malloc(N * sizeof(uint));
        h_SrcVal = (uint *)malloc(N * sizeof(uint));
        h_DstKey = (uint *)malloc(N * sizeof(uint));
        h_DstVal = (uint *)malloc(N * sizeof(uint));

        srand(2009);
        for(uint i = 0; i < N; i++)
            h_SrcKey[i] = rand() % numValues;
        fillValues(h_SrcVal, N);

    printf("Allocating and initializing CUDA arrays...\n\n");
        cutilSafeCall( cudaMalloc((void **)&d_DstKey, N * sizeof(uint)) );
        cutilSafeCall( cudaMalloc((void **)&d_DstVal, N * sizeof(uint)) );
        cutilSafeCall( cudaMalloc((void **)&d_BufKey, N * sizeof(uint)) );
        cutilSafeCall( cudaMalloc((void **)&d_BufVal, N * sizeof(uint)) );
        cutilSafeCall( cudaMalloc((void **)&d_SrcKey, N * sizeof(uint)) );
        cutilSafeCall( cudaMalloc((void **)&d_SrcVal, N * sizeof(uint)) );
        cutilSafeCall( cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice) );
        cutilSafeCall( cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice) );

    printf("Initializing GPU merge sort...\n");
        initMergeSort();

    printf("Running GPU merge sort...\n");
        cutilSafeCall( cudaThreadSynchronize() );
        cutResetTimer(hTimer);
        cutStartTimer(hTimer);
            mergeSort(
                d_DstKey,
                d_DstVal,
                d_BufKey,
                d_BufVal,
                d_SrcKey,
                d_SrcVal,
                N,
                DIR
            );
        cutilSafeCall( cudaThreadSynchronize() );
        cutStopTimer(hTimer);
    printf("Time: %f ms\n", cutGetTimerValue(hTimer));

    printf("Reading back GPU merge sort results...\n");
        cutilSafeCall( cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost) );
        cutilSafeCall( cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost) );

    printf("Inspecting the results...\n");
        uint keysFlag = validateSortedKeys(
            h_DstKey,
            h_SrcKey,
            1,
            N,
            numValues,
            DIR
        );

        uint valuesFlag = validateSortedValues(
            h_DstKey,
            h_DstVal,
            h_SrcKey,
            1,
            N
        );

    printf( (keysFlag && valuesFlag) ? "TEST PASSED\n" : "TEST FAILED\n");

    printf("Shutting down...\n");
        closeMergeSort();
        cutilCheckError( cutDeleteTimer(hTimer) );
        cutilSafeCall( cudaFree(d_SrcVal) );
        cutilSafeCall( cudaFree(d_SrcKey) );
        cutilSafeCall( cudaFree(d_BufVal) );
        cutilSafeCall( cudaFree(d_BufKey) );
        cutilSafeCall( cudaFree(d_DstVal) );
        cutilSafeCall( cudaFree(d_DstKey) );
        free(h_DstVal);
        free(h_DstKey);
        free(h_SrcVal);
        free(h_SrcKey);
        cudaThreadExit();
        cutilExit(argc, argv);
}
void shutDown(unsigned char k, int /*x*/, int /*y*/)
{
    switch (k){
        case '\033':
        case 'q':
        case 'Q':
            printf("Shutting down...\n");
                cutilCheckError( cutStopTimer(hTimer)   );
                cutilCheckError( cutDeleteTimer(hTimer) );
				// DEPRECATED: cutilSafeCall( cudaGLRegisterBufferObject(gl_PBO) );
				cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, gl_PBO, 
									   cudaGraphicsMapFlagsWriteDiscard));
                glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
                glDeleteBuffers(1, &gl_PBO);
                glDeleteTextures(1, &gl_Tex);

                cutilSafeCall( CUDA_FreeArray() );
                free(h_Src);
            printf("Shutdown done.\n");

            cutilDeviceReset();
            exit(0);
        break;

        case '1':
            printf("Passthrough.\n");
            g_Kernel = 0;
        break;

        case '2':
            printf("KNN method \n");
            g_Kernel = 1;
        break;

        case '3':
            printf("NLM method\n");
            g_Kernel = 2;
        break;

        case '4':
            printf("Quick NLM(NLM2) method\n");
            g_Kernel = 3;
        break;

        case ' ':
            printf(g_Diag ? "LERP highlighting mode.\n" : "Normal mode.\n");
            g_Diag = !g_Diag;
        break;

        case 'n':
            printf("Decrease noise level.\n");
            knnNoise -= noiseStep;
            nlmNoise -= noiseStep;
        break;

        case 'N':
            printf("Increase noise level.\n");
            knnNoise += noiseStep;
            nlmNoise += noiseStep;
        break;

        case 'l':
            printf("Decrease LERP quotent.\n");
            lerpC = MAX(lerpC - lerpStep, 0.0f);
        break;

        case 'L':
            printf("Increase LERP quotent.\n");
            lerpC = MIN(lerpC + lerpStep, 1.0f);
        break;

        case 'f' : case 'F':
            g_FPS = true;
        break;

        case '?':
            printf("lerpC = %5.5f\n", lerpC);
            printf("knnNoise = %5.5f\n", knnNoise);
            printf("nlmNoise = %5.5f\n", nlmNoise);
        break;
    }
}
int main(int argc, char **argv) {
    char *precisionChoice;
    cutGetCmdLineArgumentstr(argc, (const char **)argv, "type", &precisionChoice);
    if(precisionChoice == NULL)
        useDoublePrecision = 0;
    else {
        if(!strcasecmp(precisionChoice, "double"))
            useDoublePrecision = 1;
        else
            useDoublePrecision = 0;
    }

    const int MAX_GPU_COUNT = 8;
    const int         OPT_N = 256;
    const int        PATH_N = 1 << 18;
    const unsigned int SEED = 777;

    //Input data array
    TOptionData optionData[OPT_N];
    //Final GPU MC results
    TOptionValue callValueGPU[OPT_N];
    //"Theoretical" call values by Black-Scholes formula
    float callValueBS[OPT_N];
    //Solver config
    TOptionPlan optionSolver[MAX_GPU_COUNT];
    //OS thread ID
    CUTThread threadID[MAX_GPU_COUNT];


    //GPU number present in the system
    int GPU_N;
    int gpuBase, gpuIndex;
    int i;

    //Timer
    unsigned int hTimer;
    float time;

    double
    delta, ref, sumDelta, sumRef, sumReserve;

    cutilSafeCall( cudaGetDeviceCount(&GPU_N) );
    cutilCheckError( cutCreateTimer(&hTimer) );

#ifdef _EMU
    GPU_N = 1;
#endif
    printf("main(): generating input data...\n");
    srand(123);
    for(i = 0; i < OPT_N; i++) {
        optionData[i].S = randFloat(5.0f, 50.0f);
        optionData[i].X = randFloat(10.0f, 25.0f);
        optionData[i].T = randFloat(1.0f, 5.0f);
        optionData[i].R = 0.06f;
        optionData[i].V = 0.10f;
        callValueGPU[i].Expected   = -1.0f;
        callValueGPU[i].Confidence = -1.0f;
    }

    printf("main(): starting %i host threads...\n", GPU_N);
    //Get option count for each GPU
    for(i = 0; i < GPU_N; i++)
        optionSolver[i].optionCount = OPT_N / GPU_N;
    //Take into account cases with "odd" option counts
    for(i = 0; i < (OPT_N % GPU_N); i++)
        optionSolver[i].optionCount++;

    //Assign GPU option ranges
    gpuBase = 0;
    for(i = 0; i < GPU_N; i++) {
        optionSolver[i].device     = i;
        optionSolver[i].optionData = optionData   + gpuBase;
        optionSolver[i].callValue  = callValueGPU + gpuBase;
        optionSolver[i].seed       = SEED;
        optionSolver[i].pathN      = PATH_N;
        gpuBase += optionSolver[i].optionCount;
    }

    //Start the timer
    cutilCheckError( cutResetTimer(hTimer) );
    cutilCheckError( cutStartTimer(hTimer) );

    //Start CPU thread for each GPU
    for(gpuIndex = 0; gpuIndex < GPU_N; gpuIndex++)
        threadID[gpuIndex] = cutStartThread((CUT_THREADROUTINE)solverThread, &optionSolver[gpuIndex]);

    //Stop the timer
    cutilCheckError( cutStopTimer(hTimer) );
    time = cutGetTimerValue(hTimer);

    printf("main(): waiting for GPU results...\n");
    cutWaitForThreads(threadID, GPU_N);

    printf("main(): GPU statistics\n");
    for(i = 0; i < GPU_N; i++) {
        printf("GPU #%i\n", optionSolver[i].device);
        printf("Options         : %i\n", optionSolver[i].optionCount);
        printf("Simulation paths: %i\n", optionSolver[i].pathN);
    }
    printf("\nTotal time (ms.): %f\n", time);
    printf("Options per sec.: %f\n", OPT_N / (time * 0.001));

#ifdef DO_CPU
    printf("main(): running CPU MonteCarlo...\n");
    TOptionValue callValueCPU;
    sumDelta = 0;
    sumRef   = 0;
    for(i = 0; i < OPT_N; i++) {
        MonteCarloCPU(
            callValueCPU,
            optionData[i],
            NULL,
            PATH_N
        );
        delta     = fabs(callValueCPU.Expected - callValueGPU[i].Expected);
        ref       = callValueCPU.Expected;
        sumDelta += delta;
        sumRef   += fabs(ref);
        printf("Exp : %f | %f\t", callValueCPU.Expected,   callValueGPU[i].Expected);
        printf("Conf: %f | %f\n", callValueCPU.Confidence, callValueGPU[i].Confidence);
    }
    printf("L1 norm: %E\n", sumDelta / sumRef);
#endif

    printf("main(): comparing Monte Carlo and Black-Scholes results...\n");
    sumDelta   = 0;
    sumRef     = 0;
    sumReserve = 0;
    for(i = 0; i < OPT_N; i++) {
        BlackScholesCall(
            callValueBS[i],
            optionData[i]
        );
        delta     = fabs(callValueBS[i] - callValueGPU[i].Expected);
        ref       = callValueBS[i];
        sumDelta += delta;
        sumRef   += fabs(ref);
        if(delta > 1e-6) sumReserve += callValueGPU[i].Confidence / delta;
#ifdef PRINT_RESULTS
        printf("BS: %f; delta: %E\n", callValueBS[i], delta);
#endif
    }
    sumReserve /= OPT_N;
    printf("L1 norm        : %E\n", sumDelta / sumRef);
    printf("Average reserve: %f\n", sumReserve);
    printf((sumReserve > 1.0f) ? "PASSED\n" : "FAILED.\n");

    printf("Shutting down...\n");

    cutilCheckError( cutDeleteTimer(hTimer) );
    cutilExit(argc, argv);
}
void shmoo(int minN, int maxN, int maxThreads, int maxBlocks, ReduceType datatype)
{ 
		fprintf(stderr, "Shmoo wasn't implemented in this modified kernel!\n");
		exit(1);
    // create random input data on CPU
    unsigned int bytes = maxN * sizeof(T);

    T *h_idata = (T*) malloc(bytes);

    for(int i = 0; i < maxN; i++) {
        // Keep the numbers small so we don't get truncation error in the sum
        if (datatype == REDUCE_INT)
            h_idata[i] = (T)(rand() & 0xFF);
        else
            h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX;
    }

    int maxNumBlocks = MIN( maxN / maxThreads, MAX_BLOCK_DIM_SIZE);

    // allocate mem for the result on host side
    T* h_odata = (T*) malloc(maxNumBlocks*sizeof(T));

    // allocate device memory and data
    T* d_idata = NULL;
    T* d_odata = NULL;

    cutilSafeCallNoSync( cudaMalloc((void**) &d_idata, bytes) );
    cutilSafeCallNoSync( cudaMalloc((void**) &d_odata, maxNumBlocks*sizeof(T)) );

    // copy data directly to device memory
    cutilSafeCallNoSync( cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice) );
    cutilSafeCallNoSync( cudaMemcpy(d_odata, h_idata, maxNumBlocks*sizeof(T), cudaMemcpyHostToDevice) );

    // warm-up
    for (int kernel = 0; kernel < 7; kernel++)
    {
        sumreduce<T>(maxN, maxThreads, maxNumBlocks, kernel, d_idata, d_odata);
    }
    int testIterations = 100;

    unsigned int timer = 0;
    cutilCheckError( cutCreateTimer( &timer));
    
    // print headers
    shrLog("Time in milliseconds for various numbers of elements for each kernel\n\n\n");
    shrLog("Kernel");
    for (int i = minN; i <= maxN; i *= 2)
    {
        shrLog(", %d", i);
    }
   
    for (int kernel = 0; kernel < 7; kernel++)
    {
        shrLog("\n%d", kernel);
        for (int i = minN; i <= maxN; i *= 2)
        {
            cutResetTimer(timer);
            int numBlocks = 0;
            int numThreads = 0;
            getNumBlocksAndThreads(kernel, i, maxBlocks, maxThreads, numBlocks, numThreads);
            
            float reduceTime;
            if( numBlocks <= MAX_BLOCK_DIM_SIZE ) {
                benchmarkReduceSum(i, numThreads, numBlocks, maxThreads, maxBlocks, kernel, 
                                testIterations, false, 1, timer, h_odata, d_idata, d_odata);
                reduceTime =  cutGetAverageTimerValue(timer);
            } else {                
                reduceTime = -1.0;
            }
            shrLog(", %.5f", reduceTime);
        }
    }

    // cleanup
    cutilCheckError(cutDeleteTimer(timer));
    free(h_idata);
    free(h_odata);

    cutilSafeCallNoSync(cudaFree(d_idata));
    cutilSafeCallNoSync(cudaFree(d_odata));    
}
Beispiel #21
0
//////////////////////////////////////////////////////////////////////////////
// Program main
//////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv) 
{
    printf("Run \"nbody -benchmark [-n=<numBodies>]\" to measure perfomance.\n\n");
    
    bool benchmark = 
        (cutCheckCmdLineFlag(argc, (const char**) argv, "benchmark") != 0);

    bool compareToCPU = 
        ((cutCheckCmdLineFlag(argc, (const char**) argv, "compare") != 0) ||
        !(cutCheckCmdLineFlag(argc, (const char**) argv, "noqatest")  != 0));

    bool regression = 
        (cutCheckCmdLineFlag(argc, (const char**) argv, "regression") != 0);

    int devID;
    cudaDeviceProp props;

    // nBody has a mode that allows it to be run without using GL interop
    if (benchmark || compareToCPU || regression) {
				/*
        if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
            cutilDeviceInit(argc, argv);
        } else {
            devID = cutGetMaxGflopsDeviceId();
            cudaSetDevice( devID );
        } */
    } 
    else 
    { 
        // This mode shows the OpenGL results rendered
        // First initialize OpenGL context, so we can properly set the GL for CUDA.
        // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
        glutInit(&argc, argv);
        glutInitDisplayMode(GLUT_RGB | GLUT_DEPTH | GLUT_DOUBLE);
        glutInitWindowSize(720, 480);
        glutCreateWindow("CUDA n-body system");

        GLenum err = glewInit();
        if (GLEW_OK != err)
        {
            printf("GLEW Error: %s\n", glewGetErrorString(err));
        }
        else
        {
#if   defined(WIN32)
            wglSwapIntervalEXT(0);
#elif defined(LINUX)
            glxSwapIntervalSGI(0);
#endif      
        }
    	
        initGL();
        initParameters();
    	
        if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
            cutilGLDeviceInit(argc, argv);
        } else {
            devID = cutGetMaxGflopsDeviceId();
            cudaGLSetGLDevice( devID );
        }
    }


    // get number of SMs on this GPU
    cutilSafeCall(cudaGetDevice(&devID));
    cutilSafeCall(cudaGetDeviceProperties(&props, devID));

    numIterations = 0;
    int p = 256;
    int q = 1;

    cutGetCmdLineArgumenti(argc, (const char**) argv, "i", &numIterations);
    cutGetCmdLineArgumenti(argc, (const char**) argv, "p", &p);
    cutGetCmdLineArgumenti(argc, (const char**) argv, "q", &q);


    // default number of bodies is #SMs * 4 * CTA size
    numBodies = compareToCPU ? 4096 : p*q*4*props.multiProcessorCount;

    cutGetCmdLineArgumenti(argc, (const char**) argv, "n", &numBodies);

    switch (numBodies)
    {
    case 1024:
        activeParams.m_clusterScale = 1.52f;
        activeParams.m_velocityScale = 2.f;
        break;
    case 2048:
        activeParams.m_clusterScale = 1.56f;
        activeParams.m_velocityScale = 2.64f;
        break;
    case 4096:
        activeParams.m_clusterScale = 1.68f;
        activeParams.m_velocityScale = 2.98f;
        break;
    case 8192:
        activeParams.m_clusterScale = 1.98f;
        activeParams.m_velocityScale = 2.9f;
        break;
    default:
    case 16384:
        activeParams.m_clusterScale = 1.54f;
        activeParams.m_velocityScale = 8.f;
        break;
    case 32768:
        activeParams.m_clusterScale = 1.44f;
        activeParams.m_velocityScale = 11.f;
        break;
    }

    if (q * p > 256)
    {
        p = 256 / q;
        printf("Setting p=%d, q=%d to maintain %d threads per block\n", p, q, 256);
    }

    if (q == 1 && numBodies < p)
    {
        p = numBodies;
    }

    init(numBodies, p, q, !(benchmark || compareToCPU));
    
    reset(nbody, numBodies, NBODY_CONFIG_SHELL, !(benchmark || compareToCPU));


    if (benchmark)
    {
        if (numIterations <= 0) 
            numIterations = 100;
        runBenchmark(numIterations);
    }
    else if (compareToCPU || regression)
    {
        compareResults(regression, numBodies);
    }
    else
    {
        glutDisplayFunc(display);
        glutReshapeFunc(reshape);
        glutMouseFunc(mouse);
        glutMotionFunc(motion);
        glutKeyboardFunc(key);
        glutSpecialFunc(special);
        glutIdleFunc(idle);

        cutilSafeCall(cudaEventRecord(startEvent, 0));
        glutMainLoop();
    }

    if (nbodyCPU)
        delete nbodyCPU;
    if (nbodyCUDA)
        delete nbodyCUDA;

    if (hPos)
        delete [] hPos;
    if (hVel)
        delete [] hVel;
    if (hColor)
        delete [] hColor;

    cutilSafeCall(cudaEventDestroy(startEvent));
    cutilSafeCall(cudaEventDestroy(stopEvent));
    cutilCheckError(cutDeleteTimer(demoTimer));

    return 0;
}
int main(int argc, char **argv)
{
    // Start logs
    shrSetLogFileName ("quasirandomGenerator.txt");
    shrLog("%s Starting...\n\n", argv[0]);
    
    unsigned int useDoublePrecision;

    char *precisionChoice;
    cutGetCmdLineArgumentstr(argc, (const char **)argv, "type", &precisionChoice);
    if(precisionChoice == NULL)
        useDoublePrecision = 0;
    else{
        if(!strcasecmp(precisionChoice, "double"))
            useDoublePrecision = 1;
        else
            useDoublePrecision = 0;
    }

    unsigned int tableCPU[QRNG_DIMENSIONS][QRNG_RESOLUTION];

    float
        *h_OutputGPU;

    float
        *d_Output;

    int
        dim, pos;

    double
        delta, ref, sumDelta, sumRef, L1norm, gpuTime;

    unsigned int hTimer;

    if(sizeof(INT64) != 8){
        shrLog("sizeof(INT64) != 8\n");
        return 0;
    }

    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
        cutilDeviceInit(argc, argv);
    else
        cudaSetDevice( cutGetMaxGflopsDeviceId() );

    cutilCheckError(cutCreateTimer(&hTimer));

    int deviceIndex;
    cutilSafeCall(cudaGetDevice(&deviceIndex));
    cudaDeviceProp deviceProp;
    cutilSafeCall(cudaGetDeviceProperties(&deviceProp, deviceIndex));
    int version = deviceProp.major * 10 + deviceProp.minor;
    if(useDoublePrecision && version < 13){
        shrLog("Double precision not supported.\n");
        cudaThreadExit();
        return 0;
    }

    shrLog("Allocating GPU memory...\n");
        cutilSafeCall( cudaMalloc((void **)&d_Output, QRNG_DIMENSIONS * N * sizeof(float)) );

    shrLog("Allocating CPU memory...\n");
        h_OutputGPU = (float *)malloc(QRNG_DIMENSIONS * N * sizeof(float));

    shrLog("Initializing QRNG tables...\n\n");
        initQuasirandomGenerator(tableCPU);
        if(useDoublePrecision)
            initTable_SM13(tableCPU);
        else
            initTable_SM10(tableCPU);

    shrLog("Testing QRNG...\n\n");
        cutilSafeCall( cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float)) );
		int numIterations = 20;
		for (int i = -1; i < numIterations; i++)
		{
			if (i == 0)
			{
				cutilSafeCall( cudaThreadSynchronize() );
				cutilCheckError( cutResetTimer(hTimer) );
				cutilCheckError( cutStartTimer(hTimer) );
			}
            if(useDoublePrecision)
                quasirandomGenerator_SM13(d_Output, 0, N);
            else
                quasirandomGenerator_SM10(d_Output, 0, N);
		}
        cutilSafeCall( cudaThreadSynchronize() );
        cutilCheckError(cutStopTimer(hTimer));
        gpuTime = cutGetTimerValue(hTimer)/(double)numIterations*1e-3;
	shrLogEx(LOGBOTH | MASTER, 0, "quasirandomGenerator, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", 
                (double)QRNG_DIMENSIONS * (double)N * 1.0E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128*QRNG_DIMENSIONS); 

    shrLog("\nReading GPU results...\n");
        cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost) );

    shrLog("Comparing to the CPU results...\n\n");
        sumDelta = 0;
        sumRef = 0;
        for(dim = 0; dim < QRNG_DIMENSIONS; dim++)
            for(pos = 0; pos < N; pos++){
                ref       = getQuasirandomValue63(pos, dim);
                delta     = (double)h_OutputGPU[dim * N + pos] - ref;
                sumDelta += fabs(delta);
                sumRef   += fabs(ref);
            }
    shrLog("L1 norm: %E\n", sumDelta / sumRef);

    shrLog("\nTesting inverseCNDgpu()...\n\n");
        cutilSafeCall( cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float)) );
		for (int i = -1; i < numIterations; i++)
		{
			if (i == 0) 
			{
				cutilSafeCall( cudaThreadSynchronize() );
				cutilCheckError( cutResetTimer(hTimer) );
				cutilCheckError( cutStartTimer(hTimer) );
			}
            if(useDoublePrecision)
                inverseCND_SM13(d_Output, NULL, QRNG_DIMENSIONS * N);
            else
                inverseCND_SM10(d_Output, NULL, QRNG_DIMENSIONS * N);
		}
        cutilSafeCall( cudaThreadSynchronize() );
        cutilCheckError(cutStopTimer(hTimer));
        gpuTime = cutGetTimerValue(hTimer)/(double)numIterations*1e-3;
	shrLogEx(LOGBOTH | MASTER, 0, "quasirandomGenerator-inverse, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", 
                (double)QRNG_DIMENSIONS * (double)N * 1E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128); 

    shrLog("Reading GPU results...\n");
        cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost) );

    shrLog("\nComparing to the CPU results...\n");
        sumDelta = 0;
        sumRef = 0;
        for(pos = 0; pos < QRNG_DIMENSIONS * N; pos++){
            double  p = (double)(pos + 1) / (double)(QRNG_DIMENSIONS * N + 1);
            ref       = MoroInvCNDcpu(p);
            delta     = (double)h_OutputGPU[pos] - ref;
            sumDelta += fabs(delta);
            sumRef   += fabs(ref);
        }
    shrLog("L1 norm: %E\n\n", L1norm = sumDelta / sumRef);
    shrLog((L1norm < 1E-6) ? "PASSED\n\n" : "FAILED\n\n");

    shrLog("Shutting down...\n");
        cutilCheckError(cutDeleteTimer(hTimer));
        free(h_OutputGPU);
        cutilSafeCall( cudaFree(d_Output) );

    cudaThreadExit();

    shrEXIT(argc, (const char**)argv);
}
bool
runTestMax( int argc, char** argv, ReduceType datatype) 
{
    int size = 1<<24;    // number of elements to reduce
    int maxThreads = 256;  // number of threads per block
    int whichKernel = 6;
    int maxBlocks = 64;
    bool cpuFinalReduction = false;
    int cpuFinalThreshold = 1;

    cutGetCmdLineArgumenti( argc, (const char**) argv, "n", &size);
    cutGetCmdLineArgumenti( argc, (const char**) argv, "threads", &maxThreads);
    cutGetCmdLineArgumenti( argc, (const char**) argv, "kernel", &whichKernel);
    cutGetCmdLineArgumenti( argc, (const char**) argv, "maxblocks", &maxBlocks);
    
		shrLog("METHOD: MAX\n");
    shrLog("%d elements\n", size);
    shrLog("%d threads (max)\n", maxThreads);

    cpuFinalReduction = (cutCheckCmdLineFlag( argc, (const char**) argv, "cpufinal") == CUTTrue);
    cutGetCmdLineArgumenti( argc, (const char**) argv, "cputhresh", &cpuFinalThreshold);

    bool runShmoo = (cutCheckCmdLineFlag(argc, (const char**) argv, "shmoo") == CUTTrue);

    if (runShmoo)
    {
        shmoo<T>(1, 33554432, maxThreads, maxBlocks, datatype);
    }
    else
    {

        // create random input data on CPU
        unsigned int bytes = size * sizeof(T);

        T *h_idata = (T *) malloc(bytes);

        for(int i=0; i<size; i++) 
        {
            // Keep the numbers small so we don't get truncation error in the sum
            if (datatype == REDUCE_INT)
                h_idata[i] = (T)(rand() & 0xFF);
            else
                h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX;
        }

        int numBlocks = 0;
        int numThreads = 0;
        getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads);
        if (numBlocks == 1) cpuFinalThreshold = 1;

        // allocate mem for the result on host side
        T* h_odata = (T*) malloc(numBlocks*sizeof(T));

        shrLog("%d blocks\n\n", numBlocks);

        // allocate device memory and data
        T* d_idata = NULL;
        T* d_odata = NULL;

        cutilSafeCallNoSync( cudaMalloc((void**) &d_idata, bytes) );
        cutilSafeCallNoSync( cudaMalloc((void**) &d_odata, numBlocks*sizeof(T)) );

        // copy data directly to device memory
        cutilSafeCallNoSync( cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice) );
        cutilSafeCallNoSync( cudaMemcpy(d_odata, h_idata, numBlocks*sizeof(T), cudaMemcpyHostToDevice) );

        // warm-up
        maxreduce<T>(size, numThreads, numBlocks, whichKernel, d_idata, d_odata);
        
        int testIterations = 100;

        unsigned int timer = 0;
        cutilCheckError( cutCreateTimer( &timer));
        
        T gpu_result = 0;

        gpu_result = benchmarkReduceMax<T>(size, numThreads, numBlocks, maxThreads, maxBlocks,
                                        whichKernel, testIterations, cpuFinalReduction, 
                                        cpuFinalThreshold, timer,
                                        h_odata, d_idata, d_odata);

		double reduceTime = cutGetAverageTimerValue(timer) * 1e-3;
        shrLogEx(LOGBOTH | MASTER, 0, "Reduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %d, Workgroup = %u\n", 
               1.0e-9 * ((double)bytes)/reduceTime, reduceTime, size, 1, numThreads);

        // compute reference solution
        T cpu_result = maxreduceCPU<T>(h_idata, size);

        double threshold = 1e-12;
        double diff = 0;
		
        if (datatype == REDUCE_INT)
        {
            shrLog("\nGPU result = %d\n", gpu_result);
            shrLog("CPU result = %d\n\n", cpu_result);
        }
        else
        {
            shrLog("\nGPU result = %f\n", gpu_result);
            shrLog("CPU result = %f\n\n", cpu_result);

            if (datatype == REDUCE_FLOAT)
                threshold = 1e-8 * size;
            diff = fabs((double)gpu_result - (double)cpu_result);
        }

        // cleanup
        cutilCheckError( cutDeleteTimer(timer) );
        free(h_idata);
        free(h_odata);

        cutilSafeCallNoSync(cudaFree(d_idata));
        cutilSafeCallNoSync(cudaFree(d_odata));

		if (datatype == REDUCE_INT) {
			return (gpu_result == cpu_result);
		} else {
			return (diff < threshold);
		}
	}
	return true;
}