Beispiel #1
0
void switchDemoPrecision()
{
    cutilDeviceSynchronize();

    fp64 = !fp64;
    flopsPerInteraction = fp64 ? 30 : 20;
    
    T_old *oldPos = new T_old[numBodies * 4];
    T_old *oldVel = new T_old[numBodies * 4];
    
    NBodyDemo<T_old>::getArrays(oldPos, oldVel);
   
    // convert float to double
    T_new *newPos = new T_new[numBodies * 4];
    T_new *newVel = new T_new[numBodies * 4];

    for (int i = 0; i < numBodies * 4; i++)
    {
        newPos[i] = (T_new)oldPos[i];
        newVel[i] = (T_new)oldVel[i];
    }
    
    NBodyDemo<T_new>::setArrays(newPos, newVel);

    cutilDeviceSynchronize();

    delete [] oldPos;
    delete [] oldVel;
    delete [] newPos;
    delete [] newVel;
}
Beispiel #2
0
////////////////////////////////////////////////////////////////////////////////
//! Run a simple benchmark test for CUDA
////////////////////////////////////////////////////////////////////////////////
void
runBenchmark( int argc, char **argv )
{
    int devID = 0;
    shrLog("[runBenchmark]: [%s]\n", sSDKsample);
    devID = cutilChooseCudaDevice(argc, argv);

    loadImageData(argc, argv);

    initCuda();

    g_CheckRender       = new CheckBackBuffer(width, height, 4, false);
    g_CheckRender->setExecPath(argv[0]);

    unsigned int *d_result;
    cutilSafeCall( cudaMalloc( (void **)&d_result, width*height*sizeof(unsigned int)) );

    // warm-up
    boxFilterRGBA(d_img, d_temp, d_temp, width, height, filter_radius, iterations, nthreads);
    cutilSafeCall( cutilDeviceSynchronize() );

    // Start round-trip timer and process iCycles loops on the GPU
    iterations = 1;     // standard 1-pass filtering
    const int iCycles = 150;
    double dProcessingTime = 0.0;
    shrLog("\nRunning BoxFilterGPU for %d cycles...\n\n", iCycles);
    shrDeltaT(2);
    for (int i = 0; i < iCycles; i++)
    {
        dProcessingTime += boxFilterRGBA(d_img, d_temp, d_img, width, height, filter_radius, iterations, nthreads);
    }

    // check if kernel execution generated an error and sync host
    cutilCheckMsg("Error: boxFilterRGBA Kernel execution FAILED");
    cutilSafeCall(cutilDeviceSynchronize());

    // Get average computation time
    dProcessingTime /= (double)iCycles;

    // log testname, throughput, timing and config info to sample and master logs
    shrLogEx(LOGBOTH | MASTER, 0, "boxFilter-texture, Throughput = %.4f M RGBA Pixels/s, Time = %.5f s, Size = %u RGBA Pixels, NumDevsUsed = %u, Workgroup = %u\n",
             (1.0e-6 * width * height)/dProcessingTime, dProcessingTime,
             (width * height), 1, nthreads);
    shrLog("\n");
}
void runAutoTest(int argc, char **argv)
{
	int devID = 0;
    printf("[%s] - (automated testing w/ readback)\n", sSDKsample);

	devID = cutilChooseCudaDevice(argc, argv);

    // First load the image, so we know what the size of the image (imageW and imageH)
    printf("Allocating host and CUDA memory and loading image file...\n");
    const char *image_path = cutFindFilePath("portrait_noise.bmp", argv[0]);
    if (image_path == NULL) {
       printf( "imageDenoisingGL was unable to find and load image file <portrait_noise.bmp>.\nExiting...\n");
       shrQAFinishExit(argc, (const char **)argv, QA_FAILED);
    }
    LoadBMPFile(&h_Src, &imageW, &imageH, image_path);
    printf("Data init done.\n");

    cutilSafeCall( CUDA_MallocArray(&h_Src, imageW, imageH) );

    g_CheckRender       = new CheckBackBuffer(imageW, imageH, sizeof(TColor), false);
    g_CheckRender->setExecPath(argv[0]);

    TColor *d_dst = NULL;
    cutilSafeCall( cudaMalloc( (void **)&d_dst, imageW*imageH*sizeof(TColor)) );

    while (g_Kernel <= 3) {
        printf("[AutoTest]: %s <%s>\n", sSDKsample, filterMode[g_Kernel]);
        cutilSafeCall( CUDA_Bind2TextureArray()                      );
        runImageFilters(d_dst);
        cutilSafeCall( CUDA_UnbindTexture()     );
        cutilSafeCall( cutilDeviceSynchronize() );
        cudaMemcpy(g_CheckRender->imageData(), d_dst, imageW*imageH*sizeof(TColor), cudaMemcpyDeviceToHost);
        g_CheckRender->savePPM(sOriginal[g_Kernel], true, NULL);

        if (!g_CheckRender->PPMvsPPM(sOriginal[g_Kernel], sReference[g_Kernel], MAX_EPSILON_ERROR, 0.15f)) {
            g_TotalErrors++;
        }
        g_Kernel++;
    }

    cutilSafeCall( CUDA_FreeArray() );
    free(h_Src);

    cutilSafeCall( cudaFree( d_dst ) );
    delete g_CheckRender;

	printf("\n[%s] -> Test Results: %d errors\n", sSDKsample, g_TotalErrors);

	cutilDeviceReset();
	shrQAFinishExit(argc, (const char **)argv, (!g_TotalErrors ? QA_PASSED : QA_FAILED));
}
void runAutoTest(int argc, char **argv)
{
    int devID = 0;
    shrLog("[runAutoTest]: [%s] (automated testing w/ readback)\n", sSDKsample);

    devID = cutilChooseCudaDevice(argc, argv);

    loadImageData(argc, argv);

    initCuda();

    g_CheckRender       = new CheckBackBuffer(width, height, 4, false);
    g_CheckRender->setExecPath(argv[0]);

    unsigned int *d_result;
    cutilSafeCall( cudaMalloc( (void **)&d_result, width*height*sizeof(unsigned int)) );

    for(int i = 0; i < 4; i++)
    {
        shrLog("[AutoTest]: %s (radius=%d)", sSDKsample, filter_radius );
        bilateralFilterRGBA(d_result, width, height, euclidean_delta, filter_radius, iterations, nthreads);

        // check if kernel execution generated an error
        cutilCheckMsg("Error: bilateralFilterRGBA Kernel execution FAILED");
        cutilSafeCall( cutilDeviceSynchronize() );
        cudaMemcpy(g_CheckRender->imageData(), d_result, width*height*sizeof(unsigned int), cudaMemcpyDeviceToHost);

        g_CheckRender->savePPM(sOriginal[i], false, NULL);

        if (!g_CheckRender->PPMvsPPM(sOriginal[i], sReference[i], MAX_EPSILON_ERROR, 0.15f)) {
            g_TotalErrors++;
        }
        gaussian_delta += 1.0f;
        euclidean_delta *= 1.25f;

		updateGaussian(gaussian_delta, filter_radius);
    }

    cutilSafeCall( cudaFree( d_result ) );
    delete g_CheckRender;
}
Beispiel #5
0
int main(int argc, char **argv)
{
    uchar *h_Data;
    uint  *h_HistogramCPU, *h_HistogramGPU;
    uchar *d_Data;
    uint  *d_Histogram;
    uint hTimer;
    int PassFailFlag = 1;
    uint byteCount = 64 * 1048576;
    uint uiSizeMult = 1;

    cudaDeviceProp deviceProp;
    deviceProp.major = 0;
    deviceProp.minor = 0;
    int dev;

	shrQAStart(argc, argv);

	// set logfile name and start logs
    shrSetLogFileName ("histogram.txt");

    //Use command-line specified CUDA device, otherwise use device with highest Gflops/s
    if( shrCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
        dev = cutilDeviceInit(argc, argv);
        if (dev < 0) {
           printf("No CUDA Capable Devices found, exiting...\n");
           shrQAFinishExit(argc, (const char **)argv, QA_WAIVED);
        }
    } else {
        cudaSetDevice( dev = cutGetMaxGflopsDeviceId() );
        cutilSafeCall( cudaChooseDevice(&dev, &deviceProp) );
    }
    cutilSafeCall( cudaGetDeviceProperties(&deviceProp, dev) );

	printf("CUDA device [%s] has %d Multi-Processors, Compute %d.%d\n", 
		deviceProp.name, deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor);

	int version = deviceProp.major * 0x10 + deviceProp.minor;

	if(version < 0x11) 
    {
        printf("There is no device supporting a minimum of CUDA compute capability 1.1 for this SDK sample\n");
        cutilDeviceReset();
		shrQAFinishExit(argc, (const char **)argv, QA_WAIVED);
    }

    cutilCheckError(cutCreateTimer(&hTimer));

    // Optional Command-line multiplier to increase size of array to histogram
    if (shrGetCmdLineArgumentu(argc, (const char**)argv, "sizemult", &uiSizeMult))
    {
        uiSizeMult = CLAMP(uiSizeMult, 1, 10);
        byteCount *= uiSizeMult;
    }

    shrLog("Initializing data...\n");
        shrLog("...allocating CPU memory.\n");
            h_Data         = (uchar *)malloc(byteCount);
            h_HistogramCPU = (uint  *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint));
            h_HistogramGPU = (uint  *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint));

        shrLog("...generating input data\n");
            srand(2009);
            for(uint i = 0; i < byteCount; i++) 
                h_Data[i] = rand() % 256;

        shrLog("...allocating GPU memory and copying input data\n\n");
            cutilSafeCall( cudaMalloc((void **)&d_Data, byteCount  ) );
            cutilSafeCall( cudaMalloc((void **)&d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint)  ) );
            cutilSafeCall( cudaMemcpy(d_Data, h_Data, byteCount, cudaMemcpyHostToDevice) );

    {
        shrLog("Starting up 64-bin histogram...\n\n");
            initHistogram64();

        shrLog("Running 64-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns);
            for(int iter = -1; iter < numRuns; iter++){
                //iter == -1 -- warmup iteration
                if(iter == 0){
                    cutilSafeCall( cutilDeviceSynchronize() );
                    cutilCheckError( cutResetTimer(hTimer) );
                    cutilCheckError( cutStartTimer(hTimer) );
                }

                histogram64(d_Histogram, d_Data, byteCount);
            }

            cutilSafeCall( cutilDeviceSynchronize() );
            cutilCheckError(  cutStopTimer(hTimer));
            double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns;
        shrLog("histogram64() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs);
        shrLogEx(LOGBOTH | MASTER, 0, "histogram64, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", 
                (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM64_THREADBLOCK_SIZE); 

        shrLog("\nValidating GPU results...\n");
            shrLog(" ...reading back GPU results\n");
                cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM64_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) );

            shrLog(" ...histogram64CPU()\n");
               histogram64CPU(
                    h_HistogramCPU,
                    h_Data,
                    byteCount
                );

            shrLog(" ...comparing the results...\n");
                for(uint i = 0; i < HISTOGRAM64_BIN_COUNT; i++)
                    if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0;
            shrLog(PassFailFlag ? " ...64-bin histograms match\n\n" : " ***64-bin histograms do not match!!!***\n\n" );

        shrLog("Shutting down 64-bin histogram...\n\n\n");
            closeHistogram64();
    }

    {
        shrLog("Initializing 256-bin histogram...\n");
            initHistogram256();

        shrLog("Running 256-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns);
            for(int iter = -1; iter < numRuns; iter++){
                //iter == -1 -- warmup iteration
                if(iter == 0){
                    cutilSafeCall( cutilDeviceSynchronize() );
                    cutilCheckError( cutResetTimer(hTimer) );
                    cutilCheckError( cutStartTimer(hTimer) );
                }

                histogram256(d_Histogram, d_Data, byteCount);
            }

            cutilSafeCall( cutilDeviceSynchronize() );
            cutilCheckError(  cutStopTimer(hTimer));
            double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns;
        shrLog("histogram256() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs);
        shrLogEx(LOGBOTH | MASTER, 0, "histogram256, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", 
                (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM256_THREADBLOCK_SIZE); 
                
        shrLog("\nValidating GPU results...\n");
            shrLog(" ...reading back GPU results\n");
                cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) );

            shrLog(" ...histogram256CPU()\n");
                histogram256CPU(
                    h_HistogramCPU,
                    h_Data,
                    byteCount
                );

            shrLog(" ...comparing the results\n");
                for(uint i = 0; i < HISTOGRAM256_BIN_COUNT; i++)
                    if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0;
            shrLog(PassFailFlag ? " ...256-bin histograms match\n\n" : " ***256-bin histograms do not match!!!***\n\n" );

        shrLog("Shutting down 256-bin histogram...\n\n\n");
            closeHistogram256();
    }

    shrLog("Shutting down...\n");
        cutilCheckError(cutDeleteTimer(hTimer));
        cutilSafeCall( cudaFree(d_Histogram) );
        cutilSafeCall( cudaFree(d_Data) );
        free(h_HistogramGPU);
        free(h_HistogramCPU);
        free(h_Data);

    cutilDeviceReset();
	shrLog("%s - Test Summary\n", sSDKsample);
    // pass or fail (for both 64 bit and 256 bit histograms)
    shrQAFinishExit(argc, (const char **)argv, (PassFailFlag ? QA_PASSED : QA_FAILED));
}
int main()
{
	int width = 512;
	int height = 512;

	// Creation du device
	cutilSafeCall( cudaSetDevice( cutGetMaxGflopsDeviceId() ) );

	// Creation des buffers sur CPU
	int * a = new int[width*height];
	int * b = new int[width*height];
	int * res = new int[width*height];
	for(int i = 0; i < width*height; i++)
	{
		a[i] = (int)ceil(((double)rand()/ (double)RAND_MAX)*100);
		b[i] = (int)ceil(((double)rand()/ (double)RAND_MAX)*100);
	}

	// Allocation des objects on the device
	// *** data
	unsigned int size = width * height * sizeof(int);
	std::cout << "Allocation d'un buffer de taille " << width*height << "\n";
	int* d_a = NULL;
	int* d_b = NULL;
	int* d_res = NULL;
    cutilSafeCall( cudaMalloc( (void**) &d_a, size));
	cutilSafeCall( cudaMalloc( (void**) &d_b, size));
	cutilSafeCall( cudaMalloc( (void**) &d_res, size));

	// Copy des donnees
	cutilSafeCall( cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice));
	cutilSafeCall( cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice));

    // Lancer le calcul
	std::cout << "Lancer le kernel ... \n";
	runKernel(d_a, d_b, d_res, width*height);
	cutilSafeCall( cutilDeviceSynchronize() );

	// Copie DeviceHost
	cutilSafeCall( cudaMemcpy(res, d_res, size, cudaMemcpyDeviceToHost));

	// Verification du test
	int i = 0;
	for(;i < width*height;i++)
		if(res[i] != a[i] + b[i])
			std::cout << "Error : [" << i << "] " << res[i] << " != " << a[i] << " + " << b[i] << std::endl;


	// Liberation des ressources
	// *** Device
	cutilSafeCall(cudaFree(d_a));
	cutilSafeCall(cudaFree(d_b));
	cutilSafeCall(cudaFree(d_res));
	// *** CPU
	delete[] res;
	delete[] a;
	delete[] b;

	// Close device
	cutilDeviceReset();
	std::cout << "Test result : " << ((i == width*height) ? "Succes" : "Error" ) <<  std::endl;
	std::cout.flush();

	return 0;
}
Beispiel #7
0
////////////////////////////////////////////////////////////////////////////////
// Test driver
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char** argv){
    uint *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal;
    uint *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal;
    uint hTimer;

    const uint   N = 4 * 1048576;
    const uint DIR = 1;
    const uint numValues = 65536;

	shrQAStart(argc, argv);

    printf("Allocating and initializing host arrays...\n\n");
        cutCreateTimer(&hTimer);
        h_SrcKey = (uint *)malloc(N * sizeof(uint));
        h_SrcVal = (uint *)malloc(N * sizeof(uint));
        h_DstKey = (uint *)malloc(N * sizeof(uint));
        h_DstVal = (uint *)malloc(N * sizeof(uint));

        srand(2009);
        for(uint i = 0; i < N; i++)
            h_SrcKey[i] = rand() % numValues;
        fillValues(h_SrcVal, N);

    printf("Allocating and initializing CUDA arrays...\n\n");
        cutilSafeCall( cudaMalloc((void **)&d_DstKey, N * sizeof(uint)) );
        cutilSafeCall( cudaMalloc((void **)&d_DstVal, N * sizeof(uint)) );
        cutilSafeCall( cudaMalloc((void **)&d_BufKey, N * sizeof(uint)) );
        cutilSafeCall( cudaMalloc((void **)&d_BufVal, N * sizeof(uint)) );
        cutilSafeCall( cudaMalloc((void **)&d_SrcKey, N * sizeof(uint)) );
        cutilSafeCall( cudaMalloc((void **)&d_SrcVal, N * sizeof(uint)) );
        cutilSafeCall( cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice) );
        cutilSafeCall( cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice) );

    printf("Initializing GPU merge sort...\n");
        initMergeSort();

    printf("Running GPU merge sort...\n");
        cutilSafeCall( cutilDeviceSynchronize() );
        cutResetTimer(hTimer);
        cutStartTimer(hTimer);
            mergeSort(
                d_DstKey,
                d_DstVal,
                d_BufKey,
                d_BufVal,
                d_SrcKey,
                d_SrcVal,
                N,
                DIR
            );
        cutilSafeCall( cutilDeviceSynchronize() );
        cutStopTimer(hTimer);
    printf("Time: %f ms\n", cutGetTimerValue(hTimer));

    printf("Reading back GPU merge sort results...\n");
        cutilSafeCall( cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost) );
        cutilSafeCall( cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost) );

    printf("Inspecting the results...\n");
        uint keysFlag = validateSortedKeys(
            h_DstKey,
            h_SrcKey,
            1,
            N,
            numValues,
            DIR
        );

        uint valuesFlag = validateSortedValues(
            h_DstKey,
            h_DstVal,
            h_SrcKey,
            1,
            N
        );

    printf("Shutting down...\n");
        closeMergeSort();
        cutilCheckError( cutDeleteTimer(hTimer) );
        cutilSafeCall( cudaFree(d_SrcVal) );
        cutilSafeCall( cudaFree(d_SrcKey) );
        cutilSafeCall( cudaFree(d_BufVal) );
        cutilSafeCall( cudaFree(d_BufKey) );
        cutilSafeCall( cudaFree(d_DstVal) );
        cutilSafeCall( cudaFree(d_DstKey) );
        free(h_DstVal);
        free(h_DstKey);
        free(h_SrcVal);
        free(h_SrcKey);
        cutilDeviceReset();

        shrQAFinishExit(argc, (const char **)argv, (keysFlag && valuesFlag) ? QA_PASSED : QA_FAILED);
}
void runAutoTest(int argc, char **argv)
{
    printf("[%s] (automated testing w/ readback)\n", sSDKsample);

    if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) 
    {
       int device = cutilDeviceInit(argc, argv);
       if (device < 0) {
            printf("No CUDA Capable devices found, exiting...\n");
            shrQAFinishExit(argc, (const char **)argv, QA_WAIVED);
       }
	   checkDeviceMeetComputeSpec( argc, argv );
    } else {
       int dev = findCapableDevice(argc, argv);
       if( dev != -1 ) 
          cudaSetDevice( dev );
       else {
          cutilDeviceReset();
		  shrQAFinishExit2(g_bQAReadback, *pArgc, (const char **)pArgv, QA_PASSED);
       }
    }

    loadDefaultImage( argc, argv );

    if (argc > 1) {
        char *filename;
        if (cutGetCmdLineArgumentstr(argc, (const char **)argv, "file", &filename)) {
            initializeData(filename, argc, argv);
        }
    } else {
        loadDefaultImage( argc, argv );
    }

    g_CheckRender       = new CheckBackBuffer(imWidth, imHeight, sizeof(Pixel), false);
    g_CheckRender->setExecPath(argv[0]);

    Pixel *d_result;
    cutilSafeCall( cudaMalloc( (void **)&d_result, imWidth*imHeight*sizeof(Pixel)) );

    while (g_SobelDisplayMode <= 2) 
    {
        printf("AutoTest: %s <%s>\n", sSDKsample, filterMode[g_SobelDisplayMode]);

        sobelFilter(d_result, imWidth, imHeight, g_SobelDisplayMode, imageScale, blockOp, pointOp );

        cutilSafeCall( cutilDeviceSynchronize() );

        cudaMemcpy(g_CheckRender->imageData(), d_result, imWidth*imHeight*sizeof(Pixel), cudaMemcpyDeviceToHost);

        g_CheckRender->savePGM(sOriginal[g_Index], false, NULL);

        if (!g_CheckRender->PGMvsPGM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, 0.15f)) {
            g_TotalErrors++;
        }
        g_Index++;
        g_SobelDisplayMode = (SobelDisplayMode)g_Index;
    }

    cutilSafeCall( cudaFree( d_result ) );
    delete g_CheckRender;

    shrQAFinishExit(argc, (const char **)argv, (!g_TotalErrors ? QA_PASSED : QA_FAILED) );
}
T benchmarkReduceMax(int  n, 
                  int  numThreads,
                  int  numBlocks,
                  int  maxThreads,
                  int  maxBlocks,
                  int  whichKernel, 
                  int  testIterations,
                  bool cpuFinalReduction,
                  int  cpuFinalThreshold,
                  unsigned int timer,
                  T* h_odata,
                  T* d_idata, 
                  T* d_odata)
{
    T gpu_result = 0;
    bool needReadBack = true;

    for (int i = 0; i < testIterations; ++i)
    {
        gpu_result = 0;

        cutilDeviceSynchronize();
        cutilCheckError( cutStartTimer( timer));

        // execute the kernel
        maxreduce<T>(n, numThreads, numBlocks, whichKernel, d_idata, d_odata);

        // check if kernel execution generated an error
        cutilCheckMsg("Kernel execution failed");

        if (cpuFinalReduction)
        {
            // sum partial sums from each block on CPU        
            // copy result from device to host
            cutilSafeCallNoSync( cudaMemcpy( h_odata, d_odata, numBlocks*sizeof(T), cudaMemcpyDeviceToHost) );

            for(int i=0; i<numBlocks; i++) 
            {
                gpu_result += h_odata[i];
            }

            needReadBack = false;
        }
        else
        {
            // sum partial block sums on GPU
            int s=numBlocks;
            int kernel = whichKernel;
            while(s > cpuFinalThreshold) 
            {
                int threads = 0, blocks = 0;
                getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, threads);
                
                maxreduce<T>(s, threads, blocks, kernel, d_odata, d_odata);
                
                if (kernel < 3)
                    s = (s + threads - 1) / threads;
                else
                    s = (s + (threads*2-1)) / (threads*2);
            }
            
            if (s > 1)
            {
                // copy result from device to host
                cutilSafeCallNoSync( cudaMemcpy( h_odata, d_odata, s * sizeof(T), cudaMemcpyDeviceToHost) );

                for(int i=0; i < s; i++) 
                {
                    gpu_result += h_odata[i];
                }

                needReadBack = false;
            }
        }

        cutilDeviceSynchronize();
        cutilCheckError( cutStopTimer(timer) );      
    }

    if (needReadBack)
    {
        // copy final sum from device to host
        cutilSafeCallNoSync( cudaMemcpy( &gpu_result, d_odata, sizeof(T), cudaMemcpyDeviceToHost) );
    }

    return gpu_result;
}
int main()
{
	int width = 800;
	int height = 600;

	int mesh_width = 256;
	int mesh_height = 256;

	// Creation du device
	cutilSafeCall( cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ) );

	// Creation d'une fenetre
	C3::Window OpenGLWin;
	OpenGLWin.Create(C3::WindowMode(width,height),"CudaC3");
	
	// Glew init
	GLenum err = glewInit();
	if(err != GLEW_OK)
		std::cout << "Error on GLEW initialization.\n";


	// Configuration OpenGL
	glClearColor(0.0, 0.0, 0.0, 1.0);
	glDisable(GL_DEPTH_TEST);
	glViewport(0, 0, width, height);
	glMatrixMode(GL_PROJECTION);
    glLoadIdentity();
    gluPerspective(60.0, (GLfloat)width / (GLfloat) height, 0.1, 10.0);

	// VBO
	// *** Create
	GLuint vbo;
	glGenBuffers(1, &vbo);
	glBindBuffer(GL_ARRAY_BUFFER, vbo);
	// *** Initialize
	unsigned int size = mesh_width * mesh_height * 4 * sizeof(float);
	glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW);
	glBindBuffer(GL_ARRAY_BUFFER, 0);
	// *** Register in CUDA
	cudaGraphicsResource *cuda_vbo_resource = NULL;
	cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsWriteDiscard));


	float g_fAnim = 0.f;
	int nbFrame = 0;
	float timeFPS = 0.f;
	while(OpenGLWin.IsOpened())
	{
		// Events
		C3::Event event;
		while(OpenGLWin.PoolEvent(event))
		{
                        //std::cout << "Event !" << std::endl;
			if(event.Type == C3::Event::Closed)
			{
				std::cout << "Close ... " << std::endl;
				OpenGLWin.Close();
			}
			else if(event.Type == C3::Event::KeyPressed)
			{
				if(event.Key.Code == C3::Key::Escape)
				{
					std::cout << "Close ... " << std::endl;
					OpenGLWin.Close();
				}
			}
		}

		// Mise a jour du temps
		g_fAnim += OpenGLWin.GetFrameTime() / 1000.f;
		timeFPS += OpenGLWin.GetFrameTime() / 1000.f;
		nbFrame++;
		if(timeFPS > 1.0f)
		{
			std::stringstream ss;
			ss << "CudaC3 [" << (int)ceil( nbFrame / timeFPS ) << " FPS]";
			OpenGLWin.SetTitle(ss.str());
			timeFPS = 0.f;
			nbFrame = 0;
		}
		
		// Draw the scene
		glClear( GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT );

		// Lancer le calcul CUDA
		// *** map OpenGL buffer object for writing from CUDA
		float4 *dptr;
		cutilSafeCall(cudaGraphicsMapResources(1, &cuda_vbo_resource, 0));
		size_t num_bytes; 
		cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void **)&dptr, &num_bytes, cuda_vbo_resource));
		// *** Run kernel
		runKernel(dptr, mesh_width, mesh_height,g_fAnim);
		cutilSafeCall( cutilDeviceSynchronize() );
		// *** Unmap
		cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_vbo_resource, 0));
		
		// OpenGL
		// *** Make some transformation
		glMatrixMode(GL_MODELVIEW);
		glLoadIdentity();
		glTranslatef(0.0, 0.0, -3.0);
		glRotatef(0.0, 1.0, 0.0, 0.0);
		glRotatef(0.0, 0.0, 1.0, 0.0);
		// *** Render VBO
		// --- Bind
		glBindBuffer(GL_ARRAY_BUFFER, vbo);
		glVertexPointer(4, GL_FLOAT, 0, 0);
		// --- Draw
		glEnableClientState(GL_VERTEX_ARRAY);
		glColor3f(1.0, 0.0, 0.0);
		glDrawArrays(GL_POINTS, 0, mesh_width * mesh_height);
		glDisableClientState(GL_VERTEX_ARRAY);

		// Swap buffers
		OpenGLWin.Display();
	}

	// Liberation des ressources
	cudaGraphicsUnregisterResource(cuda_vbo_resource);
	
	glBindBuffer(1, vbo);
	glDeleteBuffers(1, &vbo);

	// Close device
	cutilDeviceReset();

	return 0;
}