void initGL(int *argc, char** argv)
{
    glutInit( argc, argv );    
    glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE);
    glutInitWindowSize(wWidth, wHeight);
    glutCreateWindow("Function Pointers [CUDA Edge Detection]n");

    glewInit();

    if (g_bFBODisplay) {
        if (!glewIsSupported( "GL_VERSION_2_0 GL_ARB_fragment_program GL_EXT_framebuffer_object" )) {
            fprintf(stderr, "Error: failed to get minimal extensions for demo\n");
            fprintf(stderr, "This sample requires:\n");
            fprintf(stderr, "  OpenGL version 2.0\n");
            fprintf(stderr, "  GL_ARB_fragment_program\n");
            fprintf(stderr, "  GL_EXT_framebuffer_object\n");
            cutilDeviceReset();
            shrQAFinishExit(*argc, (const char **)argv, QA_WAIVED);
        } 
    } else {
        if (!glewIsSupported( "GL_VERSION_1_5 GL_ARB_vertex_buffer_object GL_ARB_pixel_buffer_object" )) {
            fprintf(stderr, "Error: failed to get minimal extensions for demo\n");
            fprintf(stderr, "This sample requires:\n");
            fprintf(stderr, "  OpenGL version 1.5\n");
            fprintf(stderr, "  GL_ARB_vertex_buffer_object\n");
            fprintf(stderr, "  GL_ARB_pixel_buffer_object\n");
            cutilDeviceReset();
            shrQAFinishExit(*argc, (const char **)argv, QA_WAIVED);
        }
    }
}
void initializeData(char *file, int argc, char **argv) {
    GLint bsize;
    unsigned int w, h;
    size_t file_length= strlen(file);

    if (!strcmp(&file[file_length-3], "pgm")) {
        if (cutLoadPGMub(file, &pixels, &w, &h) != CUTTrue) {
            printf("Failed to load image file: %s\n", file);
            exit(-1);
        }
        g_Bpp = 1;
    } else if (!strcmp(&file[file_length-3], "ppm")) {
        if (cutLoadPPM4ub(file, &pixels, &w, &h) != CUTTrue) {
            printf("Failed to load image file: %s\n", file);
            exit(-1);
        }
        g_Bpp = 4;
    } else {
        cutilDeviceReset();
        shrQAFinishExit(argc, (const char **)argv, QA_WAIVED);
    }
    imWidth = (int)w; imHeight = (int)h;
    setupTexture(imWidth, imHeight, pixels, g_Bpp);

    // copy function pointer tables to host side for later use
    setupFunctionTables();

    memset(pixels, 0x0, g_Bpp * sizeof(Pixel) * imWidth * imHeight);

    if (!g_bQAReadback) {
        // use OpenGL Path
        glGenBuffers(1, &pbo_buffer);
        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_buffer); 
        glBufferData(GL_PIXEL_UNPACK_BUFFER, 
                        g_Bpp * sizeof(Pixel) * imWidth * imHeight, 
                        pixels, GL_STREAM_DRAW);  

        glGetBufferParameteriv(GL_PIXEL_UNPACK_BUFFER, GL_BUFFER_SIZE, &bsize); 
        if ((GLuint)bsize != (g_Bpp * sizeof(Pixel) * imWidth * imHeight)) {
            printf("Buffer object (%d) has incorrect size (%d).\n", (unsigned)pbo_buffer, (unsigned)bsize);
            cutilDeviceReset();
            shrQAFinishExit(argc, (const char **)argv, QA_FAILED);
        }

        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);

        // register this buffer object with CUDA
        cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, pbo_buffer, cudaGraphicsMapFlagsWriteDiscard));	

        glGenTextures(1, &texid);
        glBindTexture(GL_TEXTURE_2D, texid);
        glTexImage2D(GL_TEXTURE_2D, 0, ((g_Bpp==1) ? GL_LUMINANCE : GL_BGRA), 
                    imWidth, imHeight,  0, GL_LUMINANCE, GL_UNSIGNED_BYTE, NULL);
        glBindTexture(GL_TEXTURE_2D, 0);

        glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
        glPixelStorei(GL_PACK_ALIGNMENT, 1);
    }
}
void runAutoTest(int argc, char **argv)
{
	int devID = 0;
    printf("[%s] - (automated testing w/ readback)\n", sSDKsample);

	devID = cutilChooseCudaDevice(argc, argv);

    // First load the image, so we know what the size of the image (imageW and imageH)
    printf("Allocating host and CUDA memory and loading image file...\n");
    const char *image_path = cutFindFilePath("portrait_noise.bmp", argv[0]);
    if (image_path == NULL) {
       printf( "imageDenoisingGL was unable to find and load image file <portrait_noise.bmp>.\nExiting...\n");
       shrQAFinishExit(argc, (const char **)argv, QA_FAILED);
    }
    LoadBMPFile(&h_Src, &imageW, &imageH, image_path);
    printf("Data init done.\n");

    cutilSafeCall( CUDA_MallocArray(&h_Src, imageW, imageH) );

    g_CheckRender       = new CheckBackBuffer(imageW, imageH, sizeof(TColor), false);
    g_CheckRender->setExecPath(argv[0]);

    TColor *d_dst = NULL;
    cutilSafeCall( cudaMalloc( (void **)&d_dst, imageW*imageH*sizeof(TColor)) );

    while (g_Kernel <= 3) {
        printf("[AutoTest]: %s <%s>\n", sSDKsample, filterMode[g_Kernel]);
        cutilSafeCall( CUDA_Bind2TextureArray()                      );
        runImageFilters(d_dst);
        cutilSafeCall( CUDA_UnbindTexture()     );
        cutilSafeCall( cutilDeviceSynchronize() );
        cudaMemcpy(g_CheckRender->imageData(), d_dst, imageW*imageH*sizeof(TColor), cudaMemcpyDeviceToHost);
        g_CheckRender->savePPM(sOriginal[g_Kernel], true, NULL);

        if (!g_CheckRender->PPMvsPPM(sOriginal[g_Kernel], sReference[g_Kernel], MAX_EPSILON_ERROR, 0.15f)) {
            g_TotalErrors++;
        }
        g_Kernel++;
    }

    cutilSafeCall( CUDA_FreeArray() );
    free(h_Src);

    cutilSafeCall( cudaFree( d_dst ) );
    delete g_CheckRender;

	printf("\n[%s] -> Test Results: %d errors\n", sSDKsample, g_TotalErrors);

	cutilDeviceReset();
	shrQAFinishExit(argc, (const char **)argv, (!g_TotalErrors ? QA_PASSED : QA_FAILED));
}
void checkDeviceMeetComputeSpec( int argc, char **argv )
{
	int device = 0;
	cudaGetDevice( &device );
	if( checkCUDAProfile( device, MIN_RUNTIME_VERSION, MIN_COMPUTE_VERSION) ) {
	   fprintf(stderr,"\nCUDA Capable Device %d, meets minimum required specs.\n", device );
	} else {
       fprintf(stderr, "\nNo configuration with minimum compute capabilities found.  Exiting...\n");
       fprintf(stderr, "This sample requires:\n");
       fprintf(stderr, "\tCUDA Compute Capability >= %d.%d is required\n", MIN_COMPUTE_VERSION/16, MIN_COMPUTE_VERSION%16);
       fprintf(stderr, "\tCUDA Runtime Version    >= %d.%d is required\n", MIN_RUNTIME_VERSION/1000, (MIN_RUNTIME_VERSION%100)/10);
       cutilDeviceReset();
       shrQAFinishExit(argc, (const char **)argv, QA_WAIVED);
	}
}
Example #5
0
void initGL(int *argc, char** argv)
{  
    // First initialize OpenGL context, so we can properly set the GL for CUDA.
    // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
    glutInit(argc, argv);
    glutInitDisplayMode(GLUT_RGB | GLUT_DEPTH | GLUT_DOUBLE);
    glutInitWindowSize(720, 480);
    glutCreateWindow("CUDA n-body system");
    if (bFullscreen)
        glutFullScreen();
    GLenum err = glewInit();

    if (GLEW_OK != err)
    {
        shrLog("GLEW Error: %s\n", glewGetErrorString(err));
        cutilDeviceReset();
        exit(-1);
    }
    else if (!glewIsSupported("GL_VERSION_2_0 "
                         "GL_VERSION_1_5 "
			             "GL_ARB_multitexture "
                         "GL_ARB_vertex_buffer_object")) 
    {
        fprintf(stderr, "Required OpenGL extensions missing.");
        exit(-1);
    }
    else
    {
#if   defined(WIN32)
        wglSwapIntervalEXT(0);
#elif defined(LINUX)
        glxSwapIntervalSGI(0);
#endif      
    }

    glEnable(GL_DEPTH_TEST);
    glClearColor(0.0, 0.0, 0.0, 1.0);
   
    checkGLErrors("initGL");
}
Example #6
0
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv)
{
    pArgc = &argc;
    pArgv = argv;

    shrQAStart(argc, argv);
    // start logs
    shrSetLogFileName ("boxFilter.txt");
    shrLog("%s Starting...\n\n", argv[0]);

    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    if (argc > 1) {
        cutGetCmdLineArgumenti( argc, (const char**) argv, "threads", &nthreads );
        cutGetCmdLineArgumenti( argc, (const char**) argv, "radius", &filter_radius);

        if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify"))
        {
            g_bOpenGLQA = true;
            fpsLimit = frameCheckNumber;
        }
        if (cutCheckCmdLineFlag(argc, (const char **)argv, "fbo")) {
            g_bFBODisplay = true;
        }
    }

    // load image to process
    loadImageData(argc, argv);

    if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") ||
            cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt"))
    {
        // Running CUDA kernel (boxFilter) without visualization (QA Testing/Verification)
        runAutoTest(argc, argv);
        shrQAFinishExit(argc, (const char **)argv, (g_TotalErrors == 0 ? QA_PASSED : QA_FAILED));
    }
    else if (cutCheckCmdLineFlag(argc, (const char **)argv, "benchmark"))
    {
        // Running CUDA kernels (boxfilter) in Benchmarking mode
        runBenchmark(argc, argv);
        shrQAFinishExit(argc, (const char **)argv, (g_TotalErrors == 0 ? QA_PASSED : QA_FAILED));
    }
    else
    {
        // Running CUDA kernels (boxFilter) with OpenGL visualization
        if (g_bFBODisplay) shrLog("[FBO Display] ");
        if (g_bOpenGLQA)   shrLog("[OpenGL Readback Comparisons] ");
        shrLog("\n");

        if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) {
            printf("   This SDK does not explicitly support -device=n when running with OpenGL.\n");
            printf("   When specifying -device=n (n=0,1,2,....) the sample must not use OpenGL.\n");
            printf("   See details below to run without OpenGL:\n\n");
            printf(" > %s -device=n -qatest\n\n", argv[0]);
            printf("exiting...\n");
            shrQAFinishExit(argc, (const char **)argv, QA_WAIVED);
        }

        // First initialize OpenGL context, so we can properly set the GL for CUDA.
        // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
        initGL( &argc, argv );
        int dev = findCapableDevice(argc, argv);
        if( dev != -1 ) {
            cudaGLSetGLDevice( dev );
        } else {
            cutilDeviceReset();
            shrQAFinishExit(argc, (const char **)argv, QA_WAIVED);
        }

        // Now we can create a CUDA context and bind it to the OpenGL context
        initCuda();
        initGLResources();

        if (g_bOpenGLQA) {
            if (g_bFBODisplay) {
                g_CheckRender = new CheckFBO(width, height, 4, g_FrameBufferObject);
            } else {
                g_CheckRender = new CheckBackBuffer(width, height, 4);
            }
            g_CheckRender->setPixelFormat(GL_RGBA);
            g_CheckRender->setExecPath(argv[0]);
            g_CheckRender->EnableQAReadback(true);
        }
    }

    // sets the callback function so it will call cleanup upon exit
    atexit(cleanup);

    shrLog("Running Standard Demonstration with GLUT loop...\n\n");
    shrLog("Press '+' and '-' to change filter width\n"
           "Press ']' and '[' to change number of iterations\n\n");

    // Main OpenGL loop that will run visualization for every vsync
    glutMainLoop();

    cutilDeviceReset();
    shrQAFinishExit(argc, (const char **)argv, (g_TotalErrors == 0 ? QA_PASSED : QA_FAILED));
}
Example #7
0
////////////////////////////////////////////////////////////////////////////////
// Test driver
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char** argv){
    uint *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal;
    uint *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal;
    uint hTimer;

    const uint   N = 4 * 1048576;
    const uint DIR = 1;
    const uint numValues = 65536;

	shrQAStart(argc, argv);

    printf("Allocating and initializing host arrays...\n\n");
        cutCreateTimer(&hTimer);
        h_SrcKey = (uint *)malloc(N * sizeof(uint));
        h_SrcVal = (uint *)malloc(N * sizeof(uint));
        h_DstKey = (uint *)malloc(N * sizeof(uint));
        h_DstVal = (uint *)malloc(N * sizeof(uint));

        srand(2009);
        for(uint i = 0; i < N; i++)
            h_SrcKey[i] = rand() % numValues;
        fillValues(h_SrcVal, N);

    printf("Allocating and initializing CUDA arrays...\n\n");
        cutilSafeCall( cudaMalloc((void **)&d_DstKey, N * sizeof(uint)) );
        cutilSafeCall( cudaMalloc((void **)&d_DstVal, N * sizeof(uint)) );
        cutilSafeCall( cudaMalloc((void **)&d_BufKey, N * sizeof(uint)) );
        cutilSafeCall( cudaMalloc((void **)&d_BufVal, N * sizeof(uint)) );
        cutilSafeCall( cudaMalloc((void **)&d_SrcKey, N * sizeof(uint)) );
        cutilSafeCall( cudaMalloc((void **)&d_SrcVal, N * sizeof(uint)) );
        cutilSafeCall( cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice) );
        cutilSafeCall( cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice) );

    printf("Initializing GPU merge sort...\n");
        initMergeSort();

    printf("Running GPU merge sort...\n");
        cutilSafeCall( cutilDeviceSynchronize() );
        cutResetTimer(hTimer);
        cutStartTimer(hTimer);
            mergeSort(
                d_DstKey,
                d_DstVal,
                d_BufKey,
                d_BufVal,
                d_SrcKey,
                d_SrcVal,
                N,
                DIR
            );
        cutilSafeCall( cutilDeviceSynchronize() );
        cutStopTimer(hTimer);
    printf("Time: %f ms\n", cutGetTimerValue(hTimer));

    printf("Reading back GPU merge sort results...\n");
        cutilSafeCall( cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost) );
        cutilSafeCall( cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost) );

    printf("Inspecting the results...\n");
        uint keysFlag = validateSortedKeys(
            h_DstKey,
            h_SrcKey,
            1,
            N,
            numValues,
            DIR
        );

        uint valuesFlag = validateSortedValues(
            h_DstKey,
            h_DstVal,
            h_SrcKey,
            1,
            N
        );

    printf("Shutting down...\n");
        closeMergeSort();
        cutilCheckError( cutDeleteTimer(hTimer) );
        cutilSafeCall( cudaFree(d_SrcVal) );
        cutilSafeCall( cudaFree(d_SrcKey) );
        cutilSafeCall( cudaFree(d_BufVal) );
        cutilSafeCall( cudaFree(d_BufKey) );
        cutilSafeCall( cudaFree(d_DstVal) );
        cutilSafeCall( cudaFree(d_DstKey) );
        free(h_DstVal);
        free(h_DstKey);
        free(h_SrcVal);
        free(h_SrcKey);
        cutilDeviceReset();

        shrQAFinishExit(argc, (const char **)argv, (keysFlag && valuesFlag) ? QA_PASSED : QA_FAILED);
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main( int argc, char** argv) 
{
    shrQAStart(argc, argv);

    // start logs
    shrSetLogFileName ("bilateralFilter.txt");
    shrLog("%s Starting...\n\n", argv[0]); 

    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    cutGetCmdLineArgumenti( argc, (const char**) argv, "threads", &nthreads );
    cutGetCmdLineArgumenti( argc, (const char**) argv, "radius", &filter_radius);

	// load image to process
    loadImageData(argc, argv);

    if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") ||
		cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) 
    {
		// Running CUDA kernel (bilateralFilter) without visualization (QA Testing/Verification)
        runAutoTest(argc, argv);
	    shrQAFinishExit(argc, (const char **)argv, (g_TotalErrors == 0 ? QA_PASSED : QA_FAILED));
	} 
	else if (cutCheckCmdLineFlag(argc, (const char **)argv, "benchmark")) 
	{
		// Running CUDA kernel (bilateralFilter) in Benchmarking Mode
        runBenchmark(argc, argv);
	    shrQAFinishExit(argc, (const char **)argv, (g_TotalErrors == 0 ? QA_PASSED : QA_FAILED));
	} 
	else 
	{	
		// Running CUDA kernel (bilateralFilter) in CUDA + OpenGL Visualization Mode
        if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) {
             printf("[%s]\n", argv[0]);
             printf("   Does not explicitly support -device=n in OpenGL mode\n");
             printf("   To use -device=n, the sample must be running w/o OpenGL\n\n");
             printf(" > %s -device=n -qatest\n", argv[0]);
             printf("exiting...\n");
             exit(0);
        }

        // First initialize OpenGL context, so we can properly set the GL for CUDA.
        // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
        initGL( argc, argv );

        if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) {
            cutilGLDeviceInit(argc, argv);
        } else {
            cudaGLSetGLDevice (cutGetMaxGflopsDeviceId() );
        }
        initCuda();
        initOpenGL();
    }

    atexit(cleanup);

    printf("Running Standard Demonstration with GLUT loop...\n\n");
    printf("Press '+' and '-' to change number of iterations\n"
        "Press LEFT and RIGHT change euclidean delta\n"
        "Press UP and DOWN to change gaussian delta\n"
        "Press '1' to show original image\n"
        "Press '2' to show result\n\n");

    glutMainLoop();

    cutilDeviceReset();
	shrQAFinishExit(argc, (const char **)argv, (g_TotalErrors == 0 ? QA_PASSED : QA_FAILED));
}
int main(int argc, char** argv) 
{
	pArgc = &argc;
	pArgv = argv;

	shrQAStart(argc, argv);

    if (argc > 1) {
        if (cutCheckCmdLineFlag(argc, (const char **)argv, "help")) {
            printHelp();
        }
        if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") ||
            cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt"))
        {
            g_bQAReadback = true;
            fpsLimit = frameCheckNumber;
        }
        if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify"))
		{
            g_bOpenGLQA = true;
            fpsLimit = frameCheckNumber;
        }
        if (cutCheckCmdLineFlag(argc, (const char **)argv, "fbo")) {
            g_bFBODisplay = true;
            fpsLimit = frameCheckNumber;
        }
    }
	

    if (g_bQAReadback) 
    {
        runAutoTest(argc, argv);
    } 
    else 
    {
        if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) {
             printf("   This SDK does not explicitly support -device=n when running with OpenGL.\n");
             printf("   When specifying -device=n (n=0,1,2,....) the sample must not use OpenGL.\n");
             printf("   See details below to run without OpenGL:\n\n");
             printf(" > %s -device=n -qatest\n\n", argv[0]);
             printf("exiting...\n");
             shrQAFinishExit(argc, (const char **)argv, QA_PASSED);
        }

        // First initialize OpenGL context, so we can properly set the GL for CUDA.
        // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
        initGL( &argc, argv );

        //cudaGLSetGLDevice (cutGetMaxGflopsDeviceId() );
        int dev = findCapableDevice(argc, argv);
        if( dev != -1 ) {
            cudaGLSetGLDevice( dev );
        } else {
            shrQAFinishExit2(g_bQAReadback, *pArgc, (const char **)pArgv, QA_PASSED);
        }

        cutilCheckError(cutCreateTimer(&timer));
        cutilCheckError(cutResetTimer(timer));  
     
        glutDisplayFunc(display);
        glutKeyboardFunc(keyboard);
        glutReshapeFunc(reshape);

        if (g_bOpenGLQA) {
            loadDefaultImage( argc, argv );
        }

        if (argc > 1) {
            char *filename;
            if (cutGetCmdLineArgumentstr(argc, (const char **)argv, "file", &filename)) {
                initializeData(filename, argc, argv);
            }
        } else {
            loadDefaultImage( argc, argv );
        }


        // If code is not printing the USage, then we execute this path.
        if (!bQuit) {
            if (g_bOpenGLQA) {
                g_CheckRender = new CheckBackBuffer(wWidth, wHeight, 4);
                g_CheckRender->setPixelFormat(GL_BGRA);
                g_CheckRender->setExecPath(argv[0]);
                g_CheckRender->EnableQAReadback(true);
            }

            printf("I: display Image (no filtering)\n");
            printf("T: display Sobel Edge Detection (Using Texture)\n");
            printf("S: display Sobel Edge Detection (Using SMEM+Texture)\n");
            printf("Use the '-' and '=' keys to change the brightness.\n");
			printf("b: switch block filter operation (mean/Sobel)\n");
			printf("p: switch point filter operation (threshold on/off)\n");
            fflush(stdout);
            atexit(cleanup); 
            glutTimerFunc(REFRESH_DELAY, timerEvent,0);
            glutMainLoop();
        }
    }

    cutilDeviceReset();
    shrQAFinishExit(argc, (const char **)argv, QA_PASSED);
}
void runAutoTest(int argc, char **argv)
{
    printf("[%s] (automated testing w/ readback)\n", sSDKsample);

    if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) 
    {
       int device = cutilDeviceInit(argc, argv);
       if (device < 0) {
            printf("No CUDA Capable devices found, exiting...\n");
            shrQAFinishExit(argc, (const char **)argv, QA_WAIVED);
       }
	   checkDeviceMeetComputeSpec( argc, argv );
    } else {
       int dev = findCapableDevice(argc, argv);
       if( dev != -1 ) 
          cudaSetDevice( dev );
       else {
          cutilDeviceReset();
		  shrQAFinishExit2(g_bQAReadback, *pArgc, (const char **)pArgv, QA_PASSED);
       }
    }

    loadDefaultImage( argc, argv );

    if (argc > 1) {
        char *filename;
        if (cutGetCmdLineArgumentstr(argc, (const char **)argv, "file", &filename)) {
            initializeData(filename, argc, argv);
        }
    } else {
        loadDefaultImage( argc, argv );
    }

    g_CheckRender       = new CheckBackBuffer(imWidth, imHeight, sizeof(Pixel), false);
    g_CheckRender->setExecPath(argv[0]);

    Pixel *d_result;
    cutilSafeCall( cudaMalloc( (void **)&d_result, imWidth*imHeight*sizeof(Pixel)) );

    while (g_SobelDisplayMode <= 2) 
    {
        printf("AutoTest: %s <%s>\n", sSDKsample, filterMode[g_SobelDisplayMode]);

        sobelFilter(d_result, imWidth, imHeight, g_SobelDisplayMode, imageScale, blockOp, pointOp );

        cutilSafeCall( cutilDeviceSynchronize() );

        cudaMemcpy(g_CheckRender->imageData(), d_result, imWidth*imHeight*sizeof(Pixel), cudaMemcpyDeviceToHost);

        g_CheckRender->savePGM(sOriginal[g_Index], false, NULL);

        if (!g_CheckRender->PGMvsPGM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, 0.15f)) {
            g_TotalErrors++;
        }
        g_Index++;
        g_SobelDisplayMode = (SobelDisplayMode)g_Index;
    }

    cutilSafeCall( cudaFree( d_result ) );
    delete g_CheckRender;

    shrQAFinishExit(argc, (const char **)argv, (!g_TotalErrors ? QA_PASSED : QA_FAILED) );
}
void shutDown(unsigned char k, int /*x*/, int /*y*/)
{
    switch (k){
        case '\033':
        case 'q':
        case 'Q':
            printf("Shutting down...\n");
                cutilCheckError( cutStopTimer(hTimer)   );
                cutilCheckError( cutDeleteTimer(hTimer) );
				// DEPRECATED: cutilSafeCall( cudaGLRegisterBufferObject(gl_PBO) );
				cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, gl_PBO, 
									   cudaGraphicsMapFlagsWriteDiscard));
                glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
                glDeleteBuffers(1, &gl_PBO);
                glDeleteTextures(1, &gl_Tex);

                cutilSafeCall( CUDA_FreeArray() );
                free(h_Src);
            printf("Shutdown done.\n");

            cutilDeviceReset();
            exit(0);
        break;

        case '1':
            printf("Passthrough.\n");
            g_Kernel = 0;
        break;

        case '2':
            printf("KNN method \n");
            g_Kernel = 1;
        break;

        case '3':
            printf("NLM method\n");
            g_Kernel = 2;
        break;

        case '4':
            printf("Quick NLM(NLM2) method\n");
            g_Kernel = 3;
        break;

        case ' ':
            printf(g_Diag ? "LERP highlighting mode.\n" : "Normal mode.\n");
            g_Diag = !g_Diag;
        break;

        case 'n':
            printf("Decrease noise level.\n");
            knnNoise -= noiseStep;
            nlmNoise -= noiseStep;
        break;

        case 'N':
            printf("Increase noise level.\n");
            knnNoise += noiseStep;
            nlmNoise += noiseStep;
        break;

        case 'l':
            printf("Decrease LERP quotent.\n");
            lerpC = MAX(lerpC - lerpStep, 0.0f);
        break;

        case 'L':
            printf("Increase LERP quotent.\n");
            lerpC = MIN(lerpC + lerpStep, 1.0f);
        break;

        case 'f' : case 'F':
            g_FPS = true;
        break;

        case '?':
            printf("lerpC = %5.5f\n", lerpC);
            printf("knnNoise = %5.5f\n", knnNoise);
            printf("nlmNoise = %5.5f\n", nlmNoise);
        break;
    }
}
Example #12
0
//////////////////////////////////////////////////////////////////////////////
// Program main
//////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv) 
{
    bool bTestResults = true;

    shrQAStart(argc, argv);

    if( cutCheckCmdLineFlag(argc, (const char**)argv, "help") ) {
        showHelp();
        return 0;
    }

    shrLog("Run \"nbody -benchmark [-n=<numBodies>]\" to measure perfomance.\n");
	shrLog("\t-fullscreen (run n-body simulation in fullscreen mode)\n");
	shrLog("\t-fp64       (use double precision floating point values for simulation)\n");
    shrLog("\t-numdevices=N (use first N CUDA devices for simulation)\n");
//    shrLog("\t-hostmem  (stores simulation data in host memory)\n");
//    shrLog("\t-cpu      (performs simulation on the host)\n");
    shrLog("\n");

	bFullscreen  = (cutCheckCmdLineFlag(argc, (const char**) argv, "fullscreen") != 0);
    if (bFullscreen)
        bShowSliders = false;

    benchmark    = (cutCheckCmdLineFlag(argc, (const char**) argv, "benchmark") != 0);

    compareToCPU = ((cutCheckCmdLineFlag(argc, (const char**) argv, "compare") != 0) ||
                   (cutCheckCmdLineFlag(argc, (const char**) argv, "qatest")  != 0));

    QATest       = (cutCheckCmdLineFlag(argc, (const char**) argv, "qatest")  != 0);

    useHostMem   = (cutCheckCmdLineFlag(argc, (const char**) argv, "hostmem") != 0);

    fp64         = (cutCheckCmdLineFlag(argc, (const char**) argv, "fp64") != 0);

    flopsPerInteraction = fp64 ? 30 : 20;

    useCpu       = (cutCheckCmdLineFlag(argc, (const char**) argv, "cpu") != 0);

    cutGetCmdLineArgumenti(argc, (const char**) argv, "numdevices", &numDevsRequested);

    // for multi-device we currently require using host memory -- the devices share
    // data via the host 
    if (numDevsRequested > 1)
        useHostMem = true;

    int numDevsAvailable = 0;
    bool customGPU = false;
    cudaGetDeviceCount(&numDevsAvailable);

    if (numDevsAvailable < numDevsRequested) {
        shrLog("Error: only %d Devices available, %d requested.  Exiting.\n", numDevsAvailable, numDevsRequested);
        shrQAFinishExit(argc, (const char **)argv, QA_PASSED);
    }

	shrLog("> %s mode\n", bFullscreen ? "Fullscreen" : "Windowed");
	shrLog("> Simulation data stored in %s memory\n", useHostMem ? "system" : "video" );
	shrLog("> %s precision floating point simulation\n", fp64 ? "Double" : "Single");
    shrLog("> %d Devices used for simulation\n", numDevsRequested);

    int devID;
    cudaDeviceProp props;

    // Initialize GL and GLUT if necessary
    if (!benchmark && !compareToCPU) {
        initGL(&argc, argv);
        initParameters();
    }
    
    if (useCpu) {
        useHostMem = true;
        compareToCPU = false;
        bSupportDouble = true;

#ifdef OPENMP
        shrLog("> Simulation with CPU using OpenMP\n");
#else
        shrLog("> Simulation with CPU\n");
#endif
    }
    else
    {
        // Now choose the CUDA Device
        // Either without GL interop:
        if (benchmark || compareToCPU || useHostMem) 
        {
            // Note if we are using host memory for the body system, we
            // don't use CUDA-GL interop.

            if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) 
            {
                devID = cutilDeviceInit(argc, argv);
                if (devID < 0) {
                   printf("exiting...\n");
                   shrQAFinishExit(argc, (const char **)argv, QA_PASSED);
                }
                customGPU = true;
            } 
            else 
            {
                devID = cutGetMaxGflopsDeviceId();
                cudaSetDevice( devID );
            }
        } 
        else // or with GL interop:
        {    	
            if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
                cutilGLDeviceInit(argc, argv);
                customGPU = true;
            } else {
                devID = cutGetMaxGflopsDeviceId();
                cudaGLSetGLDevice( devID );
            }
        }

        cutilSafeCall(cudaGetDevice(&devID));
        cutilSafeCall(cudaGetDeviceProperties(&props, devID));

        bSupportDouble = true;

#if CUDART_VERSION < 4000
        if (numDevsRequested > 1)
        {
            shrLog("MultiGPU n-body requires CUDA 4.0 or later\n");
            cutilDeviceReset();
            shrQAFinishExit(argc, (const char**)argv, QA_PASSED);
        }
#endif

        // Initialize devices
        if(numDevsRequested > 1 && customGPU)
        {
            printf("You can't use --numdevices and --device at the same time.\n");
            shrQAFinishExit(argc, (const char**)argv, QA_PASSED);
        }

        if(customGPU) { 
            cudaDeviceProp props;
            cutilSafeCall(cudaGetDeviceProperties(&props, devID));
            shrLog("> Compute %d.%d CUDA device: [%s]\n", props.major, props.minor, props.name);
        }
        else
        {
            for (int i = 0; i < numDevsRequested; i++)
            {
                cudaDeviceProp props;
                cutilSafeCall(cudaGetDeviceProperties(&props, i));
            
                shrLog("> Compute %d.%d CUDA device: [%s]\n", props.major, props.minor, props.name);

                if (useHostMem)
                {
#if CUDART_VERSION >= 2020
                    if(!props.canMapHostMemory)
                    {
                        fprintf(stderr, "Device %d cannot map host memory!\n", devID);
                        cutilDeviceReset();
                        shrQAFinishExit(argc, (const char **)argv, QA_PASSED);
                    }
                    if (numDevsRequested > 1)
                        cutilSafeCall(cudaSetDevice(i));
                    cutilSafeCall(cudaSetDeviceFlags(cudaDeviceMapHost));
#else
                    fprintf(stderr, "This CUDART version does not support <cudaDeviceProp.canMapHostMemory> field\n");
                    cutilDeviceReset();
                    shrQAFinishExit(argc, (const char **)argv, QA_PASSED);
#endif
                }
            }

            // CC 1.2 and earlier do not support double precision
            if (props.major*10 + props.minor <= 12)
                bSupportDouble = false;
        }

        //if(numDevsRequested > 1)
        //    cutilSafeCall(cudaSetDevice(devID));

        if (fp64 && !bSupportDouble) {
            fprintf(stderr, "One or more of the requested devices does not support double precision floating-point\n");
            cutilDeviceReset();
            shrQAFinishExit(argc, (const char **)argv, QA_PASSED);
        }
    }
	
    numIterations = 0;
    p = 0;
    q = 1;

    cutGetCmdLineArgumenti(argc, (const char**) argv, "i", &numIterations);
    cutGetCmdLineArgumenti(argc, (const char**) argv, "p", &p);
    cutGetCmdLineArgumenti(argc, (const char**) argv, "q", &q);

    if (p == 0) // p not set on command line
    {
        p = 256;
        if (q * p > 256)
        {
            p = 256 / q;
            shrLog("Setting p=%d, q=%d to maintain %d threads per block\n", p, q, 256);
        }
    }

    // default number of bodies is #SMs * 4 * CTA size
    if (useCpu)
#ifdef OPENMP
        numBodies = 8192;
#else
        numBodies = 4096;
#endif
    else if (numDevsRequested == 1)
Example #13
0
// commented out to remove unused parameter warnings in Linux
void key(unsigned char key, int /*x*/, int /*y*/)
{
    switch (key) 
    {
    case ' ':
        bPause = !bPause;
        break;
    case 27: // escape
    case 'q':
    case 'Q':
        cutilDeviceReset();
        exit(0); 
        break;
    case 13: // return
        if (bSupportDouble)
        {
            if (fp64)
                switchDemoPrecision<float, double>();
            else
                switchDemoPrecision<double, float>();

			shrLog("> %s precision floating point simulation\n", fp64 ? "Double" : "Single");
        }

		break;
    case '`':
        bShowSliders = !bShowSliders;
        break;
	case 'g':
	case 'G':
		bDispInteractions = !bDispInteractions;
		break;
    case 'p':
    case 'P':
        displayMode = (ParticleRenderer::DisplayMode)
                      ((displayMode + 1) % ParticleRenderer::PARTICLE_NUM_MODES);
        break;
    case 'c':
    case 'C':
        cycleDemo = !cycleDemo;
        shrLog("Cycle Demo Parameters: %s\n", cycleDemo ? "ON" : "OFF");
        break;
    case '[':
        activeDemo = (activeDemo == 0) ? numDemos - 1 : (activeDemo - 1) % numDemos;
        selectDemo(activeDemo);
        break;
    case ']':
        activeDemo = (activeDemo + 1) % numDemos;
        selectDemo(activeDemo);
        break;
    case 'd':
    case 'D':
        displayEnabled = !displayEnabled;
        break;
    case 'o':
    case 'O':
        activeParams.print();
        break;
    case '1':
        if (fp64)
            NBodyDemo<double>::reset(numBodies, NBODY_CONFIG_SHELL);
        else
            NBodyDemo<float>::reset(numBodies, NBODY_CONFIG_SHELL);
        break;
    case '2':
        if (fp64)
            NBodyDemo<double>::reset(numBodies, NBODY_CONFIG_RANDOM);
        else
            NBodyDemo<float>::reset(numBodies, NBODY_CONFIG_RANDOM);
        break;
    case '3':
        if (fp64)
            NBodyDemo<double>::reset(numBodies, NBODY_CONFIG_EXPAND);
        else
            NBodyDemo<float>::reset(numBodies, NBODY_CONFIG_EXPAND);
        break;
    }

    glutPostRedisplay();
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv) 
{
    shrQAStart( argc, argv );
    shrSetLogFileName ("reduction.txt");

		char *reduceMethod;
    cutGetCmdLineArgumentstr( argc, (const char**) argv, "method", &reduceMethod);
    
    char *typeChoice;
    cutGetCmdLineArgumentstr( argc, (const char**) argv, "type", &typeChoice);

    if (0 == typeChoice)
    {
        typeChoice = (char*)malloc(4 * sizeof(char));
        strcpy(typeChoice, "int");
    }

    ReduceType datatype = REDUCE_INT;

    if (!strcasecmp(typeChoice, "float"))
        datatype = REDUCE_FLOAT;
    else if (!strcasecmp(typeChoice, "double"))
        datatype = REDUCE_DOUBLE;
    else
        datatype = REDUCE_INT;

    cudaDeviceProp deviceProp;
    deviceProp.major = 1;
    deviceProp.minor = 0;
    int minimumComputeVersion = 10;

    if (datatype == REDUCE_DOUBLE)
    {
        deviceProp.minor = 3;
        minimumComputeVersion = 13;
    }

    int dev;

		if(!cutCheckCmdLineFlag(argc, (const char**)argv, "method") )
		{
				fprintf(stderr, "MISSING --method FLAG.\nYou must provide --method={ SUM | MIN | MAX }.\n");
				exit(1);
		}

    if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) 
    {
        cutilDeviceInit(argc, argv);
        cutilSafeCallNoSync(cudaGetDevice(&dev));
    } 
    else
    {
        cutilSafeCallNoSync(cudaChooseDevice(&dev, &deviceProp));
        
    }

    cutilSafeCallNoSync(cudaGetDeviceProperties(&deviceProp, dev));

    if((deviceProp.major * 10 + deviceProp.minor) >= minimumComputeVersion)
    {
        shrLog("Using Device %d: %s\n\n", dev, deviceProp.name);
        cutilSafeCallNoSync(cudaSetDevice(dev));
    }
    else 
    {
        shrLog("Error: the selected device does not support the minimum compute capability of %d.%d.\n\n",
            minimumComputeVersion / 10, minimumComputeVersion % 10);

        cutilDeviceReset();
        shrQAFinishExit(argc, (const char **)argv, QA_WAIVED);
    }   

    shrLog("Reducing array of type %s\n\n", typeChoice);

	bool bResult = false;

    switch (datatype)
    {
    default:
    case REDUCE_INT:
				if (strcmp("SUM", reduceMethod) == 0) {
					bResult = runTestSum<int>( argc, argv, datatype);
				} else if ( strcmp("MAX", reduceMethod) == 0 ) {
					bResult = runTestMax<int>( argc, argv, datatype);
				} else if ( strcmp("MIN", reduceMethod) == 0 ) {
					bResult = runTestMin<int>( argc, argv, datatype);
				} else {
					fprintf(stderr, "No --method specified!\n");
					exit(1);
				}
        break;
    case REDUCE_FLOAT:
				if (strcmp("SUM", reduceMethod) == 0) {
					bResult = runTestSum<float>( argc, argv, datatype);
				} else if ( strcmp("MAX", reduceMethod) == 0 ) {
					bResult = runTestMax<float>( argc, argv, datatype);
				} else if ( strcmp("MIN", reduceMethod) == 0 ) {
					bResult = runTestMin<float>( argc, argv, datatype);
				} else {
					fprintf(stderr, "No --method specified!\n");
					exit(1);
				}
        break;
    case REDUCE_DOUBLE:
				if (strcmp("SUM", reduceMethod) == 0) {
					bResult = runTestSum<double>( argc, argv, datatype);
				} else if ( strcmp("MAX", reduceMethod) == 0 ) {
					bResult = runTestMax<double>( argc, argv, datatype);
				} else if ( strcmp("MIN", reduceMethod) == 0 ) {
					bResult = runTestMin<double>( argc, argv, datatype);
				} else {
					fprintf(stderr, "No --method specified!\n");
					exit(1);
				}
        break;
    }
    
    cutilDeviceReset();
	shrQAFinishExit(argc, (const char**)argv, (bResult ? QA_PASSED : QA_FAILED));
}
int main(int argc, char **argv)
{
    shrQAStart(argc, argv);

    if (argc > 1) {
        if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") ||
            cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) 
		{
            g_bQAReadback = true;
            fpsLimit = frameCheckNumber;
        }
        if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify"))
		{
            g_bOpenGLQA = true;
            g_bFBODisplay = false;
            fpsLimit = frameCheckNumber;
        }
        if (cutCheckCmdLineFlag(argc, (const char **)argv, "fbo")) {
            g_bFBODisplay = true;
            fpsLimit = frameCheckNumber;
        }
    }

    if (g_bQAReadback) {
        runAutoTest(argc, argv);
    } else {
        printf("[%s] ", sSDKsample);
        if (g_bFBODisplay) printf("[FBO Display] ");
        if (g_bOpenGLQA)   printf("[OpenGL Readback Comparisons] ");
        printf("\n");

		// use command-line specified CUDA device, otherwise use device with highest Gflops/s
		if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) {
			printf("[%s]\n", argv[0]);
			printf("   Does not explicitly support -device=n in OpenGL mode\n");
			printf("   To use -device=n, the sample must be running w/o OpenGL\n\n");
			printf(" > %s -device=n -qatest\n", argv[0]);
			printf("exiting...\n");
            exit(0);
		}

	    // First load the image, so we know what the size of the image (imageW and imageH)
        printf("Allocating host and CUDA memory and loading image file...\n");
        const char *image_path = cutFindFilePath("portrait_noise.bmp", argv[0]);
        if (image_path == NULL) {
           printf( "imageDenoisingGL was unable to find and load image file <portrait_noise.bmp>.\nExiting...\n");
           shrQAFinishExit(argc, (const char **)argv, QA_FAILED);
        }
        LoadBMPFile(&h_Src, &imageW, &imageH, image_path);
        printf("Data init done.\n");

		// First initialize OpenGL context, so we can properly set the GL for CUDA.
		// This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
		initGL( &argc, argv );
	    cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() );

        cutilSafeCall( CUDA_MallocArray(&h_Src, imageW, imageH) );

        initOpenGLBuffers();

        // Creating the Auto-Validation Code
        if (g_bOpenGLQA) {
            if (g_bFBODisplay) {
                g_CheckRender = new CheckFBO(imageW, imageH, 4);
            } else {
                g_CheckRender = new CheckBackBuffer(imageW, imageH, 4);
            }
            g_CheckRender->setPixelFormat(GL_RGBA);
            g_CheckRender->setExecPath(argv[0]);
            g_CheckRender->EnableQAReadback(g_bOpenGLQA);
        }
    }

    printf("Starting GLUT main loop...\n");
    printf("Press [1] to view noisy image\n");
    printf("Press [2] to view image restored with knn filter\n");
    printf("Press [3] to view image restored with nlm filter\n");
    printf("Press [4] to view image restored with modified nlm filter\n");
    printf("Press [ ] to view smooth/edgy areas [RED/BLUE] Ct's\n");
    printf("Press [f] to print frame rate\n");
    printf("Press [?] to print Noise and Lerp Ct's\n");
    printf("Press [q] to exit\n");

    glutDisplayFunc(displayFunc);
    glutKeyboardFunc(shutDown);
    cutilCheckError( cutCreateTimer(&hTimer) );
    cutilCheckError( cutStartTimer(hTimer)   );
    glutTimerFunc(REFRESH_DELAY, timerEvent,0);
    glutMainLoop();

    cutilDeviceReset();
    shrQAFinishExit(argc, (const char **)argv, QA_PASSED);
}
int main()
{
	int width = 512;
	int height = 512;

	// Creation du device
	cutilSafeCall( cudaSetDevice( cutGetMaxGflopsDeviceId() ) );

	// Creation des buffers sur CPU
	int * a = new int[width*height];
	int * b = new int[width*height];
	int * res = new int[width*height];
	for(int i = 0; i < width*height; i++)
	{
		a[i] = (int)ceil(((double)rand()/ (double)RAND_MAX)*100);
		b[i] = (int)ceil(((double)rand()/ (double)RAND_MAX)*100);
	}

	// Allocation des objects on the device
	// *** data
	unsigned int size = width * height * sizeof(int);
	std::cout << "Allocation d'un buffer de taille " << width*height << "\n";
	int* d_a = NULL;
	int* d_b = NULL;
	int* d_res = NULL;
    cutilSafeCall( cudaMalloc( (void**) &d_a, size));
	cutilSafeCall( cudaMalloc( (void**) &d_b, size));
	cutilSafeCall( cudaMalloc( (void**) &d_res, size));

	// Copy des donnees
	cutilSafeCall( cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice));
	cutilSafeCall( cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice));

    // Lancer le calcul
	std::cout << "Lancer le kernel ... \n";
	runKernel(d_a, d_b, d_res, width*height);
	cutilSafeCall( cutilDeviceSynchronize() );

	// Copie DeviceHost
	cutilSafeCall( cudaMemcpy(res, d_res, size, cudaMemcpyDeviceToHost));

	// Verification du test
	int i = 0;
	for(;i < width*height;i++)
		if(res[i] != a[i] + b[i])
			std::cout << "Error : [" << i << "] " << res[i] << " != " << a[i] << " + " << b[i] << std::endl;


	// Liberation des ressources
	// *** Device
	cutilSafeCall(cudaFree(d_a));
	cutilSafeCall(cudaFree(d_b));
	cutilSafeCall(cudaFree(d_res));
	// *** CPU
	delete[] res;
	delete[] a;
	delete[] b;

	// Close device
	cutilDeviceReset();
	std::cout << "Test result : " << ((i == width*height) ? "Succes" : "Error" ) <<  std::endl;
	std::cout.flush();

	return 0;
}
Example #17
0
int main(int argc, char **argv)
{
    uchar *h_Data;
    uint  *h_HistogramCPU, *h_HistogramGPU;
    uchar *d_Data;
    uint  *d_Histogram;
    uint hTimer;
    int PassFailFlag = 1;
    uint byteCount = 64 * 1048576;
    uint uiSizeMult = 1;

    cudaDeviceProp deviceProp;
    deviceProp.major = 0;
    deviceProp.minor = 0;
    int dev;

	shrQAStart(argc, argv);

	// set logfile name and start logs
    shrSetLogFileName ("histogram.txt");

    //Use command-line specified CUDA device, otherwise use device with highest Gflops/s
    if( shrCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
        dev = cutilDeviceInit(argc, argv);
        if (dev < 0) {
           printf("No CUDA Capable Devices found, exiting...\n");
           shrQAFinishExit(argc, (const char **)argv, QA_WAIVED);
        }
    } else {
        cudaSetDevice( dev = cutGetMaxGflopsDeviceId() );
        cutilSafeCall( cudaChooseDevice(&dev, &deviceProp) );
    }
    cutilSafeCall( cudaGetDeviceProperties(&deviceProp, dev) );

	printf("CUDA device [%s] has %d Multi-Processors, Compute %d.%d\n", 
		deviceProp.name, deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor);

	int version = deviceProp.major * 0x10 + deviceProp.minor;

	if(version < 0x11) 
    {
        printf("There is no device supporting a minimum of CUDA compute capability 1.1 for this SDK sample\n");
        cutilDeviceReset();
		shrQAFinishExit(argc, (const char **)argv, QA_WAIVED);
    }

    cutilCheckError(cutCreateTimer(&hTimer));

    // Optional Command-line multiplier to increase size of array to histogram
    if (shrGetCmdLineArgumentu(argc, (const char**)argv, "sizemult", &uiSizeMult))
    {
        uiSizeMult = CLAMP(uiSizeMult, 1, 10);
        byteCount *= uiSizeMult;
    }

    shrLog("Initializing data...\n");
        shrLog("...allocating CPU memory.\n");
            h_Data         = (uchar *)malloc(byteCount);
            h_HistogramCPU = (uint  *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint));
            h_HistogramGPU = (uint  *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint));

        shrLog("...generating input data\n");
            srand(2009);
            for(uint i = 0; i < byteCount; i++) 
                h_Data[i] = rand() % 256;

        shrLog("...allocating GPU memory and copying input data\n\n");
            cutilSafeCall( cudaMalloc((void **)&d_Data, byteCount  ) );
            cutilSafeCall( cudaMalloc((void **)&d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint)  ) );
            cutilSafeCall( cudaMemcpy(d_Data, h_Data, byteCount, cudaMemcpyHostToDevice) );

    {
        shrLog("Starting up 64-bin histogram...\n\n");
            initHistogram64();

        shrLog("Running 64-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns);
            for(int iter = -1; iter < numRuns; iter++){
                //iter == -1 -- warmup iteration
                if(iter == 0){
                    cutilSafeCall( cutilDeviceSynchronize() );
                    cutilCheckError( cutResetTimer(hTimer) );
                    cutilCheckError( cutStartTimer(hTimer) );
                }

                histogram64(d_Histogram, d_Data, byteCount);
            }

            cutilSafeCall( cutilDeviceSynchronize() );
            cutilCheckError(  cutStopTimer(hTimer));
            double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns;
        shrLog("histogram64() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs);
        shrLogEx(LOGBOTH | MASTER, 0, "histogram64, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", 
                (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM64_THREADBLOCK_SIZE); 

        shrLog("\nValidating GPU results...\n");
            shrLog(" ...reading back GPU results\n");
                cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM64_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) );

            shrLog(" ...histogram64CPU()\n");
               histogram64CPU(
                    h_HistogramCPU,
                    h_Data,
                    byteCount
                );

            shrLog(" ...comparing the results...\n");
                for(uint i = 0; i < HISTOGRAM64_BIN_COUNT; i++)
                    if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0;
            shrLog(PassFailFlag ? " ...64-bin histograms match\n\n" : " ***64-bin histograms do not match!!!***\n\n" );

        shrLog("Shutting down 64-bin histogram...\n\n\n");
            closeHistogram64();
    }

    {
        shrLog("Initializing 256-bin histogram...\n");
            initHistogram256();

        shrLog("Running 256-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns);
            for(int iter = -1; iter < numRuns; iter++){
                //iter == -1 -- warmup iteration
                if(iter == 0){
                    cutilSafeCall( cutilDeviceSynchronize() );
                    cutilCheckError( cutResetTimer(hTimer) );
                    cutilCheckError( cutStartTimer(hTimer) );
                }

                histogram256(d_Histogram, d_Data, byteCount);
            }

            cutilSafeCall( cutilDeviceSynchronize() );
            cutilCheckError(  cutStopTimer(hTimer));
            double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns;
        shrLog("histogram256() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs);
        shrLogEx(LOGBOTH | MASTER, 0, "histogram256, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", 
                (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM256_THREADBLOCK_SIZE); 
                
        shrLog("\nValidating GPU results...\n");
            shrLog(" ...reading back GPU results\n");
                cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) );

            shrLog(" ...histogram256CPU()\n");
                histogram256CPU(
                    h_HistogramCPU,
                    h_Data,
                    byteCount
                );

            shrLog(" ...comparing the results\n");
                for(uint i = 0; i < HISTOGRAM256_BIN_COUNT; i++)
                    if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0;
            shrLog(PassFailFlag ? " ...256-bin histograms match\n\n" : " ***256-bin histograms do not match!!!***\n\n" );

        shrLog("Shutting down 256-bin histogram...\n\n\n");
            closeHistogram256();
    }

    shrLog("Shutting down...\n");
        cutilCheckError(cutDeleteTimer(hTimer));
        cutilSafeCall( cudaFree(d_Histogram) );
        cutilSafeCall( cudaFree(d_Data) );
        free(h_HistogramGPU);
        free(h_HistogramCPU);
        free(h_Data);

    cutilDeviceReset();
	shrLog("%s - Test Summary\n", sSDKsample);
    // pass or fail (for both 64 bit and 256 bit histograms)
    shrQAFinishExit(argc, (const char **)argv, (PassFailFlag ? QA_PASSED : QA_FAILED));
}
int main()
{
	int width = 800;
	int height = 600;

	int mesh_width = 256;
	int mesh_height = 256;

	// Creation du device
	cutilSafeCall( cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ) );

	// Creation d'une fenetre
	C3::Window OpenGLWin;
	OpenGLWin.Create(C3::WindowMode(width,height),"CudaC3");
	
	// Glew init
	GLenum err = glewInit();
	if(err != GLEW_OK)
		std::cout << "Error on GLEW initialization.\n";


	// Configuration OpenGL
	glClearColor(0.0, 0.0, 0.0, 1.0);
	glDisable(GL_DEPTH_TEST);
	glViewport(0, 0, width, height);
	glMatrixMode(GL_PROJECTION);
    glLoadIdentity();
    gluPerspective(60.0, (GLfloat)width / (GLfloat) height, 0.1, 10.0);

	// VBO
	// *** Create
	GLuint vbo;
	glGenBuffers(1, &vbo);
	glBindBuffer(GL_ARRAY_BUFFER, vbo);
	// *** Initialize
	unsigned int size = mesh_width * mesh_height * 4 * sizeof(float);
	glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW);
	glBindBuffer(GL_ARRAY_BUFFER, 0);
	// *** Register in CUDA
	cudaGraphicsResource *cuda_vbo_resource = NULL;
	cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsWriteDiscard));


	float g_fAnim = 0.f;
	int nbFrame = 0;
	float timeFPS = 0.f;
	while(OpenGLWin.IsOpened())
	{
		// Events
		C3::Event event;
		while(OpenGLWin.PoolEvent(event))
		{
                        //std::cout << "Event !" << std::endl;
			if(event.Type == C3::Event::Closed)
			{
				std::cout << "Close ... " << std::endl;
				OpenGLWin.Close();
			}
			else if(event.Type == C3::Event::KeyPressed)
			{
				if(event.Key.Code == C3::Key::Escape)
				{
					std::cout << "Close ... " << std::endl;
					OpenGLWin.Close();
				}
			}
		}

		// Mise a jour du temps
		g_fAnim += OpenGLWin.GetFrameTime() / 1000.f;
		timeFPS += OpenGLWin.GetFrameTime() / 1000.f;
		nbFrame++;
		if(timeFPS > 1.0f)
		{
			std::stringstream ss;
			ss << "CudaC3 [" << (int)ceil( nbFrame / timeFPS ) << " FPS]";
			OpenGLWin.SetTitle(ss.str());
			timeFPS = 0.f;
			nbFrame = 0;
		}
		
		// Draw the scene
		glClear( GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT );

		// Lancer le calcul CUDA
		// *** map OpenGL buffer object for writing from CUDA
		float4 *dptr;
		cutilSafeCall(cudaGraphicsMapResources(1, &cuda_vbo_resource, 0));
		size_t num_bytes; 
		cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void **)&dptr, &num_bytes, cuda_vbo_resource));
		// *** Run kernel
		runKernel(dptr, mesh_width, mesh_height,g_fAnim);
		cutilSafeCall( cutilDeviceSynchronize() );
		// *** Unmap
		cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_vbo_resource, 0));
		
		// OpenGL
		// *** Make some transformation
		glMatrixMode(GL_MODELVIEW);
		glLoadIdentity();
		glTranslatef(0.0, 0.0, -3.0);
		glRotatef(0.0, 1.0, 0.0, 0.0);
		glRotatef(0.0, 0.0, 1.0, 0.0);
		// *** Render VBO
		// --- Bind
		glBindBuffer(GL_ARRAY_BUFFER, vbo);
		glVertexPointer(4, GL_FLOAT, 0, 0);
		// --- Draw
		glEnableClientState(GL_VERTEX_ARRAY);
		glColor3f(1.0, 0.0, 0.0);
		glDrawArrays(GL_POINTS, 0, mesh_width * mesh_height);
		glDisableClientState(GL_VERTEX_ARRAY);

		// Swap buffers
		OpenGLWin.Display();
	}

	// Liberation des ressources
	cudaGraphicsUnregisterResource(cuda_vbo_resource);
	
	glBindBuffer(1, vbo);
	glDeleteBuffers(1, &vbo);

	// Close device
	cutilDeviceReset();

	return 0;
}