void initGL(int *argc, char** argv)
{
    glutInit( argc, argv );    
    glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE);
    glutInitWindowSize(wWidth, wHeight);
    glutCreateWindow("Cuda Edge Detection");

    glewInit();

    if (g_bFBODisplay) {
        if (!glewIsSupported( "GL_VERSION_2_0 GL_ARB_fragment_program GL_EXT_framebuffer_object" )) {
            fprintf(stderr, "Error: failed to get minimal extensions for demo\n");
            fprintf(stderr, "This sample requires:\n");
            fprintf(stderr, "  OpenGL version 2.0\n");
            fprintf(stderr, "  GL_ARB_fragment_program\n");
            fprintf(stderr, "  GL_EXT_framebuffer_object\n");
            cudaThreadExit();
            cutilExit(*argc, argv);
        } 
    } else {
        if (!glewIsSupported( "GL_VERSION_1_5 GL_ARB_vertex_buffer_object GL_ARB_pixel_buffer_object" )) {
            fprintf(stderr, "Error: failed to get minimal extensions for demo\n");
            fprintf(stderr, "This sample requires:\n");
            fprintf(stderr, "  OpenGL version 1.5\n");
            fprintf(stderr, "  GL_ARB_vertex_buffer_object\n");
            fprintf(stderr, "  GL_ARB_pixel_buffer_object\n");
            cudaThreadExit();
            cutilExit(*argc, argv);
        }
    }
}
void initializeData(char *file, int argc, char **argv) {
    GLint bsize;
    unsigned int w, h;
    size_t file_length= strlen(file);

    if (!strcmp(&file[file_length-3], "pgm")) {
        if (cutLoadPGMub(file, &pixels, &w, &h) != CUTTrue) {
            printf("Failed to load image file: %s\n", file);
            exit(-1);
        }
        g_Bpp = 1;
    } else if (!strcmp(&file[file_length-3], "ppm")) {
        if (cutLoadPPM4ub(file, &pixels, &w, &h) != CUTTrue) {
            printf("Failed to load image file: %s\n", file);
            exit(-1);
        }
        g_Bpp = 4;
    } else {
        cudaThreadExit();
		cutilExit(argc, argv);
    }
    imWidth = (int)w; imHeight = (int)h;
    setupTexture(imWidth, imHeight, pixels, g_Bpp);
	// copy function pointer tables to host side for later use
	setupFunctionTables();

    memset(pixels, 0x0, g_Bpp * sizeof(Pixel) * imWidth * imHeight);

    if (!g_bQAReadback) {
        // use OpenGL Path
        glGenBuffers(1, &pbo_buffer);
        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_buffer); 
        glBufferData(GL_PIXEL_UNPACK_BUFFER, 
                        g_Bpp * sizeof(Pixel) * imWidth * imHeight, 
                        pixels, GL_STREAM_DRAW);  

        glGetBufferParameteriv(GL_PIXEL_UNPACK_BUFFER, GL_BUFFER_SIZE, &bsize); 
        if ((GLuint)bsize != (g_Bpp * sizeof(Pixel) * imWidth * imHeight)) {
            printf("Buffer object (%d) has incorrect size (%d).\n", (unsigned)pbo_buffer, (unsigned)bsize);
            cudaThreadExit();
			cutilExit(argc, argv);
        }

        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);

		 // register this buffer object with CUDA
	    cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, pbo_buffer, cudaGraphicsMapFlagsWriteDiscard));	

        glGenTextures(1, &texid);
        glBindTexture(GL_TEXTURE_2D, texid);
        glTexImage2D(GL_TEXTURE_2D, 0, ((g_Bpp==1) ? GL_LUMINANCE : GL_BGRA), 
                    imWidth, imHeight,  0, GL_LUMINANCE, GL_UNSIGNED_BYTE, NULL);
        glBindTexture(GL_TEXTURE_2D, 0);

        glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
        glPixelStorei(GL_PACK_ALIGNMENT, 1);
    }
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char** argv)
{
    if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") ||
		cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) 
	{
        g_bQAReadback = true;
        g_bFBODisplay = false;
        fpsLimit = frameCheckNumber;
    }
    if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify"))
    {
        g_bOpenGLQA = true;
    }
    if (cutCheckCmdLineFlag(argc, (const char **)argv, "fbo"))
    {
        g_bFBODisplay = true;
    }

    if (g_bQAReadback) {
        runAutoTest(argc, argv);
    } else {
        runGraphicsTest(argc, argv);
    }

    cutilExit(argc, argv);
}
Beispiel #4
0
// Main program
int main(int argc, char** argv)
{
  // Create the CUTIL timer
  cutilCheckError( cutCreateTimer( &timer));
   
  if (CUTFalse == initGL(argc, argv)) {
    return CUTFalse;
  }
 
  initCuda(argc, argv);
  CUT_CHECK_ERROR_GL();
 
  // register callbacks
  glutDisplayFunc(fpsDisplay);
  glutKeyboardFunc(keyboard);
  glutMouseFunc(mouse);
  glutMotionFunc(motion);
   
  // start rendering mainloop
  glutMainLoop();
   
  // clean up
  cudaThreadExit();
  cutilExit(argc, argv);
}
Beispiel #5
0
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char** argv)
{
    runTest(argc, argv);

    cudaThreadExit();

    cutilExit(argc, argv);
}
// initialize OpenGL
void initGL(int *argc, char **argv)
{
    glutInit(argc, argv);
    glutInitDisplayMode(GLUT_RGB | GLUT_DEPTH | GLUT_DOUBLE);
    glutInitWindowSize(winWidth, winHeight);
    glutCreateWindow("CUDA Smoke Particles");

    glewInit();
    if (!glewIsSupported("GL_VERSION_2_0 GL_VERSION_1_5")) {
        fprintf(stderr, "The following required OpenGL extensions missing:\n\tGL_VERSION_2_0\n\tGL_VERSION_1_5\n");
        fprintf(stderr, "  PASSED\n");
        cutilExit(*argc, argv);
        exit(-1);
    }
    if (!glewIsSupported("GL_ARB_multitexture GL_ARB_vertex_buffer_object GL_EXT_geometry_shader4")) {
        fprintf(stderr, "The following required OpenGL extensions missing:\n\tGL_ARB_multitexture\n\tGL_ARB_vertex_buffer_object\n\tGL_EXT_geometry_shader4.\n");
        fprintf(stderr, "  PASSED\n");
        cutilExit(*argc, argv);
        exit(-1);
    }

#if defined (_WIN32)
    if (wglewIsSupported("WGL_EXT_swap_control")) {
        // disable vertical sync
        wglSwapIntervalEXT(0);
    }
#endif

    glEnable(GL_DEPTH_TEST);

    // load floor texture
    char* imagePath = cutFindFilePath("floortile.ppm", argv[0]);
    if (imagePath == 0) {
        fprintf(stderr, "Error finding floor image file\n");
        fprintf(stderr, "  FAILED\n");
        exit(EXIT_FAILURE);
    }
    floorTex = loadTexture(imagePath);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR_MIPMAP_LINEAR);
    glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAX_ANISOTROPY_EXT, 16.0f);

    floorProg = new GLSLProgram(floorVS, floorPS);	

    glutReportErrors();
}
int findCapableDevice(int argc, char **argv)
{
    int dev;
    int bestDev = -1;

    int deviceCount = 0;
    if (cudaGetDeviceCount(&deviceCount) != cudaSuccess) {
        fprintf(stderr, "cudaGetDeviceCount FAILED CUDA Driver and Runtime version may be mismatched.\n");
        fprintf(stderr, "\nFAILED\n");
        cudaThreadExit();
        cutilExit(argc, argv);
    }
    for (dev = 0; dev < deviceCount; ++dev) {
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);

        if (dev == 0) {
            // This function call returns 9999 for both major & minor fields, if no CUDA capable devices are present
            if (deviceProp.major == 9999 && deviceProp.minor == 9999)
                fprintf(stderr,"There is no device supporting CUDA.\n");
            else if (deviceCount == 1)
                fprintf(stderr,"There is 1 device supporting CUDA\n");
            else
                fprintf(stderr,"There are %d devices supporting CUDA\n", deviceCount);
        }

        if( checkCUDAProfile( dev ) ) {
            fprintf(stderr,"\nFound capable device: %d\n", dev );
            if( bestDev == -1 ) { 
                bestDev = dev;
                fprintf(stderr, "Setting active device to %d\n", bestDev );
            }
        }
    }

    if( bestDev == -1 ) {
        fprintf(stderr, "\nNo configuration with available capabilities was found.  Test has been waived.\n");
        fprintf(stderr, "This sample requires:\n");
        fprintf(stderr, "\tGPU Device Compute   >= 2.0 is required\n");
        fprintf(stderr, "\tCUDA Runtime Version >= 3.1 is required\n");
        fprintf(stderr, "PASSED\n");
    }
    return bestDev;
}
Beispiel #8
0
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char** argv)
{
    // check for command line arguments
    if (argc > 1) {
        if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") ||
            cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) 
		{
            g_bQAReadback = true;
            animate       = false;
            fpsLimit = frameCheckNumber;
        }
        if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) {
            g_bOpenGLQA = true;
            animate = false;
            fpsLimit = frameCheckNumber;
        }
    }

    if (g_bQAReadback) 
    {
        // Automated testing 
        runAutoTest(argc, argv);
    } 
    else 
    {
        printf("[%s]\n\n"
               "Left mouse button          - rotate\n"
               "Middle mouse button        - pan\n"
               "Left + middle mouse button - zoom\n"
               "'w' key                    - toggle wireframe\n", sSDKsample);

        runGraphicsTest(argc, argv);
    }

    cutilExit(argc, argv);
}
int main(int argc, char** argv) 
{
    printf("[%s]\n", sSDKsample);
    if (argc > 1) {
        if (cutCheckCmdLineFlag(argc, (const char **)argv, "help")) {
            printHelp();
        }
		if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") ||
		    cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt"))
		{
            g_bQAReadback = true;
            fpsLimit = frameCheckNumber;
        }
        if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify"))
		{
            g_bOpenGLQA = true;
            fpsLimit = frameCheckNumber;
        }
        if (cutCheckCmdLineFlag(argc, (const char **)argv, "fbo")) {
            g_bFBODisplay = true;
            fpsLimit = frameCheckNumber;
        }
    }
	

    if (g_bQAReadback) 
    {
        runAutoTest(argc, argv);
    } 
    else 
    {
        // First initialize OpenGL context, so we can properly set the GL for CUDA.
        // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
        initGL( &argc, argv );

        // use command-line specified CUDA device if possible, otherwise search for capable device
        if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
            cutilGLDeviceInit(argc, argv);
			int device;
			cudaGetDevice( &device );
			if( checkCUDAProfile( device ) == false ) {
				cudaThreadExit();
				cutilExit(argc, argv);
			}
        } else {
            //cudaGLSetGLDevice (cutGetMaxGflopsDeviceId() );
			int dev = findCapableDevice(argc, argv);
			if( dev != -1 ) 
				cudaGLSetGLDevice( dev );
			else {
				cudaThreadExit();
				cutilExit(argc, argv);
			}
        }

        cutilCheckError(cutCreateTimer(&timer));
        cutilCheckError(cutResetTimer(timer));  
     
        glutDisplayFunc(display);
        glutKeyboardFunc(keyboard);
        glutReshapeFunc(reshape);
        glutIdleFunc(idle);

        if (g_bOpenGLQA) {
            loadDefaultImage( argc, argv );
        }

        if (argc > 1) {
            char *filename;
            if (cutGetCmdLineArgumentstr(argc, (const char **)argv, "file", &filename)) {
                initializeData(filename, argc, argv);
            }
        } else {
            loadDefaultImage( argc, argv );
        }


        // If code is not printing the USage, then we execute this path.
        if (!bQuit) {
            if (g_bOpenGLQA) {
                g_CheckRender = new CheckBackBuffer(wWidth, wHeight, 4);
                g_CheckRender->setPixelFormat(GL_BGRA);
                g_CheckRender->setExecPath(argv[0]);
                g_CheckRender->EnableQAReadback(true);
            }

            printf("I: display image\n");
            printf("T: display Sobel edge detection (computed with tex)\n");
            printf("S: display Sobel edge detection (computed with tex+shared memory)\n");
            printf("Use the '-' and '=' keys to change the brightness.\n");
			printf("b: switch block filter operation (mean/Sobel)\n");
			printf("p: swtich point filter operation (threshold on/off)\n");
            fflush(stdout);
            atexit(cleanup); 
            glutMainLoop();
        }
    }

    cudaThreadExit();
    cutilExit(argc, argv);
}
void runAutoTest(int argc, char **argv)
{
    printf("[%s] (automated testing w/ readback)\n", sSDKsample);

	if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
		cutilDeviceInit(argc, argv);
		int device;
		cudaGetDevice( &device );
		if( checkCUDAProfile( device ) == false ) {
			cudaThreadExit();
		    cutilExit(argc, argv);
		}
	} else {
		int dev = findCapableDevice(argc, argv);
		if( dev != -1 ) 
			cudaSetDevice( dev );
		else {
			cudaThreadExit();
		    cutilExit(argc, argv);
		}
	}

    loadDefaultImage( argc, argv );

    if (argc > 1) {
        char *filename;
        if (cutGetCmdLineArgumentstr(argc, (const char **)argv, "file", &filename)) {
            initializeData(filename, argc, argv);
        }
    } else {
        loadDefaultImage( argc, argv );
    }

    g_CheckRender       = new CheckBackBuffer(imWidth, imHeight, sizeof(Pixel), false);
    g_CheckRender->setExecPath(argv[0]);

    Pixel *d_result;
    cutilSafeCall( cudaMalloc( (void **)&d_result, imWidth*imHeight*sizeof(Pixel)) );

    while (g_SobelDisplayMode <= 2) 
    {
        printf("AutoTest: %s <%s>\n", sSDKsample, filterMode[g_SobelDisplayMode]);

        sobelFilter(d_result, imWidth, imHeight, g_SobelDisplayMode, imageScale, blockOp, pointOp );

        cutilSafeCall( cudaThreadSynchronize() );

        cudaMemcpy(g_CheckRender->imageData(), d_result, imWidth*imHeight*sizeof(Pixel), cudaMemcpyDeviceToHost);

        g_CheckRender->savePGM(sOriginal[g_Index], false, NULL);

        if (!g_CheckRender->PGMvsPGM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, 0.15f)) {
            g_TotalErrors++;
        }
        g_Index++;
        g_SobelDisplayMode = (SobelDisplayMode)g_Index;
    }

    cutilSafeCall( cudaFree( d_result ) );
    delete g_CheckRender;

    if (!g_TotalErrors) 
        printf("PASSED\n");
    else 
        printf("FAILED\n");
}
Beispiel #11
0
int main(int argc, char** argv) 
{
	// EDISON //////////////////////////////////////////////////////////////////
	
	sigmaS = 7.0f;
	sigmaR = 6.5f;
	edison.minRegion = 20.0f;


	cutLoadPPMub("image.ppm", &edison.inputImage_, &width, &height);	
	edison.meanShift();

	cutSavePPMub("segmimage.ppm", edison.segmImage_, width, height);
	cutSavePPMub("filtimage.ppm", edison.filtImage_, width, height);
	
	unsigned char data[height * width];
	memset(data, 0, height * width * sizeof(unsigned char));

	for(int i = 0; i < edison.numBoundaries_; i++) {
			data[edison.boundaries_[i]] = 255;
	}
	
	cutSavePGMub("bndyimage.pgm", data, width, height);
	//return 0;
	// EDISON //////////////////////////////////////////////////////////////////
	
	
    if (argc > 1) {
        if (cutCheckCmdLineFlag(argc, (const char **)argv, "help")) {
            printHelp();
        }
		if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") ||
		    cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt"))
		{
            g_bQAReadback = true;
            fpsLimit = frameCheckNumber;
        }
        if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify"))
		{
            g_bOpenGLQA = true;
            fpsLimit = frameCheckNumber;
        }
        if (cutCheckCmdLineFlag(argc, (const char **)argv, "fbo")) {
            g_bFBODisplay = true;
            fpsLimit = frameCheckNumber;
        }
    }

    if (g_bQAReadback) {
        runAutoTest(argc, argv);
    } else {
        // First initialize OpenGL context, so we can properly set the GL for CUDA.
        // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
        initGL( argc, argv );

        // use command-line specified CUDA device, otherwise use device with highest Gflops/s
        if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
            cutilGLDeviceInit(argc, argv);
        } else {
            cudaGLSetGLDevice (cutGetMaxGflopsDeviceId() );
        }

        int device;
        struct cudaDeviceProp prop;
        cudaGetDevice( &device );
        cudaGetDeviceProperties( &prop, device );
        if(!strncmp( "Tesla", prop.name, 5 )) {
            printf("This sample needs a card capable of OpenGL and display.\n");
            printf("Please choose a different device with the -device=x argument.\n");
            cudaThreadExit();
            cutilExit(argc, argv);
        }

        cutilCheckError(cutCreateTimer(&timer));
        cutilCheckError(cutResetTimer(timer));  
     
        glutDisplayFunc(display);
        glutKeyboardFunc(keyboard);
        glutReshapeFunc(reshape);
        glutIdleFunc(idle);

        if (g_bOpenGLQA) {
            loadDefaultImage( argv[0] );
        }

        if (argc > 1) {
            char *filename;
            if (cutGetCmdLineArgumentstr(argc, (const char **)argv, "file", &filename)) {
                initializeData(filename);
            }
        } else {
            loadDefaultImage( argv[0]);
        }

        // If code is not printing the USage, then we execute this path.
        if (!bQuit) {
            if (g_bOpenGLQA) {
                g_CheckRender = new CheckBackBuffer(wWidth, wHeight, 4);
                g_CheckRender->setPixelFormat(GL_BGRA);
                g_CheckRender->setExecPath(argv[0]);
                g_CheckRender->EnableQAReadback(true);
            }

            printf("I: display image\n");
            printf("T: display Sobel edge detection (computed with tex)\n");
            printf("S: display Sobel edge detection (computed with tex+shared memory)\n");
            printf("Use the '-' and '=' keys to change the brightness.\n");
            fflush(stdout);
            atexit(cleanup); 
            glutMainLoop();
        }
    }

    cudaThreadExit();
    cutilExit(argc, argv);
}
////////////////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv){
    const unsigned int OPT_N_MAX = 512;
    unsigned int useDoublePrecision;

    printf("[binomialOptions]\n");

    int devID = cutilDeviceInit(argc, argv);
    if (devID < 0) {
       printf("exiting...\n");
       cutilExit(argc, argv);
       exit(0);
    }

    cutilSafeCall(cudaGetDevice(&devID));
    cudaDeviceProp deviceProp;
    cutilSafeCall(cudaGetDeviceProperties(&deviceProp, devID));

    char *precisionChoice;
    cutGetCmdLineArgumentstr(argc, (const char **)argv, "type", &precisionChoice);
    if(precisionChoice == NULL) {
        useDoublePrecision = 0;
    } else {
        if(!strcasecmp(precisionChoice, "double"))
            useDoublePrecision = 1;
        else
            useDoublePrecision = 0;
    }
    printf(useDoublePrecision ? "Using double precision...\n" : "Using single precision...\n");
    const int OPT_N = deviceEmulation() ? 1 : OPT_N_MAX;

    TOptionData optionData[OPT_N_MAX];
    float
        callValueBS[OPT_N_MAX],
        callValueGPU[OPT_N_MAX],
        callValueCPU[OPT_N_MAX];

    double
        sumDelta, sumRef, gpuTime, errorVal;

    unsigned int hTimer;
    int i;

    cutilCheckError( cutCreateTimer(&hTimer) );

    int version = deviceProp.major * 10 + deviceProp.minor;
    if(useDoublePrecision && version < 13){
        printf("Double precision is not supported.\n");
        return 0;
    }

    printf("Generating input data...\n");
        //Generate options set
        srand(123);
        for(i = 0; i < OPT_N; i++){
            optionData[i].S = randData(5.0f, 30.0f);
            optionData[i].X = randData(1.0f, 100.0f);
            optionData[i].T = randData(0.25f, 10.0f);
            optionData[i].R = 0.06f;
            optionData[i].V = 0.10f;
            BlackScholesCall(callValueBS[i], optionData[i]);
        }

    printf("Running GPU binomial tree...\n");
        cutilSafeCall( cudaThreadSynchronize() );
        cutilCheckError( cutResetTimer(hTimer) );
        cutilCheckError( cutStartTimer(hTimer) );

        if(useDoublePrecision)
            binomialOptions_SM13(callValueGPU, optionData, OPT_N);
        else
            binomialOptions_SM10(callValueGPU, optionData, OPT_N);

        cutilSafeCall( cudaThreadSynchronize() );
        cutilCheckError( cutStopTimer(hTimer) );
        gpuTime = cutGetTimerValue(hTimer);
    printf("Options count            : %i     \n", OPT_N);
    printf("Time steps               : %i     \n", NUM_STEPS);
    printf("binomialOptionsGPU() time: %f msec\n", gpuTime);
    printf("Options per second       : %f     \n", OPT_N / (gpuTime * 0.001));

    printf("Running CPU binomial tree...\n");
        for(i = 0; i < OPT_N; i++)
            binomialOptionsCPU(callValueCPU[i], optionData[i]);

    printf("Comparing the results...\n");
    sumDelta = 0;
    sumRef   = 0;
    printf("GPU binomial vs. Black-Scholes\n");
    for(i = 0; i < OPT_N; i++){
        sumDelta += fabs(callValueBS[i] - callValueGPU[i]);
        sumRef += fabs(callValueBS[i]);
    }
    if(sumRef >1E-5)
        printf("L1 norm: %E\n", sumDelta / sumRef);
    else
        printf("Avg. diff: %E\n", sumDelta / (double)OPT_N);

    printf("CPU binomial vs. Black-Scholes\n");
    sumDelta = 0;
    sumRef   = 0;
    for(i = 0; i < OPT_N; i++){
        sumDelta += fabs(callValueBS[i]- callValueCPU[i]);
        sumRef += fabs(callValueBS[i]);
    }
    if(sumRef >1E-5)
        printf("L1 norm: %E\n", sumDelta / sumRef);
    else
        printf("Avg. diff: %E\n", sumDelta / (double)OPT_N);

    printf("CPU binomial vs. GPU binomial\n");
    sumDelta = 0;
    sumRef   = 0;
    for(i = 0; i < OPT_N; i++){
        sumDelta += fabs(callValueGPU[i] - callValueCPU[i]);
        sumRef += callValueCPU[i];
    }
    if(sumRef > 1E-5)
        printf("L1 norm: %E\n", errorVal = sumDelta / sumRef);
    else
        printf("Avg. diff: %E\n", errorVal = sumDelta / (double)OPT_N);

    printf("Shutting down...\n");

	printf("\n[binomialOptions] - Test Summary:\n");
    printf((errorVal < 5e-4) ? "PASSED\n" : "FAILED\n");

    cutilCheckError( cutDeleteTimer(hTimer) );

    cudaThreadExit();

    cutilExit(argc, argv);
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char** argv) 
{
	int retVal = 0;

	retVal = xnInit( argc, argv );

    printf("[ %s ]\n", sSDKsample); 

    if (argc > 1) {
        cutGetCmdLineArgumenti( argc, (const char**) argv, "n", (int *) &numParticles);
        if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") ||
			cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")
			) 
		{
            g_bQAReadback = true;
        }
        if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) 
		{
            g_bQAGLVerify = true;
        }
    }

    if (g_bQAReadback) {
        // For Automated testing, we do not use OpenGL/CUDA interop
        if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) {
	        cutilDeviceInit (argc, argv);
        } else {
            cudaSetDevice (cutGetMaxGflopsDeviceId() );
        }

        g_CheckRender = new CheckBackBuffer(winWidth, winHeight, 4, false);
        g_CheckRender->setPixelFormat(GL_RGBA);
        g_CheckRender->setExecPath(argv[0]);
        g_CheckRender->EnableQAReadback(true);

        // This code path is used for Automated Testing
        initParticles(numParticles, false, false);
        initParams();

        if (emitterOn) {
            runEmitter();
        }
        SimParams &params = psystem->getParams();
        params.cursorPos = make_float3(cursorPosLag.x, cursorPosLag.y, cursorPosLag.z);

        psystem->step(timestep);

        float4 *pos = NULL, *vel = NULL;
        int g_TotalErrors = 0;

        psystem->dumpBin(&pos, &vel);

        g_CheckRender->dumpBin(pos, numParticles*sizeof(float4), "smokeParticles_pos.bin");
        g_CheckRender->dumpBin(vel, numParticles*sizeof(float4), "smokeParticles_vel.bin");

        if (!g_CheckRender->compareBin2BinFloat("smokeParticles_pos.bin", sRefBin[0], numParticles*sizeof(float4), MAX_EPSILON_ERROR, THRESHOLD))
           g_TotalErrors++;

        if (!g_CheckRender->compareBin2BinFloat("smokeParticles_vel.bin", sRefBin[1], numParticles*sizeof(float4), MAX_EPSILON_ERROR, THRESHOLD))
           g_TotalErrors++;


        delete psystem;
        delete g_CheckRender;

        printf("%s\n", (g_TotalErrors > 0) ? "FAILED" : "PASSED");

        cudaThreadExit();
    } else {
        // Normal smokeParticles rendering path
        // 1st initialize OpenGL context, so we can properly set the GL for CUDA.
        // This is needed to achieve optimal performance with OpenGL/CUDA interop.
        initGL( &argc, argv );

        if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) {
	        cutilGLDeviceInit (argc, argv);
        } else {
            cudaGLSetGLDevice (cutGetMaxGflopsDeviceId() );
        }

        if (g_bQAGLVerify) {
            g_CheckRender = new CheckBackBuffer(winWidth, winHeight, 4);
            g_CheckRender->setPixelFormat(GL_RGBA);
            g_CheckRender->setExecPath(argv[0]);
            g_CheckRender->EnableQAReadback(true);
        }

        // This is the normal code path for SmokeParticles
        initParticles(numParticles, true, true);
        initParams();
        initMenus();

        glutDisplayFunc(display);
        glutReshapeFunc(reshape);
        glutMouseFunc(mouse);
        glutMotionFunc(motion);
        glutKeyboardFunc(key);
        glutKeyboardUpFunc(keyUp);
        glutSpecialFunc(special);
        glutIdleFunc(idle);

        glutMainLoop();
    }

    cutilExit(argc, argv);
	return retVal;
}
Beispiel #14
0
////////////////////////////////////////////////////////////////////////////////
// Test driver
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char** argv){
    uint 
        *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal;
    uint 
        *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal;
    uint hTimer;

    const uint   N = 4 * 1048576;
    const uint DIR = 1;

    const uint numValues = 65536;


    printf("Allocating and initializing host arrays...\n\n");
        cutCreateTimer(&hTimer);
        h_SrcKey = (uint *)malloc(N * sizeof(uint));
        h_SrcVal = (uint *)malloc(N * sizeof(uint));
        h_DstKey = (uint *)malloc(N * sizeof(uint));
        h_DstVal = (uint *)malloc(N * sizeof(uint));

        srand(2009);
        for(uint i = 0; i < N; i++)
            h_SrcKey[i] = rand() % numValues;
        fillValues(h_SrcVal, N);

    printf("Allocating and initializing CUDA arrays...\n\n");
        cutilSafeCall( cudaMalloc((void **)&d_DstKey, N * sizeof(uint)) );
        cutilSafeCall( cudaMalloc((void **)&d_DstVal, N * sizeof(uint)) );
        cutilSafeCall( cudaMalloc((void **)&d_BufKey, N * sizeof(uint)) );
        cutilSafeCall( cudaMalloc((void **)&d_BufVal, N * sizeof(uint)) );
        cutilSafeCall( cudaMalloc((void **)&d_SrcKey, N * sizeof(uint)) );
        cutilSafeCall( cudaMalloc((void **)&d_SrcVal, N * sizeof(uint)) );
        cutilSafeCall( cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice) );
        cutilSafeCall( cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice) );

    printf("Initializing GPU merge sort...\n");
        initMergeSort();

    printf("Running GPU merge sort...\n");
        cutilSafeCall( cudaThreadSynchronize() );
        cutResetTimer(hTimer);
        cutStartTimer(hTimer);
            mergeSort(
                d_DstKey,
                d_DstVal,
                d_BufKey,
                d_BufVal,
                d_SrcKey,
                d_SrcVal,
                N,
                DIR
            );
        cutilSafeCall( cudaThreadSynchronize() );
        cutStopTimer(hTimer);
    printf("Time: %f ms\n", cutGetTimerValue(hTimer));

    printf("Reading back GPU merge sort results...\n");
        cutilSafeCall( cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost) );
        cutilSafeCall( cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost) );

    printf("Inspecting the results...\n");
        uint keysFlag = validateSortedKeys(
            h_DstKey,
            h_SrcKey,
            1,
            N,
            numValues,
            DIR
        );

        uint valuesFlag = validateSortedValues(
            h_DstKey,
            h_DstVal,
            h_SrcKey,
            1,
            N
        );

    printf( (keysFlag && valuesFlag) ? "TEST PASSED\n" : "TEST FAILED\n");

    printf("Shutting down...\n");
        closeMergeSort();
        cutilCheckError( cutDeleteTimer(hTimer) );
        cutilSafeCall( cudaFree(d_SrcVal) );
        cutilSafeCall( cudaFree(d_SrcKey) );
        cutilSafeCall( cudaFree(d_BufVal) );
        cutilSafeCall( cudaFree(d_BufKey) );
        cutilSafeCall( cudaFree(d_DstVal) );
        cutilSafeCall( cudaFree(d_DstKey) );
        free(h_DstVal);
        free(h_DstKey);
        free(h_SrcVal);
        free(h_SrcKey);
        cudaThreadExit();
        cutilExit(argc, argv);
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv) 
{
    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    if (!cutCheckCmdLineFlag(argc, (const char **)argv, "noqatest") ||
		cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) 
	{
        g_bQAReadback = true;
        fpsLimit = frameCheckNumber;
    }
    if (argc > 1) {

        if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) {
            g_bOpenGLQA = true;
            fpsLimit = frameCheckNumber;
        }
    }

    printf("[%s] ", sSDKsample);
    if (g_bQAReadback) printf("(Automated Testing)\n");
    if (g_bOpenGLQA)   printf("(OpenGL Readback)\n");

    // Get the path of the filename
    char *filename;
    if (cutGetCmdLineArgumentstr(argc, (const char**) argv, "image", &filename)) {
        image_filename = filename;
    }
    // load image
    char* image_path = cutFindFilePath(image_filename, argv[0]);
    if (image_path == 0) {
        fprintf(stderr, "Error finding image file '%s'\n", image_filename);
        cudaThreadExit();
        exit(EXIT_FAILURE);
    }

    cutilCheckError( cutLoadPPM4ub(image_path, (unsigned char **) &h_img, &width, &height));
    if (!h_img) {
        printf("Error opening file '%s'\n", image_path);
        cudaThreadExit();
        exit(-1);
    }
    printf("Loaded '%s', %d x %d pixels\n", image_path, width, height);

    cutGetCmdLineArgumenti(argc, (const char**) argv, "threads", &nthreads);
    cutGetCmdLineArgumentf(argc, (const char**) argv, "sigma", &sigma);
    runBenchmark = cutCheckCmdLineFlag(argc, (const char**) argv, "bench");

    int device;
    struct cudaDeviceProp prop;
    cudaGetDevice( &device );
    cudaGetDeviceProperties( &prop, device );
    if( !strncmp( "Tesla", prop.name, 5 ) ) {
        printf("Tesla card detected, running the test in benchmark mode (no OpenGL display)\n");
//        runBenchmark = CUTTrue;
        g_bQAReadback = true;
    }        

    // Benchmark or AutoTest mode detected, no OpenGL
    if (runBenchmark == CUTTrue || g_bQAReadback) {
        if( cutCheckCmdLineFlag( argc, (const char **)argv, "device" ) ) 
            cutilDeviceInit( argc, argv );
        else 
            cudaSetDevice( cutGetMaxGflopsDeviceId() );
    } else {

        // First initialize OpenGL context, so we can properly set the GL for CUDA.
        // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
        initGL(argc, argv);

        if( cutCheckCmdLineFlag( argc, (const char **)argv, "device" ) ) 
            cutilGLDeviceInit( argc, argv );
        else 
            cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() );
    }

    initCudaBuffers();

    if (g_bOpenGLQA) {
        g_CheckRender = new CheckBackBuffer(width, height, 4);
        g_CheckRender->setPixelFormat(GL_RGBA);
        g_CheckRender->setExecPath(argv[0]);
        g_CheckRender->EnableQAReadback(true);
    }

    if (g_bQAReadback) {
        // This is the automated testing path
        g_CheckRender = new CheckBackBuffer(width, height, 4, false);
        g_CheckRender->setPixelFormat(GL_RGBA);
        g_CheckRender->setExecPath(argv[0]);
        g_CheckRender->EnableQAReadback(true);

        runAutoTest(argc, argv); 
        cleanup();
        cudaThreadExit();
        cutilExit(argc, argv);
    }

    if (runBenchmark) {
        benchmark(100);
        cleanup();
        cudaThreadExit();
        exit(0);
    }

    initGLBuffers();
    
    atexit(cleanup);
    
    glutMainLoop();

    cudaThreadExit();
    cutilExit(argc, argv);
}
Beispiel #16
0
int main( int argc, char* argv[] )
{
	int argc2 = 2;
	char* argv2[2] = { "", "-device=1" };

	cudaDeviceProp deviceProp;
    int devID = cutilChooseCudaDevice( argc2, argv2 );
    if( devID < 0 )
	{
       printf( "exiting...\n" );
       cutilExit( argc, argv );
       exit( 0 );
    }
    cutilSafeCall( cudaGetDeviceProperties( &deviceProp, devID ) );

	//Image4f im( "c:/tmp/tulip.png" );
	//Image4f im( "c:/tmp/tulip_1080.png" ); // Jiawen version
	Image4f im( "../../apps/bilateral_grid/input.png" );
	//Image4f im( "c:/tmp/church_panorama_5097x2889.pfm" );

	im = im.flipUD();

	Array2D< float > data( im.width(), im.height() );
	Array2D< float > output( im.width(), im.height() );

	for( int y = 0; y < im.height(); ++y )
	{
		for( int x = 0; x < im.width(); ++x )
		{
			Vector3f rgb = im.pixel( x, y ).xyz();
            // float lum = ColorUtils::rgb2luminance( rgb );
            // data( x, y ) = lum;
            // jrk: just use red
            data( x, y ) = rgb[0];
		}
	}

	testBilateralFilter( data, 8, 0.1f, output );
	saveArrayAsImage( output, "bf", 8, 0.1f );
	testBilateralFilter( data, 16, 0.1f, output );
	saveArrayAsImage( output, "bf", 16, 0.1f );
	testBilateralFilter( data, 32, 0.2f, output );
	saveArrayAsImage( output, "bf", 32, 0.2f );
	testBilateralFilter( data, 64, 0.4f, output );
	saveArrayAsImage( output, "bf", 64, 0.4f );
    
#if 0
	Image4f edgeImage( "/tmp/step.png" );
	edgeImage.flipUD();

	Array2D< float > edge( im.width(), im.height() );
	for( int y = 0; y < im.height(); ++y )
	{
		for( int x = 0; x < im.width(); ++x )
		{
			edge( x, y ) = edgeImage.pixel( x, y ).x;
		}
	}

	testCrossBilateralFilter( data, edge, 16, 0.1f, output );
	saveArrayAsImage( output, "cbf", 16, 0.1f );
	testCrossBilateralFilter( data, edge, 32, 0.2f, output );
	saveArrayAsImage( output, "cbf", 32, 0.2f );
	testCrossBilateralFilter( data, edge, 64, 0.4f, output );
	saveArrayAsImage( output, "cbf", 64, 0.4f );
#endif

	return 0;
}
int main(int argc, char **argv) {
    char *precisionChoice;
    cutGetCmdLineArgumentstr(argc, (const char **)argv, "type", &precisionChoice);
    if(precisionChoice == NULL)
        useDoublePrecision = 0;
    else {
        if(!strcasecmp(precisionChoice, "double"))
            useDoublePrecision = 1;
        else
            useDoublePrecision = 0;
    }

    const int MAX_GPU_COUNT = 8;
    const int         OPT_N = 256;
    const int        PATH_N = 1 << 18;
    const unsigned int SEED = 777;

    //Input data array
    TOptionData optionData[OPT_N];
    //Final GPU MC results
    TOptionValue callValueGPU[OPT_N];
    //"Theoretical" call values by Black-Scholes formula
    float callValueBS[OPT_N];
    //Solver config
    TOptionPlan optionSolver[MAX_GPU_COUNT];
    //OS thread ID
    CUTThread threadID[MAX_GPU_COUNT];


    //GPU number present in the system
    int GPU_N;
    int gpuBase, gpuIndex;
    int i;

    //Timer
    unsigned int hTimer;
    float time;

    double
    delta, ref, sumDelta, sumRef, sumReserve;

    cutilSafeCall( cudaGetDeviceCount(&GPU_N) );
    cutilCheckError( cutCreateTimer(&hTimer) );

#ifdef _EMU
    GPU_N = 1;
#endif
    printf("main(): generating input data...\n");
    srand(123);
    for(i = 0; i < OPT_N; i++) {
        optionData[i].S = randFloat(5.0f, 50.0f);
        optionData[i].X = randFloat(10.0f, 25.0f);
        optionData[i].T = randFloat(1.0f, 5.0f);
        optionData[i].R = 0.06f;
        optionData[i].V = 0.10f;
        callValueGPU[i].Expected   = -1.0f;
        callValueGPU[i].Confidence = -1.0f;
    }

    printf("main(): starting %i host threads...\n", GPU_N);
    //Get option count for each GPU
    for(i = 0; i < GPU_N; i++)
        optionSolver[i].optionCount = OPT_N / GPU_N;
    //Take into account cases with "odd" option counts
    for(i = 0; i < (OPT_N % GPU_N); i++)
        optionSolver[i].optionCount++;

    //Assign GPU option ranges
    gpuBase = 0;
    for(i = 0; i < GPU_N; i++) {
        optionSolver[i].device     = i;
        optionSolver[i].optionData = optionData   + gpuBase;
        optionSolver[i].callValue  = callValueGPU + gpuBase;
        optionSolver[i].seed       = SEED;
        optionSolver[i].pathN      = PATH_N;
        gpuBase += optionSolver[i].optionCount;
    }

    //Start the timer
    cutilCheckError( cutResetTimer(hTimer) );
    cutilCheckError( cutStartTimer(hTimer) );

    //Start CPU thread for each GPU
    for(gpuIndex = 0; gpuIndex < GPU_N; gpuIndex++)
        threadID[gpuIndex] = cutStartThread((CUT_THREADROUTINE)solverThread, &optionSolver[gpuIndex]);

    //Stop the timer
    cutilCheckError( cutStopTimer(hTimer) );
    time = cutGetTimerValue(hTimer);

    printf("main(): waiting for GPU results...\n");
    cutWaitForThreads(threadID, GPU_N);

    printf("main(): GPU statistics\n");
    for(i = 0; i < GPU_N; i++) {
        printf("GPU #%i\n", optionSolver[i].device);
        printf("Options         : %i\n", optionSolver[i].optionCount);
        printf("Simulation paths: %i\n", optionSolver[i].pathN);
    }
    printf("\nTotal time (ms.): %f\n", time);
    printf("Options per sec.: %f\n", OPT_N / (time * 0.001));

#ifdef DO_CPU
    printf("main(): running CPU MonteCarlo...\n");
    TOptionValue callValueCPU;
    sumDelta = 0;
    sumRef   = 0;
    for(i = 0; i < OPT_N; i++) {
        MonteCarloCPU(
            callValueCPU,
            optionData[i],
            NULL,
            PATH_N
        );
        delta     = fabs(callValueCPU.Expected - callValueGPU[i].Expected);
        ref       = callValueCPU.Expected;
        sumDelta += delta;
        sumRef   += fabs(ref);
        printf("Exp : %f | %f\t", callValueCPU.Expected,   callValueGPU[i].Expected);
        printf("Conf: %f | %f\n", callValueCPU.Confidence, callValueGPU[i].Confidence);
    }
    printf("L1 norm: %E\n", sumDelta / sumRef);
#endif

    printf("main(): comparing Monte Carlo and Black-Scholes results...\n");
    sumDelta   = 0;
    sumRef     = 0;
    sumReserve = 0;
    for(i = 0; i < OPT_N; i++) {
        BlackScholesCall(
            callValueBS[i],
            optionData[i]
        );
        delta     = fabs(callValueBS[i] - callValueGPU[i].Expected);
        ref       = callValueBS[i];
        sumDelta += delta;
        sumRef   += fabs(ref);
        if(delta > 1e-6) sumReserve += callValueGPU[i].Confidence / delta;
#ifdef PRINT_RESULTS
        printf("BS: %f; delta: %E\n", callValueBS[i], delta);
#endif
    }
    sumReserve /= OPT_N;
    printf("L1 norm        : %E\n", sumDelta / sumRef);
    printf("Average reserve: %f\n", sumReserve);
    printf((sumReserve > 1.0f) ? "PASSED\n" : "FAILED.\n");

    printf("Shutting down...\n");

    cutilCheckError( cutDeleteTimer(hTimer) );
    cutilExit(argc, argv);
}
Beispiel #18
0
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char** argv)
{
  po::options_description desc("mephitis options");

  desc.add_options() 
    ("help", "this help")
    ("archive,a", po::value<std::string>(), "archive file")
    ("device,D", po::value<unsigned>(), "device")
    ("frames,f", po::value<unsigned>(), "number frames to display")
    ("voxelsize,s", po::value<float>(), "size of voxels (in centimeters)")
    ("no-processing,n", "display only, no processing")
    ("verbose,v", "verbose")
    ("debug,d", "debug")
    ;

  po::variables_map vm;
  po::store(po::parse_command_line(argc, argv, desc), vm);
  po::notify(vm);

  if (vm.count("help"))
    {
      std::cout << desc << "\n";
      exit(0);
    }
  std::string archivefile = vm.count("archive") ? vm["archive"].as<std::string>() : "ARCHIVE_NOT_SPECIFIED";      

  int device = vm.count("device") ? vm["device"].as<unsigned>() : 0;

  voxelsize = vm.count("voxelsize") ? vm["voxelsize"].as<float>() : 1.0;
  no_processing = vm.count("no-processing"); 

  if (vm.count("frames"))
    nframes = vm["frames"].as<unsigned>();

  ms::verbose = vm.count("verbose");
  ms::debug = vm.count("debug");

  ms::regulator<ms::coalesced_points<ms::host> > regulator(queue, 3);
  
  ms::archive_source<ms::coalesced_points<ms::host> > source(archivefile, boost::ref(regulator), 5);

  boost::thread popperthread(boost::bind(&ms::archive_source<ms::coalesced_points<ms::host> >::run,
                                         boost::ref(source)));

  std::cout << "bag popper thread running, sleeping 1 second\n";
  boost::this_thread::sleep(boost::posix_time::seconds(1));
  std::cout << "done sleeping\n";

  // First initialize OpenGL context, so we can properly set the GL for CUDA.
  // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
  if (CUTFalse == initGL(&argc, argv)) {
    return CUTFalse;
  }

  mephitis::set_device(device, true);

  monitor.run(5);

  // register callbacks
  glutDisplayFunc(display);
  glutKeyboardFunc(keyboard);
  glutMouseFunc(mouse);
  glutMotionFunc(motion);
		
  atexit(cleanup);
  
  // start rendering mainloop
  glutMainLoop();

  cudaThreadExit();

  cutilExit(argc, argv);
}
Beispiel #19
0
int main(int argc, char **argv){
	GpuProfiling::initProf();
    uchar *h_Data;
    uint  *h_HistogramCPU, *h_HistogramGPU;
    uchar *d_Data;
    uint  *d_Histogram;
    uint hTimer;
    int PassFailFlag = 1;
    uint byteCount = 64 * 1048576;
    uint uiSizeMult = 1;

    cudaDeviceProp deviceProp;
    deviceProp.major = 0;
    deviceProp.minor = 0;
    int dev;

	printf("%s\n", sSDKsample);

	// set logfile name and start logs
    shrSetLogFileName ("histogram.txt");
    shrLog("%s Starting...\n\n", argv[0]);

    //Use command-line specified CUDA device, otherwise use device with highest Gflops/s
    if( shrCheckCmdLineFlag(argc, (const char**)argv, "device") )
        cutilDeviceInit(argc, argv);
    else
        cudaSetDevice( dev = cutGetMaxGflopsDeviceId() );

    cutilSafeCall( cudaChooseDevice(&dev, &deviceProp) );
    cutilSafeCall( cudaGetDeviceProperties(&deviceProp, dev) );

	printf("CUDA device [%s] has %d Multi-Processors, Compute %d.%d\n",
		deviceProp.name, deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor);

	int version = deviceProp.major * 0x10 + deviceProp.minor;

	if(version < 0x11)
    {
        printf("There is no device supporting a minimum of CUDA compute capability 1.1 for this SDK sample\n");
        printf("PASSED");
        cudaThreadExit();
		exit(0);
        cutilExit(argc, argv);
    }

    cutilCheckError(cutCreateTimer(&hTimer));

    // Optional Command-line multiplier to increase size of array to histogram
    if (shrGetCmdLineArgumentu(argc, (const char**)argv, "sizemult", &uiSizeMult))
    {
        uiSizeMult = CLAMP(uiSizeMult, 1, 10);
        byteCount *= uiSizeMult;
    }

    shrLog("Initializing data...\n");
        shrLog("...allocating CPU memory.\n");
            h_Data         = (uchar *)malloc(byteCount);
            h_HistogramCPU = (uint  *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint));
            h_HistogramGPU = (uint  *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint));

        shrLog("...generating input data\n");
            srand(2009);
            for(uint i = 0; i < byteCount; i++)
                h_Data[i] = rand() % 256;

        shrLog("...allocating GPU memory and copying input data\n\n");
            cutilSafeCall( cudaMalloc((void **)&d_Data, byteCount  ) );
            cutilSafeCall( cudaMalloc((void **)&d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint)  ) );
            cutilSafeCall( cudaMemcpy(d_Data, h_Data, byteCount, cudaMemcpyHostToDevice) );

    {
        shrLog("Starting up 64-bin histogram...\n\n");
            initHistogram64();

        shrLog("Running 64-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns);
            for(int iter = -1; iter < numRuns; iter++){
                //iter == -1 -- warmup iteration
                if(iter == 0){
                    cutilSafeCall( cudaThreadSynchronize() );
                    cutilCheckError( cutResetTimer(hTimer) );
                    cutilCheckError( cutStartTimer(hTimer) );
                }

                histogram64(d_Histogram, d_Data, byteCount);
            }

            cutilSafeCall( cudaThreadSynchronize() );
            cutilCheckError(  cutStopTimer(hTimer));
            double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns;
        shrLog("histogram64() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs);
        shrLogEx(LOGBOTH | MASTER, 0, "histogram64, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n",
                (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM64_THREADBLOCK_SIZE);

        shrLog("\nValidating GPU results...\n");
            shrLog(" ...reading back GPU results\n");
                cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM64_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) );

            shrLog(" ...histogram64CPU()\n");
               histogram64CPU(
                    h_HistogramCPU,
                    h_Data,
                    byteCount
                );

            shrLog(" ...comparing the results...\n");
                for(uint i = 0; i < HISTOGRAM64_BIN_COUNT; i++)
                    if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0;
            shrLog(PassFailFlag ? " ...64-bin histograms match\n\n" : " ***64-bin histograms do not match!!!***\n\n" );

        shrLog("Shutting down 64-bin histogram...\n\n\n");
            closeHistogram64();
    }

    {
        shrLog("Initializing 256-bin histogram...\n");
            initHistogram256();

        shrLog("Running 256-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns);
            for(int iter = -1; iter < numRuns; iter++){
                //iter == -1 -- warmup iteration
                if(iter == 0){
                    cutilSafeCall( cudaThreadSynchronize() );
                    cutilCheckError( cutResetTimer(hTimer) );
                    cutilCheckError( cutStartTimer(hTimer) );
                }

                histogram256(d_Histogram, d_Data, byteCount);
            }

            cutilSafeCall( cudaThreadSynchronize() );
            cutilCheckError(  cutStopTimer(hTimer));
            double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns;
        shrLog("histogram256() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs);
        shrLogEx(LOGBOTH | MASTER, 0, "histogram256, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n",
                (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM256_THREADBLOCK_SIZE);

        shrLog("\nValidating GPU results...\n");
            shrLog(" ...reading back GPU results\n");
                cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) );

            shrLog(" ...histogram256CPU()\n");
                histogram256CPU(
                    h_HistogramCPU,
                    h_Data,
                    byteCount
                );

            shrLog(" ...comparing the results\n");
                for(uint i = 0; i < HISTOGRAM256_BIN_COUNT; i++)
                    if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0;
            shrLog(PassFailFlag ? " ...256-bin histograms match\n\n" : " ***256-bin histograms do not match!!!***\n\n" );

        shrLog("Shutting down 256-bin histogram...\n\n\n");
            closeHistogram256();
    }

	shrLog("%s - Test Summary\n", sSDKsample);

    // pass or fail (for both 64 bit and 256 bit histograms)
    shrLog("%s\n\n", PassFailFlag ? "PASSED" : "FAILED");
	GpuProfiling::printResults();

    shrLog("Shutting down...\n");
        cutilCheckError(cutDeleteTimer(hTimer));
        cutilSafeCall( cudaFree(d_Histogram) );
        cutilSafeCall( cudaFree(d_Data) );
        free(h_HistogramGPU);
        free(h_HistogramCPU);
        free(h_Data);

    cudaThreadExit();
	exit(0);
    shrEXIT(argc, (const char**)argv);
}