Example #1
0
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char** argv) 
{
    numParticles = 1024;
    uint gridDim = 64;
    numIterations = 1;

    cutGetCmdLineArgumenti( argc, (const char**) argv, "n", (int *) &numParticles);
    cutGetCmdLineArgumenti( argc, (const char**) argv, "grid", (int *) &gridDim);
    gridSize.x = gridSize.y = gridSize.z = gridDim;
    printf("grid: %d x %d x %d = %d cells\n", gridSize.x, gridSize.y, gridSize.z, gridSize.x*gridSize.y*gridSize.z);

    bool benchmark = !cutCheckCmdLineFlag(argc, (const char**) argv, "noqatest") != 0;
    cutGetCmdLineArgumenti( argc, (const char**) argv, "i", &numIterations);
    
    cudaInit(argc, argv);

    glutInit(&argc, argv);
    glutInitDisplayMode(GLUT_RGB | GLUT_DEPTH | GLUT_DOUBLE);
    glutInitWindowSize(640, 480);
    glutCreateWindow("CUDA particles");

    initGL();
    init(numParticles, gridSize);
    initParams();
    initMenus();

    if (benchmark)
    {
        if (numIterations <= 0) 
            numIterations = 300;
        runBenchmark(numIterations);
    }
    else
    {
        glutDisplayFunc(display);
        glutReshapeFunc(reshape);
        glutMouseFunc(mouse);
        glutMotionFunc(motion);
        glutKeyboardFunc(key);
        glutSpecialFunc(special);
        glutIdleFunc(idle);

        glutMainLoop();
    }

    if (psystem)
        delete psystem;

    cudaThreadExit();

    return 0;
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv) 
{
    //start logs
    shrSetLogFileName ("volumeRender.txt");
    shrLog("%s Starting...\n\n", argv[0]); 

    if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") ||
		cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) 
	{
        g_bQAReadback = true;
        fpsLimit = frameCheckNumber;
    }

    if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) 
	{
        g_bQAGLVerify = true;
        fpsLimit = frameCheckNumber;
    }

    if (g_bQAReadback) {
	    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
        if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
            cutilDeviceInit(argc, argv);
        } else {
            cudaSetDevice( cutGetMaxGflopsDeviceId() );
        }

    } else {
        // First initialize OpenGL context, so we can properly set the GL for CUDA.
        // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
        initGL( &argc, argv );

	    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
        if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
            cutilGLDeviceInit(argc, argv);
        } else {
            cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() );
        }
/*
        int device;
        struct cudaDeviceProp prop;
        cudaGetDevice( &device );
        cudaGetDeviceProperties( &prop, device );
        if( !strncmp( "Tesla", prop.name, 5 ) ) {
            shrLog("This sample needs a card capable of OpenGL and display.\n");
            shrLog("Please choose a different device with the -device=x argument.\n");
            cutilExit(argc, argv);
        }
*/
	}

    // parse arguments
    char *filename;
    if (cutGetCmdLineArgumentstr( argc, (const char**) argv, "file", &filename)) {
        volumeFilename = filename;
    }
    int n;
    if (cutGetCmdLineArgumenti( argc, (const char**) argv, "size", &n)) {
        volumeSize.width = volumeSize.height = volumeSize.depth = n;
    }
    if (cutGetCmdLineArgumenti( argc, (const char**) argv, "xsize", &n)) {
        volumeSize.width = n;
    }
    if (cutGetCmdLineArgumenti( argc, (const char**) argv, "ysize", &n)) {
        volumeSize.height = n;
    }
    if (cutGetCmdLineArgumenti( argc, (const char**) argv, "zsize", &n)) {
         volumeSize.depth = n;
    }

    // load volume data
    char* path = shrFindFilePath(volumeFilename, argv[0]);
    if (path == 0) {
        shrLog("Error finding file '%s'\n", volumeFilename);
        exit(EXIT_FAILURE);
    }

    size_t size = volumeSize.width*volumeSize.height*volumeSize.depth*sizeof(VolumeType);
    void *h_volume = loadRawFile(path, size);
    
    initCuda(h_volume, volumeSize);
    free(h_volume);

    cutilCheckError( cutCreateTimer( &timer));

    shrLog("Press '=' and '-' to change density\n"
           "      ']' and '[' to change brightness\n"
           "      ';' and ''' to modify transfer function offset\n"
           "      '.' and ',' to modify transfer function scale\n\n");

    // calculate new grid size
    gridSize = dim3(iDivUp(width, blockSize.x), iDivUp(height, blockSize.y));

    if (g_bQAReadback) {
        g_CheckRender = new CheckBackBuffer(width, height, 4, false);
        g_CheckRender->setPixelFormat(GL_RGBA);
        g_CheckRender->setExecPath(argv[0]);
        g_CheckRender->EnableQAReadback(true);

        uint *d_output;
        cutilSafeCall(cudaMalloc((void**)&d_output, width*height*sizeof(uint)));
        cutilSafeCall(cudaMemset(d_output, 0, width*height*sizeof(uint)));

        float modelView[16] = 
        {
            1.0f, 0.0f, 0.0f, 0.0f,
            0.0f, 1.0f, 0.0f, 0.0f,
            0.0f, 0.0f, 1.0f, 0.0f,
            0.0f, 0.0f, 4.0f, 1.0f
        };

        invViewMatrix[0] = modelView[0]; invViewMatrix[1] = modelView[4]; invViewMatrix[2] = modelView[8]; invViewMatrix[3] = modelView[12];
        invViewMatrix[4] = modelView[1]; invViewMatrix[5] = modelView[5]; invViewMatrix[6] = modelView[9]; invViewMatrix[7] = modelView[13];
        invViewMatrix[8] = modelView[2]; invViewMatrix[9] = modelView[6]; invViewMatrix[10] = modelView[10]; invViewMatrix[11] = modelView[14];

        // call CUDA kernel, writing results to PBO
	    copyInvViewMatrix(invViewMatrix, sizeof(float4)*3);
        
        // Start timer 0 and process n loops on the GPU 
        int nIter = 10;
        for (int i = -1; i < nIter; i++)
        {
            if( i == 0 ) {
                cudaThreadSynchronize();
                cutStartTimer(timer); 
            }
            
            render_kernel(gridSize, blockSize, d_output, width, height, density, brightness, transferOffset, transferScale);
        }
        cudaThreadSynchronize();
        cutStopTimer(timer);
        // Get elapsed time and throughput, then log to sample and master logs
        double dAvgTime = cutGetTimerValue(timer)/(nIter * 1000.0);
        shrLogEx(LOGBOTH | MASTER, 0, "volumeRender, Throughput = %.4f MTexels/s, Time = %.5f s, Size = %u Texels, NumDevsUsed = %u, Workgroup = %u\n", 
               (1.0e-6 * width * height)/dAvgTime, dAvgTime, (width * height), 1, blockSize.x * blockSize.y); 
        

        cutilCheckMsg("Error: render_kernel() execution FAILED");
        cutilSafeCall( cudaThreadSynchronize() );

        cutilSafeCall( cudaMemcpy(g_CheckRender->imageData(), d_output, width*height*4, cudaMemcpyDeviceToHost) );
        g_CheckRender->savePPM(sOriginal[g_Index], true, NULL);

        if (!g_CheckRender->PPMvsPPM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, THRESHOLD)) {
            shrLog("\nFAILED\n\n");
        } else {
            shrLog("\nPASSED\n\n");
        }

        cudaFree(d_output);
    	freeCudaBuffers();

        if (g_CheckRender) {
            delete g_CheckRender; g_CheckRender = NULL;
        }

    } else {
        // This is the normal rendering path for VolumeRender
        glutDisplayFunc(display);
        glutKeyboardFunc(keyboard);
        glutMouseFunc(mouse);
        glutMotionFunc(motion);
        glutReshapeFunc(reshape);
        glutIdleFunc(idle);

        initPixelBuffer();

        if (g_bQAGLVerify) {
            g_CheckRender = new CheckBackBuffer(width, height, 4);
            g_CheckRender->setPixelFormat(GL_RGBA);
            g_CheckRender->setExecPath(argv[0]);
            g_CheckRender->EnableQAReadback(true);
        }
        atexit(cleanup);

        glutMainLoop();
    }

    cudaThreadExit();
    shrEXIT(argc, (const char**)argv);
}
Example #3
0
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv)
{
    pArgc = &argc;
    pArgv = argv;

    shrQAStart(argc, argv);
    // start logs
    shrSetLogFileName ("boxFilter.txt");
    shrLog("%s Starting...\n\n", argv[0]);

    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    if (argc > 1) {
        cutGetCmdLineArgumenti( argc, (const char**) argv, "threads", &nthreads );
        cutGetCmdLineArgumenti( argc, (const char**) argv, "radius", &filter_radius);

        if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify"))
        {
            g_bOpenGLQA = true;
            fpsLimit = frameCheckNumber;
        }
        if (cutCheckCmdLineFlag(argc, (const char **)argv, "fbo")) {
            g_bFBODisplay = true;
        }
    }

    // load image to process
    loadImageData(argc, argv);

    if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") ||
            cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt"))
    {
        // Running CUDA kernel (boxFilter) without visualization (QA Testing/Verification)
        runAutoTest(argc, argv);
        shrQAFinishExit(argc, (const char **)argv, (g_TotalErrors == 0 ? QA_PASSED : QA_FAILED));
    }
    else if (cutCheckCmdLineFlag(argc, (const char **)argv, "benchmark"))
    {
        // Running CUDA kernels (boxfilter) in Benchmarking mode
        runBenchmark(argc, argv);
        shrQAFinishExit(argc, (const char **)argv, (g_TotalErrors == 0 ? QA_PASSED : QA_FAILED));
    }
    else
    {
        // Running CUDA kernels (boxFilter) with OpenGL visualization
        if (g_bFBODisplay) shrLog("[FBO Display] ");
        if (g_bOpenGLQA)   shrLog("[OpenGL Readback Comparisons] ");
        shrLog("\n");

        if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) {
            printf("   This SDK does not explicitly support -device=n when running with OpenGL.\n");
            printf("   When specifying -device=n (n=0,1,2,....) the sample must not use OpenGL.\n");
            printf("   See details below to run without OpenGL:\n\n");
            printf(" > %s -device=n -qatest\n\n", argv[0]);
            printf("exiting...\n");
            shrQAFinishExit(argc, (const char **)argv, QA_WAIVED);
        }

        // First initialize OpenGL context, so we can properly set the GL for CUDA.
        // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
        initGL( &argc, argv );
        int dev = findCapableDevice(argc, argv);
        if( dev != -1 ) {
            cudaGLSetGLDevice( dev );
        } else {
            cutilDeviceReset();
            shrQAFinishExit(argc, (const char **)argv, QA_WAIVED);
        }

        // Now we can create a CUDA context and bind it to the OpenGL context
        initCuda();
        initGLResources();

        if (g_bOpenGLQA) {
            if (g_bFBODisplay) {
                g_CheckRender = new CheckFBO(width, height, 4, g_FrameBufferObject);
            } else {
                g_CheckRender = new CheckBackBuffer(width, height, 4);
            }
            g_CheckRender->setPixelFormat(GL_RGBA);
            g_CheckRender->setExecPath(argv[0]);
            g_CheckRender->EnableQAReadback(true);
        }
    }

    // sets the callback function so it will call cleanup upon exit
    atexit(cleanup);

    shrLog("Running Standard Demonstration with GLUT loop...\n\n");
    shrLog("Press '+' and '-' to change filter width\n"
           "Press ']' and '[' to change number of iterations\n\n");

    // Main OpenGL loop that will run visualization for every vsync
    glutMainLoop();

    cutilDeviceReset();
    shrQAFinishExit(argc, (const char **)argv, (g_TotalErrors == 0 ? QA_PASSED : QA_FAILED));
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char** argv) 
{
	int retVal = 0;

	retVal = xnInit( argc, argv );

    printf("[ %s ]\n", sSDKsample); 

    if (argc > 1) {
        cutGetCmdLineArgumenti( argc, (const char**) argv, "n", (int *) &numParticles);
        if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") ||
			cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")
			) 
		{
            g_bQAReadback = true;
        }
        if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) 
		{
            g_bQAGLVerify = true;
        }
    }

    if (g_bQAReadback) {
        // For Automated testing, we do not use OpenGL/CUDA interop
        if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) {
	        cutilDeviceInit (argc, argv);
        } else {
            cudaSetDevice (cutGetMaxGflopsDeviceId() );
        }

        g_CheckRender = new CheckBackBuffer(winWidth, winHeight, 4, false);
        g_CheckRender->setPixelFormat(GL_RGBA);
        g_CheckRender->setExecPath(argv[0]);
        g_CheckRender->EnableQAReadback(true);

        // This code path is used for Automated Testing
        initParticles(numParticles, false, false);
        initParams();

        if (emitterOn) {
            runEmitter();
        }
        SimParams &params = psystem->getParams();
        params.cursorPos = make_float3(cursorPosLag.x, cursorPosLag.y, cursorPosLag.z);

        psystem->step(timestep);

        float4 *pos = NULL, *vel = NULL;
        int g_TotalErrors = 0;

        psystem->dumpBin(&pos, &vel);

        g_CheckRender->dumpBin(pos, numParticles*sizeof(float4), "smokeParticles_pos.bin");
        g_CheckRender->dumpBin(vel, numParticles*sizeof(float4), "smokeParticles_vel.bin");

        if (!g_CheckRender->compareBin2BinFloat("smokeParticles_pos.bin", sRefBin[0], numParticles*sizeof(float4), MAX_EPSILON_ERROR, THRESHOLD))
           g_TotalErrors++;

        if (!g_CheckRender->compareBin2BinFloat("smokeParticles_vel.bin", sRefBin[1], numParticles*sizeof(float4), MAX_EPSILON_ERROR, THRESHOLD))
           g_TotalErrors++;


        delete psystem;
        delete g_CheckRender;

        printf("%s\n", (g_TotalErrors > 0) ? "FAILED" : "PASSED");

        cudaThreadExit();
    } else {
        // Normal smokeParticles rendering path
        // 1st initialize OpenGL context, so we can properly set the GL for CUDA.
        // This is needed to achieve optimal performance with OpenGL/CUDA interop.
        initGL( &argc, argv );

        if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) {
	        cutilGLDeviceInit (argc, argv);
        } else {
            cudaGLSetGLDevice (cutGetMaxGflopsDeviceId() );
        }

        if (g_bQAGLVerify) {
            g_CheckRender = new CheckBackBuffer(winWidth, winHeight, 4);
            g_CheckRender->setPixelFormat(GL_RGBA);
            g_CheckRender->setExecPath(argv[0]);
            g_CheckRender->EnableQAReadback(true);
        }

        // This is the normal code path for SmokeParticles
        initParticles(numParticles, true, true);
        initParams();
        initMenus();

        glutDisplayFunc(display);
        glutReshapeFunc(reshape);
        glutMouseFunc(mouse);
        glutMotionFunc(motion);
        glutKeyboardFunc(key);
        glutKeyboardUpFunc(keyUp);
        glutSpecialFunc(special);
        glutIdleFunc(idle);

        glutMainLoop();
    }

    cutilExit(argc, argv);
	return retVal;
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main( int argc, char** argv) 
{
    shrQAStart(argc, argv);

    // start logs
    shrSetLogFileName ("bilateralFilter.txt");
    shrLog("%s Starting...\n\n", argv[0]); 

    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    cutGetCmdLineArgumenti( argc, (const char**) argv, "threads", &nthreads );
    cutGetCmdLineArgumenti( argc, (const char**) argv, "radius", &filter_radius);

	// load image to process
    loadImageData(argc, argv);

    if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") ||
		cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) 
    {
		// Running CUDA kernel (bilateralFilter) without visualization (QA Testing/Verification)
        runAutoTest(argc, argv);
	    shrQAFinishExit(argc, (const char **)argv, (g_TotalErrors == 0 ? QA_PASSED : QA_FAILED));
	} 
	else if (cutCheckCmdLineFlag(argc, (const char **)argv, "benchmark")) 
	{
		// Running CUDA kernel (bilateralFilter) in Benchmarking Mode
        runBenchmark(argc, argv);
	    shrQAFinishExit(argc, (const char **)argv, (g_TotalErrors == 0 ? QA_PASSED : QA_FAILED));
	} 
	else 
	{	
		// Running CUDA kernel (bilateralFilter) in CUDA + OpenGL Visualization Mode
        if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) {
             printf("[%s]\n", argv[0]);
             printf("   Does not explicitly support -device=n in OpenGL mode\n");
             printf("   To use -device=n, the sample must be running w/o OpenGL\n\n");
             printf(" > %s -device=n -qatest\n", argv[0]);
             printf("exiting...\n");
             exit(0);
        }

        // First initialize OpenGL context, so we can properly set the GL for CUDA.
        // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
        initGL( argc, argv );

        if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) {
            cutilGLDeviceInit(argc, argv);
        } else {
            cudaGLSetGLDevice (cutGetMaxGflopsDeviceId() );
        }
        initCuda();
        initOpenGL();
    }

    atexit(cleanup);

    printf("Running Standard Demonstration with GLUT loop...\n\n");
    printf("Press '+' and '-' to change number of iterations\n"
        "Press LEFT and RIGHT change euclidean delta\n"
        "Press UP and DOWN to change gaussian delta\n"
        "Press '1' to show original image\n"
        "Press '2' to show result\n\n");

    glutMainLoop();

    cutilDeviceReset();
	shrQAFinishExit(argc, (const char **)argv, (g_TotalErrors == 0 ? QA_PASSED : QA_FAILED));
}
Example #6
0
//////////////////////////////////////////////////////////////////////////////
// Program main
//////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv) 
{
    bool bTestResults = true;

    shrQAStart(argc, argv);

    if( cutCheckCmdLineFlag(argc, (const char**)argv, "help") ) {
        showHelp();
        return 0;
    }

    shrLog("Run \"nbody -benchmark [-n=<numBodies>]\" to measure perfomance.\n");
	shrLog("\t-fullscreen (run n-body simulation in fullscreen mode)\n");
	shrLog("\t-fp64       (use double precision floating point values for simulation)\n");
    shrLog("\t-numdevices=N (use first N CUDA devices for simulation)\n");
//    shrLog("\t-hostmem  (stores simulation data in host memory)\n");
//    shrLog("\t-cpu      (performs simulation on the host)\n");
    shrLog("\n");

	bFullscreen  = (cutCheckCmdLineFlag(argc, (const char**) argv, "fullscreen") != 0);
    if (bFullscreen)
        bShowSliders = false;

    benchmark    = (cutCheckCmdLineFlag(argc, (const char**) argv, "benchmark") != 0);

    compareToCPU = ((cutCheckCmdLineFlag(argc, (const char**) argv, "compare") != 0) ||
                   (cutCheckCmdLineFlag(argc, (const char**) argv, "qatest")  != 0));

    QATest       = (cutCheckCmdLineFlag(argc, (const char**) argv, "qatest")  != 0);

    useHostMem   = (cutCheckCmdLineFlag(argc, (const char**) argv, "hostmem") != 0);

    fp64         = (cutCheckCmdLineFlag(argc, (const char**) argv, "fp64") != 0);

    flopsPerInteraction = fp64 ? 30 : 20;

    useCpu       = (cutCheckCmdLineFlag(argc, (const char**) argv, "cpu") != 0);

    cutGetCmdLineArgumenti(argc, (const char**) argv, "numdevices", &numDevsRequested);

    // for multi-device we currently require using host memory -- the devices share
    // data via the host 
    if (numDevsRequested > 1)
        useHostMem = true;

    int numDevsAvailable = 0;
    bool customGPU = false;
    cudaGetDeviceCount(&numDevsAvailable);

    if (numDevsAvailable < numDevsRequested) {
        shrLog("Error: only %d Devices available, %d requested.  Exiting.\n", numDevsAvailable, numDevsRequested);
        shrQAFinishExit(argc, (const char **)argv, QA_PASSED);
    }

	shrLog("> %s mode\n", bFullscreen ? "Fullscreen" : "Windowed");
	shrLog("> Simulation data stored in %s memory\n", useHostMem ? "system" : "video" );
	shrLog("> %s precision floating point simulation\n", fp64 ? "Double" : "Single");
    shrLog("> %d Devices used for simulation\n", numDevsRequested);

    int devID;
    cudaDeviceProp props;

    // Initialize GL and GLUT if necessary
    if (!benchmark && !compareToCPU) {
        initGL(&argc, argv);
        initParameters();
    }
    
    if (useCpu) {
        useHostMem = true;
        compareToCPU = false;
        bSupportDouble = true;

#ifdef OPENMP
        shrLog("> Simulation with CPU using OpenMP\n");
#else
        shrLog("> Simulation with CPU\n");
#endif
    }
    else
    {
        // Now choose the CUDA Device
        // Either without GL interop:
        if (benchmark || compareToCPU || useHostMem) 
        {
            // Note if we are using host memory for the body system, we
            // don't use CUDA-GL interop.

            if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) 
            {
                devID = cutilDeviceInit(argc, argv);
                if (devID < 0) {
                   printf("exiting...\n");
                   shrQAFinishExit(argc, (const char **)argv, QA_PASSED);
                }
                customGPU = true;
            } 
            else 
            {
                devID = cutGetMaxGflopsDeviceId();
                cudaSetDevice( devID );
            }
        } 
        else // or with GL interop:
        {    	
            if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
                cutilGLDeviceInit(argc, argv);
                customGPU = true;
            } else {
                devID = cutGetMaxGflopsDeviceId();
                cudaGLSetGLDevice( devID );
            }
        }

        cutilSafeCall(cudaGetDevice(&devID));
        cutilSafeCall(cudaGetDeviceProperties(&props, devID));

        bSupportDouble = true;

#if CUDART_VERSION < 4000
        if (numDevsRequested > 1)
        {
            shrLog("MultiGPU n-body requires CUDA 4.0 or later\n");
            cutilDeviceReset();
            shrQAFinishExit(argc, (const char**)argv, QA_PASSED);
        }
#endif

        // Initialize devices
        if(numDevsRequested > 1 && customGPU)
        {
            printf("You can't use --numdevices and --device at the same time.\n");
            shrQAFinishExit(argc, (const char**)argv, QA_PASSED);
        }

        if(customGPU) { 
            cudaDeviceProp props;
            cutilSafeCall(cudaGetDeviceProperties(&props, devID));
            shrLog("> Compute %d.%d CUDA device: [%s]\n", props.major, props.minor, props.name);
        }
        else
        {
            for (int i = 0; i < numDevsRequested; i++)
            {
                cudaDeviceProp props;
                cutilSafeCall(cudaGetDeviceProperties(&props, i));
            
                shrLog("> Compute %d.%d CUDA device: [%s]\n", props.major, props.minor, props.name);

                if (useHostMem)
                {
#if CUDART_VERSION >= 2020
                    if(!props.canMapHostMemory)
                    {
                        fprintf(stderr, "Device %d cannot map host memory!\n", devID);
                        cutilDeviceReset();
                        shrQAFinishExit(argc, (const char **)argv, QA_PASSED);
                    }
                    if (numDevsRequested > 1)
                        cutilSafeCall(cudaSetDevice(i));
                    cutilSafeCall(cudaSetDeviceFlags(cudaDeviceMapHost));
#else
                    fprintf(stderr, "This CUDART version does not support <cudaDeviceProp.canMapHostMemory> field\n");
                    cutilDeviceReset();
                    shrQAFinishExit(argc, (const char **)argv, QA_PASSED);
#endif
                }
            }

            // CC 1.2 and earlier do not support double precision
            if (props.major*10 + props.minor <= 12)
                bSupportDouble = false;
        }

        //if(numDevsRequested > 1)
        //    cutilSafeCall(cudaSetDevice(devID));

        if (fp64 && !bSupportDouble) {
            fprintf(stderr, "One or more of the requested devices does not support double precision floating-point\n");
            cutilDeviceReset();
            shrQAFinishExit(argc, (const char **)argv, QA_PASSED);
        }
    }
	
    numIterations = 0;
    p = 0;
    q = 1;

    cutGetCmdLineArgumenti(argc, (const char**) argv, "i", &numIterations);
    cutGetCmdLineArgumenti(argc, (const char**) argv, "p", &p);
    cutGetCmdLineArgumenti(argc, (const char**) argv, "q", &q);

    if (p == 0) // p not set on command line
    {
        p = 256;
        if (q * p > 256)
        {
            p = 256 / q;
            shrLog("Setting p=%d, q=%d to maintain %d threads per block\n", p, q, 256);
        }
    }

    // default number of bodies is #SMs * 4 * CTA size
    if (useCpu)
#ifdef OPENMP
        numBodies = 8192;
#else
        numBodies = 4096;
#endif
    else if (numDevsRequested == 1)
Example #7
0
//////////////////////////////////////////////////////////////////////////////
// Program main
//////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv) 
{
    printf("Run \"nbody -benchmark [-n=<numBodies>]\" to measure perfomance.\n\n");
    
    bool benchmark = 
        (cutCheckCmdLineFlag(argc, (const char**) argv, "benchmark") != 0);

    bool compareToCPU = 
        ((cutCheckCmdLineFlag(argc, (const char**) argv, "compare") != 0) ||
        !(cutCheckCmdLineFlag(argc, (const char**) argv, "noqatest")  != 0));

    bool regression = 
        (cutCheckCmdLineFlag(argc, (const char**) argv, "regression") != 0);

    int devID;
    cudaDeviceProp props;

    // nBody has a mode that allows it to be run without using GL interop
    if (benchmark || compareToCPU || regression) {
				/*
        if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
            cutilDeviceInit(argc, argv);
        } else {
            devID = cutGetMaxGflopsDeviceId();
            cudaSetDevice( devID );
        } */
    } 
    else 
    { 
        // This mode shows the OpenGL results rendered
        // First initialize OpenGL context, so we can properly set the GL for CUDA.
        // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
        glutInit(&argc, argv);
        glutInitDisplayMode(GLUT_RGB | GLUT_DEPTH | GLUT_DOUBLE);
        glutInitWindowSize(720, 480);
        glutCreateWindow("CUDA n-body system");

        GLenum err = glewInit();
        if (GLEW_OK != err)
        {
            printf("GLEW Error: %s\n", glewGetErrorString(err));
        }
        else
        {
#if   defined(WIN32)
            wglSwapIntervalEXT(0);
#elif defined(LINUX)
            glxSwapIntervalSGI(0);
#endif      
        }
    	
        initGL();
        initParameters();
    	
        if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
            cutilGLDeviceInit(argc, argv);
        } else {
            devID = cutGetMaxGflopsDeviceId();
            cudaGLSetGLDevice( devID );
        }
    }


    // get number of SMs on this GPU
    cutilSafeCall(cudaGetDevice(&devID));
    cutilSafeCall(cudaGetDeviceProperties(&props, devID));

    numIterations = 0;
    int p = 256;
    int q = 1;

    cutGetCmdLineArgumenti(argc, (const char**) argv, "i", &numIterations);
    cutGetCmdLineArgumenti(argc, (const char**) argv, "p", &p);
    cutGetCmdLineArgumenti(argc, (const char**) argv, "q", &q);


    // default number of bodies is #SMs * 4 * CTA size
    numBodies = compareToCPU ? 4096 : p*q*4*props.multiProcessorCount;

    cutGetCmdLineArgumenti(argc, (const char**) argv, "n", &numBodies);

    switch (numBodies)
    {
    case 1024:
        activeParams.m_clusterScale = 1.52f;
        activeParams.m_velocityScale = 2.f;
        break;
    case 2048:
        activeParams.m_clusterScale = 1.56f;
        activeParams.m_velocityScale = 2.64f;
        break;
    case 4096:
        activeParams.m_clusterScale = 1.68f;
        activeParams.m_velocityScale = 2.98f;
        break;
    case 8192:
        activeParams.m_clusterScale = 1.98f;
        activeParams.m_velocityScale = 2.9f;
        break;
    default:
    case 16384:
        activeParams.m_clusterScale = 1.54f;
        activeParams.m_velocityScale = 8.f;
        break;
    case 32768:
        activeParams.m_clusterScale = 1.44f;
        activeParams.m_velocityScale = 11.f;
        break;
    }

    if (q * p > 256)
    {
        p = 256 / q;
        printf("Setting p=%d, q=%d to maintain %d threads per block\n", p, q, 256);
    }

    if (q == 1 && numBodies < p)
    {
        p = numBodies;
    }

    init(numBodies, p, q, !(benchmark || compareToCPU));
    
    reset(nbody, numBodies, NBODY_CONFIG_SHELL, !(benchmark || compareToCPU));


    if (benchmark)
    {
        if (numIterations <= 0) 
            numIterations = 100;
        runBenchmark(numIterations);
    }
    else if (compareToCPU || regression)
    {
        compareResults(regression, numBodies);
    }
    else
    {
        glutDisplayFunc(display);
        glutReshapeFunc(reshape);
        glutMouseFunc(mouse);
        glutMotionFunc(motion);
        glutKeyboardFunc(key);
        glutSpecialFunc(special);
        glutIdleFunc(idle);

        cutilSafeCall(cudaEventRecord(startEvent, 0));
        glutMainLoop();
    }

    if (nbodyCPU)
        delete nbodyCPU;
    if (nbodyCUDA)
        delete nbodyCUDA;

    if (hPos)
        delete [] hPos;
    if (hVel)
        delete [] hVel;
    if (hColor)
        delete [] hColor;

    cutilSafeCall(cudaEventDestroy(startEvent));
    cutilSafeCall(cudaEventDestroy(stopEvent));
    cutilCheckError(cutDeleteTimer(demoTimer));

    return 0;
}
bool
runTestMax( int argc, char** argv, ReduceType datatype) 
{
    int size = 1<<24;    // number of elements to reduce
    int maxThreads = 256;  // number of threads per block
    int whichKernel = 6;
    int maxBlocks = 64;
    bool cpuFinalReduction = false;
    int cpuFinalThreshold = 1;

    cutGetCmdLineArgumenti( argc, (const char**) argv, "n", &size);
    cutGetCmdLineArgumenti( argc, (const char**) argv, "threads", &maxThreads);
    cutGetCmdLineArgumenti( argc, (const char**) argv, "kernel", &whichKernel);
    cutGetCmdLineArgumenti( argc, (const char**) argv, "maxblocks", &maxBlocks);
    
		shrLog("METHOD: MAX\n");
    shrLog("%d elements\n", size);
    shrLog("%d threads (max)\n", maxThreads);

    cpuFinalReduction = (cutCheckCmdLineFlag( argc, (const char**) argv, "cpufinal") == CUTTrue);
    cutGetCmdLineArgumenti( argc, (const char**) argv, "cputhresh", &cpuFinalThreshold);

    bool runShmoo = (cutCheckCmdLineFlag(argc, (const char**) argv, "shmoo") == CUTTrue);

    if (runShmoo)
    {
        shmoo<T>(1, 33554432, maxThreads, maxBlocks, datatype);
    }
    else
    {

        // create random input data on CPU
        unsigned int bytes = size * sizeof(T);

        T *h_idata = (T *) malloc(bytes);

        for(int i=0; i<size; i++) 
        {
            // Keep the numbers small so we don't get truncation error in the sum
            if (datatype == REDUCE_INT)
                h_idata[i] = (T)(rand() & 0xFF);
            else
                h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX;
        }

        int numBlocks = 0;
        int numThreads = 0;
        getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads);
        if (numBlocks == 1) cpuFinalThreshold = 1;

        // allocate mem for the result on host side
        T* h_odata = (T*) malloc(numBlocks*sizeof(T));

        shrLog("%d blocks\n\n", numBlocks);

        // allocate device memory and data
        T* d_idata = NULL;
        T* d_odata = NULL;

        cutilSafeCallNoSync( cudaMalloc((void**) &d_idata, bytes) );
        cutilSafeCallNoSync( cudaMalloc((void**) &d_odata, numBlocks*sizeof(T)) );

        // copy data directly to device memory
        cutilSafeCallNoSync( cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice) );
        cutilSafeCallNoSync( cudaMemcpy(d_odata, h_idata, numBlocks*sizeof(T), cudaMemcpyHostToDevice) );

        // warm-up
        maxreduce<T>(size, numThreads, numBlocks, whichKernel, d_idata, d_odata);
        
        int testIterations = 100;

        unsigned int timer = 0;
        cutilCheckError( cutCreateTimer( &timer));
        
        T gpu_result = 0;

        gpu_result = benchmarkReduceMax<T>(size, numThreads, numBlocks, maxThreads, maxBlocks,
                                        whichKernel, testIterations, cpuFinalReduction, 
                                        cpuFinalThreshold, timer,
                                        h_odata, d_idata, d_odata);

		double reduceTime = cutGetAverageTimerValue(timer) * 1e-3;
        shrLogEx(LOGBOTH | MASTER, 0, "Reduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %d, Workgroup = %u\n", 
               1.0e-9 * ((double)bytes)/reduceTime, reduceTime, size, 1, numThreads);

        // compute reference solution
        T cpu_result = maxreduceCPU<T>(h_idata, size);

        double threshold = 1e-12;
        double diff = 0;
		
        if (datatype == REDUCE_INT)
        {
            shrLog("\nGPU result = %d\n", gpu_result);
            shrLog("CPU result = %d\n\n", cpu_result);
        }
        else
        {
            shrLog("\nGPU result = %f\n", gpu_result);
            shrLog("CPU result = %f\n\n", cpu_result);

            if (datatype == REDUCE_FLOAT)
                threshold = 1e-8 * size;
            diff = fabs((double)gpu_result - (double)cpu_result);
        }

        // cleanup
        cutilCheckError( cutDeleteTimer(timer) );
        free(h_idata);
        free(h_odata);

        cutilSafeCallNoSync(cudaFree(d_idata));
        cutilSafeCallNoSync(cudaFree(d_odata));

		if (datatype == REDUCE_INT) {
			return (gpu_result == cpu_result);
		} else {
			return (diff < threshold);
		}
	}
	return true;
}
////////////////////////////////////////////////////////////////////////////////
// initialize marching cubes
////////////////////////////////////////////////////////////////////////////////
void
initMC(int argc, char** argv)
{
    // parse command line arguments
    int n;
    if (cutGetCmdLineArgumenti( argc, (const char**) argv, "grid", &n)) {
        gridSizeLog2.x = gridSizeLog2.y = gridSizeLog2.z = n;
    }
    if (cutGetCmdLineArgumenti( argc, (const char**) argv, "gridx", &n)) {
        gridSizeLog2.x = n;
    }
    if (cutGetCmdLineArgumenti( argc, (const char**) argv, "gridy", &n)) {
        gridSizeLog2.y = n;
    }
    if (cutGetCmdLineArgumenti( argc, (const char**) argv, "gridz", &n)) {
        gridSizeLog2.z = n;
    }

    char *filename;
    if (cutGetCmdLineArgumentstr( argc, (const char**) argv, "file", &filename)) {
        volumeFilename = filename;
    }

    gridSize = make_uint3(1<<gridSizeLog2.x, 1<<gridSizeLog2.y, 1<<gridSizeLog2.z);
    gridSizeMask = make_uint3(gridSize.x-1, gridSize.y-1, gridSize.z-1);
    gridSizeShift = make_uint3(0, gridSizeLog2.x, gridSizeLog2.x+gridSizeLog2.y);

    numVoxels = gridSize.x*gridSize.y*gridSize.z;
    voxelSize = make_float3(2.0f / gridSize.x, 2.0f / gridSize.y, 2.0f / gridSize.z);
    maxVerts = gridSize.x*gridSize.y*100;

    printf("grid: %d x %d x %d = %d voxels\n", gridSize.x, gridSize.y, gridSize.z, numVoxels);
    printf("max verts = %d\n", maxVerts);

#if SAMPLE_VOLUME
    // load volume data
    char* path = cutFindFilePath(volumeFilename, argv[0]);
    if (path == 0) {
        fprintf(stderr, "Error finding file '%s'\n", volumeFilename);
        cudaThreadExit();
        exit(EXIT_FAILURE);
    }

    int size = gridSize.x*gridSize.y*gridSize.z*sizeof(uchar);
    uchar *volume = loadRawFile(path, size);
    cutilSafeCall(cudaMalloc((void**) &d_volume, size));
    cutilSafeCall(cudaMemcpy(d_volume, volume, size, cudaMemcpyHostToDevice) );
    free(volume);

	bindVolumeTexture(d_volume);
#endif

    if (g_bQAReadback) {
        cudaMalloc((void **)&(d_pos),    maxVerts*sizeof(float)*4);
        cudaMalloc((void **)&(d_normal), maxVerts*sizeof(float)*4);
    } else {
        // create VBOs
        createVBO(&posVbo, maxVerts*sizeof(float)*4);
		// DEPRECATED: cutilSafeCall( cudaGLRegisterBufferObject(posVbo) );
		cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_posvbo_resource, posVbo, 
							   cudaGraphicsMapFlagsWriteDiscard));

        createVBO(&normalVbo, maxVerts*sizeof(float)*4);
        // DEPRECATED: cutilSafeCall(cudaGLRegisterBufferObject(normalVbo));
		cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_normalvbo_resource, normalVbo, 
							   cudaGraphicsMapFlagsWriteDiscard));

    }

    // allocate textures
	allocateTextures(	&d_edgeTable, &d_triTable, &d_numVertsTable );

    // allocate device memory
    unsigned int memSize = sizeof(uint) * numVoxels;
    cutilSafeCall(cudaMalloc((void**) &d_voxelVerts,            memSize));
    cutilSafeCall(cudaMalloc((void**) &d_voxelVertsScan,        memSize));
    cutilSafeCall(cudaMalloc((void**) &d_voxelOccupied,         memSize));
    cutilSafeCall(cudaMalloc((void**) &d_voxelOccupiedScan,     memSize));
    cutilSafeCall(cudaMalloc((void**) &d_compVoxelArray,   memSize));

    // initialize CUDPP scan
    CUDPPConfiguration config;
    config.algorithm    = CUDPP_SCAN;
    config.datatype     = CUDPP_UINT;
    config.op           = CUDPP_ADD;
    config.options      = CUDPP_OPTION_FORWARD | CUDPP_OPTION_EXCLUSIVE;
    cudppPlan(&scanplan, config, numVoxels, 1, 0);
}
Example #10
0
int
main( int argc,char** argv)
{
	printf("hello world\n");

	if (!InitCUDA())
	{
		return 0;
	}



	int iter = 1000;
	int trainnum = 20;
	bool isProfiler = false;
	int intProfiler = 0;
	int testnum = -1;
	float maxtime = 0.0f;
	cutGetCmdLineArgumenti(argc, (const char**) argv, "train", &trainnum);
	cutGetCmdLineArgumenti(argc, (const char**) argv, "iter", &iter);
	cutGetCmdLineArgumenti(argc, (const char**) argv, "profiler", &intProfiler);
	cutGetCmdLineArgumenti(argc, (const char**) argv, "test", &testnum);
	cutGetCmdLineArgumentf(argc, (const char**) argv, "maxtime", &maxtime);
	printf("%d\n", intProfiler);
	if(intProfiler)
	{
		isProfiler = true;
	}
	if(testnum == -1) testnum = trainnum /2;
	printf("Iter = %d\n", iter);
	printf("TrainNum = %d\n", trainnum);
	printf("TestNum = %d\n", testnum);

	CUT_DEVICE_INIT(argc, argv);


	cublasStatus status;
	status = cublasInit();
	if(status != CUBLAS_STATUS_SUCCESS)
	{
		printf("Can't init cublas\n");
		printf("%s\n", cudaGetErrorString(cudaGetLastError()));
		return -1;
	}


	Image* imageList = new Image[trainnum+testnum];
	read64("my_optdigits.tra", imageList, trainnum + testnum);

	const int warmUpTime = 3;
	if(!isProfiler)
	{
		freopen("verbose.txt", "w", stdout);
		for(int i=0;i< warmUpTime;i++)
		{
			runImage(argc, argv, imageList, trainnum < warmUpTime ? trainnum : warmUpTime, 0, 10, false, 0.0f);
		}
		freopen("CON", "w", stdout);
		printf("Warm-up complete.\n\n\n");
	}
#ifdef _DEBUG
	freopen("out.txt", "w", stdout);
#endif // _DEBUG
	runImage(argc, argv, imageList, trainnum, testnum, iter, true, maxtime);
	freopen("CON", "w", stdout);
	delete[] imageList;
	//TestReduce();
	
	cublasShutdown();
	if(!isProfiler)
	{
		CUT_EXIT(argc, argv);
	}
	//getchar();
	return 0;
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv) 
{
    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    if (!cutCheckCmdLineFlag(argc, (const char **)argv, "noqatest") ||
		cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) 
	{
        g_bQAReadback = true;
        fpsLimit = frameCheckNumber;
    }
    if (argc > 1) {

        if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) {
            g_bOpenGLQA = true;
            fpsLimit = frameCheckNumber;
        }
    }

    printf("[%s] ", sSDKsample);
    if (g_bQAReadback) printf("(Automated Testing)\n");
    if (g_bOpenGLQA)   printf("(OpenGL Readback)\n");

    // Get the path of the filename
    char *filename;
    if (cutGetCmdLineArgumentstr(argc, (const char**) argv, "image", &filename)) {
        image_filename = filename;
    }
    // load image
    char* image_path = cutFindFilePath(image_filename, argv[0]);
    if (image_path == 0) {
        fprintf(stderr, "Error finding image file '%s'\n", image_filename);
        cudaThreadExit();
        exit(EXIT_FAILURE);
    }

    cutilCheckError( cutLoadPPM4ub(image_path, (unsigned char **) &h_img, &width, &height));
    if (!h_img) {
        printf("Error opening file '%s'\n", image_path);
        cudaThreadExit();
        exit(-1);
    }
    printf("Loaded '%s', %d x %d pixels\n", image_path, width, height);

    cutGetCmdLineArgumenti(argc, (const char**) argv, "threads", &nthreads);
    cutGetCmdLineArgumentf(argc, (const char**) argv, "sigma", &sigma);
    runBenchmark = cutCheckCmdLineFlag(argc, (const char**) argv, "bench");

    int device;
    struct cudaDeviceProp prop;
    cudaGetDevice( &device );
    cudaGetDeviceProperties( &prop, device );
    if( !strncmp( "Tesla", prop.name, 5 ) ) {
        printf("Tesla card detected, running the test in benchmark mode (no OpenGL display)\n");
//        runBenchmark = CUTTrue;
        g_bQAReadback = true;
    }        

    // Benchmark or AutoTest mode detected, no OpenGL
    if (runBenchmark == CUTTrue || g_bQAReadback) {
        if( cutCheckCmdLineFlag( argc, (const char **)argv, "device" ) ) 
            cutilDeviceInit( argc, argv );
        else 
            cudaSetDevice( cutGetMaxGflopsDeviceId() );
    } else {

        // First initialize OpenGL context, so we can properly set the GL for CUDA.
        // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
        initGL(argc, argv);

        if( cutCheckCmdLineFlag( argc, (const char **)argv, "device" ) ) 
            cutilGLDeviceInit( argc, argv );
        else 
            cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() );
    }

    initCudaBuffers();

    if (g_bOpenGLQA) {
        g_CheckRender = new CheckBackBuffer(width, height, 4);
        g_CheckRender->setPixelFormat(GL_RGBA);
        g_CheckRender->setExecPath(argv[0]);
        g_CheckRender->EnableQAReadback(true);
    }

    if (g_bQAReadback) {
        // This is the automated testing path
        g_CheckRender = new CheckBackBuffer(width, height, 4, false);
        g_CheckRender->setPixelFormat(GL_RGBA);
        g_CheckRender->setExecPath(argv[0]);
        g_CheckRender->EnableQAReadback(true);

        runAutoTest(argc, argv); 
        cleanup();
        cudaThreadExit();
        cutilExit(argc, argv);
    }

    if (runBenchmark) {
        benchmark(100);
        cleanup();
        cudaThreadExit();
        exit(0);
    }

    initGLBuffers();
    
    atexit(cleanup);
    
    glutMainLoop();

    cudaThreadExit();
    cutilExit(argc, argv);
}