/////////////////////////////////////////////////////
// Main program
/////////////////////////////////////////////////////
int main(const int argc, const char **argv)
{
    unsigned long start = START_SIZE;
#ifdef NVS
    unsigned long end = END_NVS;
#else
    /* sizeof(unsigned long) = 8 bytes */
    unsigned long end = END_TITAN;
#endif

    int stateDim = 1;   /* default stateDim = 1 */
    if (checkCmdLineFlag(argc, argv, "dim"))
        stateDim = getCmdLineArgumentInt(argc, argv, "dim");

    char *typeInput = 0;
    getCmdLineArgumentString(argc, (const char**)argv, "type", &typeInput);
    if (0 != typeInput){
    if (!strcasecmp(typeInput, "float"))
        runTest<float>(start, end, stateDim);
    else if (!strcasecmp(typeInput, "int"))
        runTest<int>(start, end, stateDim);
    else if (!strcasecmp(typeInput, "double"))
        runTest<double>(start, end, stateDim);
    }
    else
        runTest<double>(start, end, stateDim);


    exit(EXIT_SUCCESS);
}
Esempio n. 2
0
 // Initialization code to find the best CUDA Device
 int findCudaDevice(int argc, const char **argv)
 {
     cudaDeviceProp deviceProp;
     int devID = 0;
     // If the command-line has a device number specified, use it
     if (checkCmdLineFlag(argc, argv, "device")) {
         devID = getCmdLineArgumentInt(argc, argv, "device=");
         if (devID < 0) {
             printf("Invalid command line parameters\n");
             exit(-1);
         } else {
             devID = gpuDeviceInit(devID);
             if (devID < 0) {
                printf("exiting...\n");
                shrQAFinishExit(argc, (const char **)argv, QA_FAILED);
                exit(-1);
             }
         }
     } else {
         // Otherwise pick the device with highest Gflops/s
         devID = gpuGetMaxGflopsDeviceId();
         checkCudaErrors( cudaSetDevice( devID ) );
         checkCudaErrors( cudaGetDeviceProperties(&deviceProp, devID) );
         printf("> Using CUDA device [%d]: %s\n", devID, deviceProp.name);
     }
     return devID;
 }
Esempio n. 3
0
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
    numParticles = NUM_PARTICLES;
    uint gridDim = GRID_SIZE;
    numIterations = 0;

    if (argc > 1)
    {
        if (checkCmdLineFlag(argc, (const char **) argv, "n"))
        {
            numParticles = getCmdLineArgumentInt(argc, (const char **)argv, "n");
        }

        if (checkCmdLineFlag(argc, (const char **) argv, "grid"))
        {
            gridDim = getCmdLineArgumentInt(argc, (const char **) argv, "grid");
        }
    }

    gridSize.x = gridSize.y = gridSize.z = gridDim;
    printf("grid: %d x %d x %d = %d cells\n", gridSize.x, gridSize.y, gridSize.z, gridSize.x*gridSize.y*gridSize.z);
    printf("particles: %d\n", numParticles);


    if (checkCmdLineFlag(argc, (const char **) argv, "i"))
    {
        numIterations = getCmdLineArgumentInt(argc, (const char **) argv, "i");
    }

    cudaInit(argc, argv);

    initParticleSystem(numParticles, gridSize);
    initParams();

    if (numIterations <= 0) numIterations = 300;
    runBenchmark(numIterations, argv[0]);

    if (psystem)
    {
        delete psystem;
    }

    exit(g_TotalErrors > 0 ? EXIT_FAILURE : EXIT_SUCCESS);
}
Esempio n. 4
0
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
    pArgc = &argc;
    pArgv = argv;

    // parse arguments
    char *filename;

    printf("Starting bicubicTexture\n");

    if (checkCmdLineFlag(argc, (const char **) argv, "help"))
    {
        printHelp();
        exit(EXIT_SUCCESS);
    }

    if (checkCmdLineFlag(argc, (const char **) argv, "mode"))
    {
        g_FilterMode = (eFilterMode)getCmdLineArgumentInt(argc, (const char **) argv, "mode");

        if (g_FilterMode < MODE_NEAREST && g_FilterMode > MODE_CATMULL_ROM)
        {
            printf("Invalid Mode setting %d\n", g_FilterMode);
            exit(EXIT_FAILURE);
        }
    }

    if (getCmdLineArgumentString(argc, (const char **) argv, "file", &filename))
    {
        dumpFilename = filename;
        fpsLimit     = frameCheckNumber;

        // Running CUDA kernel (bicubicFiltering) without visualization (QA Testing/Verification)
        runAutoTest(argc, argv, (const char *)dumpFilename, g_FilterMode);
    }
    else
    {
        // This runs the CUDA kernel (bicubicFiltering) + OpenGL visualization
        initialize(argc, argv);
        glutMainLoop();
        sdkDeleteTimer(&timer);

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        exit(EXIT_SUCCESS);
    }

    exit(EXIT_SUCCESS);
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
    pArgc = &argc;
    pArgv = argv;

    // parse arguments
    char *filename;

#if defined(__linux__)
    setenv ("DISPLAY", ":0", 0);
#endif

    printf("Starting bicubicTexture\n");

    if (checkCmdLineFlag(argc, (const char **) argv, "help"))
    {
        printHelp();
        exit(EXIT_SUCCESS);
    }

    if (checkCmdLineFlag(argc, (const char **) argv, "mode"))
    {
        g_FilterMode = (eFilterMode)getCmdLineArgumentInt(argc, (const char **) argv, "mode");

        if (g_FilterMode < 0 || g_FilterMode >= NUM_MODES)
        {
            printf("Invalid Mode setting %d\n", g_FilterMode);
            exit(EXIT_FAILURE);
        }
    }

    if (getCmdLineArgumentString(argc, (const char **) argv, "file", &filename))
    {
        dumpFilename = filename;
        fpsLimit     = frameCheckNumber;

        // Running CUDA kernel (bicubicFiltering) without visualization (QA Testing/Verification)
        runAutoTest(argc, argv, (const char *)dumpFilename, g_FilterMode);
    }
    else
    {
        // This runs the CUDA kernel (bicubicFiltering) + OpenGL visualization
        initialize(argc, argv);
        glutMainLoop();
    }

    exit(EXIT_SUCCESS);
}
Esempio n. 6
0
// Initialization code to find the best CUDA Device
inline int findCudaDevice(int argc, const char **argv)
{
    cudaDeviceProp deviceProp;
    int devID = 0;

    // If the command-line has a device number specified, use it
    if (checkCmdLineFlag(argc, argv, "device"))
    {
        devID = getCmdLineArgumentInt(argc, argv, "device=");

        if (devID < 0)
        {
            printf("Invalid command line parameter\n ");
            exit(EXIT_FAILURE);
        }
        else
        {
            devID = gpuDeviceInit(devID);

            if (devID < 0)
            {
                printf("exiting...\n");
                exit(EXIT_FAILURE);
            }
        }
    }
    else
    {
        // Otherwise pick the device with highest Gflops/s
        devID = gpuGetMaxGflopsDeviceId();
        checkCudaErrors(cudaSetDevice(devID));
        checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
        printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
    }

    return devID;
}
Esempio n. 7
0
//////////////////////////////////////////////////////////////////////////////
// Program main
//////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
    bool bTestResults = true;

    if (checkCmdLineFlag(argc, (const char **)argv, "help"))
    {
        printf("\n> Command line options\n");
        showHelp();
        return 0;
    }

    printf("Run \"nbody -benchmark [-numbodies=<numBodies>]\" to measure perfomance.\n");
    showHelp();

    bFullscreen  = (checkCmdLineFlag(argc, (const char **) argv, "fullscreen") != 0);

    if (bFullscreen)
    {
        bShowSliders = false;
    }

    benchmark    = (checkCmdLineFlag(argc, (const char **) argv, "benchmark") != 0);

    compareToCPU = ((checkCmdLineFlag(argc, (const char **) argv, "compare") != 0) ||
                    (checkCmdLineFlag(argc, (const char **) argv, "qatest")  != 0));

    QATest       = (checkCmdLineFlag(argc, (const char **) argv, "qatest")  != 0);
    useHostMem   = (checkCmdLineFlag(argc, (const char **) argv, "hostmem") != 0);
    fp64         = (checkCmdLineFlag(argc, (const char **) argv, "fp64") != 0);

    flopsPerInteraction = fp64 ? 30 : 20;

    useCpu       = (checkCmdLineFlag(argc, (const char **) argv, "cpu") != 0);

    if (checkCmdLineFlag(argc, (const char **)argv, "numdevices"))
    {
        numDevsRequested = getCmdLineArgumentInt(argc, (const char **) argv, "numdevices");

        if (numDevsRequested < 1)
        {
            printf("Error: \"number of CUDA devices\" specified %d is invalid.  Value should be >= 1\n", numDevsRequested);
            exit(bTestResults ? EXIT_SUCCESS : EXIT_FAILURE);
        }
        else
        {
            printf("number of CUDA devices  = %d\n", numDevsRequested);
        }
    }

    // for multi-device we currently require using host memory -- the devices share
    // data via the host
    if (numDevsRequested > 1)
    {
        useHostMem = true;
    }

    int numDevsAvailable = 0;
    bool customGPU = false;
    cudaGetDeviceCount(&numDevsAvailable);

    if (numDevsAvailable < numDevsRequested)
    {
        printf("Error: only %d Devices available, %d requested.  Exiting.\n", numDevsAvailable, numDevsRequested);
        exit(EXIT_SUCCESS);
    }

    printf("> %s mode\n", bFullscreen ? "Fullscreen" : "Windowed");
    printf("> Simulation data stored in %s memory\n", useHostMem ? "system" : "video");
    printf("> %s precision floating point simulation\n", fp64 ? "Double" : "Single");
    printf("> %d Devices used for simulation\n", numDevsRequested);

    int devID;
    cudaDeviceProp props;

    if (useCpu)
    {
        useHostMem = true;
        compareToCPU = false;
        bSupportDouble = true;

#ifdef OPENMP
        printf("> Simulation with CPU using OpenMP\n");
#else
        printf("> Simulation with CPU\n");
#endif
    }

    // Initialize GL and GLUT if necessary
    if (!benchmark && !compareToCPU)
    {
        initGL(&argc, argv);
        initParameters();
    }

    
    if(!useCpu)
    {
        // Now choose the CUDA Device
        // Either without GL interop:
        if (benchmark || compareToCPU || useHostMem)
        {
            // Note if we are using host memory for the body system, we
            // don't use CUDA-GL interop.

            if (checkCmdLineFlag(argc, (const char **)argv, "device"))
            {
                customGPU = true;
            }

            devID = findCudaDevice(argc, (const char **)argv);
        }
        else   // or with GL interop:
        {
            if (checkCmdLineFlag(argc, (const char **)argv, "device"))
            {
                customGPU = true;
            }

            devID = findCudaGLDevice(argc, (const char **)argv);
        }

        checkCudaErrors(cudaGetDevice(&devID));
        checkCudaErrors(cudaGetDeviceProperties(&props, devID));

        bSupportDouble = true;

#if CUDART_VERSION < 4000

        if (numDevsRequested > 1)
        {
            printf("MultiGPU n-body requires CUDA 4.0 or later\n");
            cudaDeviceReset();
            exit(EXIT_SUCCESS);
        }

#endif

        // Initialize devices
        if (numDevsRequested > 1 && customGPU)
        {
            printf("You can't use --numdevices and --device at the same time.\n");
            exit(EXIT_SUCCESS);
        }

        if (customGPU)
        {
            cudaDeviceProp props;
            checkCudaErrors(cudaGetDeviceProperties(&props, devID));
            printf("> Compute %d.%d CUDA device: [%s]\n", props.major, props.minor, props.name);
        }
        else
        {
            for (int i = 0; i < numDevsRequested; i++)
            {
                cudaDeviceProp props;
                checkCudaErrors(cudaGetDeviceProperties(&props, i));

                printf("> Compute %d.%d CUDA device: [%s]\n", props.major, props.minor, props.name);

                if (useHostMem)
                {
#if CUDART_VERSION >= 2020

                    if (!props.canMapHostMemory)
                    {
                        fprintf(stderr, "Device %d cannot map host memory!\n", devID);
                        cudaDeviceReset();
                        exit(EXIT_SUCCESS);
                    }

                    if (numDevsRequested > 1)
                    {
                        checkCudaErrors(cudaSetDevice(i));
                    }

                    checkCudaErrors(cudaSetDeviceFlags(cudaDeviceMapHost));
#else
                    fprintf(stderr, "This CUDART version does not support <cudaDeviceProp.canMapHostMemory> field\n");
                    cudaDeviceReset();
                    exit(EXIT_SUCCESS);
#endif
                }
            }

            // CC 1.2 and earlier do not support double precision
            if (props.major*10 + props.minor <= 12)
            {
                bSupportDouble = false;
            }
        }

        //if(numDevsRequested > 1)
        //    checkCudaErrors(cudaSetDevice(devID));

        if (fp64 && !bSupportDouble)
        {
            fprintf(stderr, "One or more of the requested devices does not support double precision floating-point\n");
            cudaDeviceReset();
            exit(EXIT_SUCCESS);
        }
    }

    numIterations = 0;
    p = 0;
    q = 1;

    if (checkCmdLineFlag(argc, (const char **)argv, "i"))
    {
        numIterations = getCmdLineArgumentInt(argc, (const char **)argv, "i");
    }

    if (checkCmdLineFlag(argc, (const char **) argv, "p"))
    {
        p = getCmdLineArgumentInt(argc, (const char **)argv, "p");
    }

    if (checkCmdLineFlag(argc, (const char **) argv, "q"))
    {
        q = getCmdLineArgumentInt(argc, (const char **)argv, "q");
    }

    if (p == 0)   // p not set on command line
    {
        p = 256;

        if (q * p > 256)
        {
            p = 256 / q;
            printf("Setting p=%d, q=%d to maintain %d threads per block\n", p, q, 256);
        }
    }

    // default number of bodies is #SMs * 4 * CTA size
    if (useCpu)
#ifdef OPENMP
        numBodies = 8192;

#else
        numBodies = 4096;
#endif
    else if (numDevsRequested == 1)
Esempio n. 8
0
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
    printf("%s Starting...\n\n", sSDKsample);

    numParticles = NUM_PARTICLES;
    maxNumParticles = MAX_NUM_PARTICLES;
    uint gridDim = GRID_SIZE;
    numIterations = 0;
    printf("Surely I can get this far\n");

    if (argc > 1)
    {
        if (checkCmdLineFlag(argc, (const char **) argv, "n"))
        {
            numParticles = getCmdLineArgumentInt(argc, (const char **)argv, "n");
        }

        if (checkCmdLineFlag(argc, (const char **) argv, "grid"))
        {
            gridDim = getCmdLineArgumentInt(argc, (const char **) argv, "grid");
        }

        if (checkCmdLineFlag(argc, (const char **)argv, "file"))
        {
            getCmdLineArgumentString(argc, (const char **)argv, "file", &g_refFile);
            fpsLimit = frameCheckNumber;
            numIterations = 1;
        }
    }

    //*******************************************************
    // RMK Hard code for cylindrical coords (y=theta=1)
    // DomainSize
	//char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-nothot/ZVert.txt";
	//char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-nothot/RVert.txt";
	//char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-nothot-big/ZVert.txt";
	//char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-nothot-big/RVert.txt";
	//char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-big/ZVert.txt";
	//char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-big/RVert.txt";
	//char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-nothot-big-refine/ZVert.txt";
	//char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-nothot-big-refine/RVert.txt";
	char Zfile[] = "/home/rkeedy/CFD/BuoyantStrumJet85-big-refine-lighter/ZVert.txt";
	char Rfile[] = "/home/rkeedy/CFD/BuoyantStrumJet85-big-refine-lighter/RVert.txt";
	//char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet62-big-refine-lighter/ZVert.txt";
	//char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet62-big-refine-lighter/RVert.txt";
	//char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-big-refine/ZVert.txt";
	//char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-big-refine/RVert.txt";
	//char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet63-big-refine/ZVert.txt";
	//char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet63-big-refine/RVert.txt";
	numVelNodes.x = filecount(Rfile); //-1;
	numVelNodes.z = filecount(Zfile); //-1;
	numVelNodes.y = 1;
	numCells.x = 80; //47; //24; //29;
	numCells.y = 1;
	numCells.z = 160; //188; //95; //88;
	numParticles = numCells.x*numCells.z*20; //avgnumparticles = 40
	srand( time( NULL ) );
	//numParticles = numCells.x*numCells.z*40;
	printf("vel grid: %d x %d x %d = %d cells\n", numVelNodes.x, numVelNodes.y, numVelNodes.z, numVelNodes.x*numVelNodes.y*numVelNodes.z);
	printf("    grid: %d x %d x %d = %d cells\n", numCells.x, numCells.y, numCells.z, numCells.x*numCells.y*numCells.z);
	//printf("vel grid: %d x %d x %d = %d cells\n", gridSize.x, gridSize.y, gridSize.z, gridSize.x*gridSize.y*gridSize.z);

    bool benchmark = checkCmdLineFlag(argc, (const char **) argv, "benchmark") != 0;

    if (checkCmdLineFlag(argc, (const char **) argv, "i"))
    {
        numIterations = getCmdLineArgumentInt(argc, (const char **) argv, "i");
    }
    if (g_refFile)
    {
        cudaInit(argc, argv);
    }
    else
    {
        if (checkCmdLineFlag(argc, (const char **)argv, "device"))
        {
            printf("[%s]\n", argv[0]);
            printf("   Does not explicitly support -device=n in OpenGL mode\n");
            printf("   To use -device=n, the sample must be running w/o OpenGL\n\n");
            printf(" > %s -device=n -file=<*.bin>\n", argv[0]);
            printf("exiting...\n");
            exit(EXIT_SUCCESS);
        }

        initGL(&argc, argv);
        cudaGLInit(argc, argv);
    }

    // Moved code snippet to CellSystem
    //initCellSystem(gridSize);
    // now moved to particlesystem

    printf("Begin initialization\n");

    //initParticleSystem(numParticles, gridSize, g_refFile==NULL);
    initParticleSystem(maxNumParticles, numParticles, numVelNodes, numCells, g_refFile==NULL);
    //printf("Finished with initParticleSystem, %d\n",g_refFile==NULL);
    //cin.ignore();
    initParams();

    printf("Finished with initialization\n");

    if (!g_refFile)
    {
        initMenus();
    }

    if (benchmark || g_refFile)
    {
        if (numIterations <= 0)
        {
            numIterations = 300;
        }

        runBenchmark(numIterations, argv[0]);
    }
    else
    {
        glutDisplayFunc(display);
        glutReshapeFunc(reshape);
        glutMouseFunc(mouse);
        glutMotionFunc(motion);
        glutKeyboardFunc(key);
        glutSpecialFunc(special);
        glutIdleFunc(idle);

        atexit(cleanup);

        glutMainLoop();
    }

    if (psystem)
    {
        delete psystem;
    }

    cudaDeviceReset();
    exit(g_TotalErrors > 0 ? EXIT_FAILURE : EXIT_SUCCESS);
}
Esempio n. 9
0
void parseCommandLineArguments(int argc, char *argv[])
{
    char video_file[256];

    printf("Command Line Arguments:\n");

    for (int n=0; n < argc; n++)
    {
        printf("argv[%d] = %s\n", n, argv[n]);
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "help"))
    {
        displayHelp();
        exit(EXIT_SUCCESS);
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "decodecuda"))
    {
        g_eVideoCreateFlags = cudaVideoCreate_PreferCUDA;
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "decodedxva"))
    {
        g_eVideoCreateFlags = cudaVideoCreate_PreferDXVA;
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "decodecuvid"))
    {
        g_eVideoCreateFlags = cudaVideoCreate_PreferCUVID;
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "vsync"))
    {
        g_bUseVsync = true;
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "novsync"))
    {
        g_bUseVsync = false;
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "repeatframe"))
    {
        g_bFrameRepeat = true;
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "framestep"))
    {
        g_bFrameStep     = true;
        g_bUseDisplay    = true;
        g_bUseInterop    = true;
        g_fpsLimit = 1;
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "updateall"))
    {
        g_bUpdateAll     = true;
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "displayvideo"))
    {
        g_bUseDisplay   = true;
        g_bUseInterop   = true;
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "nointerop"))
    {
        g_bUseInterop   = false;
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "readback"))
    {
        g_bReadback     = true;
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "device"))
    {
        g_DeviceID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
        g_bUseDisplay    = true;
        g_bUseInterop    = true;
    }

    if (g_bUseDisplay == false)
    {
        g_bQAReadback    = true;
        g_bUseInterop    = false;
    }

    if (g_bLoop == false)
    {
        g_bAutoQuit = true;
    }

    // Search all command file parameters for video files with extensions:
    // mp4, avc, mkv, 264, h264. vc1, wmv, mp2, mpeg2, mpg
    char *file_ext = NULL;

    for (int i=1; i < argc; i++)
    {
        if (getFileExtension(argv[i], &file_ext) > 0)
        {
            strcpy(video_file, argv[i]);
            break;
        }
    }

    // We load the default video file for the SDK sample
    if (file_ext == NULL)
    {
        strcpy(video_file, sdkFindFilePath(VIDEO_SOURCE_FILE, argv[0]));
    }

    // Now verify the video file is legit
    FILE *fp = fopen(video_file, "r");

    if (video_file == NULL && fp == NULL)
    {
        printf("[%s]: unable to find file: <%s>\nExiting...\n", sAppFilename, VIDEO_SOURCE_FILE);
        exit(EXIT_FAILURE);
    }

    if (fp)
    {
        fclose(fp);
    }

    // default video file loaded by this sample
    sFileName = video_file;

    // store the current path so we can reinit the CUDA context
    strcpy(exec_path, argv[0]);

    printf("[%s]: input file: <%s>\n", sAppFilename, video_file);
}
int main(int argc, char **argv)
{
    char *dump_file = NULL;

#if defined(__linux__)
    setenv ("DISPLAY", ":0", 0);
#endif

    pArgc = &argc;
    pArgv = argv;

    printf("%s Starting...\n\n", sSDKsample);

    if (checkCmdLineFlag(argc, (const char **)argv, "file"))
    {
        getCmdLineArgumentString(argc, (const char **)argv,
                                 "file", (char **) &dump_file);

        int kernel = 1;

        if (checkCmdLineFlag(argc, (const char **)argv, "kernel"))
        {
            kernel = getCmdLineArgumentInt(argc, (const char **)argv, "kernel");
        }

        runAutoTest(argc, argv, dump_file, kernel);
    }
    else
    {
        printf("[%s]\n", sSDKsample);

        // use command-line specified CUDA device, otherwise use device with highest Gflops/s
        if (checkCmdLineFlag(argc, (const char **)argv, "device"))
        {
            printf("[%s]\n", argv[0]);
            printf("   Does not explicitly support -device=n in OpenGL mode\n");
            printf("   To use -device=n, the sample must be running w/o OpenGL\n\n");
            printf(" > %s -device=n -qatest\n", argv[0]);
            printf("exiting...\n");
            exit(EXIT_SUCCESS);
        }

        // First load the image, so we know what the size of the image (imageW and imageH)
        printf("Allocating host and CUDA memory and loading image file...\n");
        const char *image_path = sdkFindFilePath("portrait_noise.bmp", argv[0]);

        if (image_path == NULL)
        {
            printf("imageDenoisingGL was unable to find and load image file <portrait_noise.bmp>.\nExiting...\n");
            exit(EXIT_FAILURE);
        }

        LoadBMPFile(&h_Src, &imageW, &imageH, image_path);
        printf("Data init done.\n");

        // First initialize OpenGL context, so we can properly set the GL for CUDA.
        // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
        initGL(&argc, argv);
        cudaGLSetGLDevice(gpuGetMaxGflopsDeviceId());

        checkCudaErrors(CUDA_MallocArray(&h_Src, imageW, imageH));

        initOpenGLBuffers();
    }

    printf("Starting GLUT main loop...\n");
    printf("Press [1] to view noisy image\n");
    printf("Press [2] to view image restored with knn filter\n");
    printf("Press [3] to view image restored with nlm filter\n");
    printf("Press [4] to view image restored with modified nlm filter\n");
    printf("Press [*] to view smooth/edgy areas [RED/BLUE] Ct's when a filter is active\n");
    printf("Press [f] to print frame rate\n");
    printf("Press [?] to print Noise and Lerp Ct's\n");
    printf("Press [q] to exit\n");

    sdkCreateTimer(&timer);
    sdkStartTimer(&timer);

    glutMainLoop();
}
bool
runTest(int argc, char **argv, ReduceType datatype)
{
    int size = 1<<24;    // number of elements to reduce
    int maxThreads = 256;  // number of threads per block
    int whichKernel = 6;
    int maxBlocks = 64;
    bool cpuFinalReduction = false;
    int cpuFinalThreshold = 1;

    if (checkCmdLineFlag(argc, (const char **) argv, "n"))
    {
        size = getCmdLineArgumentInt(argc, (const char **) argv, "n");
    }

    if (checkCmdLineFlag(argc, (const char **) argv, "threads"))
    {
        maxThreads = getCmdLineArgumentInt(argc, (const char **) argv, "threads");
    }

    if (checkCmdLineFlag(argc, (const char **) argv, "kernel"))
    {
        whichKernel = getCmdLineArgumentInt(argc, (const char **) argv, "kernel");
    }

    if (checkCmdLineFlag(argc, (const char **) argv, "maxblocks"))
    {
        maxBlocks = getCmdLineArgumentInt(argc, (const char **) argv, "maxblocks");
    }

    printf("%d elements\n", size);
    printf("%d threads (max)\n", maxThreads);

    cpuFinalReduction = checkCmdLineFlag(argc, (const char **) argv, "cpufinal");

    if (checkCmdLineFlag(argc, (const char **) argv, "cputhresh"))
    {
        cpuFinalThreshold = getCmdLineArgumentInt(argc, (const char **) argv, "cputhresh");
    }

    bool runShmoo = checkCmdLineFlag(argc, (const char **) argv, "shmoo");

    if (runShmoo)
    {
        shmoo<T>(1, 33554432, maxThreads, maxBlocks, datatype);
    }
    else
    {
        // create random input data on CPU
        unsigned int bytes = size * sizeof(T);

        T *h_idata = (T *) malloc(bytes);

        for (int i=0; i<size; i++)
        {
            // Keep the numbers small so we don't get truncation error in the sum
            if (datatype == REDUCE_INT)
            {
                h_idata[i] = (T)(rand() & 0xFF);
            }
            else
            {
                h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX;
            }
        }

        int numBlocks = 0;
        int numThreads = 0;
        getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads);

        if (numBlocks == 1)
        {
            cpuFinalThreshold = 1;
        }

        // allocate mem for the result on host side
        T *h_odata = (T *) malloc(numBlocks*sizeof(T));

        printf("%d blocks\n\n", numBlocks);

        // allocate device memory and data
        T *d_idata = NULL;
        T *d_odata = NULL;

        checkCudaErrors(cudaMalloc((void **) &d_idata, bytes));
        checkCudaErrors(cudaMalloc((void **) &d_odata, numBlocks*sizeof(T)));

        // copy data directly to device memory
        checkCudaErrors(cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice));
        checkCudaErrors(cudaMemcpy(d_odata, h_idata, numBlocks*sizeof(T), cudaMemcpyHostToDevice));

        // warm-up
        reduce<T>(size, numThreads, numBlocks, whichKernel, d_idata, d_odata);

        int testIterations = 100;

        StopWatchInterface *timer = 0;
        sdkCreateTimer(&timer);

        T gpu_result = 0;

        gpu_result = benchmarkReduce<T>(size, numThreads, numBlocks, maxThreads, maxBlocks,
                                        whichKernel, testIterations, cpuFinalReduction,
                                        cpuFinalThreshold, timer,
                                        h_odata, d_idata, d_odata);

        double reduceTime = sdkGetAverageTimerValue(&timer) * 1e-3;
        printf("Reduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %d, Workgroup = %u\n",
               1.0e-9 * ((double)bytes)/reduceTime, reduceTime, size, 1, numThreads);

        // compute reference solution
        T cpu_result = reduceCPU<T>(h_idata, size);

        int precision = 0;
        double threshold = 0;
        double diff = 0;

        if (datatype == REDUCE_INT)
        {
            printf("\nGPU result = %d\n", (int)gpu_result);
            printf("CPU result = %d\n\n", (int)cpu_result);
        }
        else
        {
            if (datatype == REDUCE_FLOAT)
            {
                precision = 8;
                threshold = 1e-8 * size;
            }
            else
            {
                precision = 12;
                threshold = 1e-12 * size;
            }

            printf("\nGPU result = %.*f\n", precision, (double)gpu_result);
            printf("CPU result = %.*f\n\n", precision, (double)cpu_result);

            diff = fabs((double)gpu_result - (double)cpu_result);
        }

        // cleanup
        sdkDeleteTimer(&timer);
        free(h_idata);
        free(h_odata);

        checkCudaErrors(cudaFree(d_idata));
        checkCudaErrors(cudaFree(d_odata));

        if (datatype == REDUCE_INT)
        {
            return (gpu_result == cpu_result);
        }
        else
        {
            return (diff < threshold);
        }
    }

    return true;
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
#if defined(__linux__)
    setenv ("DISPLAY", ":0", 0);
#endif

    printf("%s Starting...\n\n", sSDKsample);

    printf("NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n\n");

    numParticles = NUM_PARTICLES;
    uint gridDim = GRID_SIZE;
    numIterations = 0;

    if (argc > 1)
    {
        if (checkCmdLineFlag(argc, (const char **) argv, "n"))
        {
            numParticles = getCmdLineArgumentInt(argc, (const char **)argv, "n");
        }

        if (checkCmdLineFlag(argc, (const char **) argv, "grid"))
        {
            gridDim = getCmdLineArgumentInt(argc, (const char **) argv, "grid");
        }

        if (checkCmdLineFlag(argc, (const char **)argv, "file"))
        {
            getCmdLineArgumentString(argc, (const char **)argv, "file", &g_refFile);
            fpsLimit = frameCheckNumber;
            numIterations = 1;
        }
    }

    gridSize.x = gridSize.y = gridSize.z = gridDim;
    printf("grid: %d x %d x %d = %d cells\n", gridSize.x, gridSize.y, gridSize.z, gridSize.x*gridSize.y*gridSize.z);
    printf("particles: %d\n", numParticles);

    bool benchmark = checkCmdLineFlag(argc, (const char **) argv, "benchmark") != 0;

    if (checkCmdLineFlag(argc, (const char **) argv, "i"))
    {
        numIterations = getCmdLineArgumentInt(argc, (const char **) argv, "i");
    }

    if (g_refFile)
    {
        cudaInit(argc, argv);
    }
    else
    {
        if (checkCmdLineFlag(argc, (const char **)argv, "device"))
        {
            printf("[%s]\n", argv[0]);
            printf("   Does not explicitly support -device=n in OpenGL mode\n");
            printf("   To use -device=n, the sample must be running w/o OpenGL\n\n");
            printf(" > %s -device=n -file=<*.bin>\n", argv[0]);
            printf("exiting...\n");
            exit(EXIT_SUCCESS);
        }

        initGL(&argc, argv);
        cudaGLInit(argc, argv);
    }

    initParticleSystem(numParticles, gridSize, g_refFile==NULL);
    initParams();

    if (!g_refFile)
    {
        initMenus();
    }

    if (benchmark || g_refFile)
    {
        if (numIterations <= 0)
        {
            numIterations = 300;
        }

        runBenchmark(numIterations, argv[0]);
    }
    else
    {
        glutDisplayFunc(display);
        glutReshapeFunc(reshape);
        glutMouseFunc(mouse);
        glutMotionFunc(motion);
        glutKeyboardFunc(key);
        glutSpecialFunc(special);
        glutIdleFunc(idle);

        glutCloseFunc(cleanup);

        glutMainLoop();
    }

    if (psystem)
    {
        delete psystem;
    }

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();
    exit(g_TotalErrors > 0 ? EXIT_FAILURE : EXIT_SUCCESS);
}
void initData(int argc, char **argv)
{
    // parse arguments
    char *filename;

    if (getCmdLineArgumentString(argc, (const char **) argv, "file", &filename))
    {
        volumeFilename = filename;
    }

    int n;


    if (checkCmdLineFlag(argc, (const char **) argv, "size"))
    {
        n = getCmdLineArgumentInt(argc, (const char **) argv, "size");
        volumeSize.width = volumeSize.height = volumeSize.depth = n;
    }

    if (checkCmdLineFlag(argc, (const char **) argv, "xsize"))
    {
        n = getCmdLineArgumentInt(argc, (const char **) argv, "xsize");
        volumeSize.width = n;
    }

    if (checkCmdLineFlag(argc, (const char **) argv, "ysize"))
    {
        n = getCmdLineArgumentInt(argc, (const char **) argv, "ysize");
        volumeSize.height = n;
    }

    if (checkCmdLineFlag(argc, (const char **) argv, "zsize"))
    {
        n = getCmdLineArgumentInt(argc, (const char **) argv, "zsize");
        volumeSize.depth = n;
    }

    char *path = sdkFindFilePath(volumeFilename, argv[0]);

    if (path == 0)
    {
        printf("Error finding file '%s'\n", volumeFilename);
        exit(EXIT_FAILURE);
    }

    size_t size = volumeSize.width*volumeSize.height*volumeSize.depth*sizeof(VolumeType);
    void *h_volume = loadRawFile(path, size);

    FilterKernel_init();
    Volume_init(&volumeOriginal,volumeSize, h_volume, 0);
    free(h_volume);
    Volume_init(&volumeFilter0, volumeSize, NULL, 1);
    Volume_init(&volumeFilter1, volumeSize, NULL, 1);
    VolumeRender_init();
    VolumeRender_setPreIntegrated(preIntegrated);
    VolumeRender_setVolume(&volumeOriginal);

    sdkCreateTimer(&timer);
    sdkCreateTimer(&animationTimer);
    sdkStartTimer(&animationTimer);

    // calculate new grid size
    gridSize = dim3(iDivUp(width, blockSize.x), iDivUp(height, blockSize.y));
}
Esempio n. 14
0
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    // start logs
    int devID;
    char *ref_file = NULL;
    printf("%s Starting...\n\n", argv[0]);

#if defined(__linux__)
    setenv ("DISPLAY", ":0", 0);
#endif

    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    if (argc > 1)
    {
        if (checkCmdLineFlag(argc, (const char **)argv, "radius"))
        {
            filter_radius = getCmdLineArgumentInt(argc, (const char **) argv, "radius");
        }

        if (checkCmdLineFlag(argc, (const char **)argv, "passes"))
        {
            iterations = getCmdLineArgumentInt(argc, (const char **)argv, "passes");
        }

        if (checkCmdLineFlag(argc, (const char **)argv, "file"))
        {
            getCmdLineArgumentString(argc, (const char **)argv, "file", (char **)&ref_file);
        }
    }

    // load image to process
    loadImageData(argc, argv);

    if (checkCmdLineFlag(argc, (const char **)argv, "benchmark"))
    {
        // This is a separate mode of the sample, where we are benchmark the kernels for performance
        devID = findCudaDevice(argc, (const char **)argv);

        // Running CUDA kernels (bilateralfilter) in Benchmarking mode
        g_TotalErrors += runBenchmark(argc, argv);

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
    }
    else if (checkCmdLineFlag(argc, (const char **)argv, "radius") ||
             checkCmdLineFlag(argc, (const char **)argv, "passes"))
    {
        // This overrides the default mode.  Users can specify the radius used by the filter kernel
        devID = findCudaDevice(argc, (const char **)argv);
        g_TotalErrors += runSingleTest(ref_file, argv[0]);

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
    }
    else
    {
        // Default mode running with OpenGL visualization and in automatic mode
        // the output automatically changes animation
        printf("\n");

        // First initialize OpenGL context, so we can properly set the GL for CUDA.
        // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
        initGL(argc, (char **)argv);
        int dev = findCapableDevice(argc, argv);

        if (dev != -1)
        {
            dev = gpuGLDeviceInit(argc, (const char **)argv);

            if (dev == -1)
            {
                exit(EXIT_FAILURE);
            }
        }
        else
        {
            // cudaDeviceReset causes the driver to clean up all state. While
            // not mandatory in normal operation, it is good practice.  It is also
            // needed to ensure correct operation when the application is being
            // profiled. Calling cudaDeviceReset causes all profile data to be
            // flushed before the application exits
            cudaDeviceReset();
            exit(EXIT_SUCCESS);
        }

        // Now we can create a CUDA context and bind it to the OpenGL context
        initCuda();
        initGLResources();

        // sets the callback function so it will call cleanup upon exit
#if defined (__APPLE__) || defined(MACOSX)
        atexit(cleanup);
#else
        glutCloseFunc(cleanup);
#endif

        printf("Running Standard Demonstration with GLUT loop...\n\n");
        printf("Press '+' and '-' to change filter width\n"
               "Press ']' and '[' to change number of iterations\n"
               "Press 'e' and 'E' to change Euclidean delta\n"
               "Press 'g' and 'G' to changle Gaussian delta\n"
               "Press 'a' or  'A' to change Animation mode ON/OFF\n\n");

        // Main OpenGL loop that will run visualization for every vsync
        glutMainLoop();
    }
}
Esempio n. 15
0
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
    int devID = 0;
    char *ref_file = NULL;

    pArgc = &argc;
    pArgv = argv;

    // start logs
    printf("%s Starting...\n\n", argv[0]);

    if (checkCmdLineFlag(argc, (const char **)argv, "help"))
    {
        printHelp();
        exit(EXIT_SUCCESS);
    }

    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    if (argc > 1)
    {
        if (checkCmdLineFlag(argc, (const char **)argv, "threads"))
        {
            nthreads      = getCmdLineArgumentInt(argc, (const char **) argv, "threads");
        }

        if (checkCmdLineFlag(argc, (const char **)argv, "radius"))
        {
            filter_radius = getCmdLineArgumentInt(argc, (const char **) argv, "radius");
        }

        if (checkCmdLineFlag(argc, (const char **)argv, "passes"))
        {
            iterations = getCmdLineArgumentInt(argc, (const char **) argv, "passes");
        }

        if (checkCmdLineFlag(argc, (const char **)argv, "file"))
        {
            getCmdLineArgumentString(argc, (const char **)argv, "file", (char **)&ref_file);
        }
    }

    // load image to process
    loadImageData(argc, argv);

    if (checkCmdLineFlag(argc, (const char **)argv, "benchmark"))
    {
        // This is a separate mode of the sample, where we are benchmark the kernels for performance
        devID = findCudaDevice(argc, (const char **)argv);

        // Running CUDA kernels (boxfilter) in Benchmarking mode
        g_TotalErrors += runBenchmark();
        exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
    }
    else if (checkCmdLineFlag(argc, (const char **)argv, "radius") ||
             checkCmdLineFlag(argc, (const char **)argv, "passes"))
    {
        // This overrides the default mode.  Users can specify the radius used by the filter kernel
        devID = findCudaDevice(argc, (const char **)argv);
        g_TotalErrors += runSingleTest(ref_file, argv[0]);
        exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
    }
    else
    {
        // Default mode running with OpenGL visualization and in automatic mode
        // the output automatically changes animation
        printf("\n");

        // First initialize OpenGL context, so we can properly set the GL for CUDA.
        // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
        initGL(&argc, argv);
        int dev = findCapableDevice(argc, argv);

        if (dev != -1)
        {
            cudaGLSetGLDevice(dev);
        }
        else
        {
            cudaDeviceReset();
            exit(EXIT_SUCCESS);
        }

        // Now we can create a CUDA context and bind it to the OpenGL context
        initCuda();
        initGLResources();

        // sets the callback function so it will call cleanup upon exit
        atexit(cleanup);

        printf("Running Standard Demonstration with GLUT loop...\n\n");
        printf("Press '+' and '-' to change filter width\n"
               "Press ']' and '[' to change number of iterations\n"
               "Press 'a' or  'A' to change animation ON/OFF\n\n");

        // Main OpenGL loop that will run visualization for every vsync
        glutMainLoop();

        cudaDeviceReset();
        exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
    }
}
Esempio n. 16
0
HRESULT initCudaResources(int argc, char **argv, int bUseInterop, int bTCC)
{
    HRESULT hr = S_OK;

    CUdevice cuda_device;

    if (checkCmdLineFlag(argc, (const char **)argv, "device"))
    {
        cuda_device = getCmdLineArgumentInt(argc, (const char **) argv, "device");

        // If interop is disabled, then we need to create a CUDA context w/o the GL context
        if (bUseInterop && !bTCC)
        {
            cuda_device = findCudaDeviceDRV(argc, (const char **)argv);
        }
        else
        {
            cuda_device = findCudaGLDeviceDRV(argc, (const char **)argv);
        }

        if (cuda_device < 0)
        {
            printf("No CUDA Capable devices found, exiting...\n");
            exit(EXIT_SUCCESS);
        }

        checkCudaErrors(cuDeviceGet(&g_oDevice, cuda_device));
    }
    else
    {
        // If we want to use Graphics Interop, then choose the GPU that is capable
        if (bUseInterop)
        {
            cuda_device = gpuGetMaxGflopsGLDeviceIdDRV();
            checkCudaErrors(cuDeviceGet(&g_oDevice, cuda_device));
        }
        else
        {
            cuda_device = gpuGetMaxGflopsDeviceIdDRV();
            checkCudaErrors(cuDeviceGet(&g_oDevice, cuda_device));
        }
    }

    // get compute capabilities and the devicename
    int major, minor;
    size_t totalGlobalMem;
    char deviceName[256];
    checkCudaErrors(cuDeviceComputeCapability(&major, &minor, g_oDevice));
    checkCudaErrors(cuDeviceGetName(deviceName, 256, g_oDevice));
    printf("> Using GPU Device %d: %s has SM %d.%d compute capability\n", cuda_device, deviceName, major, minor);

    checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, g_oDevice));
    printf("  Total amount of global memory:     %4.4f MB\n", (float)totalGlobalMem/(1024*1024));

    // Create CUDA Device w/ D3D9 interop (if WDDM), otherwise CUDA w/o interop (if TCC)
    // (use CU_CTX_BLOCKING_SYNC for better CPU synchronization)
    if (bUseInterop)
    {
        checkCudaErrors(cuD3D9CtxCreate(&g_oContext, &g_oDevice, CU_CTX_BLOCKING_SYNC, g_pD3DDevice));
    }
    else
    {
        checkCudaErrors(cuCtxCreate(&g_oContext, CU_CTX_BLOCKING_SYNC, g_oDevice));
    }

    try
    {
        // Initialize CUDA releated Driver API (32-bit or 64-bit), depending the platform running
        if (sizeof(void *) == 4)
        {
            g_pCudaModule = new CUmoduleManager("NV12ToARGB_drvapi_Win32.ptx", exec_path, 2, 2, 2);
        }
        else
        {
            g_pCudaModule = new CUmoduleManager("NV12ToARGB_drvapi_x64.ptx", exec_path, 2, 2, 2);
        }
    }
    catch (char const *p_file)
    {
        // If the CUmoduleManager constructor fails to load the PTX file, it will throw an exception
        printf("\n>> CUmoduleManager::Exception!  %s not found!\n", p_file);
        printf(">> Please rebuild NV12ToARGB_drvapi.cu or re-install this sample.\n");
        return E_FAIL;
    }

    g_pCudaModule->GetCudaFunction("NV12ToARGB_drvapi",   &gfpNV12toARGB);
    g_pCudaModule->GetCudaFunction("Passthru_drvapi",     &gfpPassthru);

    /////////////////Change///////////////////////////
    // Now we create the CUDA resources and the CUDA decoder context
    initCudaVideo();

    if (bUseInterop)
    {
        initD3D9Surface(g_pVideoDecoder->targetWidth(),
                        g_pVideoDecoder->targetHeight());
    }
    else
    {
        checkCudaErrors(cuMemAlloc(&g_pInteropFrame[0], g_pVideoDecoder->targetWidth() * g_pVideoDecoder->targetHeight() * 2));
        checkCudaErrors(cuMemAlloc(&g_pInteropFrame[1], g_pVideoDecoder->targetWidth() * g_pVideoDecoder->targetHeight() * 2));
    }

    CUcontext cuCurrent = NULL;
    CUresult result = cuCtxPopCurrent(&cuCurrent);

    if (result != CUDA_SUCCESS)
    {
        printf("cuCtxPopCurrent: %d\n", result);
        assert(0);
    }

    /////////////////////////////////////////
    return ((g_pCudaModule && g_pVideoDecoder && (g_pImageDX || g_pInteropFrame[0])) ? S_OK : E_FAIL);
}
Esempio n. 17
0
///////////////////////////////////////////////////////////////////////////////
// Main program
///////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    // Start logs
    shrQAStart(argc, argv);

    // initialize the GPU, either identified by --device
    // or by picking the device with highest flop rate.
    int devID = findCudaDevice(argc, (const char **)argv);

    // parsing the number of random numbers to generate
    int rand_n = DEFAULT_RAND_N;
    if( checkCmdLineFlag(argc, (const char**) argv, "count") )  
    {       
        rand_n = getCmdLineArgumentInt(argc, (const char**) argv, "count"); 
    }
    printf("Allocating data for %i samples...\n", rand_n);
     
    // parsing the seed
    int seed = DEFAULT_SEED;
    if( checkCmdLineFlag(argc, (const char**) argv, "seed") ) 
    {       
        seed = getCmdLineArgumentInt(argc, (const char**) argv, "seed"); 
    }
    printf("Seeding with %i ...\n", seed);
    

    float *d_Rand; 
    checkCudaErrors( cudaMalloc((void **)&d_Rand, rand_n * sizeof(float)) );
    
    curandGenerator_t prngGPU;
    checkCurandErrors( curandCreateGenerator(&prngGPU, CURAND_RNG_PSEUDO_MTGP32) ); 
    checkCurandErrors( curandSetPseudoRandomGeneratorSeed(prngGPU, seed) );

    curandGenerator_t prngCPU;
    checkCurandErrors( curandCreateGeneratorHost(&prngCPU, CURAND_RNG_PSEUDO_MTGP32) ); 
    checkCurandErrors( curandSetPseudoRandomGeneratorSeed(prngCPU, seed) );

    //
    // Example 1: Compare random numbers generated on GPU and CPU
    float *h_RandGPU  = (float *)malloc(rand_n * sizeof(float));

    printf("Generating random numbers on GPU...\n\n");
    checkCurandErrors( curandGenerateUniform(prngGPU, (float*) d_Rand, rand_n) );

    printf("\nReading back the results...\n");
    checkCudaErrors( cudaMemcpy(h_RandGPU, d_Rand, rand_n * sizeof(float), cudaMemcpyDeviceToHost) );

    
    float *h_RandCPU  = (float *)malloc(rand_n * sizeof(float));
     
    printf("Generating random numbers on CPU...\n\n");
    checkCurandErrors( curandGenerateUniform(prngCPU, (float*) h_RandCPU, rand_n) ); 
 
    printf("Comparing CPU/GPU random numbers...\n\n");
    float L1norm = compareResults(rand_n, h_RandGPU, h_RandCPU); 
    
    //
    // Example 2: Timing of random number generation on GPU
    const int numIterations = 10;
    int i;
    StopWatchInterface *hTimer;

    checkCudaErrors( cudaDeviceSynchronize() );
    sdkCreateTimer(&hTimer);
    sdkResetTimer(&hTimer);
    sdkStartTimer(&hTimer);

    for (i = 0; i < numIterations; i++)
    {
        checkCurandErrors( curandGenerateUniform(prngGPU, (float*) d_Rand, rand_n) );
    }

    checkCudaErrors( cudaDeviceSynchronize() );
    sdkStopTimer(&hTimer);

    double gpuTime = 1.0e-3 * sdkGetTimerValue(&hTimer)/(double)numIterations;

    printf("MersenneTwister, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers\n", 
               1.0e-9 * rand_n / gpuTime, gpuTime, rand_n); 

    printf("Shutting down...\n");

    checkCurandErrors( curandDestroyGenerator(prngGPU) );
    checkCurandErrors( curandDestroyGenerator(prngCPU) );
    checkCudaErrors( cudaFree(d_Rand) );
    sdkDeleteTimer( &hTimer);
    free(h_RandGPU);
    free(h_RandCPU);

    cudaDeviceReset();	
    shrQAFinishExit(argc, (const char**)argv, (L1norm < 1e-6) ? QA_PASSED : QA_FAILED);
}
void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size)
{
    // By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
    cudaError_t error;
    devID = 0;

    if (checkCmdLineFlag(argc, (const char **)argv, "device"))
    {
        devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
        error = cudaSetDevice(devID);

        if (error != cudaSuccess)
        {
            printf("cudaSetDevice returned error code %d, line(%d)\n", error, __LINE__);
            exit(EXIT_FAILURE);
        }
    }

    // get number of SMs on this GPU
    error = cudaGetDevice(&devID);

    if (error != cudaSuccess)
    {
        printf("cudaGetDevice returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }


    if (checkCmdLineFlag(argc, (const char **)argv, "sizemult"))
    {
        iSizeMultiple = getCmdLineArgumentInt(argc, (const char **)argv, "sizemult");
    }

    iSizeMultiple = min(iSizeMultiple, 10);
    iSizeMultiple = max(iSizeMultiple, 1);

    cudaDeviceProp deviceProp;

    error = cudaGetDeviceProperties(&deviceProp, devID);

    if (error != cudaSuccess)
    {
        printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }

    printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);

    // use a larger block size for Fermi and above
    int block_size = (deviceProp.major < 2) ? 16 : 32;

    matrix_size.uiWA = 2 * block_size * iSizeMultiple;
    matrix_size.uiHA = 4 * block_size * iSizeMultiple;
    matrix_size.uiWB = 2 * block_size * iSizeMultiple;
    matrix_size.uiHB = 4 * block_size * iSizeMultiple;
    matrix_size.uiWC = 2 * block_size * iSizeMultiple;
    matrix_size.uiHC = 4 * block_size * iSizeMultiple;

    printf("MatrixA(%u,%u), MatrixB(%u,%u), MatrixC(%u,%u)\n",
           matrix_size.uiWA, matrix_size.uiHA,
           matrix_size.uiWB, matrix_size.uiHB,
           matrix_size.uiWC, matrix_size.uiHC);
}
void runAutoTest(int argc, char *argv[])
{
    printf("[%s] (automated testing w/ readback)\n", sSDKsample);
    int devID = findCudaDevice(argc, (const char **)argv);

    loadDefaultImage(argv[0]);

    Pixel *d_result;
    checkCudaErrors(cudaMalloc((void **)&d_result, imWidth*imHeight*sizeof(Pixel)));

    char *ref_file = NULL;
    char  dump_file[256];

    int mode = 0;
    mode = getCmdLineArgumentInt(argc, (const char **)argv, "mode");
    getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file);

    switch (mode)
    {
        case 0:
            g_SobelDisplayMode = SOBELDISPLAY_IMAGE;
            sprintf(dump_file, "lena_orig.pgm");
            break;

        case 1:
            g_SobelDisplayMode = SOBELDISPLAY_SOBELTEX;
            sprintf(dump_file, "lena_tex.pgm");
            break;

        case 2:
            g_SobelDisplayMode = SOBELDISPLAY_SOBELSHARED;
            sprintf(dump_file, "lena_shared.pgm");
            break;

        default:
            printf("Invalid Filter Mode File\n");
            exit(EXIT_FAILURE);
            break;
    }

    printf("AutoTest: %s <%s>\n", sSDKsample, filterMode[g_SobelDisplayMode]);
    sobelFilter(d_result, imWidth, imHeight, g_SobelDisplayMode, imageScale);
    checkCudaErrors(cudaDeviceSynchronize());

    unsigned char *h_result = (unsigned char *)malloc(imWidth*imHeight*sizeof(Pixel));
    checkCudaErrors(cudaMemcpy(h_result, d_result, imWidth*imHeight*sizeof(Pixel), cudaMemcpyDeviceToHost));
    sdkSavePGM(dump_file, h_result, imWidth, imHeight);

    if (!sdkComparePGM(dump_file, sdkFindFilePath(ref_file, argv[0]), MAX_EPSILON_ERROR, 0.15f, false))
    {
        g_TotalErrors++;
    }

    checkCudaErrors(cudaFree(d_result));
    free(h_result);

    if (g_TotalErrors != 0)
    {
        printf("Test failed!\n");
        exit(EXIT_FAILURE);
    }

    printf("Test passed!\n");
    exit(EXIT_SUCCESS);
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
    pArgc = &argc;
    pArgv = argv;
    char *ref_file = NULL;

#if defined(__linux__)
    setenv ("DISPLAY", ":0", 0);
#endif

    printf("%s Starting...\n\n", sSDKsample);

    printf("NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n\n");

    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    if (argc > 1)
    {
        if (checkCmdLineFlag(argc, (const char **)argv, "file"))
        {
            getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file);
            fpsLimit = frameCheckNumber;
        }
    }

    // Get the path of the filename
    char *filename;

    if (getCmdLineArgumentString(argc, (const char **) argv, "image", &filename))
    {
        image_filename = filename;
    }

    // load image
    char *image_path = sdkFindFilePath(image_filename, argv[0]);

    if (image_path == NULL)
    {
        fprintf(stderr, "Error unable to find and load image file: '%s'\n", image_filename);
        exit(EXIT_FAILURE);
    }

    sdkLoadPPM4ub(image_path, (unsigned char **)&h_img, &width, &height);

    if (!h_img)
    {
        printf("Error unable to load PPM file: '%s'\n", image_path);
        exit(EXIT_FAILURE);
    }

    printf("Loaded '%s', %d x %d pixels\n", image_path, width, height);

    if (checkCmdLineFlag(argc, (const char **)argv, "threads"))
    {
        nthreads = getCmdLineArgumentInt(argc, (const char **) argv, "threads");
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "sigma"))
    {
        sigma = getCmdLineArgumentFloat(argc, (const char **) argv, "sigma");
    }

    runBenchmark = checkCmdLineFlag(argc, (const char **) argv, "benchmark");

    int device;
    struct cudaDeviceProp prop;
    cudaGetDevice(&device);
    cudaGetDeviceProperties(&prop, device);

    if (!strncmp("Tesla", prop.name, 5))
    {
        printf("Tesla card detected, running the test in benchmark mode (no OpenGL display)\n");
        //        runBenchmark = true;
        runBenchmark = true;
    }

    // Benchmark or AutoTest mode detected, no OpenGL
    if (runBenchmark == true || ref_file != NULL)
    {
        findCudaDevice(argc, (const char **)argv);
    }
    else
    {
        // First initialize OpenGL context, so we can properly set the GL for CUDA.
        // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
        initGL(&argc, argv);
        findCudaGLDevice(argc, (const char **)argv);
    }

    initCudaBuffers();

    if (ref_file)
    {
        printf("(Automated Testing)\n");
        bool testPassed = runSingleTest(ref_file, argv[0]);

        cleanup();

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();

        exit(testPassed ? EXIT_SUCCESS : EXIT_FAILURE);
    }

    if (runBenchmark)
    {
        printf("(Run Benchmark)\n");
        benchmark(100);

        cleanup();

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();

        exit(EXIT_SUCCESS);
    }

    initGLBuffers();
    glutMainLoop();

    exit(EXIT_SUCCESS);
}