Example #1
0
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
    pArgc = &argc;
    pArgv = argv;

    // parse arguments
    char *filename;

    printf("Starting bicubicTexture\n");

    if (checkCmdLineFlag(argc, (const char **) argv, "help"))
    {
        printHelp();
        exit(EXIT_SUCCESS);
    }

    if (checkCmdLineFlag(argc, (const char **) argv, "mode"))
    {
        g_FilterMode = (eFilterMode)getCmdLineArgumentInt(argc, (const char **) argv, "mode");

        if (g_FilterMode < MODE_NEAREST && g_FilterMode > MODE_CATMULL_ROM)
        {
            printf("Invalid Mode setting %d\n", g_FilterMode);
            exit(EXIT_FAILURE);
        }
    }

    if (getCmdLineArgumentString(argc, (const char **) argv, "file", &filename))
    {
        dumpFilename = filename;
        fpsLimit     = frameCheckNumber;

        // Running CUDA kernel (bicubicFiltering) without visualization (QA Testing/Verification)
        runAutoTest(argc, argv, (const char *)dumpFilename, g_FilterMode);
    }
    else
    {
        // This runs the CUDA kernel (bicubicFiltering) + OpenGL visualization
        initialize(argc, argv);
        glutMainLoop();
        sdkDeleteTimer(&timer);

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        exit(EXIT_SUCCESS);
    }

    exit(EXIT_SUCCESS);
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
    pArgc = &argc;
    pArgv = argv;

    // parse arguments
    char *filename;

#if defined(__linux__)
    setenv ("DISPLAY", ":0", 0);
#endif

    printf("Starting bicubicTexture\n");

    if (checkCmdLineFlag(argc, (const char **) argv, "help"))
    {
        printHelp();
        exit(EXIT_SUCCESS);
    }

    if (checkCmdLineFlag(argc, (const char **) argv, "mode"))
    {
        g_FilterMode = (eFilterMode)getCmdLineArgumentInt(argc, (const char **) argv, "mode");

        if (g_FilterMode < 0 || g_FilterMode >= NUM_MODES)
        {
            printf("Invalid Mode setting %d\n", g_FilterMode);
            exit(EXIT_FAILURE);
        }
    }

    if (getCmdLineArgumentString(argc, (const char **) argv, "file", &filename))
    {
        dumpFilename = filename;
        fpsLimit     = frameCheckNumber;

        // Running CUDA kernel (bicubicFiltering) without visualization (QA Testing/Verification)
        runAutoTest(argc, argv, (const char *)dumpFilename, g_FilterMode);
    }
    else
    {
        // This runs the CUDA kernel (bicubicFiltering) + OpenGL visualization
        initialize(argc, argv);
        glutMainLoop();
    }

    exit(EXIT_SUCCESS);
}
/////////////////////////////////////////////////////
// Main program
/////////////////////////////////////////////////////
int main(const int argc, const char **argv)
{
    unsigned long start = START_SIZE;
#ifdef NVS
    unsigned long end = END_NVS;
#else
    /* sizeof(unsigned long) = 8 bytes */
    unsigned long end = END_TITAN;
#endif

    int stateDim = 1;   /* default stateDim = 1 */
    if (checkCmdLineFlag(argc, argv, "dim"))
        stateDim = getCmdLineArgumentInt(argc, argv, "dim");

    char *typeInput = 0;
    getCmdLineArgumentString(argc, (const char**)argv, "type", &typeInput);
    if (0 != typeInput){
    if (!strcasecmp(typeInput, "float"))
        runTest<float>(start, end, stateDim);
    else if (!strcasecmp(typeInput, "int"))
        runTest<int>(start, end, stateDim);
    else if (!strcasecmp(typeInput, "double"))
        runTest<double>(start, end, stateDim);
    }
    else
        runTest<double>(start, end, stateDim);


    exit(EXIT_SUCCESS);
}
Example #4
0
 // Initialization code to find the best CUDA Device
 int findCudaDevice(int argc, const char **argv)
 {
     cudaDeviceProp deviceProp;
     int devID = 0;
     // If the command-line has a device number specified, use it
     if (checkCmdLineFlag(argc, argv, "device")) {
         devID = getCmdLineArgumentInt(argc, argv, "device=");
         if (devID < 0) {
             printf("Invalid command line parameters\n");
             exit(-1);
         } else {
             devID = gpuDeviceInit(devID);
             if (devID < 0) {
                printf("exiting...\n");
                shrQAFinishExit(argc, (const char **)argv, QA_FAILED);
                exit(-1);
             }
         }
     } else {
         // Otherwise pick the device with highest Gflops/s
         devID = gpuGetMaxGflopsDeviceId();
         checkCudaErrors( cudaSetDevice( devID ) );
         checkCudaErrors( cudaGetDeviceProperties(&deviceProp, devID) );
         printf("> Using CUDA device [%d]: %s\n", devID, deviceProp.name);
     }
     return devID;
 }
Example #5
0
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
    numParticles = NUM_PARTICLES;
    uint gridDim = GRID_SIZE;
    numIterations = 0;

    if (argc > 1)
    {
        if (checkCmdLineFlag(argc, (const char **) argv, "n"))
        {
            numParticles = getCmdLineArgumentInt(argc, (const char **)argv, "n");
        }

        if (checkCmdLineFlag(argc, (const char **) argv, "grid"))
        {
            gridDim = getCmdLineArgumentInt(argc, (const char **) argv, "grid");
        }
    }

    gridSize.x = gridSize.y = gridSize.z = gridDim;
    printf("grid: %d x %d x %d = %d cells\n", gridSize.x, gridSize.y, gridSize.z, gridSize.x*gridSize.y*gridSize.z);
    printf("particles: %d\n", numParticles);


    if (checkCmdLineFlag(argc, (const char **) argv, "i"))
    {
        numIterations = getCmdLineArgumentInt(argc, (const char **) argv, "i");
    }

    cudaInit(argc, argv);

    initParticleSystem(numParticles, gridSize);
    initParams();

    if (numIterations <= 0) numIterations = 300;
    runBenchmark(numIterations, argv[0]);

    if (psystem)
    {
        delete psystem;
    }

    exit(g_TotalErrors > 0 ? EXIT_FAILURE : EXIT_SUCCESS);
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    char device_name[256];
    char *ref_file = NULL;

    pArgc = &argc;
    pArgv = argv;

    printf("[%s] - Starting...\n", SDK_name);

    if (!findCUDADevice())               // Search for CUDA GPU
    {
        printf("> CUDA Device NOT found on \"%s\".. Exiting.\n", device_name);
        exit(EXIT_SUCCESS);
    }

    if (!dynlinkLoadD3D10API())              // Search for D3D API (locate drivers, does not mean device is found)
    {
        printf("> D3D10 API libraries NOT found.. Exiting.\n");
        dynlinkUnloadD3D10API();
        exit(EXIT_SUCCESS);
    }

    if (!findDXDevice(device_name))          // Search for D3D Hardware Device
    {
        printf("> D3D10 Graphics Device NOT found.. Exiting.\n");
        dynlinkUnloadD3D10API();
        exit(EXIT_SUCCESS);
    }

    // command line options
    if (argc > 1)
    {
        // automatied build testing harness
        if (checkCmdLineFlag(argc, (const char **)argv, "file"))
            getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file);
    }

    // run D3D10/CUDA test
    runTest(argc, argv, ref_file);

    //
    // and exit
    //
    printf("%s running on %s exiting...\n", SDK_name, device_name);

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();
    exit(g_bPassed ? EXIT_SUCCESS : EXIT_FAILURE);
}
Example #7
0
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
    pArgc = &argc;
    pArgv = argv;

    char *ref_file = NULL;

    printf("%s Starting...\n\n", sSDKsample);

    if (checkCmdLineFlag(argc, (const char **)argv, "file"))
    {
        fpsLimit = frameCheckNumber;
        getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file);
    }

    if (ref_file)
    {
        chooseCudaDevice(argc, argv, false);

        loadVolumeData(argv[0]);

        runAutoTest(ref_file, argv[0]);
    }
    else
    {
        // First initialize OpenGL context, so we can properly set the GL for CUDA.
        // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
        initGL(&argc, argv);

        // use command-line specified CUDA device, otherwise use device with highest Gflops/s
        chooseCudaDevice(argc, argv, true);

        // OpenGL buffers
        initGLBuffers();

        loadVolumeData(argv[0]);
    }

    printf("Press space to toggle animation\n"
           "Press '+' and '-' to change displayed slice\n");

#if defined (__APPLE__) || defined(MACOSX)
    atexit(cleanup);
#else
    glutCloseFunc(cleanup);
#endif

    glutMainLoop();

    exit(EXIT_SUCCESS);
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    char device_name[256];
    char *ref_file = NULL;

    pArgc = &argc;
    pArgv = argv;

    printf("[%s] - Starting...\n", SDK_name);

    if (!findCUDADevice())               // Search for CUDA GPU
    {
        printf("> CUDA Device NOT found on \"%s\".. Exiting.\n", device_name);
        exit(EXIT_SUCCESS);
    }

    if (!dynlinkLoadD3D10API())              // Search for D3D API (locate drivers, does not mean device is found)
    {
        printf("> D3D10 API libraries NOT found.. Exiting.\n");
        dynlinkUnloadD3D10API();
        exit(EXIT_SUCCESS);
    }

    if (!findDXDevice(device_name))          // Search for D3D Hardware Device
    {
        printf("> D3D10 Graphics Device NOT found.. Exiting.\n");
        dynlinkUnloadD3D10API();
        exit(EXIT_SUCCESS);
    }

    // command line options
    if (argc > 1)
    {
        // automatied build testing harness
        if (checkCmdLineFlag(argc, (const char **)argv, "file"))
            getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file);
    }

    // run D3D10/CUDA test
    runTest(argc, argv, ref_file);

    //
    // and exit
    //
    printf("%s running on %s exiting...\n", SDK_name, device_name);

    cudaDeviceReset();
    exit(g_bPassed ? EXIT_SUCCESS : EXIT_FAILURE);
}
Example #9
0
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    char device_name[256];
    char *ref_file = NULL;

    pArgc = &argc;
    pArgv = argv;

    printf("> %s starting...\n", sSDKSample);

    if (!findCUDADevice())                   // Search for CUDA GPU
    {
        printf("> CUDA Device NOT found on \"%s\".. Exiting.\n", device_name);
        exit(EXIT_SUCCESS);
    }

    if (!dynlinkLoadD3D10API())                  // Search for D3D API (locate drivers, does not mean device is found)
    {
        printf("> D3D10 API libraries NOT found on.. Exiting.\n");
        dynlinkUnloadD3D10API();
        exit(EXIT_SUCCESS);
    }

    if (!findDXDevice(device_name))              // Search for D3D Hardware Device
    {
        printf("> D3D10 Graphics Device NOT found.. Exiting.\n");
        dynlinkUnloadD3D10API();
        exit(EXIT_SUCCESS);
    }

    if (argc > 1)
    {
        if (checkCmdLineFlag(argc, (const char **)argv, "file"))
        {
            getCmdLineArgumentString(argc, (const char **)argv, "file", (char **)&ref_file);
        }
    }

    runTest(argc, argv, ref_file);

    //
    // and exit
    //
    printf("%s running on %s exiting...\n", sSDKSample, device_name);
    printf("%s sample finished returned: %s\n", sSDKSample, (g_bPassed ? "OK" : "ERROR!"));
    exit(g_bPassed ? EXIT_SUCCESS : EXIT_FAILURE);
}
Example #10
0
int main(int argc, char *argv[])
{
    if (checkCmdLineFlag(argc, (const char **)argv, "help"))
    {
        printf("\n USAGE:");
        printf("\n   -pc-file='file path' to file containing point cloud list of points");
        printf("\n                        file should have format: x y z r g b");
        printf("\n                        where point coordinates xyz are floats");
        printf("\n                        and point color channels rgb are 8 bit integers (0-255)");
        printf("\n\n");
        return 0;
    }

    Glib::RefPtr<Gtk::Application> app = Gtk::Application::create( argc, argv, "jacko.pc_render" );
    PC_Render pc_render;
    return app->run(pc_render);
}
Example #11
0
////////////////////////////////////////////////////////////////////////////////
//! Check if the result is correct or write data to file for external
//! regression testing
////////////////////////////////////////////////////////////////////////////////
bool SaveResult(int argc, char **argv)
{
    // Map vertex buffer
    float *data;

    if (FAILED(g_pVB->Map(D3D10_MAP_READ, 0, (void **)&data))) //Lock(0, 0, (void**)&data, 0)))
        return false;

    // Unmap
    g_pVB->Unmap();

    // Save result
    if (checkCmdLineFlag(argc, (const char **) argv, "regression"))
    {
        // write file for regression test
        sdkWriteFile<float>("./data/regression.dat", data, sizeof(CUSTOMVERTEX), 0.0f, false);
    }

    return true;
}
Example #12
0
// Initialization code to find the best CUDA Device
inline int findCudaDevice(int argc, const char **argv)
{
    cudaDeviceProp deviceProp;
    int devID = 0;

    // If the command-line has a device number specified, use it
    if (checkCmdLineFlag(argc, argv, "device"))
    {
        devID = getCmdLineArgumentInt(argc, argv, "device=");

        if (devID < 0)
        {
            printf("Invalid command line parameter\n ");
            exit(EXIT_FAILURE);
        }
        else
        {
            devID = gpuDeviceInit(devID);

            if (devID < 0)
            {
                printf("exiting...\n");
                exit(EXIT_FAILURE);
            }
        }
    }
    else
    {
        // Otherwise pick the device with highest Gflops/s
        devID = gpuGetMaxGflopsDeviceId();
        checkCudaErrors(cudaSetDevice(devID));
        checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
        printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
    }

    return devID;
}
Example #13
0
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
    printf("%s Starting...\n\n", sSDKsample);

    numParticles = NUM_PARTICLES;
    maxNumParticles = MAX_NUM_PARTICLES;
    uint gridDim = GRID_SIZE;
    numIterations = 0;
    printf("Surely I can get this far\n");

    if (argc > 1)
    {
        if (checkCmdLineFlag(argc, (const char **) argv, "n"))
        {
            numParticles = getCmdLineArgumentInt(argc, (const char **)argv, "n");
        }

        if (checkCmdLineFlag(argc, (const char **) argv, "grid"))
        {
            gridDim = getCmdLineArgumentInt(argc, (const char **) argv, "grid");
        }

        if (checkCmdLineFlag(argc, (const char **)argv, "file"))
        {
            getCmdLineArgumentString(argc, (const char **)argv, "file", &g_refFile);
            fpsLimit = frameCheckNumber;
            numIterations = 1;
        }
    }

    //*******************************************************
    // RMK Hard code for cylindrical coords (y=theta=1)
    // DomainSize
	//char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-nothot/ZVert.txt";
	//char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-nothot/RVert.txt";
	//char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-nothot-big/ZVert.txt";
	//char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-nothot-big/RVert.txt";
	//char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-big/ZVert.txt";
	//char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-big/RVert.txt";
	//char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-nothot-big-refine/ZVert.txt";
	//char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-nothot-big-refine/RVert.txt";
	char Zfile[] = "/home/rkeedy/CFD/BuoyantStrumJet85-big-refine-lighter/ZVert.txt";
	char Rfile[] = "/home/rkeedy/CFD/BuoyantStrumJet85-big-refine-lighter/RVert.txt";
	//char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet62-big-refine-lighter/ZVert.txt";
	//char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet62-big-refine-lighter/RVert.txt";
	//char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-big-refine/ZVert.txt";
	//char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-big-refine/RVert.txt";
	//char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet63-big-refine/ZVert.txt";
	//char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet63-big-refine/RVert.txt";
	numVelNodes.x = filecount(Rfile); //-1;
	numVelNodes.z = filecount(Zfile); //-1;
	numVelNodes.y = 1;
	numCells.x = 80; //47; //24; //29;
	numCells.y = 1;
	numCells.z = 160; //188; //95; //88;
	numParticles = numCells.x*numCells.z*20; //avgnumparticles = 40
	srand( time( NULL ) );
	//numParticles = numCells.x*numCells.z*40;
	printf("vel grid: %d x %d x %d = %d cells\n", numVelNodes.x, numVelNodes.y, numVelNodes.z, numVelNodes.x*numVelNodes.y*numVelNodes.z);
	printf("    grid: %d x %d x %d = %d cells\n", numCells.x, numCells.y, numCells.z, numCells.x*numCells.y*numCells.z);
	//printf("vel grid: %d x %d x %d = %d cells\n", gridSize.x, gridSize.y, gridSize.z, gridSize.x*gridSize.y*gridSize.z);

    bool benchmark = checkCmdLineFlag(argc, (const char **) argv, "benchmark") != 0;

    if (checkCmdLineFlag(argc, (const char **) argv, "i"))
    {
        numIterations = getCmdLineArgumentInt(argc, (const char **) argv, "i");
    }
    if (g_refFile)
    {
        cudaInit(argc, argv);
    }
    else
    {
        if (checkCmdLineFlag(argc, (const char **)argv, "device"))
        {
            printf("[%s]\n", argv[0]);
            printf("   Does not explicitly support -device=n in OpenGL mode\n");
            printf("   To use -device=n, the sample must be running w/o OpenGL\n\n");
            printf(" > %s -device=n -file=<*.bin>\n", argv[0]);
            printf("exiting...\n");
            exit(EXIT_SUCCESS);
        }

        initGL(&argc, argv);
        cudaGLInit(argc, argv);
    }

    // Moved code snippet to CellSystem
    //initCellSystem(gridSize);
    // now moved to particlesystem

    printf("Begin initialization\n");

    //initParticleSystem(numParticles, gridSize, g_refFile==NULL);
    initParticleSystem(maxNumParticles, numParticles, numVelNodes, numCells, g_refFile==NULL);
    //printf("Finished with initParticleSystem, %d\n",g_refFile==NULL);
    //cin.ignore();
    initParams();

    printf("Finished with initialization\n");

    if (!g_refFile)
    {
        initMenus();
    }

    if (benchmark || g_refFile)
    {
        if (numIterations <= 0)
        {
            numIterations = 300;
        }

        runBenchmark(numIterations, argv[0]);
    }
    else
    {
        glutDisplayFunc(display);
        glutReshapeFunc(reshape);
        glutMouseFunc(mouse);
        glutMotionFunc(motion);
        glutKeyboardFunc(key);
        glutSpecialFunc(special);
        glutIdleFunc(idle);

        atexit(cleanup);

        glutMainLoop();
    }

    if (psystem)
    {
        delete psystem;
    }

    cudaDeviceReset();
    exit(g_TotalErrors > 0 ? EXIT_FAILURE : EXIT_SUCCESS);
}
bool
runTest(int argc, char **argv, ReduceType datatype)
{
    int size = 1<<24;    // number of elements to reduce
    int maxThreads = 256;  // number of threads per block
    int whichKernel = 6;
    int maxBlocks = 64;
    bool cpuFinalReduction = false;
    int cpuFinalThreshold = 1;

    if (checkCmdLineFlag(argc, (const char **) argv, "n"))
    {
        size = getCmdLineArgumentInt(argc, (const char **) argv, "n");
    }

    if (checkCmdLineFlag(argc, (const char **) argv, "threads"))
    {
        maxThreads = getCmdLineArgumentInt(argc, (const char **) argv, "threads");
    }

    if (checkCmdLineFlag(argc, (const char **) argv, "kernel"))
    {
        whichKernel = getCmdLineArgumentInt(argc, (const char **) argv, "kernel");
    }

    if (checkCmdLineFlag(argc, (const char **) argv, "maxblocks"))
    {
        maxBlocks = getCmdLineArgumentInt(argc, (const char **) argv, "maxblocks");
    }

    printf("%d elements\n", size);
    printf("%d threads (max)\n", maxThreads);

    cpuFinalReduction = checkCmdLineFlag(argc, (const char **) argv, "cpufinal");

    if (checkCmdLineFlag(argc, (const char **) argv, "cputhresh"))
    {
        cpuFinalThreshold = getCmdLineArgumentInt(argc, (const char **) argv, "cputhresh");
    }

    bool runShmoo = checkCmdLineFlag(argc, (const char **) argv, "shmoo");

    if (runShmoo)
    {
        shmoo<T>(1, 33554432, maxThreads, maxBlocks, datatype);
    }
    else
    {
        // create random input data on CPU
        unsigned int bytes = size * sizeof(T);

        T *h_idata = (T *) malloc(bytes);

        for (int i=0; i<size; i++)
        {
            // Keep the numbers small so we don't get truncation error in the sum
            if (datatype == REDUCE_INT)
            {
                h_idata[i] = (T)(rand() & 0xFF);
            }
            else
            {
                h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX;
            }
        }

        int numBlocks = 0;
        int numThreads = 0;
        getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads);

        if (numBlocks == 1)
        {
            cpuFinalThreshold = 1;
        }

        // allocate mem for the result on host side
        T *h_odata = (T *) malloc(numBlocks*sizeof(T));

        printf("%d blocks\n\n", numBlocks);

        // allocate device memory and data
        T *d_idata = NULL;
        T *d_odata = NULL;

        checkCudaErrors(cudaMalloc((void **) &d_idata, bytes));
        checkCudaErrors(cudaMalloc((void **) &d_odata, numBlocks*sizeof(T)));

        // copy data directly to device memory
        checkCudaErrors(cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice));
        checkCudaErrors(cudaMemcpy(d_odata, h_idata, numBlocks*sizeof(T), cudaMemcpyHostToDevice));

        // warm-up
        reduce<T>(size, numThreads, numBlocks, whichKernel, d_idata, d_odata);

        int testIterations = 100;

        StopWatchInterface *timer = 0;
        sdkCreateTimer(&timer);

        T gpu_result = 0;

        gpu_result = benchmarkReduce<T>(size, numThreads, numBlocks, maxThreads, maxBlocks,
                                        whichKernel, testIterations, cpuFinalReduction,
                                        cpuFinalThreshold, timer,
                                        h_odata, d_idata, d_odata);

        double reduceTime = sdkGetAverageTimerValue(&timer) * 1e-3;
        printf("Reduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %d, Workgroup = %u\n",
               1.0e-9 * ((double)bytes)/reduceTime, reduceTime, size, 1, numThreads);

        // compute reference solution
        T cpu_result = reduceCPU<T>(h_idata, size);

        int precision = 0;
        double threshold = 0;
        double diff = 0;

        if (datatype == REDUCE_INT)
        {
            printf("\nGPU result = %d\n", (int)gpu_result);
            printf("CPU result = %d\n\n", (int)cpu_result);
        }
        else
        {
            if (datatype == REDUCE_FLOAT)
            {
                precision = 8;
                threshold = 1e-8 * size;
            }
            else
            {
                precision = 12;
                threshold = 1e-12 * size;
            }

            printf("\nGPU result = %.*f\n", precision, (double)gpu_result);
            printf("CPU result = %.*f\n\n", precision, (double)cpu_result);

            diff = fabs((double)gpu_result - (double)cpu_result);
        }

        // cleanup
        sdkDeleteTimer(&timer);
        free(h_idata);
        free(h_odata);

        checkCudaErrors(cudaFree(d_idata));
        checkCudaErrors(cudaFree(d_odata));

        if (datatype == REDUCE_INT)
        {
            return (gpu_result == cpu_result);
        }
        else
        {
            return (diff < threshold);
        }
    }

    return true;
}
int main(int argc, char **argv)
{
    char *dump_file = NULL;

#if defined(__linux__)
    setenv ("DISPLAY", ":0", 0);
#endif

    pArgc = &argc;
    pArgv = argv;

    printf("%s Starting...\n\n", sSDKsample);

    if (checkCmdLineFlag(argc, (const char **)argv, "file"))
    {
        getCmdLineArgumentString(argc, (const char **)argv,
                                 "file", (char **) &dump_file);

        int kernel = 1;

        if (checkCmdLineFlag(argc, (const char **)argv, "kernel"))
        {
            kernel = getCmdLineArgumentInt(argc, (const char **)argv, "kernel");
        }

        runAutoTest(argc, argv, dump_file, kernel);
    }
    else
    {
        printf("[%s]\n", sSDKsample);

        // use command-line specified CUDA device, otherwise use device with highest Gflops/s
        if (checkCmdLineFlag(argc, (const char **)argv, "device"))
        {
            printf("[%s]\n", argv[0]);
            printf("   Does not explicitly support -device=n in OpenGL mode\n");
            printf("   To use -device=n, the sample must be running w/o OpenGL\n\n");
            printf(" > %s -device=n -qatest\n", argv[0]);
            printf("exiting...\n");
            exit(EXIT_SUCCESS);
        }

        // First load the image, so we know what the size of the image (imageW and imageH)
        printf("Allocating host and CUDA memory and loading image file...\n");
        const char *image_path = sdkFindFilePath("portrait_noise.bmp", argv[0]);

        if (image_path == NULL)
        {
            printf("imageDenoisingGL was unable to find and load image file <portrait_noise.bmp>.\nExiting...\n");
            exit(EXIT_FAILURE);
        }

        LoadBMPFile(&h_Src, &imageW, &imageH, image_path);
        printf("Data init done.\n");

        // First initialize OpenGL context, so we can properly set the GL for CUDA.
        // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
        initGL(&argc, argv);
        cudaGLSetGLDevice(gpuGetMaxGflopsDeviceId());

        checkCudaErrors(CUDA_MallocArray(&h_Src, imageW, imageH));

        initOpenGLBuffers();
    }

    printf("Starting GLUT main loop...\n");
    printf("Press [1] to view noisy image\n");
    printf("Press [2] to view image restored with knn filter\n");
    printf("Press [3] to view image restored with nlm filter\n");
    printf("Press [4] to view image restored with modified nlm filter\n");
    printf("Press [*] to view smooth/edgy areas [RED/BLUE] Ct's when a filter is active\n");
    printf("Press [f] to print frame rate\n");
    printf("Press [?] to print Noise and Lerp Ct's\n");
    printf("Press [q] to exit\n");

    sdkCreateTimer(&timer);
    sdkStartTimer(&timer);

    glutMainLoop();
}
int
main(int argc, char **argv)
{
    pArgc = &argc;
    pArgv = argv;

    char *ref_file = NULL;

    printf("%s Starting...\n\n", sSDKsample);

    //start logs

    if (checkCmdLineFlag(argc, (const char **)argv, "help"))
    {
        printHelp();
        exit(EXIT_SUCCESS);
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "file"))
    {
        fpsLimit = frameCheckNumber;
        getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file);
    }

    if (ref_file)
    {
        if (checkCmdLineFlag(argc, (const char **)argv, "device"))
        {
            int device = findCudaDevice(argc, (const char **)argv);

            if (device < 0)
            {
                printf("No CUDA Capable devices found, exiting...\n");
                exit(EXIT_SUCCESS);
            }

            checkDeviceMeetComputeSpec(argc, argv);
        }
        else
        {
            int dev = findCapableDevice(argc, argv);

            if (dev != -1)
            {
                cudaSetDevice(dev);
            }
            else
            {
                cudaDeviceReset();
                exit(EXIT_SUCCESS);
            }
        }
    }
    else
    {
        if (checkCmdLineFlag(argc, (const char **)argv, "device"))
        {
            printf("   This SDK does not explicitly support -device=n when running with OpenGL.\n");
            printf("   When specifying -device=n (n=0,1,2,....) the sample must not use OpenGL.\n");
            printf("   See details below to run without OpenGL:\n\n");
            printf(" > %s -device=n -file=output.bin\n\n", argv[0]);
            printf("exiting...\n");
            exit(EXIT_SUCCESS);
        }

        // First initialize OpenGL context, so we can properly set the GL for CUDA.
        // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
        initGL(&argc, argv);

        int dev = findCapableDevice(argc, argv);

        if (dev != -1)
        {
            cudaGLSetGLDevice(dev);
        }
        else
        {
            exit(EXIT_SUCCESS);
        }
    }

    // load volume data
    initData(argc, argv);

    printf(
        "Press \n"
        "  'SPACE'     to toggle animation\n"
        "  'p'         to toggle pre-integrated transfer function\n"
        "  '+' and '-' to change density (0.01 increments)\n"
        "  ']' and '[' to change brightness\n"
        "  ';' and ''' to modify transfer function offset\n"
        "  '.' and ',' to modify transfer function scale\n\n");

    if (ref_file)
    {
        runSingleTest(ref_file, argv[0]);
    }
    else
    {
        // This is the normal rendering path for VolumeRender
        glutDisplayFunc(display);
        glutKeyboardFunc(keyboard);
        glutMouseFunc(mouse);
        glutMotionFunc(motion);
        glutReshapeFunc(reshape);
        glutIdleFunc(idle);

        initPixelBuffer();

        atexit(cleanup);

        glutMainLoop();
    }

    cudaDeviceReset();
}
void initData(int argc, char **argv)
{
    // parse arguments
    char *filename;

    if (getCmdLineArgumentString(argc, (const char **) argv, "file", &filename))
    {
        volumeFilename = filename;
    }

    int n;


    if (checkCmdLineFlag(argc, (const char **) argv, "size"))
    {
        n = getCmdLineArgumentInt(argc, (const char **) argv, "size");
        volumeSize.width = volumeSize.height = volumeSize.depth = n;
    }

    if (checkCmdLineFlag(argc, (const char **) argv, "xsize"))
    {
        n = getCmdLineArgumentInt(argc, (const char **) argv, "xsize");
        volumeSize.width = n;
    }

    if (checkCmdLineFlag(argc, (const char **) argv, "ysize"))
    {
        n = getCmdLineArgumentInt(argc, (const char **) argv, "ysize");
        volumeSize.height = n;
    }

    if (checkCmdLineFlag(argc, (const char **) argv, "zsize"))
    {
        n = getCmdLineArgumentInt(argc, (const char **) argv, "zsize");
        volumeSize.depth = n;
    }

    char *path = sdkFindFilePath(volumeFilename, argv[0]);

    if (path == 0)
    {
        printf("Error finding file '%s'\n", volumeFilename);
        exit(EXIT_FAILURE);
    }

    size_t size = volumeSize.width*volumeSize.height*volumeSize.depth*sizeof(VolumeType);
    void *h_volume = loadRawFile(path, size);

    FilterKernel_init();
    Volume_init(&volumeOriginal,volumeSize, h_volume, 0);
    free(h_volume);
    Volume_init(&volumeFilter0, volumeSize, NULL, 1);
    Volume_init(&volumeFilter1, volumeSize, NULL, 1);
    VolumeRender_init();
    VolumeRender_setPreIntegrated(preIntegrated);
    VolumeRender_setVolume(&volumeOriginal);

    sdkCreateTimer(&timer);
    sdkCreateTimer(&animationTimer);
    sdkStartTimer(&animationTimer);

    // calculate new grid size
    gridSize = dim3(iDivUp(width, blockSize.x), iDivUp(height, blockSize.y));
}
Example #18
0
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char *argv[])
{
    char device_name[NAME_LEN];
    char *ref_file = NULL;

    pArgc = &argc;
    pArgv = argv;

    printf("[%s] - Starting...\n", SDK_name);

    if (!findGraphicsGPU(device_name))
    {
        printf("> %s not supported on \"%s\" exiting...\n", SDK_name, device_name);
        exit(EXIT_SUCCESS);
    }

    // command line options
    if (argc > 1)
    {
        // automatied build testing harness
        if (checkCmdLineFlag(argc, (const char **)argv, "file"))
            getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file);
    }


    //
    // create window
    //
    // Register the window class
#if 1
    WNDCLASSEX wc = { sizeof(WNDCLASSEX), CS_CLASSDC, MsgProc, 0L, 0L,
                      GetModuleHandle(NULL), NULL, NULL, NULL, NULL,
                      "CUDA/D3D9 Texture InterOP", NULL
                    };
    RegisterClassEx(&wc);

    int xBorder = ::GetSystemMetrics(SM_CXSIZEFRAME);
    int yMenu = ::GetSystemMetrics(SM_CYMENU);
    int yBorder = ::GetSystemMetrics(SM_CYSIZEFRAME);

    // Create the application's window (padding by window border for uniform BB sizes across OSs)
    HWND hWnd = CreateWindow(wc.lpszClassName, "CUDA/D3D9 Texture InterOP",
                             WS_OVERLAPPEDWINDOW, 0, 0, g_WindowWidth + 2*xBorder, g_WindowHeight+ 2*yBorder+yMenu,
                             NULL, NULL, wc.hInstance, NULL);


#else
    static WNDCLASSEX wc = { sizeof(WNDCLASSEX), CS_CLASSDC, MsgProc, 0L, 0L, GetModuleHandle(NULL), NULL, NULL, NULL, NULL, "CudaD3D9Tex", NULL };
    RegisterClassEx(&wc);
    HWND hWnd = CreateWindow(
                    "CudaD3D9Tex", "CUDA D3D9 Texture Interop",
                    WS_OVERLAPPEDWINDOW,
                    0, 0, 800, 320,
                    GetDesktopWindow(),
                    NULL,
                    wc.hInstance,
                    NULL);
#endif

    ShowWindow(hWnd, SW_SHOWDEFAULT);
    UpdateWindow(hWnd);

    // Initialize Direct3D
    if (SUCCEEDED(InitD3D9(hWnd)) &&
        SUCCEEDED(InitCUDA()) &&
        SUCCEEDED(InitTextures()))
    {
        if (!g_bDeviceLost)
        {
            RegisterD3D9ResourceWithCUDA();
        }
    }

    //
    // the main loop
    //
    while (false == g_bDone)
    {
        RunCUDA();
        DrawScene();

        //
        // handle I/O
        //
        MSG msg;
        ZeroMemory(&msg, sizeof(msg));

        while (msg.message!=WM_QUIT)
        {
            if (PeekMessage(&msg, NULL, 0U, 0U, PM_REMOVE))
            {
                TranslateMessage(&msg);
                DispatchMessage(&msg);
            }
            else
            {
                RunCUDA();
                DrawScene();

                if (ref_file)
                {
                    for (int count=0; count<g_iFrameToCompare; count++)
                    {
                        RunCUDA();
                        DrawScene();
                    }

                    const char *cur_image_path = "simpleD3D9Texture.ppm";

                    // Save a reference of our current test run image
                    CheckRenderD3D9::BackbufferToPPM(g_pD3DDevice, cur_image_path);

                    // compare to offical reference image, printing PASS or FAIL.
                    g_bPassed = CheckRenderD3D9::PPMvsPPM(cur_image_path, ref_file, argv[0], MAX_EPSILON, 0.15f);

                    g_bDone = true;

                    Cleanup();
                    PostQuitMessage(0);
                }
            }
        }
    };

    // Unregister windows class
    UnregisterClass(wc.lpszClassName, wc.hInstance);

    //
    // and exit
    //
    printf("> %s running on %s exiting...\n", SDK_name, device_name);

    exit(g_bPassed ? EXIT_SUCCESS : EXIT_FAILURE);
}
Example #19
0
//////////////////////////////////////////////////////////////////////////////
// Program main
//////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
    bool bTestResults = true;

    if (checkCmdLineFlag(argc, (const char **)argv, "help"))
    {
        printf("\n> Command line options\n");
        showHelp();
        return 0;
    }

    printf("Run \"nbody -benchmark [-numbodies=<numBodies>]\" to measure perfomance.\n");
    showHelp();

    bFullscreen  = (checkCmdLineFlag(argc, (const char **) argv, "fullscreen") != 0);

    if (bFullscreen)
    {
        bShowSliders = false;
    }

    benchmark    = (checkCmdLineFlag(argc, (const char **) argv, "benchmark") != 0);

    compareToCPU = ((checkCmdLineFlag(argc, (const char **) argv, "compare") != 0) ||
                    (checkCmdLineFlag(argc, (const char **) argv, "qatest")  != 0));

    QATest       = (checkCmdLineFlag(argc, (const char **) argv, "qatest")  != 0);
    useHostMem   = (checkCmdLineFlag(argc, (const char **) argv, "hostmem") != 0);
    fp64         = (checkCmdLineFlag(argc, (const char **) argv, "fp64") != 0);

    flopsPerInteraction = fp64 ? 30 : 20;

    useCpu       = (checkCmdLineFlag(argc, (const char **) argv, "cpu") != 0);

    if (checkCmdLineFlag(argc, (const char **)argv, "numdevices"))
    {
        numDevsRequested = getCmdLineArgumentInt(argc, (const char **) argv, "numdevices");

        if (numDevsRequested < 1)
        {
            printf("Error: \"number of CUDA devices\" specified %d is invalid.  Value should be >= 1\n", numDevsRequested);
            exit(bTestResults ? EXIT_SUCCESS : EXIT_FAILURE);
        }
        else
        {
            printf("number of CUDA devices  = %d\n", numDevsRequested);
        }
    }

    // for multi-device we currently require using host memory -- the devices share
    // data via the host
    if (numDevsRequested > 1)
    {
        useHostMem = true;
    }

    int numDevsAvailable = 0;
    bool customGPU = false;
    cudaGetDeviceCount(&numDevsAvailable);

    if (numDevsAvailable < numDevsRequested)
    {
        printf("Error: only %d Devices available, %d requested.  Exiting.\n", numDevsAvailable, numDevsRequested);
        exit(EXIT_SUCCESS);
    }

    printf("> %s mode\n", bFullscreen ? "Fullscreen" : "Windowed");
    printf("> Simulation data stored in %s memory\n", useHostMem ? "system" : "video");
    printf("> %s precision floating point simulation\n", fp64 ? "Double" : "Single");
    printf("> %d Devices used for simulation\n", numDevsRequested);

    int devID;
    cudaDeviceProp props;

    if (useCpu)
    {
        useHostMem = true;
        compareToCPU = false;
        bSupportDouble = true;

#ifdef OPENMP
        printf("> Simulation with CPU using OpenMP\n");
#else
        printf("> Simulation with CPU\n");
#endif
    }

    // Initialize GL and GLUT if necessary
    if (!benchmark && !compareToCPU)
    {
        initGL(&argc, argv);
        initParameters();
    }

    
    if(!useCpu)
    {
        // Now choose the CUDA Device
        // Either without GL interop:
        if (benchmark || compareToCPU || useHostMem)
        {
            // Note if we are using host memory for the body system, we
            // don't use CUDA-GL interop.

            if (checkCmdLineFlag(argc, (const char **)argv, "device"))
            {
                customGPU = true;
            }

            devID = findCudaDevice(argc, (const char **)argv);
        }
        else   // or with GL interop:
        {
            if (checkCmdLineFlag(argc, (const char **)argv, "device"))
            {
                customGPU = true;
            }

            devID = findCudaGLDevice(argc, (const char **)argv);
        }

        checkCudaErrors(cudaGetDevice(&devID));
        checkCudaErrors(cudaGetDeviceProperties(&props, devID));

        bSupportDouble = true;

#if CUDART_VERSION < 4000

        if (numDevsRequested > 1)
        {
            printf("MultiGPU n-body requires CUDA 4.0 or later\n");
            cudaDeviceReset();
            exit(EXIT_SUCCESS);
        }

#endif

        // Initialize devices
        if (numDevsRequested > 1 && customGPU)
        {
            printf("You can't use --numdevices and --device at the same time.\n");
            exit(EXIT_SUCCESS);
        }

        if (customGPU)
        {
            cudaDeviceProp props;
            checkCudaErrors(cudaGetDeviceProperties(&props, devID));
            printf("> Compute %d.%d CUDA device: [%s]\n", props.major, props.minor, props.name);
        }
        else
        {
            for (int i = 0; i < numDevsRequested; i++)
            {
                cudaDeviceProp props;
                checkCudaErrors(cudaGetDeviceProperties(&props, i));

                printf("> Compute %d.%d CUDA device: [%s]\n", props.major, props.minor, props.name);

                if (useHostMem)
                {
#if CUDART_VERSION >= 2020

                    if (!props.canMapHostMemory)
                    {
                        fprintf(stderr, "Device %d cannot map host memory!\n", devID);
                        cudaDeviceReset();
                        exit(EXIT_SUCCESS);
                    }

                    if (numDevsRequested > 1)
                    {
                        checkCudaErrors(cudaSetDevice(i));
                    }

                    checkCudaErrors(cudaSetDeviceFlags(cudaDeviceMapHost));
#else
                    fprintf(stderr, "This CUDART version does not support <cudaDeviceProp.canMapHostMemory> field\n");
                    cudaDeviceReset();
                    exit(EXIT_SUCCESS);
#endif
                }
            }

            // CC 1.2 and earlier do not support double precision
            if (props.major*10 + props.minor <= 12)
            {
                bSupportDouble = false;
            }
        }

        //if(numDevsRequested > 1)
        //    checkCudaErrors(cudaSetDevice(devID));

        if (fp64 && !bSupportDouble)
        {
            fprintf(stderr, "One or more of the requested devices does not support double precision floating-point\n");
            cudaDeviceReset();
            exit(EXIT_SUCCESS);
        }
    }

    numIterations = 0;
    p = 0;
    q = 1;

    if (checkCmdLineFlag(argc, (const char **)argv, "i"))
    {
        numIterations = getCmdLineArgumentInt(argc, (const char **)argv, "i");
    }

    if (checkCmdLineFlag(argc, (const char **) argv, "p"))
    {
        p = getCmdLineArgumentInt(argc, (const char **)argv, "p");
    }

    if (checkCmdLineFlag(argc, (const char **) argv, "q"))
    {
        q = getCmdLineArgumentInt(argc, (const char **)argv, "q");
    }

    if (p == 0)   // p not set on command line
    {
        p = 256;

        if (q * p > 256)
        {
            p = 256 / q;
            printf("Setting p=%d, q=%d to maintain %d threads per block\n", p, q, 256);
        }
    }

    // default number of bodies is #SMs * 4 * CTA size
    if (useCpu)
#ifdef OPENMP
        numBodies = 8192;

#else
        numBodies = 4096;
#endif
    else if (numDevsRequested == 1)
int
main(int argc, char *argv[])
{
    printf("%s Starting...\n\n", argv[0]);

    try
    {
        std::string sFilename;
        char *filePath;

        // set your own FreeImage error handler
        FreeImage_SetOutputMessage(FreeImageErrorHandler);

        cudaDeviceInit(argc, (const char **)argv);

        // Min spec is SM 1.0 devices
        if (printfNPPinfo(argc, argv, 1, 0) == false) 
        {
            cudaDeviceReset();
            exit(EXIT_SUCCESS);
        }

        if (checkCmdLineFlag(argc, (const char **)argv, "input"))
        {
            getCmdLineArgumentString(argc, (const char **)argv, "input", &filePath);
        }
        else
        {
            filePath = sdkFindFilePath("Lena.pgm", argv[0]);
        }

        if (filePath)
        {
            sFilename = filePath;
        }
        else
        {
            sFilename = "Lena.pgm";
        }

        // if we specify the filename at the command line, then we only test sFilename
        // otherwise we will check both sFilename[0,1]
        int file_errors = 0;
        std::ifstream infile(sFilename.data(), std::ifstream::in);

        if (infile.good())
        {
            std::cout << "freeImageInteropNPP opened: <" << sFilename.data() << "> successfully!" << std::endl;
            file_errors = 0;
            infile.close();
        }
        else
        {
            std::cout << "freeImageInteropNPP unable to open: <" << sFilename.data() << ">" << std::endl;
            file_errors++;
            infile.close();
        }

        if (file_errors > 0)
        {
            exit(EXIT_FAILURE);
        }

        std::string sResultFilename = sFilename;

        std::string::size_type dot = sResultFilename.rfind('.');

        if (dot != std::string::npos)
        {
            sResultFilename = sResultFilename.substr(0, dot);
        }

        sResultFilename += "_boxFilterFII.pgm";

        if (checkCmdLineFlag(argc, (const char **)argv, "output"))
        {
            char *outputFilePath;
            getCmdLineArgumentString(argc, (const char **)argv, "output", &outputFilePath);
            sResultFilename = outputFilePath;
        }

        FREE_IMAGE_FORMAT eFormat = FreeImage_GetFileType(sFilename.c_str());

        // no signature? try to guess the file format from the file extension
        if (eFormat == FIF_UNKNOWN)
        {
            eFormat = FreeImage_GetFIFFromFilename(sFilename.c_str());
        }

        NPP_ASSERT(eFormat != FIF_UNKNOWN);
        // check that the plugin has reading capabilities ...
        FIBITMAP *pBitmap;

        if (FreeImage_FIFSupportsReading(eFormat))
        {
            pBitmap = FreeImage_Load(eFormat, sFilename.c_str());
        }

        NPP_ASSERT(pBitmap != 0);
        // Dump the bitmap information to the console
        std::cout << (*pBitmap) << std::endl;
        // make sure this is an 8-bit single channel image
        NPP_ASSERT(FreeImage_GetColorType(pBitmap) == FIC_MINISBLACK);
        NPP_ASSERT(FreeImage_GetBPP(pBitmap) == 8);

        unsigned int nImageWidth  = FreeImage_GetWidth(pBitmap);
        unsigned int nImageHeight = FreeImage_GetHeight(pBitmap);
        unsigned int nSrcPitch    = FreeImage_GetPitch(pBitmap);
        unsigned char *pSrcData  = FreeImage_GetBits(pBitmap);

        int nSrcPitchCUDA;
        Npp8u *pSrcImageCUDA = nppiMalloc_8u_C1(nImageWidth, nImageHeight, &nSrcPitchCUDA);
        NPP_ASSERT_NOT_NULL(pSrcImageCUDA);
        // copy image loaded via FreeImage to into CUDA device memory, i.e.
        // transfer the image-data up to the GPU's video-memory
        NPP_CHECK_CUDA(cudaMemcpy2D(pSrcImageCUDA, nSrcPitchCUDA, pSrcData, nSrcPitch,
                                    nImageWidth, nImageHeight, cudaMemcpyHostToDevice));

        // define size of the box filter
        const NppiSize  oMaskSize   = {7, 7};
        const NppiPoint oMaskAchnor = {0, 0};
        // compute maximal result image size
        const NppiSize  oSizeROI = {(int)nImageWidth  - (oMaskSize.width - 1),
                                    (int)nImageHeight - (oMaskSize.height - 1)
                                   };
        // allocate result image memory
        int nDstPitchCUDA;
        Npp8u *pDstImageCUDA = nppiMalloc_8u_C1(oSizeROI.width, oSizeROI.height, &nDstPitchCUDA);
        NPP_ASSERT_NOT_NULL(pDstImageCUDA);
        NPP_CHECK_NPP(nppiFilterBox_8u_C1R(pSrcImageCUDA, nSrcPitchCUDA, pDstImageCUDA, nDstPitchCUDA,
                                           oSizeROI, oMaskSize, oMaskAchnor));
        // create the result image storage using FreeImage so we can easily
        // save
        FIBITMAP *pResultBitmap = FreeImage_Allocate(oSizeROI.width, oSizeROI.height, 8 /* bits per pixel */);
        NPP_ASSERT_NOT_NULL(pResultBitmap);
        unsigned int nResultPitch   = FreeImage_GetPitch(pResultBitmap);
        unsigned char *pResultData = FreeImage_GetBits(pResultBitmap);

        NPP_CHECK_CUDA(cudaMemcpy2D(pResultData, nResultPitch, pDstImageCUDA, nDstPitchCUDA,
                                    oSizeROI.width, oSizeROI.height, cudaMemcpyDeviceToHost));
        // now save the result image
        bool bSuccess;
        bSuccess = FreeImage_Save(FIF_PGM, pResultBitmap, sResultFilename.c_str(), 0) == TRUE;
        NPP_ASSERT_MSG(bSuccess, "Failed to save result image.");

        //free nppiImage
        nppiFree(pSrcImageCUDA);
        nppiFree(pDstImageCUDA);

        cudaDeviceReset();
        exit(EXIT_SUCCESS);
    }
    catch (npp::Exception &rException)
    {
        std::cerr << "Program error! The following exception occurred: \n";
        std::cerr << rException << std::endl;
        std::cerr << "Aborting." << std::endl;
        exit(EXIT_FAILURE);
    }
    catch (...)
    {
        std::cerr << "Program error! An unknow type of exception occurred. \n";
        std::cerr << "Aborting." << std::endl;
        exit(EXIT_FAILURE);
    }

    exit(EXIT_SUCCESS);
}
Example #21
0
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    // start logs
    int devID;
    char *ref_file = NULL;
    printf("%s Starting...\n\n", argv[0]);

#if defined(__linux__)
    setenv ("DISPLAY", ":0", 0);
#endif

    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    if (argc > 1)
    {
        if (checkCmdLineFlag(argc, (const char **)argv, "radius"))
        {
            filter_radius = getCmdLineArgumentInt(argc, (const char **) argv, "radius");
        }

        if (checkCmdLineFlag(argc, (const char **)argv, "passes"))
        {
            iterations = getCmdLineArgumentInt(argc, (const char **)argv, "passes");
        }

        if (checkCmdLineFlag(argc, (const char **)argv, "file"))
        {
            getCmdLineArgumentString(argc, (const char **)argv, "file", (char **)&ref_file);
        }
    }

    // load image to process
    loadImageData(argc, argv);

    if (checkCmdLineFlag(argc, (const char **)argv, "benchmark"))
    {
        // This is a separate mode of the sample, where we are benchmark the kernels for performance
        devID = findCudaDevice(argc, (const char **)argv);

        // Running CUDA kernels (bilateralfilter) in Benchmarking mode
        g_TotalErrors += runBenchmark(argc, argv);

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
    }
    else if (checkCmdLineFlag(argc, (const char **)argv, "radius") ||
             checkCmdLineFlag(argc, (const char **)argv, "passes"))
    {
        // This overrides the default mode.  Users can specify the radius used by the filter kernel
        devID = findCudaDevice(argc, (const char **)argv);
        g_TotalErrors += runSingleTest(ref_file, argv[0]);

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
    }
    else
    {
        // Default mode running with OpenGL visualization and in automatic mode
        // the output automatically changes animation
        printf("\n");

        // First initialize OpenGL context, so we can properly set the GL for CUDA.
        // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
        initGL(argc, (char **)argv);
        int dev = findCapableDevice(argc, argv);

        if (dev != -1)
        {
            dev = gpuGLDeviceInit(argc, (const char **)argv);

            if (dev == -1)
            {
                exit(EXIT_FAILURE);
            }
        }
        else
        {
            // cudaDeviceReset causes the driver to clean up all state. While
            // not mandatory in normal operation, it is good practice.  It is also
            // needed to ensure correct operation when the application is being
            // profiled. Calling cudaDeviceReset causes all profile data to be
            // flushed before the application exits
            cudaDeviceReset();
            exit(EXIT_SUCCESS);
        }

        // Now we can create a CUDA context and bind it to the OpenGL context
        initCuda();
        initGLResources();

        // sets the callback function so it will call cleanup upon exit
#if defined (__APPLE__) || defined(MACOSX)
        atexit(cleanup);
#else
        glutCloseFunc(cleanup);
#endif

        printf("Running Standard Demonstration with GLUT loop...\n\n");
        printf("Press '+' and '-' to change filter width\n"
               "Press ']' and '[' to change number of iterations\n"
               "Press 'e' and 'E' to change Euclidean delta\n"
               "Press 'g' and 'G' to changle Gaussian delta\n"
               "Press 'a' or  'A' to change Animation mode ON/OFF\n\n");

        // Main OpenGL loop that will run visualization for every vsync
        glutMainLoop();
    }
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
    pArgc = &argc;
    pArgv = argv;
    char *ref_file = NULL;

#if defined(__linux__)
    setenv ("DISPLAY", ":0", 0);
#endif

    printf("%s Starting...\n\n", sSDKsample);

    printf("NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n\n");

    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    if (argc > 1)
    {
        if (checkCmdLineFlag(argc, (const char **)argv, "file"))
        {
            getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file);
            fpsLimit = frameCheckNumber;
        }
    }

    // Get the path of the filename
    char *filename;

    if (getCmdLineArgumentString(argc, (const char **) argv, "image", &filename))
    {
        image_filename = filename;
    }

    // load image
    char *image_path = sdkFindFilePath(image_filename, argv[0]);

    if (image_path == NULL)
    {
        fprintf(stderr, "Error unable to find and load image file: '%s'\n", image_filename);
        exit(EXIT_FAILURE);
    }

    sdkLoadPPM4ub(image_path, (unsigned char **)&h_img, &width, &height);

    if (!h_img)
    {
        printf("Error unable to load PPM file: '%s'\n", image_path);
        exit(EXIT_FAILURE);
    }

    printf("Loaded '%s', %d x %d pixels\n", image_path, width, height);

    if (checkCmdLineFlag(argc, (const char **)argv, "threads"))
    {
        nthreads = getCmdLineArgumentInt(argc, (const char **) argv, "threads");
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "sigma"))
    {
        sigma = getCmdLineArgumentFloat(argc, (const char **) argv, "sigma");
    }

    runBenchmark = checkCmdLineFlag(argc, (const char **) argv, "benchmark");

    int device;
    struct cudaDeviceProp prop;
    cudaGetDevice(&device);
    cudaGetDeviceProperties(&prop, device);

    if (!strncmp("Tesla", prop.name, 5))
    {
        printf("Tesla card detected, running the test in benchmark mode (no OpenGL display)\n");
        //        runBenchmark = true;
        runBenchmark = true;
    }

    // Benchmark or AutoTest mode detected, no OpenGL
    if (runBenchmark == true || ref_file != NULL)
    {
        findCudaDevice(argc, (const char **)argv);
    }
    else
    {
        // First initialize OpenGL context, so we can properly set the GL for CUDA.
        // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
        initGL(&argc, argv);
        findCudaGLDevice(argc, (const char **)argv);
    }

    initCudaBuffers();

    if (ref_file)
    {
        printf("(Automated Testing)\n");
        bool testPassed = runSingleTest(ref_file, argv[0]);

        cleanup();

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();

        exit(testPassed ? EXIT_SUCCESS : EXIT_FAILURE);
    }

    if (runBenchmark)
    {
        printf("(Run Benchmark)\n");
        benchmark(100);

        cleanup();

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();

        exit(EXIT_SUCCESS);
    }

    initGLBuffers();
    glutMainLoop();

    exit(EXIT_SUCCESS);
}
Example #23
0
/* Solve Ax=b using the conjugate gradient method a) without any preconditioning, b) using an Incomplete Cholesky preconditioner and c) using an ILU0 preconditioner. */
int main(int argc, char **argv)
{
    const int max_iter = 1000;
    int k, M = 0, N = 0, nz = 0, *I = NULL, *J = NULL;
    int *d_col, *d_row;
    int qatest = 0;
    const float tol = 1e-12f;
    float *x, *rhs;
    float r0, r1, alpha, beta;
    float *d_val, *d_x;
    float *d_zm1, *d_zm2, *d_rm2;
    float *d_r, *d_p, *d_omega, *d_y;
    float *val = NULL;
    float *d_valsILU0;
    float *valsILU0;
    float rsum, diff, err = 0.0;
    float qaerr1, qaerr2 = 0.0;
    float dot, numerator, denominator, nalpha;
    const float floatone = 1.0;
    const float floatzero = 0.0;

    int nErrors = 0;

    printf("conjugateGradientPrecond starting...\n");

    /* QA testing mode */
    if (checkCmdLineFlag(argc, (const char **)argv, "qatest"))
    {
        qatest = 1;
    }

    /* This will pick the best possible CUDA capable device */
    cudaDeviceProp deviceProp;
    int devID = findCudaDevice(argc, (const char **)argv);
    printf("GPU selected Device ID = %d \n", devID);

    if (devID < 0)
    {
        printf("Invalid GPU device %d selected,  exiting...\n", devID);
        exit(EXIT_SUCCESS);
    }

    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));

    /* Statistics about the GPU device */
    printf("> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n",
           deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor);

    int version = (deviceProp.major * 0x10 + deviceProp.minor);

    if (version < 0x11)
    {
        printf("%s: requires a minimum CUDA compute 1.1 capability\n", sSDKname);

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        exit(EXIT_SUCCESS);
    }

    /* Generate a random tridiagonal symmetric matrix in CSR (Compressed Sparse Row) format */
    M = N = 16384;
    nz = 5*N-4*(int)sqrt((double)N);
    I = (int *)malloc(sizeof(int)*(N+1));                              // csr row pointers for matrix A
    J = (int *)malloc(sizeof(int)*nz);                                 // csr column indices for matrix A
    val = (float *)malloc(sizeof(float)*nz);                           // csr values for matrix A
    x = (float *)malloc(sizeof(float)*N);
    rhs = (float *)malloc(sizeof(float)*N);

    for (int i = 0; i < N; i++)
    {
        rhs[i] = 0.0;                                                  // Initialize RHS
        x[i] = 0.0;                                                    // Initial approximation of solution
    }

    genLaplace(I, J, val, M, N, nz, rhs);

    /* Create CUBLAS context */
    cublasHandle_t cublasHandle = 0;
    cublasStatus_t cublasStatus;
    cublasStatus = cublasCreate(&cublasHandle);

    checkCudaErrors(cublasStatus);

    /* Create CUSPARSE context */
    cusparseHandle_t cusparseHandle = 0;
    cusparseStatus_t cusparseStatus;
    cusparseStatus = cusparseCreate(&cusparseHandle);

    checkCudaErrors(cusparseStatus);

    /* Description of the A matrix*/
    cusparseMatDescr_t descr = 0;
    cusparseStatus = cusparseCreateMatDescr(&descr);

    checkCudaErrors(cusparseStatus);

    /* Define the properties of the matrix */
    cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
    cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);

    /* Allocate required memory */
    checkCudaErrors(cudaMalloc((void **)&d_col, nz*sizeof(int)));
    checkCudaErrors(cudaMalloc((void **)&d_row, (N+1)*sizeof(int)));
    checkCudaErrors(cudaMalloc((void **)&d_val, nz*sizeof(float)));
    checkCudaErrors(cudaMalloc((void **)&d_x, N*sizeof(float)));
    checkCudaErrors(cudaMalloc((void **)&d_y, N*sizeof(float)));
    checkCudaErrors(cudaMalloc((void **)&d_r, N*sizeof(float)));
    checkCudaErrors(cudaMalloc((void **)&d_p, N*sizeof(float)));
    checkCudaErrors(cudaMalloc((void **)&d_omega, N*sizeof(float)));

    cudaMemcpy(d_col, J, nz*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_row, I, (N+1)*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_val, val, nz*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_r, rhs, N*sizeof(float), cudaMemcpyHostToDevice);

    /* Conjugate gradient without preconditioning.
       ------------------------------------------
       Follows the description by Golub & Van Loan, "Matrix Computations 3rd ed.", Section 10.2.6  */

    printf("Convergence of conjugate gradient without preconditioning: \n");
    k = 0;
    r0 = 0;
    cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, &r1);

    while (r1 > tol*tol && k <= max_iter)
    {
        k++;

        if (k == 1)
        {
            cublasScopy(cublasHandle, N, d_r, 1, d_p, 1);
        }
        else
        {
            beta = r1/r0;
            cublasSscal(cublasHandle, N, &beta, d_p, 1);
            cublasSaxpy(cublasHandle, N, &floatone, d_r, 1, d_p, 1) ;
        }

        cusparseScsrmv(cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz, &floatone, descr, d_val, d_row, d_col, d_p, &floatzero, d_omega);
        cublasSdot(cublasHandle, N, d_p, 1, d_omega, 1, &dot);
        alpha = r1/dot;
        cublasSaxpy(cublasHandle, N, &alpha, d_p, 1, d_x, 1);
        nalpha = -alpha;
        cublasSaxpy(cublasHandle, N, &nalpha, d_omega, 1, d_r, 1);
        r0 = r1;
        cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, &r1);
    }

    printf("  iteration = %3d, residual = %e \n", k, sqrt(r1));

    cudaMemcpy(x, d_x, N*sizeof(float), cudaMemcpyDeviceToHost);

    /* check result */
    err = 0.0;

    for (int i = 0; i < N; i++)
    {
        rsum = 0.0;

        for (int j = I[i]; j < I[i+1]; j++)
        {
            rsum += val[j]*x[J[j]];
        }

        diff = fabs(rsum - rhs[i]);

        if (diff > err)
        {
            err = diff;
        }
    }

    printf("  Convergence Test: %s \n", (k <= max_iter) ? "OK" : "FAIL");
    nErrors += (k > max_iter) ? 1 : 0;
    qaerr1 = err;

    if (0)
    {
        // output result in matlab-style array
        int n=(int)sqrt((double)N);
        printf("a = [  ");

        for (int iy=0; iy<n; iy++)
        {
            for (int ix=0; ix<n; ix++)
            {
                printf(" %f ", x[iy*n+ix]);
            }

            if (iy == n-1)
            {
                printf(" ]");
            }

            printf("\n");
        }
    }


    /* Preconditioned Conjugate Gradient using ILU.
       --------------------------------------------
       Follows the description by Golub & Van Loan, "Matrix Computations 3rd ed.", Algorithm 10.3.1  */

    printf("\nConvergence of conjugate gradient using incomplete LU preconditioning: \n");

    int nzILU0 = 2*N-1;
    valsILU0 = (float *) malloc(nz*sizeof(float));

    checkCudaErrors(cudaMalloc((void **)&d_valsILU0, nz*sizeof(float)));
    checkCudaErrors(cudaMalloc((void **)&d_zm1, (N)*sizeof(float)));
    checkCudaErrors(cudaMalloc((void **)&d_zm2, (N)*sizeof(float)));
    checkCudaErrors(cudaMalloc((void **)&d_rm2, (N)*sizeof(float)));

    /* create the analysis info object for the A matrix */
    cusparseSolveAnalysisInfo_t infoA = 0;
    cusparseStatus = cusparseCreateSolveAnalysisInfo(&infoA);

    checkCudaErrors(cusparseStatus);

    /* Perform the analysis for the Non-Transpose case */
    cusparseStatus = cusparseScsrsv_analysis(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                                             N, nz, descr, d_val, d_row, d_col, infoA);

    checkCudaErrors(cusparseStatus);

    /* Copy A data to ILU0 vals as input*/
    cudaMemcpy(d_valsILU0, d_val, nz*sizeof(float), cudaMemcpyDeviceToDevice);

    /* generate the Incomplete LU factor H for the matrix A using cudsparseScsrilu0 */
    cusparseStatus = cusparseScsrilu0(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, descr, d_valsILU0, d_row, d_col, infoA);

    checkCudaErrors(cusparseStatus);

    /* Create info objects for the ILU0 preconditioner */
    cusparseSolveAnalysisInfo_t info_u;
    cusparseCreateSolveAnalysisInfo(&info_u);

    cusparseMatDescr_t descrL = 0;
    cusparseStatus = cusparseCreateMatDescr(&descrL);
    cusparseSetMatType(descrL,CUSPARSE_MATRIX_TYPE_GENERAL);
    cusparseSetMatIndexBase(descrL,CUSPARSE_INDEX_BASE_ZERO);
    cusparseSetMatFillMode(descrL, CUSPARSE_FILL_MODE_LOWER);
    cusparseSetMatDiagType(descrL, CUSPARSE_DIAG_TYPE_UNIT);

    cusparseMatDescr_t descrU = 0;
    cusparseStatus = cusparseCreateMatDescr(&descrU);
    cusparseSetMatType(descrU,CUSPARSE_MATRIX_TYPE_GENERAL);
    cusparseSetMatIndexBase(descrU,CUSPARSE_INDEX_BASE_ZERO);
    cusparseSetMatFillMode(descrU, CUSPARSE_FILL_MODE_UPPER);
    cusparseSetMatDiagType(descrU, CUSPARSE_DIAG_TYPE_NON_UNIT);
    cusparseStatus = cusparseScsrsv_analysis(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, nz, descrU, d_val, d_row, d_col, info_u);

    /* reset the initial guess of the solution to zero */
    for (int i = 0; i < N; i++)
    {
        x[i] = 0.0;
    }

    checkCudaErrors(cudaMemcpy(d_r, rhs, N*sizeof(float), cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice));

    k = 0;
    cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, &r1);

    while (r1 > tol*tol && k <= max_iter)
    {
        // Forward Solve, we can re-use infoA since the sparsity pattern of A matches that of L
        cusparseStatus = cusparseScsrsv_solve(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, &floatone, descrL,
                                              d_valsILU0, d_row, d_col, infoA, d_r, d_y);
        checkCudaErrors(cusparseStatus);

        // Back Substitution
        cusparseStatus = cusparseScsrsv_solve(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, &floatone, descrU,
                                              d_valsILU0, d_row, d_col, info_u, d_y, d_zm1);
        checkCudaErrors(cusparseStatus);

        k++;

        if (k == 1)
        {
            cublasScopy(cublasHandle, N, d_zm1, 1, d_p, 1);
        }
        else
        {
            cublasSdot(cublasHandle, N, d_r, 1, d_zm1, 1, &numerator);
            cublasSdot(cublasHandle, N, d_rm2, 1, d_zm2, 1, &denominator);
            beta = numerator/denominator;
            cublasSscal(cublasHandle, N, &beta, d_p, 1);
            cublasSaxpy(cublasHandle, N, &floatone, d_zm1, 1, d_p, 1) ;
        }

        cusparseScsrmv(cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nzILU0, &floatone, descrU, d_val, d_row, d_col, d_p, &floatzero, d_omega);
        cublasSdot(cublasHandle, N, d_r, 1, d_zm1, 1, &numerator);
        cublasSdot(cublasHandle, N, d_p, 1, d_omega, 1, &denominator);
        alpha = numerator / denominator;
        cublasSaxpy(cublasHandle, N, &alpha, d_p, 1, d_x, 1);
        cublasScopy(cublasHandle, N, d_r, 1, d_rm2, 1);
        cublasScopy(cublasHandle, N, d_zm1, 1, d_zm2, 1);
        nalpha = -alpha;
        cublasSaxpy(cublasHandle, N, &nalpha, d_omega, 1, d_r, 1);
        cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, &r1);
    }

    printf("  iteration = %3d, residual = %e \n", k, sqrt(r1));

    cudaMemcpy(x, d_x, N*sizeof(float), cudaMemcpyDeviceToHost);

    /* check result */
    err = 0.0;

    for (int i = 0; i < N; i++)
    {
        rsum = 0.0;

        for (int j = I[i]; j < I[i+1]; j++)
        {
            rsum += val[j]*x[J[j]];
        }

        diff = fabs(rsum - rhs[i]);

        if (diff > err)
        {
            err = diff;
        }
    }

    printf("  Convergence Test: %s \n", (k <= max_iter) ? "OK" : "FAIL");
    nErrors += (k > max_iter) ? 1 : 0;
    qaerr2 = err;

    /* Destroy parameters */
    cusparseDestroySolveAnalysisInfo(infoA);
    cusparseDestroySolveAnalysisInfo(info_u);

    /* Destroy contexts */
    cusparseDestroy(cusparseHandle);
    cublasDestroy(cublasHandle);

    /* Free device memory */
    free(I);
    free(J);
    free(val);
    free(x);
    free(rhs);
    free(valsILU0);
    cudaFree(d_col);
    cudaFree(d_row);
    cudaFree(d_val);
    cudaFree(d_x);
    cudaFree(d_y);
    cudaFree(d_r);
    cudaFree(d_p);
    cudaFree(d_omega);
    cudaFree(d_valsILU0);
    cudaFree(d_zm1);
    cudaFree(d_zm2);
    cudaFree(d_rm2);

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();

    printf("  Test Summary:\n");
    printf("     Counted total of %d errors\n", nErrors);
    printf("     qaerr1 = %f qaerr2 = %f\n\n", fabs(qaerr1), fabs(qaerr2));
    exit((nErrors == 0 &&fabs(qaerr1)<1e-5 && fabs(qaerr2) < 1e-5 ? EXIT_SUCCESS : EXIT_FAILURE));
}
Example #24
0
///////////////////////////////////////////////////////////////////////////////
// Main program
///////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    // Start logs
    shrQAStart(argc, argv);

    // initialize the GPU, either identified by --device
    // or by picking the device with highest flop rate.
    int devID = findCudaDevice(argc, (const char **)argv);

    // parsing the number of random numbers to generate
    int rand_n = DEFAULT_RAND_N;
    if( checkCmdLineFlag(argc, (const char**) argv, "count") )  
    {       
        rand_n = getCmdLineArgumentInt(argc, (const char**) argv, "count"); 
    }
    printf("Allocating data for %i samples...\n", rand_n);
     
    // parsing the seed
    int seed = DEFAULT_SEED;
    if( checkCmdLineFlag(argc, (const char**) argv, "seed") ) 
    {       
        seed = getCmdLineArgumentInt(argc, (const char**) argv, "seed"); 
    }
    printf("Seeding with %i ...\n", seed);
    

    float *d_Rand; 
    checkCudaErrors( cudaMalloc((void **)&d_Rand, rand_n * sizeof(float)) );
    
    curandGenerator_t prngGPU;
    checkCurandErrors( curandCreateGenerator(&prngGPU, CURAND_RNG_PSEUDO_MTGP32) ); 
    checkCurandErrors( curandSetPseudoRandomGeneratorSeed(prngGPU, seed) );

    curandGenerator_t prngCPU;
    checkCurandErrors( curandCreateGeneratorHost(&prngCPU, CURAND_RNG_PSEUDO_MTGP32) ); 
    checkCurandErrors( curandSetPseudoRandomGeneratorSeed(prngCPU, seed) );

    //
    // Example 1: Compare random numbers generated on GPU and CPU
    float *h_RandGPU  = (float *)malloc(rand_n * sizeof(float));

    printf("Generating random numbers on GPU...\n\n");
    checkCurandErrors( curandGenerateUniform(prngGPU, (float*) d_Rand, rand_n) );

    printf("\nReading back the results...\n");
    checkCudaErrors( cudaMemcpy(h_RandGPU, d_Rand, rand_n * sizeof(float), cudaMemcpyDeviceToHost) );

    
    float *h_RandCPU  = (float *)malloc(rand_n * sizeof(float));
     
    printf("Generating random numbers on CPU...\n\n");
    checkCurandErrors( curandGenerateUniform(prngCPU, (float*) h_RandCPU, rand_n) ); 
 
    printf("Comparing CPU/GPU random numbers...\n\n");
    float L1norm = compareResults(rand_n, h_RandGPU, h_RandCPU); 
    
    //
    // Example 2: Timing of random number generation on GPU
    const int numIterations = 10;
    int i;
    StopWatchInterface *hTimer;

    checkCudaErrors( cudaDeviceSynchronize() );
    sdkCreateTimer(&hTimer);
    sdkResetTimer(&hTimer);
    sdkStartTimer(&hTimer);

    for (i = 0; i < numIterations; i++)
    {
        checkCurandErrors( curandGenerateUniform(prngGPU, (float*) d_Rand, rand_n) );
    }

    checkCudaErrors( cudaDeviceSynchronize() );
    sdkStopTimer(&hTimer);

    double gpuTime = 1.0e-3 * sdkGetTimerValue(&hTimer)/(double)numIterations;

    printf("MersenneTwister, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers\n", 
               1.0e-9 * rand_n / gpuTime, gpuTime, rand_n); 

    printf("Shutting down...\n");

    checkCurandErrors( curandDestroyGenerator(prngGPU) );
    checkCurandErrors( curandDestroyGenerator(prngCPU) );
    checkCudaErrors( cudaFree(d_Rand) );
    sdkDeleteTimer( &hTimer);
    free(h_RandGPU);
    free(h_RandCPU);

    cudaDeviceReset();	
    shrQAFinishExit(argc, (const char**)argv, (L1norm < 1e-6) ? QA_PASSED : QA_FAILED);
}
void initialize(int argc, char **argv)
{
    printf("[%s] (OpenGL Mode)\n", sSDKsample);

    // First initialize OpenGL context, so we can properly set the GL for CUDA.
    // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
    initGL(&argc, argv);

    int devID;
    cudaDeviceProp deviceProps;

    if (checkCmdLineFlag(argc, (const char **)argv, "device"))
    {
        devID = gpuGLDeviceInit(argc, (const char **)argv);

        if (devID < 0)
        {
            printf("exiting...\n");
            exit(EXIT_SUCCESS);
        }
    }
    else
    {
        devID = gpuGetMaxGflopsDeviceId();
        cudaGLSetGLDevice(devID);
    }

    // get number of SMs on this GPU
    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
    printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount);

    // Create the timer (for fps measurement)
    sdkCreateTimer(&timer);

    // load image from disk
    loadImageData(argc, argv);

    printf("\n"
           "\tControls\n"
           "\t=/- : Zoom in/out\n"
           "\tb   : Run Benchmark g_FilterMode\n"
           "\tc   : Draw Bicubic Spline Curve\n"
           "\t[esc] - Quit\n\n"

           "\tPress number keys to change filtering g_FilterMode:\n\n"
           "\t1 : nearest filtering\n"
           "\t2 : bilinear filtering\n"
           "\t3 : bicubic filtering\n"
           "\t4 : fast bicubic filtering\n"
           "\t5 : Catmull-Rom filtering\n\n"
          );

    initGLBuffers();

#if USE_BUFFER_TEX
    fprog = compileASMShader(GL_FRAGMENT_PROGRAM_ARB, shaderCode);

    if (!fprog)
    {
        exit(EXIT_SUCCESS);
    }

#endif
}
Example #26
0
int main(int argc, char **argv)
{
    int devID;
    cudaDeviceProp deviceProps;
    printf("%s Starting...\n\n", sSDKname);
    printf("[%s] - [OpenGL/CUDA simulation] starting...\n", sSDKname);

    // First initialize OpenGL context, so we can properly set the GL for CUDA.
    // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
    if (false == initGL(&argc, argv))
    {
        exit(EXIT_SUCCESS);
    }

    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
#ifndef OPTIMUS
    devID = findCudaGLDevice(argc, (const char **)argv);
#else
    devID = gpuGetMaxGflopsDeviceId();
#endif

    // get number of SMs on this GPU
    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
    printf("CUDA device [%s] has %d Multi-Processors\n",
           deviceProps.name, deviceProps.multiProcessorCount);

    // automated build testing harness
    if (checkCmdLineFlag(argc, (const char **)argv, "file"))
    {
        getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file);
    }

    // Allocate and initialize host data
    GLint bsize;

    sdkCreateTimer(&timer);
    sdkResetTimer(&timer);

    hvfield = (cData *)malloc(sizeof(cData) * DS);
    memset(hvfield, 0, sizeof(cData) * DS);

    // Allocate and initialize device data
    cudaMallocPitch((void **)&dvfield, &tPitch, sizeof(cData)*DIM, DIM);

    cudaMemcpy(dvfield, hvfield, sizeof(cData) * DS,
               cudaMemcpyHostToDevice);
    // Temporary complex velocity field data
    cudaMalloc((void **)&vxfield, sizeof(cData) * PDS);
    cudaMalloc((void **)&vyfield, sizeof(cData) * PDS);

    setupTexture(DIM, DIM);
    bindTexture();

    // Create particle array in host memory
    particles = (cData *)malloc(sizeof(cData) * DS);
    memset(particles, 0, sizeof(cData) * DS);

#ifdef BROADCAST
	int step = 1;

	// Broadcasted visualization stepping.
	if (argc > 3)
		step = atoi(argv[3]);

	// Create additional space to store particle packets
	// for broadcasting.
	wstep = step; hstep = step;
	int npackets = sizeof(float) * (DIM / wstep) * (DIM / hstep) / UdpBroadcastServer::PacketSize;
	if (sizeof(float) * (DIM / wstep) * (DIM / hstep) % UdpBroadcastServer::PacketSize)
		npackets++;
	packets = (char*)malloc(npackets *
		(UdpBroadcastServer::PacketSize + sizeof(unsigned int)));
#endif

    initParticles(particles, DIM, DIM);

#if defined(OPTIMUS) || defined(BROADCAST)
    // Create particle array in device memory
    cudaMalloc((void **)&particles_gpu, sizeof(cData) * DS);
    cudaMemcpy(particles_gpu, particles, sizeof(cData) * DS, cudaMemcpyHostToDevice);
#endif

    // Create CUFFT transform plan configuration
    cufftPlan2d(&planr2c, DIM, DIM, CUFFT_R2C);
    cufftPlan2d(&planc2r, DIM, DIM, CUFFT_C2R);
    // TODO: update kernels to use the new unpadded memory layout for perf
    // rather than the old FFTW-compatible layout
    cufftSetCompatibilityMode(planr2c, CUFFT_COMPATIBILITY_FFTW_PADDING);
    cufftSetCompatibilityMode(planc2r, CUFFT_COMPATIBILITY_FFTW_PADDING);

    glGenBuffersARB(1, &vbo);
    glBindBufferARB(GL_ARRAY_BUFFER_ARB, vbo);
    glBufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(cData) * DS,
                    particles, GL_DYNAMIC_DRAW_ARB);

    glGetBufferParameterivARB(GL_ARRAY_BUFFER_ARB, GL_BUFFER_SIZE_ARB, &bsize);

    if (bsize != (sizeof(cData) * DS))
        goto EXTERR;

    glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0);

#ifndef OPTIMUS
    checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone));
    getLastCudaError("cudaGraphicsGLRegisterBuffer failed");
#endif

    if (ref_file)
    {
        autoTest(argv);
        cleanup();

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        printf("[fluidsGL] - Test Results: %d Failures\n", g_TotalErrors);
        exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE);

    }
    else
    {
#ifdef BROADCAST
		const char *sv_addr = "127.0.0:9097";
		const char *bc_addr = "127.255.255.2:9097";

		// Server address
		if (argc > 2)
			sv_addr = argv[2];

		// Broadcast address
		if (argc > 1)
			bc_addr = argv[1];

		server.reset(new UdpBroadcastServer(sv_addr, bc_addr));

		// Listen to clients' feedbacks in a separate thread.
		{
			pthread_t tid;
			pthread_create(&tid, NULL, &feedback_listener, &step);
		}

		// Broadcast the particles state in a separate thread.
		{
			pthread_t tid;
			pthread_create(&tid, NULL, &broadcaster, &step);
		}
#endif
#if defined (__APPLE__) || defined(MACOSX)
        atexit(cleanup);
#else
        glutCloseFunc(cleanup);
#endif
        glutMainLoop();
    }

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();

    if (!ref_file)
    {
        exit(EXIT_SUCCESS);
    }

    return 0;

EXTERR:
    printf("Failed to initialize GL extensions.\n");

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();
    exit(EXIT_FAILURE);
}
Example #27
0
int main(int argc, char **argv)
{
    char *multiMethodChoice = NULL;
    char *scalingChoice = NULL;
    bool use_threads = true;
    bool bqatest = false;
    bool strongScaling = false;

    pArgc = &argc;
    pArgv = argv;

    printf("%s Starting...\n\n", argv[0]);

    if (checkCmdLineFlag(argc, (const char **)argv, "qatest"))
    {
        bqatest = true;
    }

    getCmdLineArgumentString(argc, (const char **)argv, "method", &multiMethodChoice);
    getCmdLineArgumentString(argc, (const char **)argv, "scaling", &scalingChoice);

    if (checkCmdLineFlag(argc, (const char **)argv, "h") ||
        checkCmdLineFlag(argc, (const char **)argv, "help"))
    {
        usage();
        exit(EXIT_SUCCESS);
    }

    if (multiMethodChoice == NULL)
    {
        use_threads = true;
    }
    else
    {
        if (!strcasecmp(multiMethodChoice, "threaded"))
        {
            use_threads = true;
        }
        else
        {
            use_threads = false;
        }
    }

    if (use_threads == false)
    {
        printf("Using single CPU thread for multiple GPUs\n");
    }

    if (scalingChoice == NULL)
    {
        strongScaling = false;
    }
    else
    {
        if (!strcasecmp(scalingChoice, "strong"))
        {
            strongScaling = true;
        }
        else
        {
            strongScaling = false;
        }
    }


    //GPU number present in the system
    int GPU_N;
    checkCudaErrors(cudaGetDeviceCount(&GPU_N));
    int nOptions = 256;

    nOptions = adjustProblemSize(GPU_N, nOptions);

    // select problem size
    int scale = (strongScaling) ? 1 : GPU_N;
    int OPT_N = nOptions * scale;
    int PATH_N = 262144;
    const unsigned long long SEED = 777;

    // initialize the timers
    hTimer = new StopWatchInterface*[GPU_N];

    for (int i=0; i<GPU_N; i++)
    {
        sdkCreateTimer(&hTimer[i]);
        sdkResetTimer(&hTimer[i]);
    }

    //Input data array
    TOptionData  *optionData   = new TOptionData[OPT_N];
    //Final GPU MC results
    TOptionValue *callValueGPU = new TOptionValue[OPT_N];
    //"Theoretical" call values by Black-Scholes formula
    float *callValueBS = new float[OPT_N];
    //Solver config
    TOptionPlan *optionSolver = new TOptionPlan[GPU_N];
    //OS thread ID
    CUTThread *threadID = new CUTThread[GPU_N];

    int gpuBase, gpuIndex;
    int i;

    float time;

    double delta, ref, sumDelta, sumRef, sumReserve;

    printf("MonteCarloMultiGPU\n");
    printf("==================\n");
    printf("Parallelization method  = %s\n", use_threads ? "threaded" : "streamed");
    printf("Problem scaling         = %s\n", strongScaling? "strong" : "weak");
    printf("Number of GPUs          = %d\n", GPU_N);
    printf("Total number of options = %d\n", OPT_N);
    printf("Number of paths         = %d\n", PATH_N);


    printf("main(): generating input data...\n");
    srand(123);

    for (i=0; i < OPT_N; i++)
    {
        optionData[i].S = randFloat(5.0f, 50.0f);
        optionData[i].X = randFloat(10.0f, 25.0f);
        optionData[i].T = randFloat(1.0f, 5.0f);
        optionData[i].R = 0.06f;
        optionData[i].V = 0.10f;
        callValueGPU[i].Expected   = -1.0f;
        callValueGPU[i].Confidence = -1.0f;
    }

    printf("main(): starting %i host threads...\n", GPU_N);


    //Get option count for each GPU
    for (i = 0; i < GPU_N; i++)
    {
        optionSolver[i].optionCount = OPT_N / GPU_N;
    }

    //Take into account cases with "odd" option counts
    for (i = 0; i < (OPT_N % GPU_N); i++)
    {
        optionSolver[i].optionCount++;
    }

    //Assign GPU option ranges
    gpuBase = 0;

    for (i = 0; i < GPU_N; i++)
    {
        optionSolver[i].device     = i;
        optionSolver[i].optionData = optionData   + gpuBase;
        optionSolver[i].callValue  = callValueGPU + gpuBase;
        // all devices use the same global seed, but start
        // the sequence at a different offset
        optionSolver[i].seed       = SEED;
        optionSolver[i].pathN      = PATH_N;
        gpuBase += optionSolver[i].optionCount;
    }


    if (use_threads || bqatest)
    {
        //Start CPU thread for each GPU
        for (gpuIndex = 0; gpuIndex < GPU_N; gpuIndex++)
        {
            threadID[gpuIndex] = cutStartThread((CUT_THREADROUTINE)solverThread, &optionSolver[gpuIndex]);
        }

        printf("main(): waiting for GPU results...\n");
        cutWaitForThreads(threadID, GPU_N);

        printf("main(): GPU statistics, threaded\n");

        for (i = 0; i < GPU_N; i++)
        {
            cudaDeviceProp deviceProp;
            checkCudaErrors(cudaGetDeviceProperties(&deviceProp, optionSolver[i].device));
            printf("GPU Device #%i: %s\n", optionSolver[i].device, deviceProp.name);
            printf("Options         : %i\n", optionSolver[i].optionCount);
            printf("Simulation paths: %i\n", optionSolver[i].pathN);
            time = sdkGetTimerValue(&hTimer[i]);
            printf("Total time (ms.): %f\n", time);
            printf("Options per sec.: %f\n", OPT_N / (time * 0.001));
        }

        printf("main(): comparing Monte Carlo and Black-Scholes results...\n");
        sumDelta   = 0;
        sumRef     = 0;
        sumReserve = 0;

        for (i = 0; i < OPT_N; i++)
        {
            BlackScholesCall(callValueBS[i], optionData[i]);
            delta     = fabs(callValueBS[i] - callValueGPU[i].Expected);
            ref       = callValueBS[i];
            sumDelta += delta;
            sumRef   += fabs(ref);

            if (delta > 1e-6)
            {
                sumReserve += callValueGPU[i].Confidence / delta;
            }

#ifdef PRINT_RESULTS
            printf("BS: %f; delta: %E\n", callValueBS[i], delta);
#endif

        }

        sumReserve /= OPT_N;
    }

    if (!use_threads || bqatest)
    {
        multiSolver(optionSolver, GPU_N);

        printf("main(): GPU statistics, streamed\n");

        for (i = 0; i < GPU_N; i++)
        {
            cudaDeviceProp deviceProp;
            checkCudaErrors(cudaGetDeviceProperties(&deviceProp, optionSolver[i].device));
            printf("GPU Device #%i: %s\n", optionSolver[i].device, deviceProp.name);
            printf("Options         : %i\n", optionSolver[i].optionCount);
            printf("Simulation paths: %i\n", optionSolver[i].pathN);
        }

        time = sdkGetTimerValue(&hTimer[0]);
        printf("\nTotal time (ms.): %f\n", time);
        printf("\tNote: This is elapsed time for all to compute.\n");
        printf("Options per sec.: %f\n", OPT_N / (time * 0.001));

        printf("main(): comparing Monte Carlo and Black-Scholes results...\n");
        sumDelta   = 0;
        sumRef     = 0;
        sumReserve = 0;

        for (i = 0; i < OPT_N; i++)
        {
            BlackScholesCall(callValueBS[i], optionData[i]);
            delta     = fabs(callValueBS[i] - callValueGPU[i].Expected);
            ref       = callValueBS[i];
            sumDelta += delta;
            sumRef   += fabs(ref);

            if (delta > 1e-6)
            {
                sumReserve += callValueGPU[i].Confidence / delta;
            }

#ifdef PRINT_RESULTS
            printf("BS: %f; delta: %E\n", callValueBS[i], delta);
#endif
        }

        sumReserve /= OPT_N;
    }

#ifdef DO_CPU
    printf("main(): running CPU MonteCarlo...\n");
    TOptionValue callValueCPU;
    sumDelta = 0;
    sumRef   = 0;

    for (i = 0; i < OPT_N; i++)
    {
        MonteCarloCPU(
            callValueCPU,
            optionData[i],
            NULL,
            PATH_N
        );
        delta     = fabs(callValueCPU.Expected - callValueGPU[i].Expected);
        ref       = callValueCPU.Expected;
        sumDelta += delta;
        sumRef   += fabs(ref);
        printf("Exp : %f | %f\t", callValueCPU.Expected,   callValueGPU[i].Expected);
        printf("Conf: %f | %f\n", callValueCPU.Confidence, callValueGPU[i].Confidence);
    }

    printf("L1 norm: %E\n", sumDelta / sumRef);
#endif

    printf("Shutting down...\n");

    for (int i=0; i<GPU_N; i++)
    {
        sdkStartTimer(&hTimer[i]);
        checkCudaErrors(cudaSetDevice(i));

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
    }

    delete[] optionSolver;
    delete[] callValueBS;
    delete[] callValueGPU;
    delete[] optionData;
    delete[] threadID;
    delete[] hTimer;

    printf("Test Summary...\n");
    printf("L1 norm        : %E\n", sumDelta / sumRef);
    printf("Average reserve: %f\n", sumReserve);
    printf(sumReserve > 1.0f ? "Test passed\n" : "Test failed!\n");
    exit(sumReserve > 1.0f ? EXIT_SUCCESS : EXIT_FAILURE);
}
void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size)
{
    // By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
    cudaError_t error;
    devID = 0;

    if (checkCmdLineFlag(argc, (const char **)argv, "device"))
    {
        devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
        error = cudaSetDevice(devID);

        if (error != cudaSuccess)
        {
            printf("cudaSetDevice returned error code %d, line(%d)\n", error, __LINE__);
            exit(EXIT_FAILURE);
        }
    }

    // get number of SMs on this GPU
    error = cudaGetDevice(&devID);

    if (error != cudaSuccess)
    {
        printf("cudaGetDevice returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }


    if (checkCmdLineFlag(argc, (const char **)argv, "sizemult"))
    {
        iSizeMultiple = getCmdLineArgumentInt(argc, (const char **)argv, "sizemult");
    }

    iSizeMultiple = min(iSizeMultiple, 10);
    iSizeMultiple = max(iSizeMultiple, 1);

    cudaDeviceProp deviceProp;

    error = cudaGetDeviceProperties(&deviceProp, devID);

    if (error != cudaSuccess)
    {
        printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }

    printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);

    // use a larger block size for Fermi and above
    int block_size = (deviceProp.major < 2) ? 16 : 32;

    matrix_size.uiWA = 2 * block_size * iSizeMultiple;
    matrix_size.uiHA = 4 * block_size * iSizeMultiple;
    matrix_size.uiWB = 2 * block_size * iSizeMultiple;
    matrix_size.uiHB = 4 * block_size * iSizeMultiple;
    matrix_size.uiWC = 2 * block_size * iSizeMultiple;
    matrix_size.uiHC = 4 * block_size * iSizeMultiple;

    printf("MatrixA(%u,%u), MatrixB(%u,%u), MatrixC(%u,%u)\n",
           matrix_size.uiWA, matrix_size.uiHA,
           matrix_size.uiWB, matrix_size.uiHB,
           matrix_size.uiWC, matrix_size.uiHC);
}
Example #29
0
int main(int argc, char **argv)
{
    int devID;
    cudaDeviceProp deviceProps;
    printf("%s Starting...\n\n", sSDKname);
    printf("[%s] - [OpenGL/CUDA simulation] starting...\n", sSDKname);

    // First initialize OpenGL context, so we can properly set the GL for CUDA.
    // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
    if (false == initGL(&argc, argv))
    {
        exit(EXIT_SUCCESS);
    }

    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    devID = findCudaGLDevice(argc, (const char **)argv);

    // get number of SMs on this GPU
    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
    printf("CUDA device [%s] has %d Multi-Processors\n",
           deviceProps.name, deviceProps.multiProcessorCount);

    // automated build testing harness
    if (checkCmdLineFlag(argc, (const char **)argv, "file"))
    {
        getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file);
    }

    // Allocate and initialize host data
    GLint bsize;

    sdkCreateTimer(&timer);
    sdkResetTimer(&timer);

    hvfield = (cData *)malloc(sizeof(cData) * DS);
    memset(hvfield, 0, sizeof(cData) * DS);

    // Allocate and initialize device data
    cudaMallocPitch((void **)&dvfield, &tPitch, sizeof(cData)*DIM, DIM);

    cudaMemcpy(dvfield, hvfield, sizeof(cData) * DS,
               cudaMemcpyHostToDevice);
    // Temporary complex velocity field data
    cudaMalloc((void **)&vxfield, sizeof(cData) * PDS);
    cudaMalloc((void **)&vyfield, sizeof(cData) * PDS);

    setupTexture(DIM, DIM);
    bindTexture();

    // Create particle array
    particles = (cData *)malloc(sizeof(cData) * DS);
    memset(particles, 0, sizeof(cData) * DS);

    initParticles(particles, DIM, DIM);

    // Create CUFFT transform plan configuration
    cufftPlan2d(&planr2c, DIM, DIM, CUFFT_R2C);
    cufftPlan2d(&planc2r, DIM, DIM, CUFFT_C2R);
    // TODO: update kernels to use the new unpadded memory layout for perf
    // rather than the old FFTW-compatible layout
    cufftSetCompatibilityMode(planr2c, CUFFT_COMPATIBILITY_FFTW_PADDING);
    cufftSetCompatibilityMode(planc2r, CUFFT_COMPATIBILITY_FFTW_PADDING);

    glGenBuffersARB(1, &vbo);
    glBindBufferARB(GL_ARRAY_BUFFER_ARB, vbo);
    glBufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(cData) * DS,
                    particles, GL_DYNAMIC_DRAW_ARB);

    glGetBufferParameterivARB(GL_ARRAY_BUFFER_ARB, GL_BUFFER_SIZE_ARB, &bsize);

    if (bsize != (sizeof(cData) * DS))
        goto EXTERR;

    glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0);

    checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone));
    getLastCudaError("cudaGraphicsGLRegisterBuffer failed");

    if (ref_file)
    {
        autoTest(argv);
        cleanup();
        cudaDeviceReset();
        printf("[fluidsGL] - Test Results: %d Failures\n", g_TotalErrors);
        exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE);

    }
    else
    {
        atexit(cleanup);
        glutMainLoop();
    }

    cudaDeviceReset();

    if (!ref_file)
    {
        exit(EXIT_SUCCESS);
    }

    return 0;

EXTERR:
    printf("Failed to initialize GL extensions.\n");

    cudaDeviceReset();
    exit(EXIT_FAILURE);
}
int main(int argc, char **argv)
{
    pArgc = &argc;
    pArgv = argv;

#if defined(__linux__)
    setenv ("DISPLAY", ":0", 0);
#endif

    printf("%s Starting...\n\n", sSDKsample);

    if (checkCmdLineFlag(argc, (const char **)argv, "help"))
    {
        printf("\nUsage: SobelFilter <options>\n");
        printf("\t\t-mode=n (0=original, 1=texture, 2=smem + texture)\n");
        printf("\t\t-file=ref_orig.pgm (ref_tex.pgm, ref_shared.pgm)\n\n");
        exit(EXIT_SUCCESS);
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "file"))
    {
        g_bQAReadback = true;
        runAutoTest(argc, argv);
    }

    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    if (checkCmdLineFlag(argc, (const char **)argv, "device"))
    {
        printf("   This SDK does not explicitly support -device=n when running with OpenGL.\n");
        printf("   When specifying -device=n (n=0,1,2,....) the sample must not use OpenGL.\n");
        printf("   See details below to run without OpenGL:\n\n");
        printf(" > %s -device=n\n\n", argv[0]);
        printf("exiting...\n");
        exit(EXIT_SUCCESS);
    }

    // First initialize OpenGL context, so we can properly set the GL for CUDA.
    // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
    initGL(&argc, argv);
    cudaGLSetGLDevice(gpuGetMaxGflopsDeviceId());

    sdkCreateTimer(&timer);
    sdkResetTimer(&timer);

    glutDisplayFunc(display);
    glutKeyboardFunc(keyboard);
    glutReshapeFunc(reshape);

    loadDefaultImage(argv[0]);

    // If code is not printing the USage, then we execute this path.
    printf("I: display Image (no filtering)\n");
    printf("T: display Sobel Edge Detection (Using Texture)\n");
    printf("S: display Sobel Edge Detection (Using SMEM+Texture)\n");
    printf("Use the '-' and '=' keys to change the brightness.\n");
    fflush(stdout);

#if defined (__APPLE__) || defined(MACOSX)
    atexit(cleanup);
#else
    glutCloseFunc(cleanup);
#endif

    glutTimerFunc(REFRESH_DELAY, timerEvent,0);
    glutMainLoop();
}