//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { pArgc = &argc; pArgv = argv; // parse arguments char *filename; printf("Starting bicubicTexture\n"); if (checkCmdLineFlag(argc, (const char **) argv, "help")) { printHelp(); exit(EXIT_SUCCESS); } if (checkCmdLineFlag(argc, (const char **) argv, "mode")) { g_FilterMode = (eFilterMode)getCmdLineArgumentInt(argc, (const char **) argv, "mode"); if (g_FilterMode < MODE_NEAREST && g_FilterMode > MODE_CATMULL_ROM) { printf("Invalid Mode setting %d\n", g_FilterMode); exit(EXIT_FAILURE); } } if (getCmdLineArgumentString(argc, (const char **) argv, "file", &filename)) { dumpFilename = filename; fpsLimit = frameCheckNumber; // Running CUDA kernel (bicubicFiltering) without visualization (QA Testing/Verification) runAutoTest(argc, argv, (const char *)dumpFilename, g_FilterMode); } else { // This runs the CUDA kernel (bicubicFiltering) + OpenGL visualization initialize(argc, argv); glutMainLoop(); sdkDeleteTimer(&timer); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(EXIT_SUCCESS); } exit(EXIT_SUCCESS); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { pArgc = &argc; pArgv = argv; // parse arguments char *filename; #if defined(__linux__) setenv ("DISPLAY", ":0", 0); #endif printf("Starting bicubicTexture\n"); if (checkCmdLineFlag(argc, (const char **) argv, "help")) { printHelp(); exit(EXIT_SUCCESS); } if (checkCmdLineFlag(argc, (const char **) argv, "mode")) { g_FilterMode = (eFilterMode)getCmdLineArgumentInt(argc, (const char **) argv, "mode"); if (g_FilterMode < 0 || g_FilterMode >= NUM_MODES) { printf("Invalid Mode setting %d\n", g_FilterMode); exit(EXIT_FAILURE); } } if (getCmdLineArgumentString(argc, (const char **) argv, "file", &filename)) { dumpFilename = filename; fpsLimit = frameCheckNumber; // Running CUDA kernel (bicubicFiltering) without visualization (QA Testing/Verification) runAutoTest(argc, argv, (const char *)dumpFilename, g_FilterMode); } else { // This runs the CUDA kernel (bicubicFiltering) + OpenGL visualization initialize(argc, argv); glutMainLoop(); } exit(EXIT_SUCCESS); }
///////////////////////////////////////////////////// // Main program ///////////////////////////////////////////////////// int main(const int argc, const char **argv) { unsigned long start = START_SIZE; #ifdef NVS unsigned long end = END_NVS; #else /* sizeof(unsigned long) = 8 bytes */ unsigned long end = END_TITAN; #endif int stateDim = 1; /* default stateDim = 1 */ if (checkCmdLineFlag(argc, argv, "dim")) stateDim = getCmdLineArgumentInt(argc, argv, "dim"); char *typeInput = 0; getCmdLineArgumentString(argc, (const char**)argv, "type", &typeInput); if (0 != typeInput){ if (!strcasecmp(typeInput, "float")) runTest<float>(start, end, stateDim); else if (!strcasecmp(typeInput, "int")) runTest<int>(start, end, stateDim); else if (!strcasecmp(typeInput, "double")) runTest<double>(start, end, stateDim); } else runTest<double>(start, end, stateDim); exit(EXIT_SUCCESS); }
// Initialization code to find the best CUDA Device int findCudaDevice(int argc, const char **argv) { cudaDeviceProp deviceProp; int devID = 0; // If the command-line has a device number specified, use it if (checkCmdLineFlag(argc, argv, "device")) { devID = getCmdLineArgumentInt(argc, argv, "device="); if (devID < 0) { printf("Invalid command line parameters\n"); exit(-1); } else { devID = gpuDeviceInit(devID); if (devID < 0) { printf("exiting...\n"); shrQAFinishExit(argc, (const char **)argv, QA_FAILED); exit(-1); } } } else { // Otherwise pick the device with highest Gflops/s devID = gpuGetMaxGflopsDeviceId(); checkCudaErrors( cudaSetDevice( devID ) ); checkCudaErrors( cudaGetDeviceProperties(&deviceProp, devID) ); printf("> Using CUDA device [%d]: %s\n", devID, deviceProp.name); } return devID; }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { numParticles = NUM_PARTICLES; uint gridDim = GRID_SIZE; numIterations = 0; if (argc > 1) { if (checkCmdLineFlag(argc, (const char **) argv, "n")) { numParticles = getCmdLineArgumentInt(argc, (const char **)argv, "n"); } if (checkCmdLineFlag(argc, (const char **) argv, "grid")) { gridDim = getCmdLineArgumentInt(argc, (const char **) argv, "grid"); } } gridSize.x = gridSize.y = gridSize.z = gridDim; printf("grid: %d x %d x %d = %d cells\n", gridSize.x, gridSize.y, gridSize.z, gridSize.x*gridSize.y*gridSize.z); printf("particles: %d\n", numParticles); if (checkCmdLineFlag(argc, (const char **) argv, "i")) { numIterations = getCmdLineArgumentInt(argc, (const char **) argv, "i"); } cudaInit(argc, argv); initParticleSystem(numParticles, gridSize); initParams(); if (numIterations <= 0) numIterations = 300; runBenchmark(numIterations, argv[0]); if (psystem) { delete psystem; } exit(g_TotalErrors > 0 ? EXIT_FAILURE : EXIT_SUCCESS); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { char device_name[256]; char *ref_file = NULL; pArgc = &argc; pArgv = argv; printf("[%s] - Starting...\n", SDK_name); if (!findCUDADevice()) // Search for CUDA GPU { printf("> CUDA Device NOT found on \"%s\".. Exiting.\n", device_name); exit(EXIT_SUCCESS); } if (!dynlinkLoadD3D10API()) // Search for D3D API (locate drivers, does not mean device is found) { printf("> D3D10 API libraries NOT found.. Exiting.\n"); dynlinkUnloadD3D10API(); exit(EXIT_SUCCESS); } if (!findDXDevice(device_name)) // Search for D3D Hardware Device { printf("> D3D10 Graphics Device NOT found.. Exiting.\n"); dynlinkUnloadD3D10API(); exit(EXIT_SUCCESS); } // command line options if (argc > 1) { // automatied build testing harness if (checkCmdLineFlag(argc, (const char **)argv, "file")) getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); } // run D3D10/CUDA test runTest(argc, argv, ref_file); // // and exit // printf("%s running on %s exiting...\n", SDK_name, device_name); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(g_bPassed ? EXIT_SUCCESS : EXIT_FAILURE); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { pArgc = &argc; pArgv = argv; char *ref_file = NULL; printf("%s Starting...\n\n", sSDKsample); if (checkCmdLineFlag(argc, (const char **)argv, "file")) { fpsLimit = frameCheckNumber; getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); } if (ref_file) { chooseCudaDevice(argc, argv, false); loadVolumeData(argv[0]); runAutoTest(ref_file, argv[0]); } else { // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL(&argc, argv); // use command-line specified CUDA device, otherwise use device with highest Gflops/s chooseCudaDevice(argc, argv, true); // OpenGL buffers initGLBuffers(); loadVolumeData(argv[0]); } printf("Press space to toggle animation\n" "Press '+' and '-' to change displayed slice\n"); #if defined (__APPLE__) || defined(MACOSX) atexit(cleanup); #else glutCloseFunc(cleanup); #endif glutMainLoop(); exit(EXIT_SUCCESS); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { char device_name[256]; char *ref_file = NULL; pArgc = &argc; pArgv = argv; printf("[%s] - Starting...\n", SDK_name); if (!findCUDADevice()) // Search for CUDA GPU { printf("> CUDA Device NOT found on \"%s\".. Exiting.\n", device_name); exit(EXIT_SUCCESS); } if (!dynlinkLoadD3D10API()) // Search for D3D API (locate drivers, does not mean device is found) { printf("> D3D10 API libraries NOT found.. Exiting.\n"); dynlinkUnloadD3D10API(); exit(EXIT_SUCCESS); } if (!findDXDevice(device_name)) // Search for D3D Hardware Device { printf("> D3D10 Graphics Device NOT found.. Exiting.\n"); dynlinkUnloadD3D10API(); exit(EXIT_SUCCESS); } // command line options if (argc > 1) { // automatied build testing harness if (checkCmdLineFlag(argc, (const char **)argv, "file")) getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); } // run D3D10/CUDA test runTest(argc, argv, ref_file); // // and exit // printf("%s running on %s exiting...\n", SDK_name, device_name); cudaDeviceReset(); exit(g_bPassed ? EXIT_SUCCESS : EXIT_FAILURE); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { char device_name[256]; char *ref_file = NULL; pArgc = &argc; pArgv = argv; printf("> %s starting...\n", sSDKSample); if (!findCUDADevice()) // Search for CUDA GPU { printf("> CUDA Device NOT found on \"%s\".. Exiting.\n", device_name); exit(EXIT_SUCCESS); } if (!dynlinkLoadD3D10API()) // Search for D3D API (locate drivers, does not mean device is found) { printf("> D3D10 API libraries NOT found on.. Exiting.\n"); dynlinkUnloadD3D10API(); exit(EXIT_SUCCESS); } if (!findDXDevice(device_name)) // Search for D3D Hardware Device { printf("> D3D10 Graphics Device NOT found.. Exiting.\n"); dynlinkUnloadD3D10API(); exit(EXIT_SUCCESS); } if (argc > 1) { if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", (char **)&ref_file); } } runTest(argc, argv, ref_file); // // and exit // printf("%s running on %s exiting...\n", sSDKSample, device_name); printf("%s sample finished returned: %s\n", sSDKSample, (g_bPassed ? "OK" : "ERROR!")); exit(g_bPassed ? EXIT_SUCCESS : EXIT_FAILURE); }
int main(int argc, char *argv[]) { if (checkCmdLineFlag(argc, (const char **)argv, "help")) { printf("\n USAGE:"); printf("\n -pc-file='file path' to file containing point cloud list of points"); printf("\n file should have format: x y z r g b"); printf("\n where point coordinates xyz are floats"); printf("\n and point color channels rgb are 8 bit integers (0-255)"); printf("\n\n"); return 0; } Glib::RefPtr<Gtk::Application> app = Gtk::Application::create( argc, argv, "jacko.pc_render" ); PC_Render pc_render; return app->run(pc_render); }
//////////////////////////////////////////////////////////////////////////////// //! Check if the result is correct or write data to file for external //! regression testing //////////////////////////////////////////////////////////////////////////////// bool SaveResult(int argc, char **argv) { // Map vertex buffer float *data; if (FAILED(g_pVB->Map(D3D10_MAP_READ, 0, (void **)&data))) //Lock(0, 0, (void**)&data, 0))) return false; // Unmap g_pVB->Unmap(); // Save result if (checkCmdLineFlag(argc, (const char **) argv, "regression")) { // write file for regression test sdkWriteFile<float>("./data/regression.dat", data, sizeof(CUSTOMVERTEX), 0.0f, false); } return true; }
// Initialization code to find the best CUDA Device inline int findCudaDevice(int argc, const char **argv) { cudaDeviceProp deviceProp; int devID = 0; // If the command-line has a device number specified, use it if (checkCmdLineFlag(argc, argv, "device")) { devID = getCmdLineArgumentInt(argc, argv, "device="); if (devID < 0) { printf("Invalid command line parameter\n "); exit(EXIT_FAILURE); } else { devID = gpuDeviceInit(devID); if (devID < 0) { printf("exiting...\n"); exit(EXIT_FAILURE); } } } else { // Otherwise pick the device with highest Gflops/s devID = gpuGetMaxGflopsDeviceId(); checkCudaErrors(cudaSetDevice(devID)); checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor); } return devID; }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { printf("%s Starting...\n\n", sSDKsample); numParticles = NUM_PARTICLES; maxNumParticles = MAX_NUM_PARTICLES; uint gridDim = GRID_SIZE; numIterations = 0; printf("Surely I can get this far\n"); if (argc > 1) { if (checkCmdLineFlag(argc, (const char **) argv, "n")) { numParticles = getCmdLineArgumentInt(argc, (const char **)argv, "n"); } if (checkCmdLineFlag(argc, (const char **) argv, "grid")) { gridDim = getCmdLineArgumentInt(argc, (const char **) argv, "grid"); } if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", &g_refFile); fpsLimit = frameCheckNumber; numIterations = 1; } } //******************************************************* // RMK Hard code for cylindrical coords (y=theta=1) // DomainSize //char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-nothot/ZVert.txt"; //char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-nothot/RVert.txt"; //char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-nothot-big/ZVert.txt"; //char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-nothot-big/RVert.txt"; //char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-big/ZVert.txt"; //char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-big/RVert.txt"; //char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-nothot-big-refine/ZVert.txt"; //char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-nothot-big-refine/RVert.txt"; char Zfile[] = "/home/rkeedy/CFD/BuoyantStrumJet85-big-refine-lighter/ZVert.txt"; char Rfile[] = "/home/rkeedy/CFD/BuoyantStrumJet85-big-refine-lighter/RVert.txt"; //char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet62-big-refine-lighter/ZVert.txt"; //char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet62-big-refine-lighter/RVert.txt"; //char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-big-refine/ZVert.txt"; //char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-big-refine/RVert.txt"; //char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet63-big-refine/ZVert.txt"; //char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet63-big-refine/RVert.txt"; numVelNodes.x = filecount(Rfile); //-1; numVelNodes.z = filecount(Zfile); //-1; numVelNodes.y = 1; numCells.x = 80; //47; //24; //29; numCells.y = 1; numCells.z = 160; //188; //95; //88; numParticles = numCells.x*numCells.z*20; //avgnumparticles = 40 srand( time( NULL ) ); //numParticles = numCells.x*numCells.z*40; printf("vel grid: %d x %d x %d = %d cells\n", numVelNodes.x, numVelNodes.y, numVelNodes.z, numVelNodes.x*numVelNodes.y*numVelNodes.z); printf(" grid: %d x %d x %d = %d cells\n", numCells.x, numCells.y, numCells.z, numCells.x*numCells.y*numCells.z); //printf("vel grid: %d x %d x %d = %d cells\n", gridSize.x, gridSize.y, gridSize.z, gridSize.x*gridSize.y*gridSize.z); bool benchmark = checkCmdLineFlag(argc, (const char **) argv, "benchmark") != 0; if (checkCmdLineFlag(argc, (const char **) argv, "i")) { numIterations = getCmdLineArgumentInt(argc, (const char **) argv, "i"); } if (g_refFile) { cudaInit(argc, argv); } else { if (checkCmdLineFlag(argc, (const char **)argv, "device")) { printf("[%s]\n", argv[0]); printf(" Does not explicitly support -device=n in OpenGL mode\n"); printf(" To use -device=n, the sample must be running w/o OpenGL\n\n"); printf(" > %s -device=n -file=<*.bin>\n", argv[0]); printf("exiting...\n"); exit(EXIT_SUCCESS); } initGL(&argc, argv); cudaGLInit(argc, argv); } // Moved code snippet to CellSystem //initCellSystem(gridSize); // now moved to particlesystem printf("Begin initialization\n"); //initParticleSystem(numParticles, gridSize, g_refFile==NULL); initParticleSystem(maxNumParticles, numParticles, numVelNodes, numCells, g_refFile==NULL); //printf("Finished with initParticleSystem, %d\n",g_refFile==NULL); //cin.ignore(); initParams(); printf("Finished with initialization\n"); if (!g_refFile) { initMenus(); } if (benchmark || g_refFile) { if (numIterations <= 0) { numIterations = 300; } runBenchmark(numIterations, argv[0]); } else { glutDisplayFunc(display); glutReshapeFunc(reshape); glutMouseFunc(mouse); glutMotionFunc(motion); glutKeyboardFunc(key); glutSpecialFunc(special); glutIdleFunc(idle); atexit(cleanup); glutMainLoop(); } if (psystem) { delete psystem; } cudaDeviceReset(); exit(g_TotalErrors > 0 ? EXIT_FAILURE : EXIT_SUCCESS); }
bool runTest(int argc, char **argv, ReduceType datatype) { int size = 1<<24; // number of elements to reduce int maxThreads = 256; // number of threads per block int whichKernel = 6; int maxBlocks = 64; bool cpuFinalReduction = false; int cpuFinalThreshold = 1; if (checkCmdLineFlag(argc, (const char **) argv, "n")) { size = getCmdLineArgumentInt(argc, (const char **) argv, "n"); } if (checkCmdLineFlag(argc, (const char **) argv, "threads")) { maxThreads = getCmdLineArgumentInt(argc, (const char **) argv, "threads"); } if (checkCmdLineFlag(argc, (const char **) argv, "kernel")) { whichKernel = getCmdLineArgumentInt(argc, (const char **) argv, "kernel"); } if (checkCmdLineFlag(argc, (const char **) argv, "maxblocks")) { maxBlocks = getCmdLineArgumentInt(argc, (const char **) argv, "maxblocks"); } printf("%d elements\n", size); printf("%d threads (max)\n", maxThreads); cpuFinalReduction = checkCmdLineFlag(argc, (const char **) argv, "cpufinal"); if (checkCmdLineFlag(argc, (const char **) argv, "cputhresh")) { cpuFinalThreshold = getCmdLineArgumentInt(argc, (const char **) argv, "cputhresh"); } bool runShmoo = checkCmdLineFlag(argc, (const char **) argv, "shmoo"); if (runShmoo) { shmoo<T>(1, 33554432, maxThreads, maxBlocks, datatype); } else { // create random input data on CPU unsigned int bytes = size * sizeof(T); T *h_idata = (T *) malloc(bytes); for (int i=0; i<size; i++) { // Keep the numbers small so we don't get truncation error in the sum if (datatype == REDUCE_INT) { h_idata[i] = (T)(rand() & 0xFF); } else { h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX; } } int numBlocks = 0; int numThreads = 0; getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads); if (numBlocks == 1) { cpuFinalThreshold = 1; } // allocate mem for the result on host side T *h_odata = (T *) malloc(numBlocks*sizeof(T)); printf("%d blocks\n\n", numBlocks); // allocate device memory and data T *d_idata = NULL; T *d_odata = NULL; checkCudaErrors(cudaMalloc((void **) &d_idata, bytes)); checkCudaErrors(cudaMalloc((void **) &d_odata, numBlocks*sizeof(T))); // copy data directly to device memory checkCudaErrors(cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_odata, h_idata, numBlocks*sizeof(T), cudaMemcpyHostToDevice)); // warm-up reduce<T>(size, numThreads, numBlocks, whichKernel, d_idata, d_odata); int testIterations = 100; StopWatchInterface *timer = 0; sdkCreateTimer(&timer); T gpu_result = 0; gpu_result = benchmarkReduce<T>(size, numThreads, numBlocks, maxThreads, maxBlocks, whichKernel, testIterations, cpuFinalReduction, cpuFinalThreshold, timer, h_odata, d_idata, d_odata); double reduceTime = sdkGetAverageTimerValue(&timer) * 1e-3; printf("Reduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %d, Workgroup = %u\n", 1.0e-9 * ((double)bytes)/reduceTime, reduceTime, size, 1, numThreads); // compute reference solution T cpu_result = reduceCPU<T>(h_idata, size); int precision = 0; double threshold = 0; double diff = 0; if (datatype == REDUCE_INT) { printf("\nGPU result = %d\n", (int)gpu_result); printf("CPU result = %d\n\n", (int)cpu_result); } else { if (datatype == REDUCE_FLOAT) { precision = 8; threshold = 1e-8 * size; } else { precision = 12; threshold = 1e-12 * size; } printf("\nGPU result = %.*f\n", precision, (double)gpu_result); printf("CPU result = %.*f\n\n", precision, (double)cpu_result); diff = fabs((double)gpu_result - (double)cpu_result); } // cleanup sdkDeleteTimer(&timer); free(h_idata); free(h_odata); checkCudaErrors(cudaFree(d_idata)); checkCudaErrors(cudaFree(d_odata)); if (datatype == REDUCE_INT) { return (gpu_result == cpu_result); } else { return (diff < threshold); } } return true; }
int main(int argc, char **argv) { char *dump_file = NULL; #if defined(__linux__) setenv ("DISPLAY", ":0", 0); #endif pArgc = &argc; pArgv = argv; printf("%s Starting...\n\n", sSDKsample); if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", (char **) &dump_file); int kernel = 1; if (checkCmdLineFlag(argc, (const char **)argv, "kernel")) { kernel = getCmdLineArgumentInt(argc, (const char **)argv, "kernel"); } runAutoTest(argc, argv, dump_file, kernel); } else { printf("[%s]\n", sSDKsample); // use command-line specified CUDA device, otherwise use device with highest Gflops/s if (checkCmdLineFlag(argc, (const char **)argv, "device")) { printf("[%s]\n", argv[0]); printf(" Does not explicitly support -device=n in OpenGL mode\n"); printf(" To use -device=n, the sample must be running w/o OpenGL\n\n"); printf(" > %s -device=n -qatest\n", argv[0]); printf("exiting...\n"); exit(EXIT_SUCCESS); } // First load the image, so we know what the size of the image (imageW and imageH) printf("Allocating host and CUDA memory and loading image file...\n"); const char *image_path = sdkFindFilePath("portrait_noise.bmp", argv[0]); if (image_path == NULL) { printf("imageDenoisingGL was unable to find and load image file <portrait_noise.bmp>.\nExiting...\n"); exit(EXIT_FAILURE); } LoadBMPFile(&h_Src, &imageW, &imageH, image_path); printf("Data init done.\n"); // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL(&argc, argv); cudaGLSetGLDevice(gpuGetMaxGflopsDeviceId()); checkCudaErrors(CUDA_MallocArray(&h_Src, imageW, imageH)); initOpenGLBuffers(); } printf("Starting GLUT main loop...\n"); printf("Press [1] to view noisy image\n"); printf("Press [2] to view image restored with knn filter\n"); printf("Press [3] to view image restored with nlm filter\n"); printf("Press [4] to view image restored with modified nlm filter\n"); printf("Press [*] to view smooth/edgy areas [RED/BLUE] Ct's when a filter is active\n"); printf("Press [f] to print frame rate\n"); printf("Press [?] to print Noise and Lerp Ct's\n"); printf("Press [q] to exit\n"); sdkCreateTimer(&timer); sdkStartTimer(&timer); glutMainLoop(); }
int main(int argc, char **argv) { pArgc = &argc; pArgv = argv; char *ref_file = NULL; printf("%s Starting...\n\n", sSDKsample); //start logs if (checkCmdLineFlag(argc, (const char **)argv, "help")) { printHelp(); exit(EXIT_SUCCESS); } if (checkCmdLineFlag(argc, (const char **)argv, "file")) { fpsLimit = frameCheckNumber; getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); } if (ref_file) { if (checkCmdLineFlag(argc, (const char **)argv, "device")) { int device = findCudaDevice(argc, (const char **)argv); if (device < 0) { printf("No CUDA Capable devices found, exiting...\n"); exit(EXIT_SUCCESS); } checkDeviceMeetComputeSpec(argc, argv); } else { int dev = findCapableDevice(argc, argv); if (dev != -1) { cudaSetDevice(dev); } else { cudaDeviceReset(); exit(EXIT_SUCCESS); } } } else { if (checkCmdLineFlag(argc, (const char **)argv, "device")) { printf(" This SDK does not explicitly support -device=n when running with OpenGL.\n"); printf(" When specifying -device=n (n=0,1,2,....) the sample must not use OpenGL.\n"); printf(" See details below to run without OpenGL:\n\n"); printf(" > %s -device=n -file=output.bin\n\n", argv[0]); printf("exiting...\n"); exit(EXIT_SUCCESS); } // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL(&argc, argv); int dev = findCapableDevice(argc, argv); if (dev != -1) { cudaGLSetGLDevice(dev); } else { exit(EXIT_SUCCESS); } } // load volume data initData(argc, argv); printf( "Press \n" " 'SPACE' to toggle animation\n" " 'p' to toggle pre-integrated transfer function\n" " '+' and '-' to change density (0.01 increments)\n" " ']' and '[' to change brightness\n" " ';' and ''' to modify transfer function offset\n" " '.' and ',' to modify transfer function scale\n\n"); if (ref_file) { runSingleTest(ref_file, argv[0]); } else { // This is the normal rendering path for VolumeRender glutDisplayFunc(display); glutKeyboardFunc(keyboard); glutMouseFunc(mouse); glutMotionFunc(motion); glutReshapeFunc(reshape); glutIdleFunc(idle); initPixelBuffer(); atexit(cleanup); glutMainLoop(); } cudaDeviceReset(); }
void initData(int argc, char **argv) { // parse arguments char *filename; if (getCmdLineArgumentString(argc, (const char **) argv, "file", &filename)) { volumeFilename = filename; } int n; if (checkCmdLineFlag(argc, (const char **) argv, "size")) { n = getCmdLineArgumentInt(argc, (const char **) argv, "size"); volumeSize.width = volumeSize.height = volumeSize.depth = n; } if (checkCmdLineFlag(argc, (const char **) argv, "xsize")) { n = getCmdLineArgumentInt(argc, (const char **) argv, "xsize"); volumeSize.width = n; } if (checkCmdLineFlag(argc, (const char **) argv, "ysize")) { n = getCmdLineArgumentInt(argc, (const char **) argv, "ysize"); volumeSize.height = n; } if (checkCmdLineFlag(argc, (const char **) argv, "zsize")) { n = getCmdLineArgumentInt(argc, (const char **) argv, "zsize"); volumeSize.depth = n; } char *path = sdkFindFilePath(volumeFilename, argv[0]); if (path == 0) { printf("Error finding file '%s'\n", volumeFilename); exit(EXIT_FAILURE); } size_t size = volumeSize.width*volumeSize.height*volumeSize.depth*sizeof(VolumeType); void *h_volume = loadRawFile(path, size); FilterKernel_init(); Volume_init(&volumeOriginal,volumeSize, h_volume, 0); free(h_volume); Volume_init(&volumeFilter0, volumeSize, NULL, 1); Volume_init(&volumeFilter1, volumeSize, NULL, 1); VolumeRender_init(); VolumeRender_setPreIntegrated(preIntegrated); VolumeRender_setVolume(&volumeOriginal); sdkCreateTimer(&timer); sdkCreateTimer(&animationTimer); sdkStartTimer(&animationTimer); // calculate new grid size gridSize = dim3(iDivUp(width, blockSize.x), iDivUp(height, blockSize.y)); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char *argv[]) { char device_name[NAME_LEN]; char *ref_file = NULL; pArgc = &argc; pArgv = argv; printf("[%s] - Starting...\n", SDK_name); if (!findGraphicsGPU(device_name)) { printf("> %s not supported on \"%s\" exiting...\n", SDK_name, device_name); exit(EXIT_SUCCESS); } // command line options if (argc > 1) { // automatied build testing harness if (checkCmdLineFlag(argc, (const char **)argv, "file")) getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); } // // create window // // Register the window class #if 1 WNDCLASSEX wc = { sizeof(WNDCLASSEX), CS_CLASSDC, MsgProc, 0L, 0L, GetModuleHandle(NULL), NULL, NULL, NULL, NULL, "CUDA/D3D9 Texture InterOP", NULL }; RegisterClassEx(&wc); int xBorder = ::GetSystemMetrics(SM_CXSIZEFRAME); int yMenu = ::GetSystemMetrics(SM_CYMENU); int yBorder = ::GetSystemMetrics(SM_CYSIZEFRAME); // Create the application's window (padding by window border for uniform BB sizes across OSs) HWND hWnd = CreateWindow(wc.lpszClassName, "CUDA/D3D9 Texture InterOP", WS_OVERLAPPEDWINDOW, 0, 0, g_WindowWidth + 2*xBorder, g_WindowHeight+ 2*yBorder+yMenu, NULL, NULL, wc.hInstance, NULL); #else static WNDCLASSEX wc = { sizeof(WNDCLASSEX), CS_CLASSDC, MsgProc, 0L, 0L, GetModuleHandle(NULL), NULL, NULL, NULL, NULL, "CudaD3D9Tex", NULL }; RegisterClassEx(&wc); HWND hWnd = CreateWindow( "CudaD3D9Tex", "CUDA D3D9 Texture Interop", WS_OVERLAPPEDWINDOW, 0, 0, 800, 320, GetDesktopWindow(), NULL, wc.hInstance, NULL); #endif ShowWindow(hWnd, SW_SHOWDEFAULT); UpdateWindow(hWnd); // Initialize Direct3D if (SUCCEEDED(InitD3D9(hWnd)) && SUCCEEDED(InitCUDA()) && SUCCEEDED(InitTextures())) { if (!g_bDeviceLost) { RegisterD3D9ResourceWithCUDA(); } } // // the main loop // while (false == g_bDone) { RunCUDA(); DrawScene(); // // handle I/O // MSG msg; ZeroMemory(&msg, sizeof(msg)); while (msg.message!=WM_QUIT) { if (PeekMessage(&msg, NULL, 0U, 0U, PM_REMOVE)) { TranslateMessage(&msg); DispatchMessage(&msg); } else { RunCUDA(); DrawScene(); if (ref_file) { for (int count=0; count<g_iFrameToCompare; count++) { RunCUDA(); DrawScene(); } const char *cur_image_path = "simpleD3D9Texture.ppm"; // Save a reference of our current test run image CheckRenderD3D9::BackbufferToPPM(g_pD3DDevice, cur_image_path); // compare to offical reference image, printing PASS or FAIL. g_bPassed = CheckRenderD3D9::PPMvsPPM(cur_image_path, ref_file, argv[0], MAX_EPSILON, 0.15f); g_bDone = true; Cleanup(); PostQuitMessage(0); } } } }; // Unregister windows class UnregisterClass(wc.lpszClassName, wc.hInstance); // // and exit // printf("> %s running on %s exiting...\n", SDK_name, device_name); exit(g_bPassed ? EXIT_SUCCESS : EXIT_FAILURE); }
////////////////////////////////////////////////////////////////////////////// // Program main ////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { bool bTestResults = true; if (checkCmdLineFlag(argc, (const char **)argv, "help")) { printf("\n> Command line options\n"); showHelp(); return 0; } printf("Run \"nbody -benchmark [-numbodies=<numBodies>]\" to measure perfomance.\n"); showHelp(); bFullscreen = (checkCmdLineFlag(argc, (const char **) argv, "fullscreen") != 0); if (bFullscreen) { bShowSliders = false; } benchmark = (checkCmdLineFlag(argc, (const char **) argv, "benchmark") != 0); compareToCPU = ((checkCmdLineFlag(argc, (const char **) argv, "compare") != 0) || (checkCmdLineFlag(argc, (const char **) argv, "qatest") != 0)); QATest = (checkCmdLineFlag(argc, (const char **) argv, "qatest") != 0); useHostMem = (checkCmdLineFlag(argc, (const char **) argv, "hostmem") != 0); fp64 = (checkCmdLineFlag(argc, (const char **) argv, "fp64") != 0); flopsPerInteraction = fp64 ? 30 : 20; useCpu = (checkCmdLineFlag(argc, (const char **) argv, "cpu") != 0); if (checkCmdLineFlag(argc, (const char **)argv, "numdevices")) { numDevsRequested = getCmdLineArgumentInt(argc, (const char **) argv, "numdevices"); if (numDevsRequested < 1) { printf("Error: \"number of CUDA devices\" specified %d is invalid. Value should be >= 1\n", numDevsRequested); exit(bTestResults ? EXIT_SUCCESS : EXIT_FAILURE); } else { printf("number of CUDA devices = %d\n", numDevsRequested); } } // for multi-device we currently require using host memory -- the devices share // data via the host if (numDevsRequested > 1) { useHostMem = true; } int numDevsAvailable = 0; bool customGPU = false; cudaGetDeviceCount(&numDevsAvailable); if (numDevsAvailable < numDevsRequested) { printf("Error: only %d Devices available, %d requested. Exiting.\n", numDevsAvailable, numDevsRequested); exit(EXIT_SUCCESS); } printf("> %s mode\n", bFullscreen ? "Fullscreen" : "Windowed"); printf("> Simulation data stored in %s memory\n", useHostMem ? "system" : "video"); printf("> %s precision floating point simulation\n", fp64 ? "Double" : "Single"); printf("> %d Devices used for simulation\n", numDevsRequested); int devID; cudaDeviceProp props; if (useCpu) { useHostMem = true; compareToCPU = false; bSupportDouble = true; #ifdef OPENMP printf("> Simulation with CPU using OpenMP\n"); #else printf("> Simulation with CPU\n"); #endif } // Initialize GL and GLUT if necessary if (!benchmark && !compareToCPU) { initGL(&argc, argv); initParameters(); } if(!useCpu) { // Now choose the CUDA Device // Either without GL interop: if (benchmark || compareToCPU || useHostMem) { // Note if we are using host memory for the body system, we // don't use CUDA-GL interop. if (checkCmdLineFlag(argc, (const char **)argv, "device")) { customGPU = true; } devID = findCudaDevice(argc, (const char **)argv); } else // or with GL interop: { if (checkCmdLineFlag(argc, (const char **)argv, "device")) { customGPU = true; } devID = findCudaGLDevice(argc, (const char **)argv); } checkCudaErrors(cudaGetDevice(&devID)); checkCudaErrors(cudaGetDeviceProperties(&props, devID)); bSupportDouble = true; #if CUDART_VERSION < 4000 if (numDevsRequested > 1) { printf("MultiGPU n-body requires CUDA 4.0 or later\n"); cudaDeviceReset(); exit(EXIT_SUCCESS); } #endif // Initialize devices if (numDevsRequested > 1 && customGPU) { printf("You can't use --numdevices and --device at the same time.\n"); exit(EXIT_SUCCESS); } if (customGPU) { cudaDeviceProp props; checkCudaErrors(cudaGetDeviceProperties(&props, devID)); printf("> Compute %d.%d CUDA device: [%s]\n", props.major, props.minor, props.name); } else { for (int i = 0; i < numDevsRequested; i++) { cudaDeviceProp props; checkCudaErrors(cudaGetDeviceProperties(&props, i)); printf("> Compute %d.%d CUDA device: [%s]\n", props.major, props.minor, props.name); if (useHostMem) { #if CUDART_VERSION >= 2020 if (!props.canMapHostMemory) { fprintf(stderr, "Device %d cannot map host memory!\n", devID); cudaDeviceReset(); exit(EXIT_SUCCESS); } if (numDevsRequested > 1) { checkCudaErrors(cudaSetDevice(i)); } checkCudaErrors(cudaSetDeviceFlags(cudaDeviceMapHost)); #else fprintf(stderr, "This CUDART version does not support <cudaDeviceProp.canMapHostMemory> field\n"); cudaDeviceReset(); exit(EXIT_SUCCESS); #endif } } // CC 1.2 and earlier do not support double precision if (props.major*10 + props.minor <= 12) { bSupportDouble = false; } } //if(numDevsRequested > 1) // checkCudaErrors(cudaSetDevice(devID)); if (fp64 && !bSupportDouble) { fprintf(stderr, "One or more of the requested devices does not support double precision floating-point\n"); cudaDeviceReset(); exit(EXIT_SUCCESS); } } numIterations = 0; p = 0; q = 1; if (checkCmdLineFlag(argc, (const char **)argv, "i")) { numIterations = getCmdLineArgumentInt(argc, (const char **)argv, "i"); } if (checkCmdLineFlag(argc, (const char **) argv, "p")) { p = getCmdLineArgumentInt(argc, (const char **)argv, "p"); } if (checkCmdLineFlag(argc, (const char **) argv, "q")) { q = getCmdLineArgumentInt(argc, (const char **)argv, "q"); } if (p == 0) // p not set on command line { p = 256; if (q * p > 256) { p = 256 / q; printf("Setting p=%d, q=%d to maintain %d threads per block\n", p, q, 256); } } // default number of bodies is #SMs * 4 * CTA size if (useCpu) #ifdef OPENMP numBodies = 8192; #else numBodies = 4096; #endif else if (numDevsRequested == 1)
int main(int argc, char *argv[]) { printf("%s Starting...\n\n", argv[0]); try { std::string sFilename; char *filePath; // set your own FreeImage error handler FreeImage_SetOutputMessage(FreeImageErrorHandler); cudaDeviceInit(argc, (const char **)argv); // Min spec is SM 1.0 devices if (printfNPPinfo(argc, argv, 1, 0) == false) { cudaDeviceReset(); exit(EXIT_SUCCESS); } if (checkCmdLineFlag(argc, (const char **)argv, "input")) { getCmdLineArgumentString(argc, (const char **)argv, "input", &filePath); } else { filePath = sdkFindFilePath("Lena.pgm", argv[0]); } if (filePath) { sFilename = filePath; } else { sFilename = "Lena.pgm"; } // if we specify the filename at the command line, then we only test sFilename // otherwise we will check both sFilename[0,1] int file_errors = 0; std::ifstream infile(sFilename.data(), std::ifstream::in); if (infile.good()) { std::cout << "freeImageInteropNPP opened: <" << sFilename.data() << "> successfully!" << std::endl; file_errors = 0; infile.close(); } else { std::cout << "freeImageInteropNPP unable to open: <" << sFilename.data() << ">" << std::endl; file_errors++; infile.close(); } if (file_errors > 0) { exit(EXIT_FAILURE); } std::string sResultFilename = sFilename; std::string::size_type dot = sResultFilename.rfind('.'); if (dot != std::string::npos) { sResultFilename = sResultFilename.substr(0, dot); } sResultFilename += "_boxFilterFII.pgm"; if (checkCmdLineFlag(argc, (const char **)argv, "output")) { char *outputFilePath; getCmdLineArgumentString(argc, (const char **)argv, "output", &outputFilePath); sResultFilename = outputFilePath; } FREE_IMAGE_FORMAT eFormat = FreeImage_GetFileType(sFilename.c_str()); // no signature? try to guess the file format from the file extension if (eFormat == FIF_UNKNOWN) { eFormat = FreeImage_GetFIFFromFilename(sFilename.c_str()); } NPP_ASSERT(eFormat != FIF_UNKNOWN); // check that the plugin has reading capabilities ... FIBITMAP *pBitmap; if (FreeImage_FIFSupportsReading(eFormat)) { pBitmap = FreeImage_Load(eFormat, sFilename.c_str()); } NPP_ASSERT(pBitmap != 0); // Dump the bitmap information to the console std::cout << (*pBitmap) << std::endl; // make sure this is an 8-bit single channel image NPP_ASSERT(FreeImage_GetColorType(pBitmap) == FIC_MINISBLACK); NPP_ASSERT(FreeImage_GetBPP(pBitmap) == 8); unsigned int nImageWidth = FreeImage_GetWidth(pBitmap); unsigned int nImageHeight = FreeImage_GetHeight(pBitmap); unsigned int nSrcPitch = FreeImage_GetPitch(pBitmap); unsigned char *pSrcData = FreeImage_GetBits(pBitmap); int nSrcPitchCUDA; Npp8u *pSrcImageCUDA = nppiMalloc_8u_C1(nImageWidth, nImageHeight, &nSrcPitchCUDA); NPP_ASSERT_NOT_NULL(pSrcImageCUDA); // copy image loaded via FreeImage to into CUDA device memory, i.e. // transfer the image-data up to the GPU's video-memory NPP_CHECK_CUDA(cudaMemcpy2D(pSrcImageCUDA, nSrcPitchCUDA, pSrcData, nSrcPitch, nImageWidth, nImageHeight, cudaMemcpyHostToDevice)); // define size of the box filter const NppiSize oMaskSize = {7, 7}; const NppiPoint oMaskAchnor = {0, 0}; // compute maximal result image size const NppiSize oSizeROI = {(int)nImageWidth - (oMaskSize.width - 1), (int)nImageHeight - (oMaskSize.height - 1) }; // allocate result image memory int nDstPitchCUDA; Npp8u *pDstImageCUDA = nppiMalloc_8u_C1(oSizeROI.width, oSizeROI.height, &nDstPitchCUDA); NPP_ASSERT_NOT_NULL(pDstImageCUDA); NPP_CHECK_NPP(nppiFilterBox_8u_C1R(pSrcImageCUDA, nSrcPitchCUDA, pDstImageCUDA, nDstPitchCUDA, oSizeROI, oMaskSize, oMaskAchnor)); // create the result image storage using FreeImage so we can easily // save FIBITMAP *pResultBitmap = FreeImage_Allocate(oSizeROI.width, oSizeROI.height, 8 /* bits per pixel */); NPP_ASSERT_NOT_NULL(pResultBitmap); unsigned int nResultPitch = FreeImage_GetPitch(pResultBitmap); unsigned char *pResultData = FreeImage_GetBits(pResultBitmap); NPP_CHECK_CUDA(cudaMemcpy2D(pResultData, nResultPitch, pDstImageCUDA, nDstPitchCUDA, oSizeROI.width, oSizeROI.height, cudaMemcpyDeviceToHost)); // now save the result image bool bSuccess; bSuccess = FreeImage_Save(FIF_PGM, pResultBitmap, sResultFilename.c_str(), 0) == TRUE; NPP_ASSERT_MSG(bSuccess, "Failed to save result image."); //free nppiImage nppiFree(pSrcImageCUDA); nppiFree(pDstImageCUDA); cudaDeviceReset(); exit(EXIT_SUCCESS); } catch (npp::Exception &rException) { std::cerr << "Program error! The following exception occurred: \n"; std::cerr << rException << std::endl; std::cerr << "Aborting." << std::endl; exit(EXIT_FAILURE); } catch (...) { std::cerr << "Program error! An unknow type of exception occurred. \n"; std::cerr << "Aborting." << std::endl; exit(EXIT_FAILURE); } exit(EXIT_SUCCESS); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { // start logs int devID; char *ref_file = NULL; printf("%s Starting...\n\n", argv[0]); #if defined(__linux__) setenv ("DISPLAY", ":0", 0); #endif // use command-line specified CUDA device, otherwise use device with highest Gflops/s if (argc > 1) { if (checkCmdLineFlag(argc, (const char **)argv, "radius")) { filter_radius = getCmdLineArgumentInt(argc, (const char **) argv, "radius"); } if (checkCmdLineFlag(argc, (const char **)argv, "passes")) { iterations = getCmdLineArgumentInt(argc, (const char **)argv, "passes"); } if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", (char **)&ref_file); } } // load image to process loadImageData(argc, argv); if (checkCmdLineFlag(argc, (const char **)argv, "benchmark")) { // This is a separate mode of the sample, where we are benchmark the kernels for performance devID = findCudaDevice(argc, (const char **)argv); // Running CUDA kernels (bilateralfilter) in Benchmarking mode g_TotalErrors += runBenchmark(argc, argv); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); } else if (checkCmdLineFlag(argc, (const char **)argv, "radius") || checkCmdLineFlag(argc, (const char **)argv, "passes")) { // This overrides the default mode. Users can specify the radius used by the filter kernel devID = findCudaDevice(argc, (const char **)argv); g_TotalErrors += runSingleTest(ref_file, argv[0]); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); } else { // Default mode running with OpenGL visualization and in automatic mode // the output automatically changes animation printf("\n"); // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL(argc, (char **)argv); int dev = findCapableDevice(argc, argv); if (dev != -1) { dev = gpuGLDeviceInit(argc, (const char **)argv); if (dev == -1) { exit(EXIT_FAILURE); } } else { // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(EXIT_SUCCESS); } // Now we can create a CUDA context and bind it to the OpenGL context initCuda(); initGLResources(); // sets the callback function so it will call cleanup upon exit #if defined (__APPLE__) || defined(MACOSX) atexit(cleanup); #else glutCloseFunc(cleanup); #endif printf("Running Standard Demonstration with GLUT loop...\n\n"); printf("Press '+' and '-' to change filter width\n" "Press ']' and '[' to change number of iterations\n" "Press 'e' and 'E' to change Euclidean delta\n" "Press 'g' and 'G' to changle Gaussian delta\n" "Press 'a' or 'A' to change Animation mode ON/OFF\n\n"); // Main OpenGL loop that will run visualization for every vsync glutMainLoop(); } }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { pArgc = &argc; pArgv = argv; char *ref_file = NULL; #if defined(__linux__) setenv ("DISPLAY", ":0", 0); #endif printf("%s Starting...\n\n", sSDKsample); printf("NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n\n"); // use command-line specified CUDA device, otherwise use device with highest Gflops/s if (argc > 1) { if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); fpsLimit = frameCheckNumber; } } // Get the path of the filename char *filename; if (getCmdLineArgumentString(argc, (const char **) argv, "image", &filename)) { image_filename = filename; } // load image char *image_path = sdkFindFilePath(image_filename, argv[0]); if (image_path == NULL) { fprintf(stderr, "Error unable to find and load image file: '%s'\n", image_filename); exit(EXIT_FAILURE); } sdkLoadPPM4ub(image_path, (unsigned char **)&h_img, &width, &height); if (!h_img) { printf("Error unable to load PPM file: '%s'\n", image_path); exit(EXIT_FAILURE); } printf("Loaded '%s', %d x %d pixels\n", image_path, width, height); if (checkCmdLineFlag(argc, (const char **)argv, "threads")) { nthreads = getCmdLineArgumentInt(argc, (const char **) argv, "threads"); } if (checkCmdLineFlag(argc, (const char **)argv, "sigma")) { sigma = getCmdLineArgumentFloat(argc, (const char **) argv, "sigma"); } runBenchmark = checkCmdLineFlag(argc, (const char **) argv, "benchmark"); int device; struct cudaDeviceProp prop; cudaGetDevice(&device); cudaGetDeviceProperties(&prop, device); if (!strncmp("Tesla", prop.name, 5)) { printf("Tesla card detected, running the test in benchmark mode (no OpenGL display)\n"); // runBenchmark = true; runBenchmark = true; } // Benchmark or AutoTest mode detected, no OpenGL if (runBenchmark == true || ref_file != NULL) { findCudaDevice(argc, (const char **)argv); } else { // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL(&argc, argv); findCudaGLDevice(argc, (const char **)argv); } initCudaBuffers(); if (ref_file) { printf("(Automated Testing)\n"); bool testPassed = runSingleTest(ref_file, argv[0]); cleanup(); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(testPassed ? EXIT_SUCCESS : EXIT_FAILURE); } if (runBenchmark) { printf("(Run Benchmark)\n"); benchmark(100); cleanup(); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(EXIT_SUCCESS); } initGLBuffers(); glutMainLoop(); exit(EXIT_SUCCESS); }
/* Solve Ax=b using the conjugate gradient method a) without any preconditioning, b) using an Incomplete Cholesky preconditioner and c) using an ILU0 preconditioner. */ int main(int argc, char **argv) { const int max_iter = 1000; int k, M = 0, N = 0, nz = 0, *I = NULL, *J = NULL; int *d_col, *d_row; int qatest = 0; const float tol = 1e-12f; float *x, *rhs; float r0, r1, alpha, beta; float *d_val, *d_x; float *d_zm1, *d_zm2, *d_rm2; float *d_r, *d_p, *d_omega, *d_y; float *val = NULL; float *d_valsILU0; float *valsILU0; float rsum, diff, err = 0.0; float qaerr1, qaerr2 = 0.0; float dot, numerator, denominator, nalpha; const float floatone = 1.0; const float floatzero = 0.0; int nErrors = 0; printf("conjugateGradientPrecond starting...\n"); /* QA testing mode */ if (checkCmdLineFlag(argc, (const char **)argv, "qatest")) { qatest = 1; } /* This will pick the best possible CUDA capable device */ cudaDeviceProp deviceProp; int devID = findCudaDevice(argc, (const char **)argv); printf("GPU selected Device ID = %d \n", devID); if (devID < 0) { printf("Invalid GPU device %d selected, exiting...\n", devID); exit(EXIT_SUCCESS); } checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); /* Statistics about the GPU device */ printf("> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n", deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor); int version = (deviceProp.major * 0x10 + deviceProp.minor); if (version < 0x11) { printf("%s: requires a minimum CUDA compute 1.1 capability\n", sSDKname); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(EXIT_SUCCESS); } /* Generate a random tridiagonal symmetric matrix in CSR (Compressed Sparse Row) format */ M = N = 16384; nz = 5*N-4*(int)sqrt((double)N); I = (int *)malloc(sizeof(int)*(N+1)); // csr row pointers for matrix A J = (int *)malloc(sizeof(int)*nz); // csr column indices for matrix A val = (float *)malloc(sizeof(float)*nz); // csr values for matrix A x = (float *)malloc(sizeof(float)*N); rhs = (float *)malloc(sizeof(float)*N); for (int i = 0; i < N; i++) { rhs[i] = 0.0; // Initialize RHS x[i] = 0.0; // Initial approximation of solution } genLaplace(I, J, val, M, N, nz, rhs); /* Create CUBLAS context */ cublasHandle_t cublasHandle = 0; cublasStatus_t cublasStatus; cublasStatus = cublasCreate(&cublasHandle); checkCudaErrors(cublasStatus); /* Create CUSPARSE context */ cusparseHandle_t cusparseHandle = 0; cusparseStatus_t cusparseStatus; cusparseStatus = cusparseCreate(&cusparseHandle); checkCudaErrors(cusparseStatus); /* Description of the A matrix*/ cusparseMatDescr_t descr = 0; cusparseStatus = cusparseCreateMatDescr(&descr); checkCudaErrors(cusparseStatus); /* Define the properties of the matrix */ cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO); /* Allocate required memory */ checkCudaErrors(cudaMalloc((void **)&d_col, nz*sizeof(int))); checkCudaErrors(cudaMalloc((void **)&d_row, (N+1)*sizeof(int))); checkCudaErrors(cudaMalloc((void **)&d_val, nz*sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_x, N*sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_y, N*sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_r, N*sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_p, N*sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_omega, N*sizeof(float))); cudaMemcpy(d_col, J, nz*sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(d_row, I, (N+1)*sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(d_val, val, nz*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_r, rhs, N*sizeof(float), cudaMemcpyHostToDevice); /* Conjugate gradient without preconditioning. ------------------------------------------ Follows the description by Golub & Van Loan, "Matrix Computations 3rd ed.", Section 10.2.6 */ printf("Convergence of conjugate gradient without preconditioning: \n"); k = 0; r0 = 0; cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, &r1); while (r1 > tol*tol && k <= max_iter) { k++; if (k == 1) { cublasScopy(cublasHandle, N, d_r, 1, d_p, 1); } else { beta = r1/r0; cublasSscal(cublasHandle, N, &beta, d_p, 1); cublasSaxpy(cublasHandle, N, &floatone, d_r, 1, d_p, 1) ; } cusparseScsrmv(cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz, &floatone, descr, d_val, d_row, d_col, d_p, &floatzero, d_omega); cublasSdot(cublasHandle, N, d_p, 1, d_omega, 1, &dot); alpha = r1/dot; cublasSaxpy(cublasHandle, N, &alpha, d_p, 1, d_x, 1); nalpha = -alpha; cublasSaxpy(cublasHandle, N, &nalpha, d_omega, 1, d_r, 1); r0 = r1; cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, &r1); } printf(" iteration = %3d, residual = %e \n", k, sqrt(r1)); cudaMemcpy(x, d_x, N*sizeof(float), cudaMemcpyDeviceToHost); /* check result */ err = 0.0; for (int i = 0; i < N; i++) { rsum = 0.0; for (int j = I[i]; j < I[i+1]; j++) { rsum += val[j]*x[J[j]]; } diff = fabs(rsum - rhs[i]); if (diff > err) { err = diff; } } printf(" Convergence Test: %s \n", (k <= max_iter) ? "OK" : "FAIL"); nErrors += (k > max_iter) ? 1 : 0; qaerr1 = err; if (0) { // output result in matlab-style array int n=(int)sqrt((double)N); printf("a = [ "); for (int iy=0; iy<n; iy++) { for (int ix=0; ix<n; ix++) { printf(" %f ", x[iy*n+ix]); } if (iy == n-1) { printf(" ]"); } printf("\n"); } } /* Preconditioned Conjugate Gradient using ILU. -------------------------------------------- Follows the description by Golub & Van Loan, "Matrix Computations 3rd ed.", Algorithm 10.3.1 */ printf("\nConvergence of conjugate gradient using incomplete LU preconditioning: \n"); int nzILU0 = 2*N-1; valsILU0 = (float *) malloc(nz*sizeof(float)); checkCudaErrors(cudaMalloc((void **)&d_valsILU0, nz*sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_zm1, (N)*sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_zm2, (N)*sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_rm2, (N)*sizeof(float))); /* create the analysis info object for the A matrix */ cusparseSolveAnalysisInfo_t infoA = 0; cusparseStatus = cusparseCreateSolveAnalysisInfo(&infoA); checkCudaErrors(cusparseStatus); /* Perform the analysis for the Non-Transpose case */ cusparseStatus = cusparseScsrsv_analysis(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, nz, descr, d_val, d_row, d_col, infoA); checkCudaErrors(cusparseStatus); /* Copy A data to ILU0 vals as input*/ cudaMemcpy(d_valsILU0, d_val, nz*sizeof(float), cudaMemcpyDeviceToDevice); /* generate the Incomplete LU factor H for the matrix A using cudsparseScsrilu0 */ cusparseStatus = cusparseScsrilu0(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, descr, d_valsILU0, d_row, d_col, infoA); checkCudaErrors(cusparseStatus); /* Create info objects for the ILU0 preconditioner */ cusparseSolveAnalysisInfo_t info_u; cusparseCreateSolveAnalysisInfo(&info_u); cusparseMatDescr_t descrL = 0; cusparseStatus = cusparseCreateMatDescr(&descrL); cusparseSetMatType(descrL,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descrL,CUSPARSE_INDEX_BASE_ZERO); cusparseSetMatFillMode(descrL, CUSPARSE_FILL_MODE_LOWER); cusparseSetMatDiagType(descrL, CUSPARSE_DIAG_TYPE_UNIT); cusparseMatDescr_t descrU = 0; cusparseStatus = cusparseCreateMatDescr(&descrU); cusparseSetMatType(descrU,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descrU,CUSPARSE_INDEX_BASE_ZERO); cusparseSetMatFillMode(descrU, CUSPARSE_FILL_MODE_UPPER); cusparseSetMatDiagType(descrU, CUSPARSE_DIAG_TYPE_NON_UNIT); cusparseStatus = cusparseScsrsv_analysis(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, nz, descrU, d_val, d_row, d_col, info_u); /* reset the initial guess of the solution to zero */ for (int i = 0; i < N; i++) { x[i] = 0.0; } checkCudaErrors(cudaMemcpy(d_r, rhs, N*sizeof(float), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice)); k = 0; cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, &r1); while (r1 > tol*tol && k <= max_iter) { // Forward Solve, we can re-use infoA since the sparsity pattern of A matches that of L cusparseStatus = cusparseScsrsv_solve(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, &floatone, descrL, d_valsILU0, d_row, d_col, infoA, d_r, d_y); checkCudaErrors(cusparseStatus); // Back Substitution cusparseStatus = cusparseScsrsv_solve(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, &floatone, descrU, d_valsILU0, d_row, d_col, info_u, d_y, d_zm1); checkCudaErrors(cusparseStatus); k++; if (k == 1) { cublasScopy(cublasHandle, N, d_zm1, 1, d_p, 1); } else { cublasSdot(cublasHandle, N, d_r, 1, d_zm1, 1, &numerator); cublasSdot(cublasHandle, N, d_rm2, 1, d_zm2, 1, &denominator); beta = numerator/denominator; cublasSscal(cublasHandle, N, &beta, d_p, 1); cublasSaxpy(cublasHandle, N, &floatone, d_zm1, 1, d_p, 1) ; } cusparseScsrmv(cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nzILU0, &floatone, descrU, d_val, d_row, d_col, d_p, &floatzero, d_omega); cublasSdot(cublasHandle, N, d_r, 1, d_zm1, 1, &numerator); cublasSdot(cublasHandle, N, d_p, 1, d_omega, 1, &denominator); alpha = numerator / denominator; cublasSaxpy(cublasHandle, N, &alpha, d_p, 1, d_x, 1); cublasScopy(cublasHandle, N, d_r, 1, d_rm2, 1); cublasScopy(cublasHandle, N, d_zm1, 1, d_zm2, 1); nalpha = -alpha; cublasSaxpy(cublasHandle, N, &nalpha, d_omega, 1, d_r, 1); cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, &r1); } printf(" iteration = %3d, residual = %e \n", k, sqrt(r1)); cudaMemcpy(x, d_x, N*sizeof(float), cudaMemcpyDeviceToHost); /* check result */ err = 0.0; for (int i = 0; i < N; i++) { rsum = 0.0; for (int j = I[i]; j < I[i+1]; j++) { rsum += val[j]*x[J[j]]; } diff = fabs(rsum - rhs[i]); if (diff > err) { err = diff; } } printf(" Convergence Test: %s \n", (k <= max_iter) ? "OK" : "FAIL"); nErrors += (k > max_iter) ? 1 : 0; qaerr2 = err; /* Destroy parameters */ cusparseDestroySolveAnalysisInfo(infoA); cusparseDestroySolveAnalysisInfo(info_u); /* Destroy contexts */ cusparseDestroy(cusparseHandle); cublasDestroy(cublasHandle); /* Free device memory */ free(I); free(J); free(val); free(x); free(rhs); free(valsILU0); cudaFree(d_col); cudaFree(d_row); cudaFree(d_val); cudaFree(d_x); cudaFree(d_y); cudaFree(d_r); cudaFree(d_p); cudaFree(d_omega); cudaFree(d_valsILU0); cudaFree(d_zm1); cudaFree(d_zm2); cudaFree(d_rm2); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); printf(" Test Summary:\n"); printf(" Counted total of %d errors\n", nErrors); printf(" qaerr1 = %f qaerr2 = %f\n\n", fabs(qaerr1), fabs(qaerr2)); exit((nErrors == 0 &&fabs(qaerr1)<1e-5 && fabs(qaerr2) < 1e-5 ? EXIT_SUCCESS : EXIT_FAILURE)); }
/////////////////////////////////////////////////////////////////////////////// // Main program /////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { // Start logs shrQAStart(argc, argv); // initialize the GPU, either identified by --device // or by picking the device with highest flop rate. int devID = findCudaDevice(argc, (const char **)argv); // parsing the number of random numbers to generate int rand_n = DEFAULT_RAND_N; if( checkCmdLineFlag(argc, (const char**) argv, "count") ) { rand_n = getCmdLineArgumentInt(argc, (const char**) argv, "count"); } printf("Allocating data for %i samples...\n", rand_n); // parsing the seed int seed = DEFAULT_SEED; if( checkCmdLineFlag(argc, (const char**) argv, "seed") ) { seed = getCmdLineArgumentInt(argc, (const char**) argv, "seed"); } printf("Seeding with %i ...\n", seed); float *d_Rand; checkCudaErrors( cudaMalloc((void **)&d_Rand, rand_n * sizeof(float)) ); curandGenerator_t prngGPU; checkCurandErrors( curandCreateGenerator(&prngGPU, CURAND_RNG_PSEUDO_MTGP32) ); checkCurandErrors( curandSetPseudoRandomGeneratorSeed(prngGPU, seed) ); curandGenerator_t prngCPU; checkCurandErrors( curandCreateGeneratorHost(&prngCPU, CURAND_RNG_PSEUDO_MTGP32) ); checkCurandErrors( curandSetPseudoRandomGeneratorSeed(prngCPU, seed) ); // // Example 1: Compare random numbers generated on GPU and CPU float *h_RandGPU = (float *)malloc(rand_n * sizeof(float)); printf("Generating random numbers on GPU...\n\n"); checkCurandErrors( curandGenerateUniform(prngGPU, (float*) d_Rand, rand_n) ); printf("\nReading back the results...\n"); checkCudaErrors( cudaMemcpy(h_RandGPU, d_Rand, rand_n * sizeof(float), cudaMemcpyDeviceToHost) ); float *h_RandCPU = (float *)malloc(rand_n * sizeof(float)); printf("Generating random numbers on CPU...\n\n"); checkCurandErrors( curandGenerateUniform(prngCPU, (float*) h_RandCPU, rand_n) ); printf("Comparing CPU/GPU random numbers...\n\n"); float L1norm = compareResults(rand_n, h_RandGPU, h_RandCPU); // // Example 2: Timing of random number generation on GPU const int numIterations = 10; int i; StopWatchInterface *hTimer; checkCudaErrors( cudaDeviceSynchronize() ); sdkCreateTimer(&hTimer); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); for (i = 0; i < numIterations; i++) { checkCurandErrors( curandGenerateUniform(prngGPU, (float*) d_Rand, rand_n) ); } checkCudaErrors( cudaDeviceSynchronize() ); sdkStopTimer(&hTimer); double gpuTime = 1.0e-3 * sdkGetTimerValue(&hTimer)/(double)numIterations; printf("MersenneTwister, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers\n", 1.0e-9 * rand_n / gpuTime, gpuTime, rand_n); printf("Shutting down...\n"); checkCurandErrors( curandDestroyGenerator(prngGPU) ); checkCurandErrors( curandDestroyGenerator(prngCPU) ); checkCudaErrors( cudaFree(d_Rand) ); sdkDeleteTimer( &hTimer); free(h_RandGPU); free(h_RandCPU); cudaDeviceReset(); shrQAFinishExit(argc, (const char**)argv, (L1norm < 1e-6) ? QA_PASSED : QA_FAILED); }
void initialize(int argc, char **argv) { printf("[%s] (OpenGL Mode)\n", sSDKsample); // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL(&argc, argv); int devID; cudaDeviceProp deviceProps; if (checkCmdLineFlag(argc, (const char **)argv, "device")) { devID = gpuGLDeviceInit(argc, (const char **)argv); if (devID < 0) { printf("exiting...\n"); exit(EXIT_SUCCESS); } } else { devID = gpuGetMaxGflopsDeviceId(); cudaGLSetGLDevice(devID); } // get number of SMs on this GPU checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount); // Create the timer (for fps measurement) sdkCreateTimer(&timer); // load image from disk loadImageData(argc, argv); printf("\n" "\tControls\n" "\t=/- : Zoom in/out\n" "\tb : Run Benchmark g_FilterMode\n" "\tc : Draw Bicubic Spline Curve\n" "\t[esc] - Quit\n\n" "\tPress number keys to change filtering g_FilterMode:\n\n" "\t1 : nearest filtering\n" "\t2 : bilinear filtering\n" "\t3 : bicubic filtering\n" "\t4 : fast bicubic filtering\n" "\t5 : Catmull-Rom filtering\n\n" ); initGLBuffers(); #if USE_BUFFER_TEX fprog = compileASMShader(GL_FRAGMENT_PROGRAM_ARB, shaderCode); if (!fprog) { exit(EXIT_SUCCESS); } #endif }
int main(int argc, char **argv) { int devID; cudaDeviceProp deviceProps; printf("%s Starting...\n\n", sSDKname); printf("[%s] - [OpenGL/CUDA simulation] starting...\n", sSDKname); // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. if (false == initGL(&argc, argv)) { exit(EXIT_SUCCESS); } // use command-line specified CUDA device, otherwise use device with highest Gflops/s #ifndef OPTIMUS devID = findCudaGLDevice(argc, (const char **)argv); #else devID = gpuGetMaxGflopsDeviceId(); #endif // get number of SMs on this GPU checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount); // automated build testing harness if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); } // Allocate and initialize host data GLint bsize; sdkCreateTimer(&timer); sdkResetTimer(&timer); hvfield = (cData *)malloc(sizeof(cData) * DS); memset(hvfield, 0, sizeof(cData) * DS); // Allocate and initialize device data cudaMallocPitch((void **)&dvfield, &tPitch, sizeof(cData)*DIM, DIM); cudaMemcpy(dvfield, hvfield, sizeof(cData) * DS, cudaMemcpyHostToDevice); // Temporary complex velocity field data cudaMalloc((void **)&vxfield, sizeof(cData) * PDS); cudaMalloc((void **)&vyfield, sizeof(cData) * PDS); setupTexture(DIM, DIM); bindTexture(); // Create particle array in host memory particles = (cData *)malloc(sizeof(cData) * DS); memset(particles, 0, sizeof(cData) * DS); #ifdef BROADCAST int step = 1; // Broadcasted visualization stepping. if (argc > 3) step = atoi(argv[3]); // Create additional space to store particle packets // for broadcasting. wstep = step; hstep = step; int npackets = sizeof(float) * (DIM / wstep) * (DIM / hstep) / UdpBroadcastServer::PacketSize; if (sizeof(float) * (DIM / wstep) * (DIM / hstep) % UdpBroadcastServer::PacketSize) npackets++; packets = (char*)malloc(npackets * (UdpBroadcastServer::PacketSize + sizeof(unsigned int))); #endif initParticles(particles, DIM, DIM); #if defined(OPTIMUS) || defined(BROADCAST) // Create particle array in device memory cudaMalloc((void **)&particles_gpu, sizeof(cData) * DS); cudaMemcpy(particles_gpu, particles, sizeof(cData) * DS, cudaMemcpyHostToDevice); #endif // Create CUFFT transform plan configuration cufftPlan2d(&planr2c, DIM, DIM, CUFFT_R2C); cufftPlan2d(&planc2r, DIM, DIM, CUFFT_C2R); // TODO: update kernels to use the new unpadded memory layout for perf // rather than the old FFTW-compatible layout cufftSetCompatibilityMode(planr2c, CUFFT_COMPATIBILITY_FFTW_PADDING); cufftSetCompatibilityMode(planc2r, CUFFT_COMPATIBILITY_FFTW_PADDING); glGenBuffersARB(1, &vbo); glBindBufferARB(GL_ARRAY_BUFFER_ARB, vbo); glBufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(cData) * DS, particles, GL_DYNAMIC_DRAW_ARB); glGetBufferParameterivARB(GL_ARRAY_BUFFER_ARB, GL_BUFFER_SIZE_ARB, &bsize); if (bsize != (sizeof(cData) * DS)) goto EXTERR; glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0); #ifndef OPTIMUS checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone)); getLastCudaError("cudaGraphicsGLRegisterBuffer failed"); #endif if (ref_file) { autoTest(argv); cleanup(); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); printf("[fluidsGL] - Test Results: %d Failures\n", g_TotalErrors); exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); } else { #ifdef BROADCAST const char *sv_addr = "127.0.0:9097"; const char *bc_addr = "127.255.255.2:9097"; // Server address if (argc > 2) sv_addr = argv[2]; // Broadcast address if (argc > 1) bc_addr = argv[1]; server.reset(new UdpBroadcastServer(sv_addr, bc_addr)); // Listen to clients' feedbacks in a separate thread. { pthread_t tid; pthread_create(&tid, NULL, &feedback_listener, &step); } // Broadcast the particles state in a separate thread. { pthread_t tid; pthread_create(&tid, NULL, &broadcaster, &step); } #endif #if defined (__APPLE__) || defined(MACOSX) atexit(cleanup); #else glutCloseFunc(cleanup); #endif glutMainLoop(); } // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); if (!ref_file) { exit(EXIT_SUCCESS); } return 0; EXTERR: printf("Failed to initialize GL extensions.\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(EXIT_FAILURE); }
int main(int argc, char **argv) { char *multiMethodChoice = NULL; char *scalingChoice = NULL; bool use_threads = true; bool bqatest = false; bool strongScaling = false; pArgc = &argc; pArgv = argv; printf("%s Starting...\n\n", argv[0]); if (checkCmdLineFlag(argc, (const char **)argv, "qatest")) { bqatest = true; } getCmdLineArgumentString(argc, (const char **)argv, "method", &multiMethodChoice); getCmdLineArgumentString(argc, (const char **)argv, "scaling", &scalingChoice); if (checkCmdLineFlag(argc, (const char **)argv, "h") || checkCmdLineFlag(argc, (const char **)argv, "help")) { usage(); exit(EXIT_SUCCESS); } if (multiMethodChoice == NULL) { use_threads = true; } else { if (!strcasecmp(multiMethodChoice, "threaded")) { use_threads = true; } else { use_threads = false; } } if (use_threads == false) { printf("Using single CPU thread for multiple GPUs\n"); } if (scalingChoice == NULL) { strongScaling = false; } else { if (!strcasecmp(scalingChoice, "strong")) { strongScaling = true; } else { strongScaling = false; } } //GPU number present in the system int GPU_N; checkCudaErrors(cudaGetDeviceCount(&GPU_N)); int nOptions = 256; nOptions = adjustProblemSize(GPU_N, nOptions); // select problem size int scale = (strongScaling) ? 1 : GPU_N; int OPT_N = nOptions * scale; int PATH_N = 262144; const unsigned long long SEED = 777; // initialize the timers hTimer = new StopWatchInterface*[GPU_N]; for (int i=0; i<GPU_N; i++) { sdkCreateTimer(&hTimer[i]); sdkResetTimer(&hTimer[i]); } //Input data array TOptionData *optionData = new TOptionData[OPT_N]; //Final GPU MC results TOptionValue *callValueGPU = new TOptionValue[OPT_N]; //"Theoretical" call values by Black-Scholes formula float *callValueBS = new float[OPT_N]; //Solver config TOptionPlan *optionSolver = new TOptionPlan[GPU_N]; //OS thread ID CUTThread *threadID = new CUTThread[GPU_N]; int gpuBase, gpuIndex; int i; float time; double delta, ref, sumDelta, sumRef, sumReserve; printf("MonteCarloMultiGPU\n"); printf("==================\n"); printf("Parallelization method = %s\n", use_threads ? "threaded" : "streamed"); printf("Problem scaling = %s\n", strongScaling? "strong" : "weak"); printf("Number of GPUs = %d\n", GPU_N); printf("Total number of options = %d\n", OPT_N); printf("Number of paths = %d\n", PATH_N); printf("main(): generating input data...\n"); srand(123); for (i=0; i < OPT_N; i++) { optionData[i].S = randFloat(5.0f, 50.0f); optionData[i].X = randFloat(10.0f, 25.0f); optionData[i].T = randFloat(1.0f, 5.0f); optionData[i].R = 0.06f; optionData[i].V = 0.10f; callValueGPU[i].Expected = -1.0f; callValueGPU[i].Confidence = -1.0f; } printf("main(): starting %i host threads...\n", GPU_N); //Get option count for each GPU for (i = 0; i < GPU_N; i++) { optionSolver[i].optionCount = OPT_N / GPU_N; } //Take into account cases with "odd" option counts for (i = 0; i < (OPT_N % GPU_N); i++) { optionSolver[i].optionCount++; } //Assign GPU option ranges gpuBase = 0; for (i = 0; i < GPU_N; i++) { optionSolver[i].device = i; optionSolver[i].optionData = optionData + gpuBase; optionSolver[i].callValue = callValueGPU + gpuBase; // all devices use the same global seed, but start // the sequence at a different offset optionSolver[i].seed = SEED; optionSolver[i].pathN = PATH_N; gpuBase += optionSolver[i].optionCount; } if (use_threads || bqatest) { //Start CPU thread for each GPU for (gpuIndex = 0; gpuIndex < GPU_N; gpuIndex++) { threadID[gpuIndex] = cutStartThread((CUT_THREADROUTINE)solverThread, &optionSolver[gpuIndex]); } printf("main(): waiting for GPU results...\n"); cutWaitForThreads(threadID, GPU_N); printf("main(): GPU statistics, threaded\n"); for (i = 0; i < GPU_N; i++) { cudaDeviceProp deviceProp; checkCudaErrors(cudaGetDeviceProperties(&deviceProp, optionSolver[i].device)); printf("GPU Device #%i: %s\n", optionSolver[i].device, deviceProp.name); printf("Options : %i\n", optionSolver[i].optionCount); printf("Simulation paths: %i\n", optionSolver[i].pathN); time = sdkGetTimerValue(&hTimer[i]); printf("Total time (ms.): %f\n", time); printf("Options per sec.: %f\n", OPT_N / (time * 0.001)); } printf("main(): comparing Monte Carlo and Black-Scholes results...\n"); sumDelta = 0; sumRef = 0; sumReserve = 0; for (i = 0; i < OPT_N; i++) { BlackScholesCall(callValueBS[i], optionData[i]); delta = fabs(callValueBS[i] - callValueGPU[i].Expected); ref = callValueBS[i]; sumDelta += delta; sumRef += fabs(ref); if (delta > 1e-6) { sumReserve += callValueGPU[i].Confidence / delta; } #ifdef PRINT_RESULTS printf("BS: %f; delta: %E\n", callValueBS[i], delta); #endif } sumReserve /= OPT_N; } if (!use_threads || bqatest) { multiSolver(optionSolver, GPU_N); printf("main(): GPU statistics, streamed\n"); for (i = 0; i < GPU_N; i++) { cudaDeviceProp deviceProp; checkCudaErrors(cudaGetDeviceProperties(&deviceProp, optionSolver[i].device)); printf("GPU Device #%i: %s\n", optionSolver[i].device, deviceProp.name); printf("Options : %i\n", optionSolver[i].optionCount); printf("Simulation paths: %i\n", optionSolver[i].pathN); } time = sdkGetTimerValue(&hTimer[0]); printf("\nTotal time (ms.): %f\n", time); printf("\tNote: This is elapsed time for all to compute.\n"); printf("Options per sec.: %f\n", OPT_N / (time * 0.001)); printf("main(): comparing Monte Carlo and Black-Scholes results...\n"); sumDelta = 0; sumRef = 0; sumReserve = 0; for (i = 0; i < OPT_N; i++) { BlackScholesCall(callValueBS[i], optionData[i]); delta = fabs(callValueBS[i] - callValueGPU[i].Expected); ref = callValueBS[i]; sumDelta += delta; sumRef += fabs(ref); if (delta > 1e-6) { sumReserve += callValueGPU[i].Confidence / delta; } #ifdef PRINT_RESULTS printf("BS: %f; delta: %E\n", callValueBS[i], delta); #endif } sumReserve /= OPT_N; } #ifdef DO_CPU printf("main(): running CPU MonteCarlo...\n"); TOptionValue callValueCPU; sumDelta = 0; sumRef = 0; for (i = 0; i < OPT_N; i++) { MonteCarloCPU( callValueCPU, optionData[i], NULL, PATH_N ); delta = fabs(callValueCPU.Expected - callValueGPU[i].Expected); ref = callValueCPU.Expected; sumDelta += delta; sumRef += fabs(ref); printf("Exp : %f | %f\t", callValueCPU.Expected, callValueGPU[i].Expected); printf("Conf: %f | %f\n", callValueCPU.Confidence, callValueGPU[i].Confidence); } printf("L1 norm: %E\n", sumDelta / sumRef); #endif printf("Shutting down...\n"); for (int i=0; i<GPU_N; i++) { sdkStartTimer(&hTimer[i]); checkCudaErrors(cudaSetDevice(i)); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); } delete[] optionSolver; delete[] callValueBS; delete[] callValueGPU; delete[] optionData; delete[] threadID; delete[] hTimer; printf("Test Summary...\n"); printf("L1 norm : %E\n", sumDelta / sumRef); printf("Average reserve: %f\n", sumReserve); printf(sumReserve > 1.0f ? "Test passed\n" : "Test failed!\n"); exit(sumReserve > 1.0f ? EXIT_SUCCESS : EXIT_FAILURE); }
void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size) { // By default, we use device 0, otherwise we override the device ID based on what is provided at the command line cudaError_t error; devID = 0; if (checkCmdLineFlag(argc, (const char **)argv, "device")) { devID = getCmdLineArgumentInt(argc, (const char **)argv, "device"); error = cudaSetDevice(devID); if (error != cudaSuccess) { printf("cudaSetDevice returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } } // get number of SMs on this GPU error = cudaGetDevice(&devID); if (error != cudaSuccess) { printf("cudaGetDevice returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } if (checkCmdLineFlag(argc, (const char **)argv, "sizemult")) { iSizeMultiple = getCmdLineArgumentInt(argc, (const char **)argv, "sizemult"); } iSizeMultiple = min(iSizeMultiple, 10); iSizeMultiple = max(iSizeMultiple, 1); cudaDeviceProp deviceProp; error = cudaGetDeviceProperties(&deviceProp, devID); if (error != cudaSuccess) { printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor); // use a larger block size for Fermi and above int block_size = (deviceProp.major < 2) ? 16 : 32; matrix_size.uiWA = 2 * block_size * iSizeMultiple; matrix_size.uiHA = 4 * block_size * iSizeMultiple; matrix_size.uiWB = 2 * block_size * iSizeMultiple; matrix_size.uiHB = 4 * block_size * iSizeMultiple; matrix_size.uiWC = 2 * block_size * iSizeMultiple; matrix_size.uiHC = 4 * block_size * iSizeMultiple; printf("MatrixA(%u,%u), MatrixB(%u,%u), MatrixC(%u,%u)\n", matrix_size.uiWA, matrix_size.uiHA, matrix_size.uiWB, matrix_size.uiHB, matrix_size.uiWC, matrix_size.uiHC); }
int main(int argc, char **argv) { int devID; cudaDeviceProp deviceProps; printf("%s Starting...\n\n", sSDKname); printf("[%s] - [OpenGL/CUDA simulation] starting...\n", sSDKname); // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. if (false == initGL(&argc, argv)) { exit(EXIT_SUCCESS); } // use command-line specified CUDA device, otherwise use device with highest Gflops/s devID = findCudaGLDevice(argc, (const char **)argv); // get number of SMs on this GPU checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount); // automated build testing harness if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); } // Allocate and initialize host data GLint bsize; sdkCreateTimer(&timer); sdkResetTimer(&timer); hvfield = (cData *)malloc(sizeof(cData) * DS); memset(hvfield, 0, sizeof(cData) * DS); // Allocate and initialize device data cudaMallocPitch((void **)&dvfield, &tPitch, sizeof(cData)*DIM, DIM); cudaMemcpy(dvfield, hvfield, sizeof(cData) * DS, cudaMemcpyHostToDevice); // Temporary complex velocity field data cudaMalloc((void **)&vxfield, sizeof(cData) * PDS); cudaMalloc((void **)&vyfield, sizeof(cData) * PDS); setupTexture(DIM, DIM); bindTexture(); // Create particle array particles = (cData *)malloc(sizeof(cData) * DS); memset(particles, 0, sizeof(cData) * DS); initParticles(particles, DIM, DIM); // Create CUFFT transform plan configuration cufftPlan2d(&planr2c, DIM, DIM, CUFFT_R2C); cufftPlan2d(&planc2r, DIM, DIM, CUFFT_C2R); // TODO: update kernels to use the new unpadded memory layout for perf // rather than the old FFTW-compatible layout cufftSetCompatibilityMode(planr2c, CUFFT_COMPATIBILITY_FFTW_PADDING); cufftSetCompatibilityMode(planc2r, CUFFT_COMPATIBILITY_FFTW_PADDING); glGenBuffersARB(1, &vbo); glBindBufferARB(GL_ARRAY_BUFFER_ARB, vbo); glBufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(cData) * DS, particles, GL_DYNAMIC_DRAW_ARB); glGetBufferParameterivARB(GL_ARRAY_BUFFER_ARB, GL_BUFFER_SIZE_ARB, &bsize); if (bsize != (sizeof(cData) * DS)) goto EXTERR; glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0); checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone)); getLastCudaError("cudaGraphicsGLRegisterBuffer failed"); if (ref_file) { autoTest(argv); cleanup(); cudaDeviceReset(); printf("[fluidsGL] - Test Results: %d Failures\n", g_TotalErrors); exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); } else { atexit(cleanup); glutMainLoop(); } cudaDeviceReset(); if (!ref_file) { exit(EXIT_SUCCESS); } return 0; EXTERR: printf("Failed to initialize GL extensions.\n"); cudaDeviceReset(); exit(EXIT_FAILURE); }
int main(int argc, char **argv) { pArgc = &argc; pArgv = argv; #if defined(__linux__) setenv ("DISPLAY", ":0", 0); #endif printf("%s Starting...\n\n", sSDKsample); if (checkCmdLineFlag(argc, (const char **)argv, "help")) { printf("\nUsage: SobelFilter <options>\n"); printf("\t\t-mode=n (0=original, 1=texture, 2=smem + texture)\n"); printf("\t\t-file=ref_orig.pgm (ref_tex.pgm, ref_shared.pgm)\n\n"); exit(EXIT_SUCCESS); } if (checkCmdLineFlag(argc, (const char **)argv, "file")) { g_bQAReadback = true; runAutoTest(argc, argv); } // use command-line specified CUDA device, otherwise use device with highest Gflops/s if (checkCmdLineFlag(argc, (const char **)argv, "device")) { printf(" This SDK does not explicitly support -device=n when running with OpenGL.\n"); printf(" When specifying -device=n (n=0,1,2,....) the sample must not use OpenGL.\n"); printf(" See details below to run without OpenGL:\n\n"); printf(" > %s -device=n\n\n", argv[0]); printf("exiting...\n"); exit(EXIT_SUCCESS); } // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL(&argc, argv); cudaGLSetGLDevice(gpuGetMaxGflopsDeviceId()); sdkCreateTimer(&timer); sdkResetTimer(&timer); glutDisplayFunc(display); glutKeyboardFunc(keyboard); glutReshapeFunc(reshape); loadDefaultImage(argv[0]); // If code is not printing the USage, then we execute this path. printf("I: display Image (no filtering)\n"); printf("T: display Sobel Edge Detection (Using Texture)\n"); printf("S: display Sobel Edge Detection (Using SMEM+Texture)\n"); printf("Use the '-' and '=' keys to change the brightness.\n"); fflush(stdout); #if defined (__APPLE__) || defined(MACOSX) atexit(cleanup); #else glutCloseFunc(cleanup); #endif glutTimerFunc(REFRESH_DELAY, timerEvent,0); glutMainLoop(); }