///////////////////////////////////////////////////// // Main program ///////////////////////////////////////////////////// int main(const int argc, const char **argv) { unsigned long start = START_SIZE; #ifdef NVS unsigned long end = END_NVS; #else /* sizeof(unsigned long) = 8 bytes */ unsigned long end = END_TITAN; #endif int stateDim = 1; /* default stateDim = 1 */ if (checkCmdLineFlag(argc, argv, "dim")) stateDim = getCmdLineArgumentInt(argc, argv, "dim"); char *typeInput = 0; getCmdLineArgumentString(argc, (const char**)argv, "type", &typeInput); if (0 != typeInput){ if (!strcasecmp(typeInput, "float")) runTest<float>(start, end, stateDim); else if (!strcasecmp(typeInput, "int")) runTest<int>(start, end, stateDim); else if (!strcasecmp(typeInput, "double")) runTest<double>(start, end, stateDim); } else runTest<double>(start, end, stateDim); exit(EXIT_SUCCESS); }
// Initialization code to find the best CUDA Device int findCudaDevice(int argc, const char **argv) { cudaDeviceProp deviceProp; int devID = 0; // If the command-line has a device number specified, use it if (checkCmdLineFlag(argc, argv, "device")) { devID = getCmdLineArgumentInt(argc, argv, "device="); if (devID < 0) { printf("Invalid command line parameters\n"); exit(-1); } else { devID = gpuDeviceInit(devID); if (devID < 0) { printf("exiting...\n"); shrQAFinishExit(argc, (const char **)argv, QA_FAILED); exit(-1); } } } else { // Otherwise pick the device with highest Gflops/s devID = gpuGetMaxGflopsDeviceId(); checkCudaErrors( cudaSetDevice( devID ) ); checkCudaErrors( cudaGetDeviceProperties(&deviceProp, devID) ); printf("> Using CUDA device [%d]: %s\n", devID, deviceProp.name); } return devID; }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { numParticles = NUM_PARTICLES; uint gridDim = GRID_SIZE; numIterations = 0; if (argc > 1) { if (checkCmdLineFlag(argc, (const char **) argv, "n")) { numParticles = getCmdLineArgumentInt(argc, (const char **)argv, "n"); } if (checkCmdLineFlag(argc, (const char **) argv, "grid")) { gridDim = getCmdLineArgumentInt(argc, (const char **) argv, "grid"); } } gridSize.x = gridSize.y = gridSize.z = gridDim; printf("grid: %d x %d x %d = %d cells\n", gridSize.x, gridSize.y, gridSize.z, gridSize.x*gridSize.y*gridSize.z); printf("particles: %d\n", numParticles); if (checkCmdLineFlag(argc, (const char **) argv, "i")) { numIterations = getCmdLineArgumentInt(argc, (const char **) argv, "i"); } cudaInit(argc, argv); initParticleSystem(numParticles, gridSize); initParams(); if (numIterations <= 0) numIterations = 300; runBenchmark(numIterations, argv[0]); if (psystem) { delete psystem; } exit(g_TotalErrors > 0 ? EXIT_FAILURE : EXIT_SUCCESS); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { pArgc = &argc; pArgv = argv; // parse arguments char *filename; printf("Starting bicubicTexture\n"); if (checkCmdLineFlag(argc, (const char **) argv, "help")) { printHelp(); exit(EXIT_SUCCESS); } if (checkCmdLineFlag(argc, (const char **) argv, "mode")) { g_FilterMode = (eFilterMode)getCmdLineArgumentInt(argc, (const char **) argv, "mode"); if (g_FilterMode < MODE_NEAREST && g_FilterMode > MODE_CATMULL_ROM) { printf("Invalid Mode setting %d\n", g_FilterMode); exit(EXIT_FAILURE); } } if (getCmdLineArgumentString(argc, (const char **) argv, "file", &filename)) { dumpFilename = filename; fpsLimit = frameCheckNumber; // Running CUDA kernel (bicubicFiltering) without visualization (QA Testing/Verification) runAutoTest(argc, argv, (const char *)dumpFilename, g_FilterMode); } else { // This runs the CUDA kernel (bicubicFiltering) + OpenGL visualization initialize(argc, argv); glutMainLoop(); sdkDeleteTimer(&timer); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(EXIT_SUCCESS); } exit(EXIT_SUCCESS); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { pArgc = &argc; pArgv = argv; // parse arguments char *filename; #if defined(__linux__) setenv ("DISPLAY", ":0", 0); #endif printf("Starting bicubicTexture\n"); if (checkCmdLineFlag(argc, (const char **) argv, "help")) { printHelp(); exit(EXIT_SUCCESS); } if (checkCmdLineFlag(argc, (const char **) argv, "mode")) { g_FilterMode = (eFilterMode)getCmdLineArgumentInt(argc, (const char **) argv, "mode"); if (g_FilterMode < 0 || g_FilterMode >= NUM_MODES) { printf("Invalid Mode setting %d\n", g_FilterMode); exit(EXIT_FAILURE); } } if (getCmdLineArgumentString(argc, (const char **) argv, "file", &filename)) { dumpFilename = filename; fpsLimit = frameCheckNumber; // Running CUDA kernel (bicubicFiltering) without visualization (QA Testing/Verification) runAutoTest(argc, argv, (const char *)dumpFilename, g_FilterMode); } else { // This runs the CUDA kernel (bicubicFiltering) + OpenGL visualization initialize(argc, argv); glutMainLoop(); } exit(EXIT_SUCCESS); }
// Initialization code to find the best CUDA Device inline int findCudaDevice(int argc, const char **argv) { cudaDeviceProp deviceProp; int devID = 0; // If the command-line has a device number specified, use it if (checkCmdLineFlag(argc, argv, "device")) { devID = getCmdLineArgumentInt(argc, argv, "device="); if (devID < 0) { printf("Invalid command line parameter\n "); exit(EXIT_FAILURE); } else { devID = gpuDeviceInit(devID); if (devID < 0) { printf("exiting...\n"); exit(EXIT_FAILURE); } } } else { // Otherwise pick the device with highest Gflops/s devID = gpuGetMaxGflopsDeviceId(); checkCudaErrors(cudaSetDevice(devID)); checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor); } return devID; }
////////////////////////////////////////////////////////////////////////////// // Program main ////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { bool bTestResults = true; if (checkCmdLineFlag(argc, (const char **)argv, "help")) { printf("\n> Command line options\n"); showHelp(); return 0; } printf("Run \"nbody -benchmark [-numbodies=<numBodies>]\" to measure perfomance.\n"); showHelp(); bFullscreen = (checkCmdLineFlag(argc, (const char **) argv, "fullscreen") != 0); if (bFullscreen) { bShowSliders = false; } benchmark = (checkCmdLineFlag(argc, (const char **) argv, "benchmark") != 0); compareToCPU = ((checkCmdLineFlag(argc, (const char **) argv, "compare") != 0) || (checkCmdLineFlag(argc, (const char **) argv, "qatest") != 0)); QATest = (checkCmdLineFlag(argc, (const char **) argv, "qatest") != 0); useHostMem = (checkCmdLineFlag(argc, (const char **) argv, "hostmem") != 0); fp64 = (checkCmdLineFlag(argc, (const char **) argv, "fp64") != 0); flopsPerInteraction = fp64 ? 30 : 20; useCpu = (checkCmdLineFlag(argc, (const char **) argv, "cpu") != 0); if (checkCmdLineFlag(argc, (const char **)argv, "numdevices")) { numDevsRequested = getCmdLineArgumentInt(argc, (const char **) argv, "numdevices"); if (numDevsRequested < 1) { printf("Error: \"number of CUDA devices\" specified %d is invalid. Value should be >= 1\n", numDevsRequested); exit(bTestResults ? EXIT_SUCCESS : EXIT_FAILURE); } else { printf("number of CUDA devices = %d\n", numDevsRequested); } } // for multi-device we currently require using host memory -- the devices share // data via the host if (numDevsRequested > 1) { useHostMem = true; } int numDevsAvailable = 0; bool customGPU = false; cudaGetDeviceCount(&numDevsAvailable); if (numDevsAvailable < numDevsRequested) { printf("Error: only %d Devices available, %d requested. Exiting.\n", numDevsAvailable, numDevsRequested); exit(EXIT_SUCCESS); } printf("> %s mode\n", bFullscreen ? "Fullscreen" : "Windowed"); printf("> Simulation data stored in %s memory\n", useHostMem ? "system" : "video"); printf("> %s precision floating point simulation\n", fp64 ? "Double" : "Single"); printf("> %d Devices used for simulation\n", numDevsRequested); int devID; cudaDeviceProp props; if (useCpu) { useHostMem = true; compareToCPU = false; bSupportDouble = true; #ifdef OPENMP printf("> Simulation with CPU using OpenMP\n"); #else printf("> Simulation with CPU\n"); #endif } // Initialize GL and GLUT if necessary if (!benchmark && !compareToCPU) { initGL(&argc, argv); initParameters(); } if(!useCpu) { // Now choose the CUDA Device // Either without GL interop: if (benchmark || compareToCPU || useHostMem) { // Note if we are using host memory for the body system, we // don't use CUDA-GL interop. if (checkCmdLineFlag(argc, (const char **)argv, "device")) { customGPU = true; } devID = findCudaDevice(argc, (const char **)argv); } else // or with GL interop: { if (checkCmdLineFlag(argc, (const char **)argv, "device")) { customGPU = true; } devID = findCudaGLDevice(argc, (const char **)argv); } checkCudaErrors(cudaGetDevice(&devID)); checkCudaErrors(cudaGetDeviceProperties(&props, devID)); bSupportDouble = true; #if CUDART_VERSION < 4000 if (numDevsRequested > 1) { printf("MultiGPU n-body requires CUDA 4.0 or later\n"); cudaDeviceReset(); exit(EXIT_SUCCESS); } #endif // Initialize devices if (numDevsRequested > 1 && customGPU) { printf("You can't use --numdevices and --device at the same time.\n"); exit(EXIT_SUCCESS); } if (customGPU) { cudaDeviceProp props; checkCudaErrors(cudaGetDeviceProperties(&props, devID)); printf("> Compute %d.%d CUDA device: [%s]\n", props.major, props.minor, props.name); } else { for (int i = 0; i < numDevsRequested; i++) { cudaDeviceProp props; checkCudaErrors(cudaGetDeviceProperties(&props, i)); printf("> Compute %d.%d CUDA device: [%s]\n", props.major, props.minor, props.name); if (useHostMem) { #if CUDART_VERSION >= 2020 if (!props.canMapHostMemory) { fprintf(stderr, "Device %d cannot map host memory!\n", devID); cudaDeviceReset(); exit(EXIT_SUCCESS); } if (numDevsRequested > 1) { checkCudaErrors(cudaSetDevice(i)); } checkCudaErrors(cudaSetDeviceFlags(cudaDeviceMapHost)); #else fprintf(stderr, "This CUDART version does not support <cudaDeviceProp.canMapHostMemory> field\n"); cudaDeviceReset(); exit(EXIT_SUCCESS); #endif } } // CC 1.2 and earlier do not support double precision if (props.major*10 + props.minor <= 12) { bSupportDouble = false; } } //if(numDevsRequested > 1) // checkCudaErrors(cudaSetDevice(devID)); if (fp64 && !bSupportDouble) { fprintf(stderr, "One or more of the requested devices does not support double precision floating-point\n"); cudaDeviceReset(); exit(EXIT_SUCCESS); } } numIterations = 0; p = 0; q = 1; if (checkCmdLineFlag(argc, (const char **)argv, "i")) { numIterations = getCmdLineArgumentInt(argc, (const char **)argv, "i"); } if (checkCmdLineFlag(argc, (const char **) argv, "p")) { p = getCmdLineArgumentInt(argc, (const char **)argv, "p"); } if (checkCmdLineFlag(argc, (const char **) argv, "q")) { q = getCmdLineArgumentInt(argc, (const char **)argv, "q"); } if (p == 0) // p not set on command line { p = 256; if (q * p > 256) { p = 256 / q; printf("Setting p=%d, q=%d to maintain %d threads per block\n", p, q, 256); } } // default number of bodies is #SMs * 4 * CTA size if (useCpu) #ifdef OPENMP numBodies = 8192; #else numBodies = 4096; #endif else if (numDevsRequested == 1)
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { printf("%s Starting...\n\n", sSDKsample); numParticles = NUM_PARTICLES; maxNumParticles = MAX_NUM_PARTICLES; uint gridDim = GRID_SIZE; numIterations = 0; printf("Surely I can get this far\n"); if (argc > 1) { if (checkCmdLineFlag(argc, (const char **) argv, "n")) { numParticles = getCmdLineArgumentInt(argc, (const char **)argv, "n"); } if (checkCmdLineFlag(argc, (const char **) argv, "grid")) { gridDim = getCmdLineArgumentInt(argc, (const char **) argv, "grid"); } if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", &g_refFile); fpsLimit = frameCheckNumber; numIterations = 1; } } //******************************************************* // RMK Hard code for cylindrical coords (y=theta=1) // DomainSize //char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-nothot/ZVert.txt"; //char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-nothot/RVert.txt"; //char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-nothot-big/ZVert.txt"; //char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-nothot-big/RVert.txt"; //char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-big/ZVert.txt"; //char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-big/RVert.txt"; //char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-nothot-big-refine/ZVert.txt"; //char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-nothot-big-refine/RVert.txt"; char Zfile[] = "/home/rkeedy/CFD/BuoyantStrumJet85-big-refine-lighter/ZVert.txt"; char Rfile[] = "/home/rkeedy/CFD/BuoyantStrumJet85-big-refine-lighter/RVert.txt"; //char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet62-big-refine-lighter/ZVert.txt"; //char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet62-big-refine-lighter/RVert.txt"; //char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-big-refine/ZVert.txt"; //char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-big-refine/RVert.txt"; //char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet63-big-refine/ZVert.txt"; //char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet63-big-refine/RVert.txt"; numVelNodes.x = filecount(Rfile); //-1; numVelNodes.z = filecount(Zfile); //-1; numVelNodes.y = 1; numCells.x = 80; //47; //24; //29; numCells.y = 1; numCells.z = 160; //188; //95; //88; numParticles = numCells.x*numCells.z*20; //avgnumparticles = 40 srand( time( NULL ) ); //numParticles = numCells.x*numCells.z*40; printf("vel grid: %d x %d x %d = %d cells\n", numVelNodes.x, numVelNodes.y, numVelNodes.z, numVelNodes.x*numVelNodes.y*numVelNodes.z); printf(" grid: %d x %d x %d = %d cells\n", numCells.x, numCells.y, numCells.z, numCells.x*numCells.y*numCells.z); //printf("vel grid: %d x %d x %d = %d cells\n", gridSize.x, gridSize.y, gridSize.z, gridSize.x*gridSize.y*gridSize.z); bool benchmark = checkCmdLineFlag(argc, (const char **) argv, "benchmark") != 0; if (checkCmdLineFlag(argc, (const char **) argv, "i")) { numIterations = getCmdLineArgumentInt(argc, (const char **) argv, "i"); } if (g_refFile) { cudaInit(argc, argv); } else { if (checkCmdLineFlag(argc, (const char **)argv, "device")) { printf("[%s]\n", argv[0]); printf(" Does not explicitly support -device=n in OpenGL mode\n"); printf(" To use -device=n, the sample must be running w/o OpenGL\n\n"); printf(" > %s -device=n -file=<*.bin>\n", argv[0]); printf("exiting...\n"); exit(EXIT_SUCCESS); } initGL(&argc, argv); cudaGLInit(argc, argv); } // Moved code snippet to CellSystem //initCellSystem(gridSize); // now moved to particlesystem printf("Begin initialization\n"); //initParticleSystem(numParticles, gridSize, g_refFile==NULL); initParticleSystem(maxNumParticles, numParticles, numVelNodes, numCells, g_refFile==NULL); //printf("Finished with initParticleSystem, %d\n",g_refFile==NULL); //cin.ignore(); initParams(); printf("Finished with initialization\n"); if (!g_refFile) { initMenus(); } if (benchmark || g_refFile) { if (numIterations <= 0) { numIterations = 300; } runBenchmark(numIterations, argv[0]); } else { glutDisplayFunc(display); glutReshapeFunc(reshape); glutMouseFunc(mouse); glutMotionFunc(motion); glutKeyboardFunc(key); glutSpecialFunc(special); glutIdleFunc(idle); atexit(cleanup); glutMainLoop(); } if (psystem) { delete psystem; } cudaDeviceReset(); exit(g_TotalErrors > 0 ? EXIT_FAILURE : EXIT_SUCCESS); }
void parseCommandLineArguments(int argc, char *argv[]) { char video_file[256]; printf("Command Line Arguments:\n"); for (int n=0; n < argc; n++) { printf("argv[%d] = %s\n", n, argv[n]); } if (checkCmdLineFlag(argc, (const char **)argv, "help")) { displayHelp(); exit(EXIT_SUCCESS); } if (checkCmdLineFlag(argc, (const char **)argv, "decodecuda")) { g_eVideoCreateFlags = cudaVideoCreate_PreferCUDA; } if (checkCmdLineFlag(argc, (const char **)argv, "decodedxva")) { g_eVideoCreateFlags = cudaVideoCreate_PreferDXVA; } if (checkCmdLineFlag(argc, (const char **)argv, "decodecuvid")) { g_eVideoCreateFlags = cudaVideoCreate_PreferCUVID; } if (checkCmdLineFlag(argc, (const char **)argv, "vsync")) { g_bUseVsync = true; } if (checkCmdLineFlag(argc, (const char **)argv, "novsync")) { g_bUseVsync = false; } if (checkCmdLineFlag(argc, (const char **)argv, "repeatframe")) { g_bFrameRepeat = true; } if (checkCmdLineFlag(argc, (const char **)argv, "framestep")) { g_bFrameStep = true; g_bUseDisplay = true; g_bUseInterop = true; g_fpsLimit = 1; } if (checkCmdLineFlag(argc, (const char **)argv, "updateall")) { g_bUpdateAll = true; } if (checkCmdLineFlag(argc, (const char **)argv, "displayvideo")) { g_bUseDisplay = true; g_bUseInterop = true; } if (checkCmdLineFlag(argc, (const char **)argv, "nointerop")) { g_bUseInterop = false; } if (checkCmdLineFlag(argc, (const char **)argv, "readback")) { g_bReadback = true; } if (checkCmdLineFlag(argc, (const char **)argv, "device")) { g_DeviceID = getCmdLineArgumentInt(argc, (const char **)argv, "device"); g_bUseDisplay = true; g_bUseInterop = true; } if (g_bUseDisplay == false) { g_bQAReadback = true; g_bUseInterop = false; } if (g_bLoop == false) { g_bAutoQuit = true; } // Search all command file parameters for video files with extensions: // mp4, avc, mkv, 264, h264. vc1, wmv, mp2, mpeg2, mpg char *file_ext = NULL; for (int i=1; i < argc; i++) { if (getFileExtension(argv[i], &file_ext) > 0) { strcpy(video_file, argv[i]); break; } } // We load the default video file for the SDK sample if (file_ext == NULL) { strcpy(video_file, sdkFindFilePath(VIDEO_SOURCE_FILE, argv[0])); } // Now verify the video file is legit FILE *fp = fopen(video_file, "r"); if (video_file == NULL && fp == NULL) { printf("[%s]: unable to find file: <%s>\nExiting...\n", sAppFilename, VIDEO_SOURCE_FILE); exit(EXIT_FAILURE); } if (fp) { fclose(fp); } // default video file loaded by this sample sFileName = video_file; // store the current path so we can reinit the CUDA context strcpy(exec_path, argv[0]); printf("[%s]: input file: <%s>\n", sAppFilename, video_file); }
int main(int argc, char **argv) { char *dump_file = NULL; #if defined(__linux__) setenv ("DISPLAY", ":0", 0); #endif pArgc = &argc; pArgv = argv; printf("%s Starting...\n\n", sSDKsample); if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", (char **) &dump_file); int kernel = 1; if (checkCmdLineFlag(argc, (const char **)argv, "kernel")) { kernel = getCmdLineArgumentInt(argc, (const char **)argv, "kernel"); } runAutoTest(argc, argv, dump_file, kernel); } else { printf("[%s]\n", sSDKsample); // use command-line specified CUDA device, otherwise use device with highest Gflops/s if (checkCmdLineFlag(argc, (const char **)argv, "device")) { printf("[%s]\n", argv[0]); printf(" Does not explicitly support -device=n in OpenGL mode\n"); printf(" To use -device=n, the sample must be running w/o OpenGL\n\n"); printf(" > %s -device=n -qatest\n", argv[0]); printf("exiting...\n"); exit(EXIT_SUCCESS); } // First load the image, so we know what the size of the image (imageW and imageH) printf("Allocating host and CUDA memory and loading image file...\n"); const char *image_path = sdkFindFilePath("portrait_noise.bmp", argv[0]); if (image_path == NULL) { printf("imageDenoisingGL was unable to find and load image file <portrait_noise.bmp>.\nExiting...\n"); exit(EXIT_FAILURE); } LoadBMPFile(&h_Src, &imageW, &imageH, image_path); printf("Data init done.\n"); // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL(&argc, argv); cudaGLSetGLDevice(gpuGetMaxGflopsDeviceId()); checkCudaErrors(CUDA_MallocArray(&h_Src, imageW, imageH)); initOpenGLBuffers(); } printf("Starting GLUT main loop...\n"); printf("Press [1] to view noisy image\n"); printf("Press [2] to view image restored with knn filter\n"); printf("Press [3] to view image restored with nlm filter\n"); printf("Press [4] to view image restored with modified nlm filter\n"); printf("Press [*] to view smooth/edgy areas [RED/BLUE] Ct's when a filter is active\n"); printf("Press [f] to print frame rate\n"); printf("Press [?] to print Noise and Lerp Ct's\n"); printf("Press [q] to exit\n"); sdkCreateTimer(&timer); sdkStartTimer(&timer); glutMainLoop(); }
bool runTest(int argc, char **argv, ReduceType datatype) { int size = 1<<24; // number of elements to reduce int maxThreads = 256; // number of threads per block int whichKernel = 6; int maxBlocks = 64; bool cpuFinalReduction = false; int cpuFinalThreshold = 1; if (checkCmdLineFlag(argc, (const char **) argv, "n")) { size = getCmdLineArgumentInt(argc, (const char **) argv, "n"); } if (checkCmdLineFlag(argc, (const char **) argv, "threads")) { maxThreads = getCmdLineArgumentInt(argc, (const char **) argv, "threads"); } if (checkCmdLineFlag(argc, (const char **) argv, "kernel")) { whichKernel = getCmdLineArgumentInt(argc, (const char **) argv, "kernel"); } if (checkCmdLineFlag(argc, (const char **) argv, "maxblocks")) { maxBlocks = getCmdLineArgumentInt(argc, (const char **) argv, "maxblocks"); } printf("%d elements\n", size); printf("%d threads (max)\n", maxThreads); cpuFinalReduction = checkCmdLineFlag(argc, (const char **) argv, "cpufinal"); if (checkCmdLineFlag(argc, (const char **) argv, "cputhresh")) { cpuFinalThreshold = getCmdLineArgumentInt(argc, (const char **) argv, "cputhresh"); } bool runShmoo = checkCmdLineFlag(argc, (const char **) argv, "shmoo"); if (runShmoo) { shmoo<T>(1, 33554432, maxThreads, maxBlocks, datatype); } else { // create random input data on CPU unsigned int bytes = size * sizeof(T); T *h_idata = (T *) malloc(bytes); for (int i=0; i<size; i++) { // Keep the numbers small so we don't get truncation error in the sum if (datatype == REDUCE_INT) { h_idata[i] = (T)(rand() & 0xFF); } else { h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX; } } int numBlocks = 0; int numThreads = 0; getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads); if (numBlocks == 1) { cpuFinalThreshold = 1; } // allocate mem for the result on host side T *h_odata = (T *) malloc(numBlocks*sizeof(T)); printf("%d blocks\n\n", numBlocks); // allocate device memory and data T *d_idata = NULL; T *d_odata = NULL; checkCudaErrors(cudaMalloc((void **) &d_idata, bytes)); checkCudaErrors(cudaMalloc((void **) &d_odata, numBlocks*sizeof(T))); // copy data directly to device memory checkCudaErrors(cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_odata, h_idata, numBlocks*sizeof(T), cudaMemcpyHostToDevice)); // warm-up reduce<T>(size, numThreads, numBlocks, whichKernel, d_idata, d_odata); int testIterations = 100; StopWatchInterface *timer = 0; sdkCreateTimer(&timer); T gpu_result = 0; gpu_result = benchmarkReduce<T>(size, numThreads, numBlocks, maxThreads, maxBlocks, whichKernel, testIterations, cpuFinalReduction, cpuFinalThreshold, timer, h_odata, d_idata, d_odata); double reduceTime = sdkGetAverageTimerValue(&timer) * 1e-3; printf("Reduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %d, Workgroup = %u\n", 1.0e-9 * ((double)bytes)/reduceTime, reduceTime, size, 1, numThreads); // compute reference solution T cpu_result = reduceCPU<T>(h_idata, size); int precision = 0; double threshold = 0; double diff = 0; if (datatype == REDUCE_INT) { printf("\nGPU result = %d\n", (int)gpu_result); printf("CPU result = %d\n\n", (int)cpu_result); } else { if (datatype == REDUCE_FLOAT) { precision = 8; threshold = 1e-8 * size; } else { precision = 12; threshold = 1e-12 * size; } printf("\nGPU result = %.*f\n", precision, (double)gpu_result); printf("CPU result = %.*f\n\n", precision, (double)cpu_result); diff = fabs((double)gpu_result - (double)cpu_result); } // cleanup sdkDeleteTimer(&timer); free(h_idata); free(h_odata); checkCudaErrors(cudaFree(d_idata)); checkCudaErrors(cudaFree(d_odata)); if (datatype == REDUCE_INT) { return (gpu_result == cpu_result); } else { return (diff < threshold); } } return true; }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { #if defined(__linux__) setenv ("DISPLAY", ":0", 0); #endif printf("%s Starting...\n\n", sSDKsample); printf("NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n\n"); numParticles = NUM_PARTICLES; uint gridDim = GRID_SIZE; numIterations = 0; if (argc > 1) { if (checkCmdLineFlag(argc, (const char **) argv, "n")) { numParticles = getCmdLineArgumentInt(argc, (const char **)argv, "n"); } if (checkCmdLineFlag(argc, (const char **) argv, "grid")) { gridDim = getCmdLineArgumentInt(argc, (const char **) argv, "grid"); } if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", &g_refFile); fpsLimit = frameCheckNumber; numIterations = 1; } } gridSize.x = gridSize.y = gridSize.z = gridDim; printf("grid: %d x %d x %d = %d cells\n", gridSize.x, gridSize.y, gridSize.z, gridSize.x*gridSize.y*gridSize.z); printf("particles: %d\n", numParticles); bool benchmark = checkCmdLineFlag(argc, (const char **) argv, "benchmark") != 0; if (checkCmdLineFlag(argc, (const char **) argv, "i")) { numIterations = getCmdLineArgumentInt(argc, (const char **) argv, "i"); } if (g_refFile) { cudaInit(argc, argv); } else { if (checkCmdLineFlag(argc, (const char **)argv, "device")) { printf("[%s]\n", argv[0]); printf(" Does not explicitly support -device=n in OpenGL mode\n"); printf(" To use -device=n, the sample must be running w/o OpenGL\n\n"); printf(" > %s -device=n -file=<*.bin>\n", argv[0]); printf("exiting...\n"); exit(EXIT_SUCCESS); } initGL(&argc, argv); cudaGLInit(argc, argv); } initParticleSystem(numParticles, gridSize, g_refFile==NULL); initParams(); if (!g_refFile) { initMenus(); } if (benchmark || g_refFile) { if (numIterations <= 0) { numIterations = 300; } runBenchmark(numIterations, argv[0]); } else { glutDisplayFunc(display); glutReshapeFunc(reshape); glutMouseFunc(mouse); glutMotionFunc(motion); glutKeyboardFunc(key); glutSpecialFunc(special); glutIdleFunc(idle); glutCloseFunc(cleanup); glutMainLoop(); } if (psystem) { delete psystem; } // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(g_TotalErrors > 0 ? EXIT_FAILURE : EXIT_SUCCESS); }
void initData(int argc, char **argv) { // parse arguments char *filename; if (getCmdLineArgumentString(argc, (const char **) argv, "file", &filename)) { volumeFilename = filename; } int n; if (checkCmdLineFlag(argc, (const char **) argv, "size")) { n = getCmdLineArgumentInt(argc, (const char **) argv, "size"); volumeSize.width = volumeSize.height = volumeSize.depth = n; } if (checkCmdLineFlag(argc, (const char **) argv, "xsize")) { n = getCmdLineArgumentInt(argc, (const char **) argv, "xsize"); volumeSize.width = n; } if (checkCmdLineFlag(argc, (const char **) argv, "ysize")) { n = getCmdLineArgumentInt(argc, (const char **) argv, "ysize"); volumeSize.height = n; } if (checkCmdLineFlag(argc, (const char **) argv, "zsize")) { n = getCmdLineArgumentInt(argc, (const char **) argv, "zsize"); volumeSize.depth = n; } char *path = sdkFindFilePath(volumeFilename, argv[0]); if (path == 0) { printf("Error finding file '%s'\n", volumeFilename); exit(EXIT_FAILURE); } size_t size = volumeSize.width*volumeSize.height*volumeSize.depth*sizeof(VolumeType); void *h_volume = loadRawFile(path, size); FilterKernel_init(); Volume_init(&volumeOriginal,volumeSize, h_volume, 0); free(h_volume); Volume_init(&volumeFilter0, volumeSize, NULL, 1); Volume_init(&volumeFilter1, volumeSize, NULL, 1); VolumeRender_init(); VolumeRender_setPreIntegrated(preIntegrated); VolumeRender_setVolume(&volumeOriginal); sdkCreateTimer(&timer); sdkCreateTimer(&animationTimer); sdkStartTimer(&animationTimer); // calculate new grid size gridSize = dim3(iDivUp(width, blockSize.x), iDivUp(height, blockSize.y)); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { // start logs int devID; char *ref_file = NULL; printf("%s Starting...\n\n", argv[0]); #if defined(__linux__) setenv ("DISPLAY", ":0", 0); #endif // use command-line specified CUDA device, otherwise use device with highest Gflops/s if (argc > 1) { if (checkCmdLineFlag(argc, (const char **)argv, "radius")) { filter_radius = getCmdLineArgumentInt(argc, (const char **) argv, "radius"); } if (checkCmdLineFlag(argc, (const char **)argv, "passes")) { iterations = getCmdLineArgumentInt(argc, (const char **)argv, "passes"); } if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", (char **)&ref_file); } } // load image to process loadImageData(argc, argv); if (checkCmdLineFlag(argc, (const char **)argv, "benchmark")) { // This is a separate mode of the sample, where we are benchmark the kernels for performance devID = findCudaDevice(argc, (const char **)argv); // Running CUDA kernels (bilateralfilter) in Benchmarking mode g_TotalErrors += runBenchmark(argc, argv); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); } else if (checkCmdLineFlag(argc, (const char **)argv, "radius") || checkCmdLineFlag(argc, (const char **)argv, "passes")) { // This overrides the default mode. Users can specify the radius used by the filter kernel devID = findCudaDevice(argc, (const char **)argv); g_TotalErrors += runSingleTest(ref_file, argv[0]); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); } else { // Default mode running with OpenGL visualization and in automatic mode // the output automatically changes animation printf("\n"); // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL(argc, (char **)argv); int dev = findCapableDevice(argc, argv); if (dev != -1) { dev = gpuGLDeviceInit(argc, (const char **)argv); if (dev == -1) { exit(EXIT_FAILURE); } } else { // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(EXIT_SUCCESS); } // Now we can create a CUDA context and bind it to the OpenGL context initCuda(); initGLResources(); // sets the callback function so it will call cleanup upon exit #if defined (__APPLE__) || defined(MACOSX) atexit(cleanup); #else glutCloseFunc(cleanup); #endif printf("Running Standard Demonstration with GLUT loop...\n\n"); printf("Press '+' and '-' to change filter width\n" "Press ']' and '[' to change number of iterations\n" "Press 'e' and 'E' to change Euclidean delta\n" "Press 'g' and 'G' to changle Gaussian delta\n" "Press 'a' or 'A' to change Animation mode ON/OFF\n\n"); // Main OpenGL loop that will run visualization for every vsync glutMainLoop(); } }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { int devID = 0; char *ref_file = NULL; pArgc = &argc; pArgv = argv; // start logs printf("%s Starting...\n\n", argv[0]); if (checkCmdLineFlag(argc, (const char **)argv, "help")) { printHelp(); exit(EXIT_SUCCESS); } // use command-line specified CUDA device, otherwise use device with highest Gflops/s if (argc > 1) { if (checkCmdLineFlag(argc, (const char **)argv, "threads")) { nthreads = getCmdLineArgumentInt(argc, (const char **) argv, "threads"); } if (checkCmdLineFlag(argc, (const char **)argv, "radius")) { filter_radius = getCmdLineArgumentInt(argc, (const char **) argv, "radius"); } if (checkCmdLineFlag(argc, (const char **)argv, "passes")) { iterations = getCmdLineArgumentInt(argc, (const char **) argv, "passes"); } if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", (char **)&ref_file); } } // load image to process loadImageData(argc, argv); if (checkCmdLineFlag(argc, (const char **)argv, "benchmark")) { // This is a separate mode of the sample, where we are benchmark the kernels for performance devID = findCudaDevice(argc, (const char **)argv); // Running CUDA kernels (boxfilter) in Benchmarking mode g_TotalErrors += runBenchmark(); exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); } else if (checkCmdLineFlag(argc, (const char **)argv, "radius") || checkCmdLineFlag(argc, (const char **)argv, "passes")) { // This overrides the default mode. Users can specify the radius used by the filter kernel devID = findCudaDevice(argc, (const char **)argv); g_TotalErrors += runSingleTest(ref_file, argv[0]); exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); } else { // Default mode running with OpenGL visualization and in automatic mode // the output automatically changes animation printf("\n"); // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL(&argc, argv); int dev = findCapableDevice(argc, argv); if (dev != -1) { cudaGLSetGLDevice(dev); } else { cudaDeviceReset(); exit(EXIT_SUCCESS); } // Now we can create a CUDA context and bind it to the OpenGL context initCuda(); initGLResources(); // sets the callback function so it will call cleanup upon exit atexit(cleanup); printf("Running Standard Demonstration with GLUT loop...\n\n"); printf("Press '+' and '-' to change filter width\n" "Press ']' and '[' to change number of iterations\n" "Press 'a' or 'A' to change animation ON/OFF\n\n"); // Main OpenGL loop that will run visualization for every vsync glutMainLoop(); cudaDeviceReset(); exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); } }
HRESULT initCudaResources(int argc, char **argv, int bUseInterop, int bTCC) { HRESULT hr = S_OK; CUdevice cuda_device; if (checkCmdLineFlag(argc, (const char **)argv, "device")) { cuda_device = getCmdLineArgumentInt(argc, (const char **) argv, "device"); // If interop is disabled, then we need to create a CUDA context w/o the GL context if (bUseInterop && !bTCC) { cuda_device = findCudaDeviceDRV(argc, (const char **)argv); } else { cuda_device = findCudaGLDeviceDRV(argc, (const char **)argv); } if (cuda_device < 0) { printf("No CUDA Capable devices found, exiting...\n"); exit(EXIT_SUCCESS); } checkCudaErrors(cuDeviceGet(&g_oDevice, cuda_device)); } else { // If we want to use Graphics Interop, then choose the GPU that is capable if (bUseInterop) { cuda_device = gpuGetMaxGflopsGLDeviceIdDRV(); checkCudaErrors(cuDeviceGet(&g_oDevice, cuda_device)); } else { cuda_device = gpuGetMaxGflopsDeviceIdDRV(); checkCudaErrors(cuDeviceGet(&g_oDevice, cuda_device)); } } // get compute capabilities and the devicename int major, minor; size_t totalGlobalMem; char deviceName[256]; checkCudaErrors(cuDeviceComputeCapability(&major, &minor, g_oDevice)); checkCudaErrors(cuDeviceGetName(deviceName, 256, g_oDevice)); printf("> Using GPU Device %d: %s has SM %d.%d compute capability\n", cuda_device, deviceName, major, minor); checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, g_oDevice)); printf(" Total amount of global memory: %4.4f MB\n", (float)totalGlobalMem/(1024*1024)); // Create CUDA Device w/ D3D9 interop (if WDDM), otherwise CUDA w/o interop (if TCC) // (use CU_CTX_BLOCKING_SYNC for better CPU synchronization) if (bUseInterop) { checkCudaErrors(cuD3D9CtxCreate(&g_oContext, &g_oDevice, CU_CTX_BLOCKING_SYNC, g_pD3DDevice)); } else { checkCudaErrors(cuCtxCreate(&g_oContext, CU_CTX_BLOCKING_SYNC, g_oDevice)); } try { // Initialize CUDA releated Driver API (32-bit or 64-bit), depending the platform running if (sizeof(void *) == 4) { g_pCudaModule = new CUmoduleManager("NV12ToARGB_drvapi_Win32.ptx", exec_path, 2, 2, 2); } else { g_pCudaModule = new CUmoduleManager("NV12ToARGB_drvapi_x64.ptx", exec_path, 2, 2, 2); } } catch (char const *p_file) { // If the CUmoduleManager constructor fails to load the PTX file, it will throw an exception printf("\n>> CUmoduleManager::Exception! %s not found!\n", p_file); printf(">> Please rebuild NV12ToARGB_drvapi.cu or re-install this sample.\n"); return E_FAIL; } g_pCudaModule->GetCudaFunction("NV12ToARGB_drvapi", &gfpNV12toARGB); g_pCudaModule->GetCudaFunction("Passthru_drvapi", &gfpPassthru); /////////////////Change/////////////////////////// // Now we create the CUDA resources and the CUDA decoder context initCudaVideo(); if (bUseInterop) { initD3D9Surface(g_pVideoDecoder->targetWidth(), g_pVideoDecoder->targetHeight()); } else { checkCudaErrors(cuMemAlloc(&g_pInteropFrame[0], g_pVideoDecoder->targetWidth() * g_pVideoDecoder->targetHeight() * 2)); checkCudaErrors(cuMemAlloc(&g_pInteropFrame[1], g_pVideoDecoder->targetWidth() * g_pVideoDecoder->targetHeight() * 2)); } CUcontext cuCurrent = NULL; CUresult result = cuCtxPopCurrent(&cuCurrent); if (result != CUDA_SUCCESS) { printf("cuCtxPopCurrent: %d\n", result); assert(0); } ///////////////////////////////////////// return ((g_pCudaModule && g_pVideoDecoder && (g_pImageDX || g_pInteropFrame[0])) ? S_OK : E_FAIL); }
/////////////////////////////////////////////////////////////////////////////// // Main program /////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { // Start logs shrQAStart(argc, argv); // initialize the GPU, either identified by --device // or by picking the device with highest flop rate. int devID = findCudaDevice(argc, (const char **)argv); // parsing the number of random numbers to generate int rand_n = DEFAULT_RAND_N; if( checkCmdLineFlag(argc, (const char**) argv, "count") ) { rand_n = getCmdLineArgumentInt(argc, (const char**) argv, "count"); } printf("Allocating data for %i samples...\n", rand_n); // parsing the seed int seed = DEFAULT_SEED; if( checkCmdLineFlag(argc, (const char**) argv, "seed") ) { seed = getCmdLineArgumentInt(argc, (const char**) argv, "seed"); } printf("Seeding with %i ...\n", seed); float *d_Rand; checkCudaErrors( cudaMalloc((void **)&d_Rand, rand_n * sizeof(float)) ); curandGenerator_t prngGPU; checkCurandErrors( curandCreateGenerator(&prngGPU, CURAND_RNG_PSEUDO_MTGP32) ); checkCurandErrors( curandSetPseudoRandomGeneratorSeed(prngGPU, seed) ); curandGenerator_t prngCPU; checkCurandErrors( curandCreateGeneratorHost(&prngCPU, CURAND_RNG_PSEUDO_MTGP32) ); checkCurandErrors( curandSetPseudoRandomGeneratorSeed(prngCPU, seed) ); // // Example 1: Compare random numbers generated on GPU and CPU float *h_RandGPU = (float *)malloc(rand_n * sizeof(float)); printf("Generating random numbers on GPU...\n\n"); checkCurandErrors( curandGenerateUniform(prngGPU, (float*) d_Rand, rand_n) ); printf("\nReading back the results...\n"); checkCudaErrors( cudaMemcpy(h_RandGPU, d_Rand, rand_n * sizeof(float), cudaMemcpyDeviceToHost) ); float *h_RandCPU = (float *)malloc(rand_n * sizeof(float)); printf("Generating random numbers on CPU...\n\n"); checkCurandErrors( curandGenerateUniform(prngCPU, (float*) h_RandCPU, rand_n) ); printf("Comparing CPU/GPU random numbers...\n\n"); float L1norm = compareResults(rand_n, h_RandGPU, h_RandCPU); // // Example 2: Timing of random number generation on GPU const int numIterations = 10; int i; StopWatchInterface *hTimer; checkCudaErrors( cudaDeviceSynchronize() ); sdkCreateTimer(&hTimer); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); for (i = 0; i < numIterations; i++) { checkCurandErrors( curandGenerateUniform(prngGPU, (float*) d_Rand, rand_n) ); } checkCudaErrors( cudaDeviceSynchronize() ); sdkStopTimer(&hTimer); double gpuTime = 1.0e-3 * sdkGetTimerValue(&hTimer)/(double)numIterations; printf("MersenneTwister, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers\n", 1.0e-9 * rand_n / gpuTime, gpuTime, rand_n); printf("Shutting down...\n"); checkCurandErrors( curandDestroyGenerator(prngGPU) ); checkCurandErrors( curandDestroyGenerator(prngCPU) ); checkCudaErrors( cudaFree(d_Rand) ); sdkDeleteTimer( &hTimer); free(h_RandGPU); free(h_RandCPU); cudaDeviceReset(); shrQAFinishExit(argc, (const char**)argv, (L1norm < 1e-6) ? QA_PASSED : QA_FAILED); }
void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size) { // By default, we use device 0, otherwise we override the device ID based on what is provided at the command line cudaError_t error; devID = 0; if (checkCmdLineFlag(argc, (const char **)argv, "device")) { devID = getCmdLineArgumentInt(argc, (const char **)argv, "device"); error = cudaSetDevice(devID); if (error != cudaSuccess) { printf("cudaSetDevice returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } } // get number of SMs on this GPU error = cudaGetDevice(&devID); if (error != cudaSuccess) { printf("cudaGetDevice returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } if (checkCmdLineFlag(argc, (const char **)argv, "sizemult")) { iSizeMultiple = getCmdLineArgumentInt(argc, (const char **)argv, "sizemult"); } iSizeMultiple = min(iSizeMultiple, 10); iSizeMultiple = max(iSizeMultiple, 1); cudaDeviceProp deviceProp; error = cudaGetDeviceProperties(&deviceProp, devID); if (error != cudaSuccess) { printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor); // use a larger block size for Fermi and above int block_size = (deviceProp.major < 2) ? 16 : 32; matrix_size.uiWA = 2 * block_size * iSizeMultiple; matrix_size.uiHA = 4 * block_size * iSizeMultiple; matrix_size.uiWB = 2 * block_size * iSizeMultiple; matrix_size.uiHB = 4 * block_size * iSizeMultiple; matrix_size.uiWC = 2 * block_size * iSizeMultiple; matrix_size.uiHC = 4 * block_size * iSizeMultiple; printf("MatrixA(%u,%u), MatrixB(%u,%u), MatrixC(%u,%u)\n", matrix_size.uiWA, matrix_size.uiHA, matrix_size.uiWB, matrix_size.uiHB, matrix_size.uiWC, matrix_size.uiHC); }
void runAutoTest(int argc, char *argv[]) { printf("[%s] (automated testing w/ readback)\n", sSDKsample); int devID = findCudaDevice(argc, (const char **)argv); loadDefaultImage(argv[0]); Pixel *d_result; checkCudaErrors(cudaMalloc((void **)&d_result, imWidth*imHeight*sizeof(Pixel))); char *ref_file = NULL; char dump_file[256]; int mode = 0; mode = getCmdLineArgumentInt(argc, (const char **)argv, "mode"); getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); switch (mode) { case 0: g_SobelDisplayMode = SOBELDISPLAY_IMAGE; sprintf(dump_file, "lena_orig.pgm"); break; case 1: g_SobelDisplayMode = SOBELDISPLAY_SOBELTEX; sprintf(dump_file, "lena_tex.pgm"); break; case 2: g_SobelDisplayMode = SOBELDISPLAY_SOBELSHARED; sprintf(dump_file, "lena_shared.pgm"); break; default: printf("Invalid Filter Mode File\n"); exit(EXIT_FAILURE); break; } printf("AutoTest: %s <%s>\n", sSDKsample, filterMode[g_SobelDisplayMode]); sobelFilter(d_result, imWidth, imHeight, g_SobelDisplayMode, imageScale); checkCudaErrors(cudaDeviceSynchronize()); unsigned char *h_result = (unsigned char *)malloc(imWidth*imHeight*sizeof(Pixel)); checkCudaErrors(cudaMemcpy(h_result, d_result, imWidth*imHeight*sizeof(Pixel), cudaMemcpyDeviceToHost)); sdkSavePGM(dump_file, h_result, imWidth, imHeight); if (!sdkComparePGM(dump_file, sdkFindFilePath(ref_file, argv[0]), MAX_EPSILON_ERROR, 0.15f, false)) { g_TotalErrors++; } checkCudaErrors(cudaFree(d_result)); free(h_result); if (g_TotalErrors != 0) { printf("Test failed!\n"); exit(EXIT_FAILURE); } printf("Test passed!\n"); exit(EXIT_SUCCESS); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { pArgc = &argc; pArgv = argv; char *ref_file = NULL; #if defined(__linux__) setenv ("DISPLAY", ":0", 0); #endif printf("%s Starting...\n\n", sSDKsample); printf("NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n\n"); // use command-line specified CUDA device, otherwise use device with highest Gflops/s if (argc > 1) { if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); fpsLimit = frameCheckNumber; } } // Get the path of the filename char *filename; if (getCmdLineArgumentString(argc, (const char **) argv, "image", &filename)) { image_filename = filename; } // load image char *image_path = sdkFindFilePath(image_filename, argv[0]); if (image_path == NULL) { fprintf(stderr, "Error unable to find and load image file: '%s'\n", image_filename); exit(EXIT_FAILURE); } sdkLoadPPM4ub(image_path, (unsigned char **)&h_img, &width, &height); if (!h_img) { printf("Error unable to load PPM file: '%s'\n", image_path); exit(EXIT_FAILURE); } printf("Loaded '%s', %d x %d pixels\n", image_path, width, height); if (checkCmdLineFlag(argc, (const char **)argv, "threads")) { nthreads = getCmdLineArgumentInt(argc, (const char **) argv, "threads"); } if (checkCmdLineFlag(argc, (const char **)argv, "sigma")) { sigma = getCmdLineArgumentFloat(argc, (const char **) argv, "sigma"); } runBenchmark = checkCmdLineFlag(argc, (const char **) argv, "benchmark"); int device; struct cudaDeviceProp prop; cudaGetDevice(&device); cudaGetDeviceProperties(&prop, device); if (!strncmp("Tesla", prop.name, 5)) { printf("Tesla card detected, running the test in benchmark mode (no OpenGL display)\n"); // runBenchmark = true; runBenchmark = true; } // Benchmark or AutoTest mode detected, no OpenGL if (runBenchmark == true || ref_file != NULL) { findCudaDevice(argc, (const char **)argv); } else { // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL(&argc, argv); findCudaGLDevice(argc, (const char **)argv); } initCudaBuffers(); if (ref_file) { printf("(Automated Testing)\n"); bool testPassed = runSingleTest(ref_file, argv[0]); cleanup(); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(testPassed ? EXIT_SUCCESS : EXIT_FAILURE); } if (runBenchmark) { printf("(Run Benchmark)\n"); benchmark(100); cleanup(); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(EXIT_SUCCESS); } initGLBuffers(); glutMainLoop(); exit(EXIT_SUCCESS); }