//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char** argv) { numParticles = 1024; uint gridDim = 64; numIterations = 1; cutGetCmdLineArgumenti( argc, (const char**) argv, "n", (int *) &numParticles); cutGetCmdLineArgumenti( argc, (const char**) argv, "grid", (int *) &gridDim); gridSize.x = gridSize.y = gridSize.z = gridDim; printf("grid: %d x %d x %d = %d cells\n", gridSize.x, gridSize.y, gridSize.z, gridSize.x*gridSize.y*gridSize.z); bool benchmark = !cutCheckCmdLineFlag(argc, (const char**) argv, "noqatest") != 0; cutGetCmdLineArgumenti( argc, (const char**) argv, "i", &numIterations); cudaInit(argc, argv); glutInit(&argc, argv); glutInitDisplayMode(GLUT_RGB | GLUT_DEPTH | GLUT_DOUBLE); glutInitWindowSize(640, 480); glutCreateWindow("CUDA particles"); initGL(); init(numParticles, gridSize); initParams(); initMenus(); if (benchmark) { if (numIterations <= 0) numIterations = 300; runBenchmark(numIterations); } else { glutDisplayFunc(display); glutReshapeFunc(reshape); glutMouseFunc(mouse); glutMotionFunc(motion); glutKeyboardFunc(key); glutSpecialFunc(special); glutIdleFunc(idle); glutMainLoop(); } if (psystem) delete psystem; cudaThreadExit(); return 0; }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { //start logs shrSetLogFileName ("volumeRender.txt"); shrLog("%s Starting...\n\n", argv[0]); if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") || cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) { g_bQAReadback = true; fpsLimit = frameCheckNumber; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) { g_bQAGLVerify = true; fpsLimit = frameCheckNumber; } if (g_bQAReadback) { // use command-line specified CUDA device, otherwise use device with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilDeviceInit(argc, argv); } else { cudaSetDevice( cutGetMaxGflopsDeviceId() ); } } else { // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL( &argc, argv ); // use command-line specified CUDA device, otherwise use device with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilGLDeviceInit(argc, argv); } else { cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); } /* int device; struct cudaDeviceProp prop; cudaGetDevice( &device ); cudaGetDeviceProperties( &prop, device ); if( !strncmp( "Tesla", prop.name, 5 ) ) { shrLog("This sample needs a card capable of OpenGL and display.\n"); shrLog("Please choose a different device with the -device=x argument.\n"); cutilExit(argc, argv); } */ } // parse arguments char *filename; if (cutGetCmdLineArgumentstr( argc, (const char**) argv, "file", &filename)) { volumeFilename = filename; } int n; if (cutGetCmdLineArgumenti( argc, (const char**) argv, "size", &n)) { volumeSize.width = volumeSize.height = volumeSize.depth = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "xsize", &n)) { volumeSize.width = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "ysize", &n)) { volumeSize.height = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "zsize", &n)) { volumeSize.depth = n; } // load volume data char* path = shrFindFilePath(volumeFilename, argv[0]); if (path == 0) { shrLog("Error finding file '%s'\n", volumeFilename); exit(EXIT_FAILURE); } size_t size = volumeSize.width*volumeSize.height*volumeSize.depth*sizeof(VolumeType); void *h_volume = loadRawFile(path, size); initCuda(h_volume, volumeSize); free(h_volume); cutilCheckError( cutCreateTimer( &timer)); shrLog("Press '=' and '-' to change density\n" " ']' and '[' to change brightness\n" " ';' and ''' to modify transfer function offset\n" " '.' and ',' to modify transfer function scale\n\n"); // calculate new grid size gridSize = dim3(iDivUp(width, blockSize.x), iDivUp(height, blockSize.y)); if (g_bQAReadback) { g_CheckRender = new CheckBackBuffer(width, height, 4, false); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); uint *d_output; cutilSafeCall(cudaMalloc((void**)&d_output, width*height*sizeof(uint))); cutilSafeCall(cudaMemset(d_output, 0, width*height*sizeof(uint))); float modelView[16] = { 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 4.0f, 1.0f }; invViewMatrix[0] = modelView[0]; invViewMatrix[1] = modelView[4]; invViewMatrix[2] = modelView[8]; invViewMatrix[3] = modelView[12]; invViewMatrix[4] = modelView[1]; invViewMatrix[5] = modelView[5]; invViewMatrix[6] = modelView[9]; invViewMatrix[7] = modelView[13]; invViewMatrix[8] = modelView[2]; invViewMatrix[9] = modelView[6]; invViewMatrix[10] = modelView[10]; invViewMatrix[11] = modelView[14]; // call CUDA kernel, writing results to PBO copyInvViewMatrix(invViewMatrix, sizeof(float4)*3); // Start timer 0 and process n loops on the GPU int nIter = 10; for (int i = -1; i < nIter; i++) { if( i == 0 ) { cudaThreadSynchronize(); cutStartTimer(timer); } render_kernel(gridSize, blockSize, d_output, width, height, density, brightness, transferOffset, transferScale); } cudaThreadSynchronize(); cutStopTimer(timer); // Get elapsed time and throughput, then log to sample and master logs double dAvgTime = cutGetTimerValue(timer)/(nIter * 1000.0); shrLogEx(LOGBOTH | MASTER, 0, "volumeRender, Throughput = %.4f MTexels/s, Time = %.5f s, Size = %u Texels, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * width * height)/dAvgTime, dAvgTime, (width * height), 1, blockSize.x * blockSize.y); cutilCheckMsg("Error: render_kernel() execution FAILED"); cutilSafeCall( cudaThreadSynchronize() ); cutilSafeCall( cudaMemcpy(g_CheckRender->imageData(), d_output, width*height*4, cudaMemcpyDeviceToHost) ); g_CheckRender->savePPM(sOriginal[g_Index], true, NULL); if (!g_CheckRender->PPMvsPPM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, THRESHOLD)) { shrLog("\nFAILED\n\n"); } else { shrLog("\nPASSED\n\n"); } cudaFree(d_output); freeCudaBuffers(); if (g_CheckRender) { delete g_CheckRender; g_CheckRender = NULL; } } else { // This is the normal rendering path for VolumeRender glutDisplayFunc(display); glutKeyboardFunc(keyboard); glutMouseFunc(mouse); glutMotionFunc(motion); glutReshapeFunc(reshape); glutIdleFunc(idle); initPixelBuffer(); if (g_bQAGLVerify) { g_CheckRender = new CheckBackBuffer(width, height, 4); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); } atexit(cleanup); glutMainLoop(); } cudaThreadExit(); shrEXIT(argc, (const char**)argv); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { pArgc = &argc; pArgv = argv; shrQAStart(argc, argv); // start logs shrSetLogFileName ("boxFilter.txt"); shrLog("%s Starting...\n\n", argv[0]); // use command-line specified CUDA device, otherwise use device with highest Gflops/s if (argc > 1) { cutGetCmdLineArgumenti( argc, (const char**) argv, "threads", &nthreads ); cutGetCmdLineArgumenti( argc, (const char**) argv, "radius", &filter_radius); if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) { g_bOpenGLQA = true; fpsLimit = frameCheckNumber; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "fbo")) { g_bFBODisplay = true; } } // load image to process loadImageData(argc, argv); if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") || cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) { // Running CUDA kernel (boxFilter) without visualization (QA Testing/Verification) runAutoTest(argc, argv); shrQAFinishExit(argc, (const char **)argv, (g_TotalErrors == 0 ? QA_PASSED : QA_FAILED)); } else if (cutCheckCmdLineFlag(argc, (const char **)argv, "benchmark")) { // Running CUDA kernels (boxfilter) in Benchmarking mode runBenchmark(argc, argv); shrQAFinishExit(argc, (const char **)argv, (g_TotalErrors == 0 ? QA_PASSED : QA_FAILED)); } else { // Running CUDA kernels (boxFilter) with OpenGL visualization if (g_bFBODisplay) shrLog("[FBO Display] "); if (g_bOpenGLQA) shrLog("[OpenGL Readback Comparisons] "); shrLog("\n"); if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) { printf(" This SDK does not explicitly support -device=n when running with OpenGL.\n"); printf(" When specifying -device=n (n=0,1,2,....) the sample must not use OpenGL.\n"); printf(" See details below to run without OpenGL:\n\n"); printf(" > %s -device=n -qatest\n\n", argv[0]); printf("exiting...\n"); shrQAFinishExit(argc, (const char **)argv, QA_WAIVED); } // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL( &argc, argv ); int dev = findCapableDevice(argc, argv); if( dev != -1 ) { cudaGLSetGLDevice( dev ); } else { cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_WAIVED); } // Now we can create a CUDA context and bind it to the OpenGL context initCuda(); initGLResources(); if (g_bOpenGLQA) { if (g_bFBODisplay) { g_CheckRender = new CheckFBO(width, height, 4, g_FrameBufferObject); } else { g_CheckRender = new CheckBackBuffer(width, height, 4); } g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); } } // sets the callback function so it will call cleanup upon exit atexit(cleanup); shrLog("Running Standard Demonstration with GLUT loop...\n\n"); shrLog("Press '+' and '-' to change filter width\n" "Press ']' and '[' to change number of iterations\n\n"); // Main OpenGL loop that will run visualization for every vsync glutMainLoop(); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, (g_TotalErrors == 0 ? QA_PASSED : QA_FAILED)); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char** argv) { int retVal = 0; retVal = xnInit( argc, argv ); printf("[ %s ]\n", sSDKsample); if (argc > 1) { cutGetCmdLineArgumenti( argc, (const char**) argv, "n", (int *) &numParticles); if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") || cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt") ) { g_bQAReadback = true; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) { g_bQAGLVerify = true; } } if (g_bQAReadback) { // For Automated testing, we do not use OpenGL/CUDA interop if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) { cutilDeviceInit (argc, argv); } else { cudaSetDevice (cutGetMaxGflopsDeviceId() ); } g_CheckRender = new CheckBackBuffer(winWidth, winHeight, 4, false); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); // This code path is used for Automated Testing initParticles(numParticles, false, false); initParams(); if (emitterOn) { runEmitter(); } SimParams ¶ms = psystem->getParams(); params.cursorPos = make_float3(cursorPosLag.x, cursorPosLag.y, cursorPosLag.z); psystem->step(timestep); float4 *pos = NULL, *vel = NULL; int g_TotalErrors = 0; psystem->dumpBin(&pos, &vel); g_CheckRender->dumpBin(pos, numParticles*sizeof(float4), "smokeParticles_pos.bin"); g_CheckRender->dumpBin(vel, numParticles*sizeof(float4), "smokeParticles_vel.bin"); if (!g_CheckRender->compareBin2BinFloat("smokeParticles_pos.bin", sRefBin[0], numParticles*sizeof(float4), MAX_EPSILON_ERROR, THRESHOLD)) g_TotalErrors++; if (!g_CheckRender->compareBin2BinFloat("smokeParticles_vel.bin", sRefBin[1], numParticles*sizeof(float4), MAX_EPSILON_ERROR, THRESHOLD)) g_TotalErrors++; delete psystem; delete g_CheckRender; printf("%s\n", (g_TotalErrors > 0) ? "FAILED" : "PASSED"); cudaThreadExit(); } else { // Normal smokeParticles rendering path // 1st initialize OpenGL context, so we can properly set the GL for CUDA. // This is needed to achieve optimal performance with OpenGL/CUDA interop. initGL( &argc, argv ); if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) { cutilGLDeviceInit (argc, argv); } else { cudaGLSetGLDevice (cutGetMaxGflopsDeviceId() ); } if (g_bQAGLVerify) { g_CheckRender = new CheckBackBuffer(winWidth, winHeight, 4); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); } // This is the normal code path for SmokeParticles initParticles(numParticles, true, true); initParams(); initMenus(); glutDisplayFunc(display); glutReshapeFunc(reshape); glutMouseFunc(mouse); glutMotionFunc(motion); glutKeyboardFunc(key); glutKeyboardUpFunc(keyUp); glutSpecialFunc(special); glutIdleFunc(idle); glutMainLoop(); } cutilExit(argc, argv); return retVal; }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { shrQAStart(argc, argv); // start logs shrSetLogFileName ("bilateralFilter.txt"); shrLog("%s Starting...\n\n", argv[0]); // use command-line specified CUDA device, otherwise use device with highest Gflops/s cutGetCmdLineArgumenti( argc, (const char**) argv, "threads", &nthreads ); cutGetCmdLineArgumenti( argc, (const char**) argv, "radius", &filter_radius); // load image to process loadImageData(argc, argv); if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") || cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) { // Running CUDA kernel (bilateralFilter) without visualization (QA Testing/Verification) runAutoTest(argc, argv); shrQAFinishExit(argc, (const char **)argv, (g_TotalErrors == 0 ? QA_PASSED : QA_FAILED)); } else if (cutCheckCmdLineFlag(argc, (const char **)argv, "benchmark")) { // Running CUDA kernel (bilateralFilter) in Benchmarking Mode runBenchmark(argc, argv); shrQAFinishExit(argc, (const char **)argv, (g_TotalErrors == 0 ? QA_PASSED : QA_FAILED)); } else { // Running CUDA kernel (bilateralFilter) in CUDA + OpenGL Visualization Mode if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) { printf("[%s]\n", argv[0]); printf(" Does not explicitly support -device=n in OpenGL mode\n"); printf(" To use -device=n, the sample must be running w/o OpenGL\n\n"); printf(" > %s -device=n -qatest\n", argv[0]); printf("exiting...\n"); exit(0); } // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL( argc, argv ); if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) { cutilGLDeviceInit(argc, argv); } else { cudaGLSetGLDevice (cutGetMaxGflopsDeviceId() ); } initCuda(); initOpenGL(); } atexit(cleanup); printf("Running Standard Demonstration with GLUT loop...\n\n"); printf("Press '+' and '-' to change number of iterations\n" "Press LEFT and RIGHT change euclidean delta\n" "Press UP and DOWN to change gaussian delta\n" "Press '1' to show original image\n" "Press '2' to show result\n\n"); glutMainLoop(); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, (g_TotalErrors == 0 ? QA_PASSED : QA_FAILED)); }
////////////////////////////////////////////////////////////////////////////// // Program main ////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { bool bTestResults = true; shrQAStart(argc, argv); if( cutCheckCmdLineFlag(argc, (const char**)argv, "help") ) { showHelp(); return 0; } shrLog("Run \"nbody -benchmark [-n=<numBodies>]\" to measure perfomance.\n"); shrLog("\t-fullscreen (run n-body simulation in fullscreen mode)\n"); shrLog("\t-fp64 (use double precision floating point values for simulation)\n"); shrLog("\t-numdevices=N (use first N CUDA devices for simulation)\n"); // shrLog("\t-hostmem (stores simulation data in host memory)\n"); // shrLog("\t-cpu (performs simulation on the host)\n"); shrLog("\n"); bFullscreen = (cutCheckCmdLineFlag(argc, (const char**) argv, "fullscreen") != 0); if (bFullscreen) bShowSliders = false; benchmark = (cutCheckCmdLineFlag(argc, (const char**) argv, "benchmark") != 0); compareToCPU = ((cutCheckCmdLineFlag(argc, (const char**) argv, "compare") != 0) || (cutCheckCmdLineFlag(argc, (const char**) argv, "qatest") != 0)); QATest = (cutCheckCmdLineFlag(argc, (const char**) argv, "qatest") != 0); useHostMem = (cutCheckCmdLineFlag(argc, (const char**) argv, "hostmem") != 0); fp64 = (cutCheckCmdLineFlag(argc, (const char**) argv, "fp64") != 0); flopsPerInteraction = fp64 ? 30 : 20; useCpu = (cutCheckCmdLineFlag(argc, (const char**) argv, "cpu") != 0); cutGetCmdLineArgumenti(argc, (const char**) argv, "numdevices", &numDevsRequested); // for multi-device we currently require using host memory -- the devices share // data via the host if (numDevsRequested > 1) useHostMem = true; int numDevsAvailable = 0; bool customGPU = false; cudaGetDeviceCount(&numDevsAvailable); if (numDevsAvailable < numDevsRequested) { shrLog("Error: only %d Devices available, %d requested. Exiting.\n", numDevsAvailable, numDevsRequested); shrQAFinishExit(argc, (const char **)argv, QA_PASSED); } shrLog("> %s mode\n", bFullscreen ? "Fullscreen" : "Windowed"); shrLog("> Simulation data stored in %s memory\n", useHostMem ? "system" : "video" ); shrLog("> %s precision floating point simulation\n", fp64 ? "Double" : "Single"); shrLog("> %d Devices used for simulation\n", numDevsRequested); int devID; cudaDeviceProp props; // Initialize GL and GLUT if necessary if (!benchmark && !compareToCPU) { initGL(&argc, argv); initParameters(); } if (useCpu) { useHostMem = true; compareToCPU = false; bSupportDouble = true; #ifdef OPENMP shrLog("> Simulation with CPU using OpenMP\n"); #else shrLog("> Simulation with CPU\n"); #endif } else { // Now choose the CUDA Device // Either without GL interop: if (benchmark || compareToCPU || useHostMem) { // Note if we are using host memory for the body system, we // don't use CUDA-GL interop. if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { devID = cutilDeviceInit(argc, argv); if (devID < 0) { printf("exiting...\n"); shrQAFinishExit(argc, (const char **)argv, QA_PASSED); } customGPU = true; } else { devID = cutGetMaxGflopsDeviceId(); cudaSetDevice( devID ); } } else // or with GL interop: { if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilGLDeviceInit(argc, argv); customGPU = true; } else { devID = cutGetMaxGflopsDeviceId(); cudaGLSetGLDevice( devID ); } } cutilSafeCall(cudaGetDevice(&devID)); cutilSafeCall(cudaGetDeviceProperties(&props, devID)); bSupportDouble = true; #if CUDART_VERSION < 4000 if (numDevsRequested > 1) { shrLog("MultiGPU n-body requires CUDA 4.0 or later\n"); cutilDeviceReset(); shrQAFinishExit(argc, (const char**)argv, QA_PASSED); } #endif // Initialize devices if(numDevsRequested > 1 && customGPU) { printf("You can't use --numdevices and --device at the same time.\n"); shrQAFinishExit(argc, (const char**)argv, QA_PASSED); } if(customGPU) { cudaDeviceProp props; cutilSafeCall(cudaGetDeviceProperties(&props, devID)); shrLog("> Compute %d.%d CUDA device: [%s]\n", props.major, props.minor, props.name); } else { for (int i = 0; i < numDevsRequested; i++) { cudaDeviceProp props; cutilSafeCall(cudaGetDeviceProperties(&props, i)); shrLog("> Compute %d.%d CUDA device: [%s]\n", props.major, props.minor, props.name); if (useHostMem) { #if CUDART_VERSION >= 2020 if(!props.canMapHostMemory) { fprintf(stderr, "Device %d cannot map host memory!\n", devID); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_PASSED); } if (numDevsRequested > 1) cutilSafeCall(cudaSetDevice(i)); cutilSafeCall(cudaSetDeviceFlags(cudaDeviceMapHost)); #else fprintf(stderr, "This CUDART version does not support <cudaDeviceProp.canMapHostMemory> field\n"); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_PASSED); #endif } } // CC 1.2 and earlier do not support double precision if (props.major*10 + props.minor <= 12) bSupportDouble = false; } //if(numDevsRequested > 1) // cutilSafeCall(cudaSetDevice(devID)); if (fp64 && !bSupportDouble) { fprintf(stderr, "One or more of the requested devices does not support double precision floating-point\n"); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_PASSED); } } numIterations = 0; p = 0; q = 1; cutGetCmdLineArgumenti(argc, (const char**) argv, "i", &numIterations); cutGetCmdLineArgumenti(argc, (const char**) argv, "p", &p); cutGetCmdLineArgumenti(argc, (const char**) argv, "q", &q); if (p == 0) // p not set on command line { p = 256; if (q * p > 256) { p = 256 / q; shrLog("Setting p=%d, q=%d to maintain %d threads per block\n", p, q, 256); } } // default number of bodies is #SMs * 4 * CTA size if (useCpu) #ifdef OPENMP numBodies = 8192; #else numBodies = 4096; #endif else if (numDevsRequested == 1)
////////////////////////////////////////////////////////////////////////////// // Program main ////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { printf("Run \"nbody -benchmark [-n=<numBodies>]\" to measure perfomance.\n\n"); bool benchmark = (cutCheckCmdLineFlag(argc, (const char**) argv, "benchmark") != 0); bool compareToCPU = ((cutCheckCmdLineFlag(argc, (const char**) argv, "compare") != 0) || !(cutCheckCmdLineFlag(argc, (const char**) argv, "noqatest") != 0)); bool regression = (cutCheckCmdLineFlag(argc, (const char**) argv, "regression") != 0); int devID; cudaDeviceProp props; // nBody has a mode that allows it to be run without using GL interop if (benchmark || compareToCPU || regression) { /* if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilDeviceInit(argc, argv); } else { devID = cutGetMaxGflopsDeviceId(); cudaSetDevice( devID ); } */ } else { // This mode shows the OpenGL results rendered // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. glutInit(&argc, argv); glutInitDisplayMode(GLUT_RGB | GLUT_DEPTH | GLUT_DOUBLE); glutInitWindowSize(720, 480); glutCreateWindow("CUDA n-body system"); GLenum err = glewInit(); if (GLEW_OK != err) { printf("GLEW Error: %s\n", glewGetErrorString(err)); } else { #if defined(WIN32) wglSwapIntervalEXT(0); #elif defined(LINUX) glxSwapIntervalSGI(0); #endif } initGL(); initParameters(); if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilGLDeviceInit(argc, argv); } else { devID = cutGetMaxGflopsDeviceId(); cudaGLSetGLDevice( devID ); } } // get number of SMs on this GPU cutilSafeCall(cudaGetDevice(&devID)); cutilSafeCall(cudaGetDeviceProperties(&props, devID)); numIterations = 0; int p = 256; int q = 1; cutGetCmdLineArgumenti(argc, (const char**) argv, "i", &numIterations); cutGetCmdLineArgumenti(argc, (const char**) argv, "p", &p); cutGetCmdLineArgumenti(argc, (const char**) argv, "q", &q); // default number of bodies is #SMs * 4 * CTA size numBodies = compareToCPU ? 4096 : p*q*4*props.multiProcessorCount; cutGetCmdLineArgumenti(argc, (const char**) argv, "n", &numBodies); switch (numBodies) { case 1024: activeParams.m_clusterScale = 1.52f; activeParams.m_velocityScale = 2.f; break; case 2048: activeParams.m_clusterScale = 1.56f; activeParams.m_velocityScale = 2.64f; break; case 4096: activeParams.m_clusterScale = 1.68f; activeParams.m_velocityScale = 2.98f; break; case 8192: activeParams.m_clusterScale = 1.98f; activeParams.m_velocityScale = 2.9f; break; default: case 16384: activeParams.m_clusterScale = 1.54f; activeParams.m_velocityScale = 8.f; break; case 32768: activeParams.m_clusterScale = 1.44f; activeParams.m_velocityScale = 11.f; break; } if (q * p > 256) { p = 256 / q; printf("Setting p=%d, q=%d to maintain %d threads per block\n", p, q, 256); } if (q == 1 && numBodies < p) { p = numBodies; } init(numBodies, p, q, !(benchmark || compareToCPU)); reset(nbody, numBodies, NBODY_CONFIG_SHELL, !(benchmark || compareToCPU)); if (benchmark) { if (numIterations <= 0) numIterations = 100; runBenchmark(numIterations); } else if (compareToCPU || regression) { compareResults(regression, numBodies); } else { glutDisplayFunc(display); glutReshapeFunc(reshape); glutMouseFunc(mouse); glutMotionFunc(motion); glutKeyboardFunc(key); glutSpecialFunc(special); glutIdleFunc(idle); cutilSafeCall(cudaEventRecord(startEvent, 0)); glutMainLoop(); } if (nbodyCPU) delete nbodyCPU; if (nbodyCUDA) delete nbodyCUDA; if (hPos) delete [] hPos; if (hVel) delete [] hVel; if (hColor) delete [] hColor; cutilSafeCall(cudaEventDestroy(startEvent)); cutilSafeCall(cudaEventDestroy(stopEvent)); cutilCheckError(cutDeleteTimer(demoTimer)); return 0; }
bool runTestMax( int argc, char** argv, ReduceType datatype) { int size = 1<<24; // number of elements to reduce int maxThreads = 256; // number of threads per block int whichKernel = 6; int maxBlocks = 64; bool cpuFinalReduction = false; int cpuFinalThreshold = 1; cutGetCmdLineArgumenti( argc, (const char**) argv, "n", &size); cutGetCmdLineArgumenti( argc, (const char**) argv, "threads", &maxThreads); cutGetCmdLineArgumenti( argc, (const char**) argv, "kernel", &whichKernel); cutGetCmdLineArgumenti( argc, (const char**) argv, "maxblocks", &maxBlocks); shrLog("METHOD: MAX\n"); shrLog("%d elements\n", size); shrLog("%d threads (max)\n", maxThreads); cpuFinalReduction = (cutCheckCmdLineFlag( argc, (const char**) argv, "cpufinal") == CUTTrue); cutGetCmdLineArgumenti( argc, (const char**) argv, "cputhresh", &cpuFinalThreshold); bool runShmoo = (cutCheckCmdLineFlag(argc, (const char**) argv, "shmoo") == CUTTrue); if (runShmoo) { shmoo<T>(1, 33554432, maxThreads, maxBlocks, datatype); } else { // create random input data on CPU unsigned int bytes = size * sizeof(T); T *h_idata = (T *) malloc(bytes); for(int i=0; i<size; i++) { // Keep the numbers small so we don't get truncation error in the sum if (datatype == REDUCE_INT) h_idata[i] = (T)(rand() & 0xFF); else h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX; } int numBlocks = 0; int numThreads = 0; getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads); if (numBlocks == 1) cpuFinalThreshold = 1; // allocate mem for the result on host side T* h_odata = (T*) malloc(numBlocks*sizeof(T)); shrLog("%d blocks\n\n", numBlocks); // allocate device memory and data T* d_idata = NULL; T* d_odata = NULL; cutilSafeCallNoSync( cudaMalloc((void**) &d_idata, bytes) ); cutilSafeCallNoSync( cudaMalloc((void**) &d_odata, numBlocks*sizeof(T)) ); // copy data directly to device memory cutilSafeCallNoSync( cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice) ); cutilSafeCallNoSync( cudaMemcpy(d_odata, h_idata, numBlocks*sizeof(T), cudaMemcpyHostToDevice) ); // warm-up maxreduce<T>(size, numThreads, numBlocks, whichKernel, d_idata, d_odata); int testIterations = 100; unsigned int timer = 0; cutilCheckError( cutCreateTimer( &timer)); T gpu_result = 0; gpu_result = benchmarkReduceMax<T>(size, numThreads, numBlocks, maxThreads, maxBlocks, whichKernel, testIterations, cpuFinalReduction, cpuFinalThreshold, timer, h_odata, d_idata, d_odata); double reduceTime = cutGetAverageTimerValue(timer) * 1e-3; shrLogEx(LOGBOTH | MASTER, 0, "Reduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %d, Workgroup = %u\n", 1.0e-9 * ((double)bytes)/reduceTime, reduceTime, size, 1, numThreads); // compute reference solution T cpu_result = maxreduceCPU<T>(h_idata, size); double threshold = 1e-12; double diff = 0; if (datatype == REDUCE_INT) { shrLog("\nGPU result = %d\n", gpu_result); shrLog("CPU result = %d\n\n", cpu_result); } else { shrLog("\nGPU result = %f\n", gpu_result); shrLog("CPU result = %f\n\n", cpu_result); if (datatype == REDUCE_FLOAT) threshold = 1e-8 * size; diff = fabs((double)gpu_result - (double)cpu_result); } // cleanup cutilCheckError( cutDeleteTimer(timer) ); free(h_idata); free(h_odata); cutilSafeCallNoSync(cudaFree(d_idata)); cutilSafeCallNoSync(cudaFree(d_odata)); if (datatype == REDUCE_INT) { return (gpu_result == cpu_result); } else { return (diff < threshold); } } return true; }
//////////////////////////////////////////////////////////////////////////////// // initialize marching cubes //////////////////////////////////////////////////////////////////////////////// void initMC(int argc, char** argv) { // parse command line arguments int n; if (cutGetCmdLineArgumenti( argc, (const char**) argv, "grid", &n)) { gridSizeLog2.x = gridSizeLog2.y = gridSizeLog2.z = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "gridx", &n)) { gridSizeLog2.x = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "gridy", &n)) { gridSizeLog2.y = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "gridz", &n)) { gridSizeLog2.z = n; } char *filename; if (cutGetCmdLineArgumentstr( argc, (const char**) argv, "file", &filename)) { volumeFilename = filename; } gridSize = make_uint3(1<<gridSizeLog2.x, 1<<gridSizeLog2.y, 1<<gridSizeLog2.z); gridSizeMask = make_uint3(gridSize.x-1, gridSize.y-1, gridSize.z-1); gridSizeShift = make_uint3(0, gridSizeLog2.x, gridSizeLog2.x+gridSizeLog2.y); numVoxels = gridSize.x*gridSize.y*gridSize.z; voxelSize = make_float3(2.0f / gridSize.x, 2.0f / gridSize.y, 2.0f / gridSize.z); maxVerts = gridSize.x*gridSize.y*100; printf("grid: %d x %d x %d = %d voxels\n", gridSize.x, gridSize.y, gridSize.z, numVoxels); printf("max verts = %d\n", maxVerts); #if SAMPLE_VOLUME // load volume data char* path = cutFindFilePath(volumeFilename, argv[0]); if (path == 0) { fprintf(stderr, "Error finding file '%s'\n", volumeFilename); cudaThreadExit(); exit(EXIT_FAILURE); } int size = gridSize.x*gridSize.y*gridSize.z*sizeof(uchar); uchar *volume = loadRawFile(path, size); cutilSafeCall(cudaMalloc((void**) &d_volume, size)); cutilSafeCall(cudaMemcpy(d_volume, volume, size, cudaMemcpyHostToDevice) ); free(volume); bindVolumeTexture(d_volume); #endif if (g_bQAReadback) { cudaMalloc((void **)&(d_pos), maxVerts*sizeof(float)*4); cudaMalloc((void **)&(d_normal), maxVerts*sizeof(float)*4); } else { // create VBOs createVBO(&posVbo, maxVerts*sizeof(float)*4); // DEPRECATED: cutilSafeCall( cudaGLRegisterBufferObject(posVbo) ); cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_posvbo_resource, posVbo, cudaGraphicsMapFlagsWriteDiscard)); createVBO(&normalVbo, maxVerts*sizeof(float)*4); // DEPRECATED: cutilSafeCall(cudaGLRegisterBufferObject(normalVbo)); cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_normalvbo_resource, normalVbo, cudaGraphicsMapFlagsWriteDiscard)); } // allocate textures allocateTextures( &d_edgeTable, &d_triTable, &d_numVertsTable ); // allocate device memory unsigned int memSize = sizeof(uint) * numVoxels; cutilSafeCall(cudaMalloc((void**) &d_voxelVerts, memSize)); cutilSafeCall(cudaMalloc((void**) &d_voxelVertsScan, memSize)); cutilSafeCall(cudaMalloc((void**) &d_voxelOccupied, memSize)); cutilSafeCall(cudaMalloc((void**) &d_voxelOccupiedScan, memSize)); cutilSafeCall(cudaMalloc((void**) &d_compVoxelArray, memSize)); // initialize CUDPP scan CUDPPConfiguration config; config.algorithm = CUDPP_SCAN; config.datatype = CUDPP_UINT; config.op = CUDPP_ADD; config.options = CUDPP_OPTION_FORWARD | CUDPP_OPTION_EXCLUSIVE; cudppPlan(&scanplan, config, numVoxels, 1, 0); }
int main( int argc,char** argv) { printf("hello world\n"); if (!InitCUDA()) { return 0; } int iter = 1000; int trainnum = 20; bool isProfiler = false; int intProfiler = 0; int testnum = -1; float maxtime = 0.0f; cutGetCmdLineArgumenti(argc, (const char**) argv, "train", &trainnum); cutGetCmdLineArgumenti(argc, (const char**) argv, "iter", &iter); cutGetCmdLineArgumenti(argc, (const char**) argv, "profiler", &intProfiler); cutGetCmdLineArgumenti(argc, (const char**) argv, "test", &testnum); cutGetCmdLineArgumentf(argc, (const char**) argv, "maxtime", &maxtime); printf("%d\n", intProfiler); if(intProfiler) { isProfiler = true; } if(testnum == -1) testnum = trainnum /2; printf("Iter = %d\n", iter); printf("TrainNum = %d\n", trainnum); printf("TestNum = %d\n", testnum); CUT_DEVICE_INIT(argc, argv); cublasStatus status; status = cublasInit(); if(status != CUBLAS_STATUS_SUCCESS) { printf("Can't init cublas\n"); printf("%s\n", cudaGetErrorString(cudaGetLastError())); return -1; } Image* imageList = new Image[trainnum+testnum]; read64("my_optdigits.tra", imageList, trainnum + testnum); const int warmUpTime = 3; if(!isProfiler) { freopen("verbose.txt", "w", stdout); for(int i=0;i< warmUpTime;i++) { runImage(argc, argv, imageList, trainnum < warmUpTime ? trainnum : warmUpTime, 0, 10, false, 0.0f); } freopen("CON", "w", stdout); printf("Warm-up complete.\n\n\n"); } #ifdef _DEBUG freopen("out.txt", "w", stdout); #endif // _DEBUG runImage(argc, argv, imageList, trainnum, testnum, iter, true, maxtime); freopen("CON", "w", stdout); delete[] imageList; //TestReduce(); cublasShutdown(); if(!isProfiler) { CUT_EXIT(argc, argv); } //getchar(); return 0; }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { // use command-line specified CUDA device, otherwise use device with highest Gflops/s if (!cutCheckCmdLineFlag(argc, (const char **)argv, "noqatest") || cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) { g_bQAReadback = true; fpsLimit = frameCheckNumber; } if (argc > 1) { if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) { g_bOpenGLQA = true; fpsLimit = frameCheckNumber; } } printf("[%s] ", sSDKsample); if (g_bQAReadback) printf("(Automated Testing)\n"); if (g_bOpenGLQA) printf("(OpenGL Readback)\n"); // Get the path of the filename char *filename; if (cutGetCmdLineArgumentstr(argc, (const char**) argv, "image", &filename)) { image_filename = filename; } // load image char* image_path = cutFindFilePath(image_filename, argv[0]); if (image_path == 0) { fprintf(stderr, "Error finding image file '%s'\n", image_filename); cudaThreadExit(); exit(EXIT_FAILURE); } cutilCheckError( cutLoadPPM4ub(image_path, (unsigned char **) &h_img, &width, &height)); if (!h_img) { printf("Error opening file '%s'\n", image_path); cudaThreadExit(); exit(-1); } printf("Loaded '%s', %d x %d pixels\n", image_path, width, height); cutGetCmdLineArgumenti(argc, (const char**) argv, "threads", &nthreads); cutGetCmdLineArgumentf(argc, (const char**) argv, "sigma", &sigma); runBenchmark = cutCheckCmdLineFlag(argc, (const char**) argv, "bench"); int device; struct cudaDeviceProp prop; cudaGetDevice( &device ); cudaGetDeviceProperties( &prop, device ); if( !strncmp( "Tesla", prop.name, 5 ) ) { printf("Tesla card detected, running the test in benchmark mode (no OpenGL display)\n"); // runBenchmark = CUTTrue; g_bQAReadback = true; } // Benchmark or AutoTest mode detected, no OpenGL if (runBenchmark == CUTTrue || g_bQAReadback) { if( cutCheckCmdLineFlag( argc, (const char **)argv, "device" ) ) cutilDeviceInit( argc, argv ); else cudaSetDevice( cutGetMaxGflopsDeviceId() ); } else { // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL(argc, argv); if( cutCheckCmdLineFlag( argc, (const char **)argv, "device" ) ) cutilGLDeviceInit( argc, argv ); else cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); } initCudaBuffers(); if (g_bOpenGLQA) { g_CheckRender = new CheckBackBuffer(width, height, 4); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); } if (g_bQAReadback) { // This is the automated testing path g_CheckRender = new CheckBackBuffer(width, height, 4, false); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); runAutoTest(argc, argv); cleanup(); cudaThreadExit(); cutilExit(argc, argv); } if (runBenchmark) { benchmark(100); cleanup(); cudaThreadExit(); exit(0); } initGLBuffers(); atexit(cleanup); glutMainLoop(); cudaThreadExit(); cutilExit(argc, argv); }