void initGlWindow(int argc, char ** argv){ #ifdef USE_OPENGL cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); if( !glfwInit() ) { cerr << "Failed to initalize GLFW" << endl; exit( 1 ); } if ( !glfwOpenWindow( width, height, 8, 8, 8, 8, 8, 8, GLFW_WINDOW ) ) { cerr << "Failed to open window" << endl; exit( 1 ); } glewInit(); if (! glewIsSupported("GL_VERSION_2_0 ")) { cerr << "ERROR: Support for necessary OpenGL extensions missing." << endl; exit( 1 ); } //glEnable(GL_POINT_SMOOTH); glViewport(0, 0, height, width); glLoadIdentity(); glOrtho(-1.0, 1.0, -1.0, 1.0, 0.0, 1.0); #endif };
void GLManager::initGlWindow(){ cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); if( !glfwInit() ) { std::cerr << "Failed to initalize GLFW" << std::endl; exit( 1 ); } glfwOpenWindowHint( GLFW_WINDOW_NO_RESIZE, GL_TRUE ); if ( !glfwOpenWindow( width, height, 8, 8, 8, 8, 8, 8, GLFW_WINDOW ) ) { std::cerr << "Failed to open window" << std::endl; exit( 1 ); } glewInit(); if (! glewIsSupported("GL_VERSION_2_0 ")) { std::cerr << "ERROR: Support for necessary OpenGL extensions missing." << std::endl; exit( 1 ); } glViewport(0, 0, height, width); glLoadIdentity(); glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0); };
//////////////////////////////////////////////////////////////////////////////// //! Run a simple test for CUDA //////////////////////////////////////////////////////////////////////////////// CUTBoolean runTest(int argc, char** argv) { if (!cutCheckCmdLineFlag(argc, (const char **)argv, "noqatest") || cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) { g_bQAReadback = true; fpsLimit = frameCheckNumber; } // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. if (CUTFalse == initGL(argc, argv)) { return CUTFalse; } // use command-line specified CUDA device, otherwise use device with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) cutilGLDeviceInit(argc, argv); else { cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); } // Create the CUTIL timer cutilCheckError( cutCreateTimer( &timer)); // register callbacks glutDisplayFunc(display); glutKeyboardFunc(keyboard); glutMouseFunc(mouse); glutMotionFunc(motion); if (g_bQAReadback) { g_CheckRender = new CheckBackBuffer(window_width, window_height, 4); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); } // create VBO createVBO(&vbo); // run the cuda part runCuda(vbo); // check result of Cuda step checkResultCuda(argc, argv, vbo); atexit(cleanup); // start rendering mainloop glutMainLoop(); cudaThreadExit(); return CUTTrue; }
void runAutoTest(int argc, char **argv) { printf("[%s] (automated testing w/ readback)\n", sSDKsample); if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilDeviceInit(argc, argv); } else { cudaSetDevice( cutGetMaxGflopsDeviceId() ); } loadDefaultImage( argv[0] ); if (argc > 1) { char *filename; if (cutGetCmdLineArgumentstr(argc, (const char **)argv, "file", &filename)) { initializeData(filename); } } else { loadDefaultImage( argv[0]); } g_CheckRender = new CheckBackBuffer(imWidth, imHeight, sizeof(Pixel), false); g_CheckRender->setExecPath(argv[0]); Pixel *d_result; cutilSafeCall( cudaMalloc( (void **)&d_result, imWidth*imHeight*sizeof(Pixel)) ); while (g_SobelDisplayMode <= 2) { printf("AutoTest: %s <%s>\n", sSDKsample, filterMode[g_SobelDisplayMode]); sobelFilter(d_result, imWidth, imHeight, g_SobelDisplayMode, imageScale ); cutilSafeCall( cudaThreadSynchronize() ); cudaMemcpy(g_CheckRender->imageData(), d_result, imWidth*imHeight*sizeof(Pixel), cudaMemcpyDeviceToHost); g_CheckRender->savePGM(sOriginal[g_Index], false, NULL); if (!g_CheckRender->PGMvsPGM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, 0.15f)) { g_TotalErrors++; } g_Index++; g_SobelDisplayMode = (SobelDisplayMode)g_Index; } cutilSafeCall( cudaFree( d_result ) ); delete g_CheckRender; if (!g_TotalErrors) printf("TEST PASSED!\n"); else printf("TEST FAILED!\n"); }
/** * Selects a GPU for the CUDA execution. * @param id id of the selected GPU. */ void selectGPU(int id) { int deviceId; if (id == -1) { deviceId = cutGetMaxGflopsDeviceId(); } else { deviceId = id; } cutilSafeCall(cudaSetDevice( deviceId )); cutilCheckMsg("cudaSetDevice failed"); }
void Application::_init() { // Pick the best CUDA device const int deviceIdx = cutGetMaxGflopsDeviceId(); CudaSafeCall( cudaSetDevice( deviceIdx ) ); // CUDA configuration CudaSafeCall( cudaDeviceSetCacheConfig( cudaFuncCachePreferShared ) ); return; }
void initCuda(){ // Use device with highest Gflops/s cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); initPBO(&pbo); // Clean up on program exit atexit(cleanupCuda); runCuda(); }
bool initCUDA( int argc, char **argv) { return true; if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) { cutilGLDeviceInit(argc, argv); } else { cudaGLSetGLDevice (cutGetMaxGflopsDeviceId()); } return true; }
void runGraphicsTest(int argc, char** argv) { printf("MarchingCubes "); if (g_bFBODisplay) printf("[w/ FBO] "); if (g_bOpenGLQA) printf("[Readback Comparisons] "); printf("\n"); if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) { printf("[%s]\n", argv[0]); printf(" Does not explicitly support -device=n in OpenGL mode\n"); printf(" To use -device=n, the sample must be running w/o OpenGL\n\n"); printf(" > %s -device=n -qatest\n", argv[0]); printf("exiting...\n"); exit(0); } // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. if(CUTFalse == initGL(&argc, argv)) { return; } cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); // register callbacks glutDisplayFunc(display); glutKeyboardFunc(keyboard); glutMouseFunc(mouse); glutMotionFunc(motion); glutIdleFunc(idle); glutReshapeFunc(reshape); initMenus(); // Initialize CUDA buffers for Marching Cubes initMC(argc, argv); cutilCheckError( cutCreateTimer( &timer)); if (g_bOpenGLQA) { g_CheckRender = new CheckBackBuffer(window_width, window_height, 4); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); } // start rendering mainloop glutMainLoop(); cudaThreadExit(); }
bool initCUDA(void) { #if __DEVICE_EMULATION__ return true; #else int count = 0; int i = 0; cudaGetDeviceCount(&count); if(count == 0) { fprintf(stderr, "Nu exista nici un device.\n"); return false; } printf("Exista %d device-uri.\n",count); for(i = 0; i < count; i++) { cudaDeviceProp prop; if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) { if(prop.major >= 1) { break; } } if(!prop.canMapHostMemory) { fprintf(stderr, "Device %d cannot map host memory!\n",0); exit(EXIT_FAILURE); } } if(i == count) { fprintf(stderr, "Nu exista nici un device care suporta CUDA.\n"); return false; } cudaSetDevice(cutGetMaxGflopsDeviceId()); cudaSetDeviceFlags(cudaDeviceMapHost); checkCUDAError("cudaSetDeviceFlags"); printf("CUDA initializat cu succes\n"); // Create the CUTIL timer cutilCheckError( cutCreateTimer( &timer)); return true; #endif }
void initCuda(){ // Use device with highest Gflops/s #if CUDA_VERSION >= 5000 cudaGLSetGLDevice( gpuGetMaxGflopsDeviceId() ); #else cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); #endif initPBO(&pbo); // Clean up on program exit atexit(cleanupCuda); runCuda(); }
void initCuda(){ // Use device with highest Gflops/s cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); initPBO(&pbo); dptr=NULL; cudaGLMapBufferObject((void**)&dptr, pbo); clearPBOpos(dptr,width,height); cudaGLUnmapBufferObject(pbo); // Clean up on program exit atexit(cleanupCuda); SetScissorWindow(glm::vec4(300,300,500,500)); texture.mapptr = stbi_load("cow.jpeg",&texture.width, &texture.height,&texture.depth,0); runCuda(); }
void initCuda(){ // Use device with highest Gflops/s cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); initPBO(&pbo); paraMap = new float[(int)renderCam->resolution.x *(int)renderCam->resolution.y]; effectiveRayMap =new int[(int)renderCam->resolution.x *(int)renderCam->resolution.y]; initialRayMap = new ray[(int)renderCam->resolution.x * (int)renderCam->resolution.y]; generateRayMap(renderCam, targetFrame); // Clean up on program exit atexit(cleanupCuda); runCuda(); }
RemoteCUDARunner::RemoteCUDARunner():GPURunner<unsigned long,int>(TYPE_CUDA),m_metahashsize(0) { m_in=0; m_devin=0; m_out=0; m_devout=0; m_metahash=0; m_devmetahash=0; cutilSafeCall(cudaGetDeviceCount(&m_devicecount)); if(m_devicecount>0) { if(m_deviceindex<0 || m_deviceindex>=m_devicecount) { m_deviceindex=cutGetMaxGflopsDeviceId(); std::cout << "Setting CUDA device to Max GFlops device at index " << m_deviceindex << std::endl; } else { std::cout << "Setting CUDA device to device at index " << m_deviceindex << std::endl; } cudaDeviceProp props; cudaGetDeviceProperties(&props,m_deviceindex); std::cout << "Device info for " << props.name << " :" << std::endl; std::cout << "Compute Capability : " << props.major << "." << props.minor << std::endl; std::cout << "Clock Rate (hz) : " << props.clockRate << std::endl; if(props.major>999) { std::cout << "CUDA seems to be running in CPU emulation mode" << std::endl; } cutilSafeCall(cudaSetDevice(m_deviceindex)); } else { m_deviceindex=-1; std::cout << "No CUDA capable device detected" << std::endl; } }
CUDARunner::CUDARunner():GPURunner<unsigned long,int>(TYPE_CUDA) { m_in=0; m_devin=0; m_out=0; m_devout=0; cutilSafeCall(cudaGetDeviceCount(&m_devicecount)); if(m_devicecount>0) { if(m_deviceindex<0 || m_deviceindex>=m_devicecount) { m_deviceindex=cutGetMaxGflopsDeviceId(); printf("Setting CUDA device to Max GFlops device at index %u\n",m_deviceindex); } else { printf("Setting CUDA device to device at index %u\n",m_deviceindex); } cudaDeviceProp props; cudaGetDeviceProperties(&props,m_deviceindex); printf("Device info for %s :\nCompute Capability : %d.%d\nClock Rate (hz) : %d\n",props.name,props.major,props.minor,props.clockRate); if(props.major>999) { printf("CUDA seems to be running in CPU emulation mode\n"); } cutilSafeCall(cudaSetDevice(m_deviceindex)); } else { m_deviceindex=-1; printf("No CUDA capable device detected\n"); } }
int DrawScene::InitGL( ) { GLenum err = glewInit(); if (GLEW_OK != err) { /* Problem: glewInit failed, something is seriously wrong. */ ::MessageBox(NULL,"glewInit failed, something is seriously wrong.", "glew error occured.",MB_OK ); return false; } #if CUDA_ENABLE // cuda初始化 cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); #endif for (int i=0;i<COUNT_MODEL;i++) { m_model[i].loadModelData(FILENAME_MS3D); m_model[i].reloadTextures(); // Loads Model Textures } glEnable(GL_TEXTURE_2D); // Enable Texture Mapping ( NEW ) glShadeModel(GL_SMOOTH); // Enable Smooth Shading glClearColor(0.0f, 0.0f, 0.0f, 0.5f); // Black Background glClearDepth(1.0f); // Depth Buffer Setup glEnable(GL_DEPTH_TEST); // Enables Depth Testing glDepthFunc(GL_LEQUAL); // The Type Of Depth Testing To Do glHint(GL_PERSPECTIVE_CORRECTION_HINT, GL_NICEST); // Really Nice Perspective Calculations #if ENABLE_CONSOLE_WINDOW AllocConsole(); freopen( "CONOUT$","w",stdout); #endif m_particleMngr.addParticleNode("", PARTICLE_TYPE_FOUNTAIN ); return TRUE; }
//------------------------------------------------------------------------------ static void #if GLFW_VERSION_MAJOR>=3 keyboard(GLFWwindow *, int key, int scancode, int event, int mods) { #else #define GLFW_KEY_ESCAPE GLFW_KEY_ESC keyboard(int key, int event) { #endif if (event == GLFW_RELEASE) return; if (g_hud.KeyDown(tolower(key))) return; switch (key) { case 'Q': g_running = 0; break; case 'F': fitFrame(); break; case '+': case '=': g_tessLevel++; break; case '-': g_tessLevel = std::max(g_tessLevelMin, g_tessLevel-1); break; case '.': g_moveModels = std::max(g_moveModels*2, 1); break; case ',': g_moveModels = std::max(g_moveModels/2, 0); break; case 'I': g_modelCount = std::max(g_modelCount/2, 1); rebuild(); break; case 'O': g_modelCount = std::min(g_modelCount*2, MAX_MODELS); rebuild(); break; case GLFW_KEY_ESCAPE: g_hud.SetVisible(!g_hud.IsVisible()); break; } } //------------------------------------------------------------------------------ static void callbackDisplayStyle(int b) { if (g_displayStyle == kVaryingColor or b == kVaryingColor or g_displayStyle == kFaceVaryingColor or b == kFaceVaryingColor) { // need to rebuild for varying reconstruct g_displayStyle = b; rebuild(); return; } g_displayStyle = b; } static void callbackKernel(int k) { g_kernel = k; #ifdef OPENSUBDIV_HAS_OPENCL if (g_kernel == kCL and g_clContext == NULL) { if (initCL(&g_clContext, &g_clQueue) == false) { printf("Error in initializing OpenCL\n"); exit(1); } } #endif #ifdef OPENSUBDIV_HAS_CUDA if (g_kernel == kCUDA and g_cudaInitialized == false) { g_cudaInitialized = true; cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); } #endif rebuild(); }
void WaterPlaneCUDA::configure(Vector upperLeft, Vector lowerRight, float dampFactor, float resolution) { cudaSetDevice(cutGetMaxGflopsDeviceId()); cudaGLSetGLDevice(cutGetMaxGflopsDeviceId()); timeSinceLast = timePassed = 0; unsigned int free, total; int gpuCount, i; CUresult res; CUdevice dev; CUcontext ctx; cuInit(0); cuDeviceGetCount(&gpuCount); printf("Detected %d GPU\n",gpuCount); for (i=0; i<gpuCount; i++) { cuDeviceGet(&dev,i); cuCtxCreate(&ctx, 0, dev); res = cuMemGetInfo(&free, &total); if(res != CUDA_SUCCESS) printf("!!!! cuMemGetInfo failed! (status = %x)", res); printf("^^^^ Device: %d\n",i); printf("^^^^ Free : %lu bytes (%lu KB) (%lu MB)\n", free, inKB(free), inMB(free)); printf("^^^^ Total: %lu bytes (%lu KB) (%lu MB)\n", total, inKB(total), inMB(total)); printf("^^^^ %f%% free, %f%% used\n", 100.0*free/(double)total, 100.0*(total - free)/(double)total); cuCtxDetach(ctx); } this->stepSize = 1.0f/resolution; this->resolutionFactor = resolution; //reale Z - Achse ist x - Achse der WaterPlaneCUDA this->sizeX = (unsigned int) abs(upperLeft.z - lowerRight.z); //reale X -Achse ist y- Achse der WaterPlaneCUDA this->sizeY = (unsigned int) abs(upperLeft.x - lowerRight.x); //Anzahl der Netzpunkte in X -Richtung this->pointsX = (unsigned int)(sizeX * resolution); //Anzahl der Netzpunkte in Y -Richtung pointsY = (unsigned int)(sizeY * resolution); uLeft = upperLeft; lRight = lowerRight; //Der "Meeresspiegel" baseHeight = lRight.y; //Das Höhenfeld der WaterPlaneCUDA waveMap = NULL; initBuffer(); gpu_newVertices = new float3[pointsX*pointsY]; gpu_oldVertices = new float3[pointsX*pointsY]; gpu_normals = new float3[pointsX*pointsY]; for (int i=0;i<pointsX*pointsY;i++) { gpu_newVertices[i]=make_float3(0,0,0); gpu_oldVertices[i]=make_float3(0,0,0); gpu_normals[i]=make_float3(0,1.0,0); } cutilSafeCall(cudaMalloc((void**)&gpu_newVertices,pointsX*pointsY*sizeof(float3))); cutilSafeCall(cudaMalloc((void**)&gpu_oldVertices,pointsX*pointsY*sizeof(float3))); cutilSafeCall(cudaMalloc((void**)&gpu_normals,pointsX*pointsY*sizeof(float3))); drawMesh(); }
//////////////////////////////////////////////////////////////////////////////// //! Run test //////////////////////////////////////////////////////////////////////////////// void runGraphicsTest(int argc, char** argv) { printf("[%s] ", sSDKsample); if (g_bOpenGLQA) printf("[OpenGL Readback Comparisons] "); printf("\n"); if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device") ) { printf("[%s]\n", argv[0]); printf(" Does not explicitly support -device=n in OpenGL mode\n"); printf(" To use -device=n, the sample must be running w/o OpenGL\n\n"); printf(" > %s -device=n -qatest\n", argv[0]); printf("exiting...\n"); exit(0); } // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. if(CUTFalse == initGL( &argc, argv )) { cudaThreadExit(); return; } cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); // create FFT plan CUFFT_SAFE_CALL(cufftPlan2d(&fftPlan, meshW, meshH, CUFFT_C2R) ); // allocate memory fftInputW = (meshW / 2)+1; fftInputH = meshH; fftInputSize = (fftInputW*fftInputH)*sizeof(float2); cutilSafeCall(cudaMalloc((void **)&d_h0, fftInputSize) ); cutilSafeCall(cudaMalloc((void **)&d_ht, fftInputSize) ); h_h0 = (float2 *) malloc(fftInputSize); generate_h0(); cutilSafeCall(cudaMemcpy(d_h0, h_h0, fftInputSize, cudaMemcpyHostToDevice) ); cutilSafeCall(cudaMalloc((void **)&d_slope, meshW*meshH*sizeof(float2)) ); cutCreateTimer(&timer); cutStartTimer(timer); prevTime = cutGetTimerValue(timer); // create vertex buffers and register with CUDA createVBO(&heightVertexBuffer, meshW*meshH*sizeof(float)); // DEPRECATED: cutilSafeCall(cudaGLRegisterBufferObject(heightVertexBuffer)); cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_heightVB_resource, heightVertexBuffer, cudaGraphicsMapFlagsWriteDiscard)); createVBO(&slopeVertexBuffer, meshW*meshH*sizeof(float2)); // DEPRECATED: cutilSafeCall(cudaGLRegisterBufferObject(slopeVertexBuffer)); cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_slopeVB_resource, slopeVertexBuffer, cudaGraphicsMapFlagsWriteDiscard)); // create vertex and index buffer for mesh createMeshPositionVBO(&posVertexBuffer, meshW, meshH); createMeshIndexBuffer(&indexBuffer, meshW, meshH); // Creating the Auto-Validation Code if (g_bOpenGLQA) { g_CheckRender = new CheckBackBuffer(windowH, windowH, 4); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); } runCuda(); // register callbacks glutDisplayFunc(display); glutKeyboardFunc(keyboard); glutMouseFunc(mouse); glutMotionFunc(motion); glutReshapeFunc(reshape); glutIdleFunc(idle); // start rendering mainloop glutMainLoop(); cudaThreadExit(); }
////////////////////////////////////////////////////////////////////////////// // Program main ////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { printf("Run \"nbody -benchmark [-n=<numBodies>]\" to measure perfomance.\n\n"); bool benchmark = (cutCheckCmdLineFlag(argc, (const char**) argv, "benchmark") != 0); bool compareToCPU = ((cutCheckCmdLineFlag(argc, (const char**) argv, "compare") != 0) || !(cutCheckCmdLineFlag(argc, (const char**) argv, "noqatest") != 0)); bool regression = (cutCheckCmdLineFlag(argc, (const char**) argv, "regression") != 0); int devID; cudaDeviceProp props; // nBody has a mode that allows it to be run without using GL interop if (benchmark || compareToCPU || regression) { /* if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilDeviceInit(argc, argv); } else { devID = cutGetMaxGflopsDeviceId(); cudaSetDevice( devID ); } */ } else { // This mode shows the OpenGL results rendered // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. glutInit(&argc, argv); glutInitDisplayMode(GLUT_RGB | GLUT_DEPTH | GLUT_DOUBLE); glutInitWindowSize(720, 480); glutCreateWindow("CUDA n-body system"); GLenum err = glewInit(); if (GLEW_OK != err) { printf("GLEW Error: %s\n", glewGetErrorString(err)); } else { #if defined(WIN32) wglSwapIntervalEXT(0); #elif defined(LINUX) glxSwapIntervalSGI(0); #endif } initGL(); initParameters(); if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilGLDeviceInit(argc, argv); } else { devID = cutGetMaxGflopsDeviceId(); cudaGLSetGLDevice( devID ); } } // get number of SMs on this GPU cutilSafeCall(cudaGetDevice(&devID)); cutilSafeCall(cudaGetDeviceProperties(&props, devID)); numIterations = 0; int p = 256; int q = 1; cutGetCmdLineArgumenti(argc, (const char**) argv, "i", &numIterations); cutGetCmdLineArgumenti(argc, (const char**) argv, "p", &p); cutGetCmdLineArgumenti(argc, (const char**) argv, "q", &q); // default number of bodies is #SMs * 4 * CTA size numBodies = compareToCPU ? 4096 : p*q*4*props.multiProcessorCount; cutGetCmdLineArgumenti(argc, (const char**) argv, "n", &numBodies); switch (numBodies) { case 1024: activeParams.m_clusterScale = 1.52f; activeParams.m_velocityScale = 2.f; break; case 2048: activeParams.m_clusterScale = 1.56f; activeParams.m_velocityScale = 2.64f; break; case 4096: activeParams.m_clusterScale = 1.68f; activeParams.m_velocityScale = 2.98f; break; case 8192: activeParams.m_clusterScale = 1.98f; activeParams.m_velocityScale = 2.9f; break; default: case 16384: activeParams.m_clusterScale = 1.54f; activeParams.m_velocityScale = 8.f; break; case 32768: activeParams.m_clusterScale = 1.44f; activeParams.m_velocityScale = 11.f; break; } if (q * p > 256) { p = 256 / q; printf("Setting p=%d, q=%d to maintain %d threads per block\n", p, q, 256); } if (q == 1 && numBodies < p) { p = numBodies; } init(numBodies, p, q, !(benchmark || compareToCPU)); reset(nbody, numBodies, NBODY_CONFIG_SHELL, !(benchmark || compareToCPU)); if (benchmark) { if (numIterations <= 0) numIterations = 100; runBenchmark(numIterations); } else if (compareToCPU || regression) { compareResults(regression, numBodies); } else { glutDisplayFunc(display); glutReshapeFunc(reshape); glutMouseFunc(mouse); glutMotionFunc(motion); glutKeyboardFunc(key); glutSpecialFunc(special); glutIdleFunc(idle); cutilSafeCall(cudaEventRecord(startEvent, 0)); glutMainLoop(); } if (nbodyCPU) delete nbodyCPU; if (nbodyCUDA) delete nbodyCUDA; if (hPos) delete [] hPos; if (hVel) delete [] hVel; if (hColor) delete [] hColor; cutilSafeCall(cudaEventDestroy(startEvent)); cutilSafeCall(cudaEventDestroy(stopEvent)); cutilCheckError(cutDeleteTimer(demoTimer)); return 0; }
int main(int argc, char **argv) { uchar *h_Data; uint *h_HistogramCPU, *h_HistogramGPU; uchar *d_Data; uint *d_Histogram; uint hTimer; int PassFailFlag = 1; uint byteCount = 64 * 1048576; uint uiSizeMult = 1; cudaDeviceProp deviceProp; deviceProp.major = 0; deviceProp.minor = 0; int dev; shrQAStart(argc, argv); // set logfile name and start logs shrSetLogFileName ("histogram.txt"); //Use command-line specified CUDA device, otherwise use device with highest Gflops/s if( shrCheckCmdLineFlag(argc, (const char**)argv, "device") ) { dev = cutilDeviceInit(argc, argv); if (dev < 0) { printf("No CUDA Capable Devices found, exiting...\n"); shrQAFinishExit(argc, (const char **)argv, QA_WAIVED); } } else { cudaSetDevice( dev = cutGetMaxGflopsDeviceId() ); cutilSafeCall( cudaChooseDevice(&dev, &deviceProp) ); } cutilSafeCall( cudaGetDeviceProperties(&deviceProp, dev) ); printf("CUDA device [%s] has %d Multi-Processors, Compute %d.%d\n", deviceProp.name, deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor); int version = deviceProp.major * 0x10 + deviceProp.minor; if(version < 0x11) { printf("There is no device supporting a minimum of CUDA compute capability 1.1 for this SDK sample\n"); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_WAIVED); } cutilCheckError(cutCreateTimer(&hTimer)); // Optional Command-line multiplier to increase size of array to histogram if (shrGetCmdLineArgumentu(argc, (const char**)argv, "sizemult", &uiSizeMult)) { uiSizeMult = CLAMP(uiSizeMult, 1, 10); byteCount *= uiSizeMult; } shrLog("Initializing data...\n"); shrLog("...allocating CPU memory.\n"); h_Data = (uchar *)malloc(byteCount); h_HistogramCPU = (uint *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint)); h_HistogramGPU = (uint *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint)); shrLog("...generating input data\n"); srand(2009); for(uint i = 0; i < byteCount; i++) h_Data[i] = rand() % 256; shrLog("...allocating GPU memory and copying input data\n\n"); cutilSafeCall( cudaMalloc((void **)&d_Data, byteCount ) ); cutilSafeCall( cudaMalloc((void **)&d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint) ) ); cutilSafeCall( cudaMemcpy(d_Data, h_Data, byteCount, cudaMemcpyHostToDevice) ); { shrLog("Starting up 64-bin histogram...\n\n"); initHistogram64(); shrLog("Running 64-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns); for(int iter = -1; iter < numRuns; iter++){ //iter == -1 -- warmup iteration if(iter == 0){ cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); } histogram64(d_Histogram, d_Data, byteCount); } cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutStopTimer(hTimer)); double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns; shrLog("histogram64() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs); shrLogEx(LOGBOTH | MASTER, 0, "histogram64, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM64_THREADBLOCK_SIZE); shrLog("\nValidating GPU results...\n"); shrLog(" ...reading back GPU results\n"); cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM64_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) ); shrLog(" ...histogram64CPU()\n"); histogram64CPU( h_HistogramCPU, h_Data, byteCount ); shrLog(" ...comparing the results...\n"); for(uint i = 0; i < HISTOGRAM64_BIN_COUNT; i++) if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0; shrLog(PassFailFlag ? " ...64-bin histograms match\n\n" : " ***64-bin histograms do not match!!!***\n\n" ); shrLog("Shutting down 64-bin histogram...\n\n\n"); closeHistogram64(); } { shrLog("Initializing 256-bin histogram...\n"); initHistogram256(); shrLog("Running 256-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns); for(int iter = -1; iter < numRuns; iter++){ //iter == -1 -- warmup iteration if(iter == 0){ cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); } histogram256(d_Histogram, d_Data, byteCount); } cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutStopTimer(hTimer)); double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns; shrLog("histogram256() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs); shrLogEx(LOGBOTH | MASTER, 0, "histogram256, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM256_THREADBLOCK_SIZE); shrLog("\nValidating GPU results...\n"); shrLog(" ...reading back GPU results\n"); cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) ); shrLog(" ...histogram256CPU()\n"); histogram256CPU( h_HistogramCPU, h_Data, byteCount ); shrLog(" ...comparing the results\n"); for(uint i = 0; i < HISTOGRAM256_BIN_COUNT; i++) if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0; shrLog(PassFailFlag ? " ...256-bin histograms match\n\n" : " ***256-bin histograms do not match!!!***\n\n" ); shrLog("Shutting down 256-bin histogram...\n\n\n"); closeHistogram256(); } shrLog("Shutting down...\n"); cutilCheckError(cutDeleteTimer(hTimer)); cutilSafeCall( cudaFree(d_Histogram) ); cutilSafeCall( cudaFree(d_Data) ); free(h_HistogramGPU); free(h_HistogramCPU); free(h_Data); cutilDeviceReset(); shrLog("%s - Test Summary\n", sSDKsample); // pass or fail (for both 64 bit and 256 bit histograms) shrQAFinishExit(argc, (const char **)argv, (PassFailFlag ? QA_PASSED : QA_FAILED)); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char** argv) { int retVal = 0; retVal = xnInit( argc, argv ); printf("[ %s ]\n", sSDKsample); if (argc > 1) { cutGetCmdLineArgumenti( argc, (const char**) argv, "n", (int *) &numParticles); if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") || cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt") ) { g_bQAReadback = true; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) { g_bQAGLVerify = true; } } if (g_bQAReadback) { // For Automated testing, we do not use OpenGL/CUDA interop if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) { cutilDeviceInit (argc, argv); } else { cudaSetDevice (cutGetMaxGflopsDeviceId() ); } g_CheckRender = new CheckBackBuffer(winWidth, winHeight, 4, false); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); // This code path is used for Automated Testing initParticles(numParticles, false, false); initParams(); if (emitterOn) { runEmitter(); } SimParams ¶ms = psystem->getParams(); params.cursorPos = make_float3(cursorPosLag.x, cursorPosLag.y, cursorPosLag.z); psystem->step(timestep); float4 *pos = NULL, *vel = NULL; int g_TotalErrors = 0; psystem->dumpBin(&pos, &vel); g_CheckRender->dumpBin(pos, numParticles*sizeof(float4), "smokeParticles_pos.bin"); g_CheckRender->dumpBin(vel, numParticles*sizeof(float4), "smokeParticles_vel.bin"); if (!g_CheckRender->compareBin2BinFloat("smokeParticles_pos.bin", sRefBin[0], numParticles*sizeof(float4), MAX_EPSILON_ERROR, THRESHOLD)) g_TotalErrors++; if (!g_CheckRender->compareBin2BinFloat("smokeParticles_vel.bin", sRefBin[1], numParticles*sizeof(float4), MAX_EPSILON_ERROR, THRESHOLD)) g_TotalErrors++; delete psystem; delete g_CheckRender; printf("%s\n", (g_TotalErrors > 0) ? "FAILED" : "PASSED"); cudaThreadExit(); } else { // Normal smokeParticles rendering path // 1st initialize OpenGL context, so we can properly set the GL for CUDA. // This is needed to achieve optimal performance with OpenGL/CUDA interop. initGL( &argc, argv ); if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) { cutilGLDeviceInit (argc, argv); } else { cudaGLSetGLDevice (cutGetMaxGflopsDeviceId() ); } if (g_bQAGLVerify) { g_CheckRender = new CheckBackBuffer(winWidth, winHeight, 4); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); } // This is the normal code path for SmokeParticles initParticles(numParticles, true, true); initParams(); initMenus(); glutDisplayFunc(display); glutReshapeFunc(reshape); glutMouseFunc(mouse); glutMotionFunc(motion); glutKeyboardFunc(key); glutKeyboardUpFunc(keyUp); glutSpecialFunc(special); glutIdleFunc(idle); glutMainLoop(); } cutilExit(argc, argv); return retVal; }
////////////////////////////////////////////////////////////////////////////// // Program main ////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { bool bTestResults = true; shrQAStart(argc, argv); if( cutCheckCmdLineFlag(argc, (const char**)argv, "help") ) { showHelp(); return 0; } shrLog("Run \"nbody -benchmark [-n=<numBodies>]\" to measure perfomance.\n"); shrLog("\t-fullscreen (run n-body simulation in fullscreen mode)\n"); shrLog("\t-fp64 (use double precision floating point values for simulation)\n"); shrLog("\t-numdevices=N (use first N CUDA devices for simulation)\n"); // shrLog("\t-hostmem (stores simulation data in host memory)\n"); // shrLog("\t-cpu (performs simulation on the host)\n"); shrLog("\n"); bFullscreen = (cutCheckCmdLineFlag(argc, (const char**) argv, "fullscreen") != 0); if (bFullscreen) bShowSliders = false; benchmark = (cutCheckCmdLineFlag(argc, (const char**) argv, "benchmark") != 0); compareToCPU = ((cutCheckCmdLineFlag(argc, (const char**) argv, "compare") != 0) || (cutCheckCmdLineFlag(argc, (const char**) argv, "qatest") != 0)); QATest = (cutCheckCmdLineFlag(argc, (const char**) argv, "qatest") != 0); useHostMem = (cutCheckCmdLineFlag(argc, (const char**) argv, "hostmem") != 0); fp64 = (cutCheckCmdLineFlag(argc, (const char**) argv, "fp64") != 0); flopsPerInteraction = fp64 ? 30 : 20; useCpu = (cutCheckCmdLineFlag(argc, (const char**) argv, "cpu") != 0); cutGetCmdLineArgumenti(argc, (const char**) argv, "numdevices", &numDevsRequested); // for multi-device we currently require using host memory -- the devices share // data via the host if (numDevsRequested > 1) useHostMem = true; int numDevsAvailable = 0; bool customGPU = false; cudaGetDeviceCount(&numDevsAvailable); if (numDevsAvailable < numDevsRequested) { shrLog("Error: only %d Devices available, %d requested. Exiting.\n", numDevsAvailable, numDevsRequested); shrQAFinishExit(argc, (const char **)argv, QA_PASSED); } shrLog("> %s mode\n", bFullscreen ? "Fullscreen" : "Windowed"); shrLog("> Simulation data stored in %s memory\n", useHostMem ? "system" : "video" ); shrLog("> %s precision floating point simulation\n", fp64 ? "Double" : "Single"); shrLog("> %d Devices used for simulation\n", numDevsRequested); int devID; cudaDeviceProp props; // Initialize GL and GLUT if necessary if (!benchmark && !compareToCPU) { initGL(&argc, argv); initParameters(); } if (useCpu) { useHostMem = true; compareToCPU = false; bSupportDouble = true; #ifdef OPENMP shrLog("> Simulation with CPU using OpenMP\n"); #else shrLog("> Simulation with CPU\n"); #endif } else { // Now choose the CUDA Device // Either without GL interop: if (benchmark || compareToCPU || useHostMem) { // Note if we are using host memory for the body system, we // don't use CUDA-GL interop. if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { devID = cutilDeviceInit(argc, argv); if (devID < 0) { printf("exiting...\n"); shrQAFinishExit(argc, (const char **)argv, QA_PASSED); } customGPU = true; } else { devID = cutGetMaxGflopsDeviceId(); cudaSetDevice( devID ); } } else // or with GL interop: { if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilGLDeviceInit(argc, argv); customGPU = true; } else { devID = cutGetMaxGflopsDeviceId(); cudaGLSetGLDevice( devID ); } } cutilSafeCall(cudaGetDevice(&devID)); cutilSafeCall(cudaGetDeviceProperties(&props, devID)); bSupportDouble = true; #if CUDART_VERSION < 4000 if (numDevsRequested > 1) { shrLog("MultiGPU n-body requires CUDA 4.0 or later\n"); cutilDeviceReset(); shrQAFinishExit(argc, (const char**)argv, QA_PASSED); } #endif // Initialize devices if(numDevsRequested > 1 && customGPU) { printf("You can't use --numdevices and --device at the same time.\n"); shrQAFinishExit(argc, (const char**)argv, QA_PASSED); } if(customGPU) { cudaDeviceProp props; cutilSafeCall(cudaGetDeviceProperties(&props, devID)); shrLog("> Compute %d.%d CUDA device: [%s]\n", props.major, props.minor, props.name); } else { for (int i = 0; i < numDevsRequested; i++) { cudaDeviceProp props; cutilSafeCall(cudaGetDeviceProperties(&props, i)); shrLog("> Compute %d.%d CUDA device: [%s]\n", props.major, props.minor, props.name); if (useHostMem) { #if CUDART_VERSION >= 2020 if(!props.canMapHostMemory) { fprintf(stderr, "Device %d cannot map host memory!\n", devID); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_PASSED); } if (numDevsRequested > 1) cutilSafeCall(cudaSetDevice(i)); cutilSafeCall(cudaSetDeviceFlags(cudaDeviceMapHost)); #else fprintf(stderr, "This CUDART version does not support <cudaDeviceProp.canMapHostMemory> field\n"); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_PASSED); #endif } } // CC 1.2 and earlier do not support double precision if (props.major*10 + props.minor <= 12) bSupportDouble = false; } //if(numDevsRequested > 1) // cutilSafeCall(cudaSetDevice(devID)); if (fp64 && !bSupportDouble) { fprintf(stderr, "One or more of the requested devices does not support double precision floating-point\n"); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_PASSED); } } numIterations = 0; p = 0; q = 1; cutGetCmdLineArgumenti(argc, (const char**) argv, "i", &numIterations); cutGetCmdLineArgumenti(argc, (const char**) argv, "p", &p); cutGetCmdLineArgumenti(argc, (const char**) argv, "q", &q); if (p == 0) // p not set on command line { p = 256; if (q * p > 256) { p = 256 / q; shrLog("Setting p=%d, q=%d to maintain %d threads per block\n", p, q, 256); } } // default number of bodies is #SMs * 4 * CTA size if (useCpu) #ifdef OPENMP numBodies = 8192; #else numBodies = 4096; #endif else if (numDevsRequested == 1)
//------------------------------------------------------------------------------ int main(int argc, char ** argv) { glutInit(&argc, argv); glutInitDisplayMode(GLUT_RGBA |GLUT_DOUBLE | GLUT_DEPTH); glutInitWindowSize(1024, 1024); glutCreateWindow("OpenSubdiv test"); std::string str; if (argc > 1) { std::ifstream ifs(argv[1]); if (ifs) { std::stringstream ss; ss << ifs.rdbuf(); ifs.close(); str = ss.str(); g_defaultShapes.push_back(SimpleShape(str.c_str(), argv[1], kCatmark)); } } initializeShapes(); int smenu = glutCreateMenu(modelMenu); for(int i = 0; i < (int)g_defaultShapes.size(); ++i){ glutAddMenuEntry( g_defaultShapes[i].name.c_str(), i); } int lmenu = glutCreateMenu(levelMenu); for(int i = 1; i < 8; ++i){ char level[16]; sprintf(level, "Level %d\n", i); glutAddMenuEntry(level, i); } // Register Osd compute kernels OpenSubdiv::OsdCpuKernelDispatcher::Register(); OpenSubdiv::OsdGlslKernelDispatcher::Register(); #if OPENSUBDIV_HAS_OPENCL OpenSubdiv::OsdClKernelDispatcher::Register(); #endif #if OPENSUBDIV_HAS_CUDA OpenSubdiv::OsdCudaKernelDispatcher::Register(); // Note: This function randomly crashes with linux 5.0-dev driver. // cudaGetDeviceProperties overrun stack..? cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); #endif int kmenu = glutCreateMenu(kernelMenu); int nKernels = OpenSubdiv::OsdKernelDispatcher::kMAX; for(int i = 0; i < nKernels; ++i) if(OpenSubdiv::OsdKernelDispatcher::HasKernelType( OpenSubdiv::OsdKernelDispatcher::KernelType(i))) glutAddMenuEntry(getKernelName(i), i); glutCreateMenu(menu); glutAddSubMenu("Level", lmenu); glutAddSubMenu("Model", smenu); glutAddSubMenu("Kernel", kmenu); glutAttachMenu(GLUT_RIGHT_BUTTON); glutDisplayFunc(display); glutReshapeFunc(reshape); glutMouseFunc(mouse); glutKeyboardFunc(keyboard); glutMotionFunc(motion); glewInit(); initGL(); const char *filename = NULL; for (int i = 1; i < argc; ++i) { if (!strcmp(argv[i], "-d")) g_level = atoi(argv[++i]); else if (!strcmp(argv[i], "-c")) g_repeatCount = atoi(argv[++i]); else filename = argv[i]; } modelMenu(0); glutIdleFunc(idle); glutMainLoop(); quit(); }
//------------------------------------------------------------------------------ int main(int argc, char ** argv) { glutInit(&argc, argv); glutInitDisplayMode(GLUT_RGBA |GLUT_DOUBLE | GLUT_DEPTH); glutInitWindowSize(1024, 1024); glutCreateWindow("OpenSubdiv ptexViewer"); int lmenu = glutCreateMenu(levelMenu); for(int i = 1; i < 8; ++i){ char level[16]; sprintf(level, "Level %d\n", i); glutAddMenuEntry(level, i); } int smenu = glutCreateMenu(schemeMenu); glutAddMenuEntry("Catmark", 0); glutAddMenuEntry("Bilinear", 1); // Register Osd compute kernels OpenSubdiv::OsdCpuKernelDispatcher::Register(); #if OPENSUBDIV_HAS_GLSL OpenSubdiv::OsdGlslKernelDispatcher::Register(); #endif #if OPENSUBDIV_HAS_OPENCL OpenSubdiv::OsdClKernelDispatcher::Register(); #endif #if OPENSUBDIV_HAS_CUDA OpenSubdiv::OsdCudaKernelDispatcher::Register(); // Note: This function randomly crashes with linux 5.0-dev driver. // cudaGetDeviceProperties overrun stack..? cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); #endif int kmenu = glutCreateMenu(kernelMenu); int nKernels = OpenSubdiv::OsdKernelDispatcher::kMAX; for(int i = 0; i < nKernels; ++i) if(OpenSubdiv::OsdKernelDispatcher::HasKernelType( OpenSubdiv::OsdKernelDispatcher::KernelType(i))) glutAddMenuEntry(getKernelName(i), i); glutCreateMenu(menu); glutAddSubMenu("Level", lmenu); glutAddSubMenu("Scheme", smenu); glutAddSubMenu("Kernel", kmenu); glutAttachMenu(GLUT_RIGHT_BUTTON); glutDisplayFunc(display); glutReshapeFunc(reshape); glutMouseFunc(mouse); glutKeyboardFunc(keyboard); glutMotionFunc(motion); glewInit(); initGL(); for (int i = 1; i < argc; ++i) { if (!strcmp(argv[i], "-d")) g_level = atoi(argv[++i]); else if (!strcmp(argv[i], "-c")) g_repeatCount = atoi(argv[++i]); else if (g_ptexColorFile == NULL) g_ptexColorFile = argv[i]; else if (g_ptexDisplacementFile == NULL) g_ptexDisplacementFile = argv[i]; else if (g_ptexOcclusionFile == NULL) g_ptexOcclusionFile = argv[i]; } if (g_ptexColorFile == NULL) { printf("Usage: %s <color.ptx> [<displacement.ptx>] [<occlusion.ptx>] \n", argv[0]); return 1; } createOsdMesh(g_level, g_kernel); fitFrame(); glutIdleFunc(idle); glutMainLoop(); quit(); }
int main(int argc, char** argv) { ModelParameters model_params; fillCalculationParameters(model_params); fillDerivedParameters(model_params, params); if (CUTFalse == initGL(argc, argv, params)) return CUTFalse; // use command-line specified CUDA device, otherwise use device with highest Gflops/s if(cutCheckCmdLineFlag(argc, (const char**)argv, "device")) cutilDeviceInit(argc, argv); else cudaSetDevice(cutGetMaxGflopsDeviceId()); // initialize calculations initConstants(params); timeval init_start, init_stop; // calculate steady state value_pair *steady_state = new value_pair[params.cells]; initSpectre(); initWaveVectors(params); gettimeofday(&init_start, NULL); calculateSteadyState(steady_state, params); gettimeofday(&init_stop, NULL); printf("Steady state calculation: %.3f s\n", time_diff(init_start, init_stop)); /* FILE *f = fopen("plot_gs_mu.txt", "w"); int shift = (params.nvz / 2) * params.nvx * params.nvy + (params.nvy / 2) * params.nvx; for(int i = 0; i < params.nvx; i++) { value_pair val = steady_state[shift + i]; fprintf(f, "%f %f\n", (-params.xmax + params.dx * i) * 1000000, (val.x * val.x + val.y * val.y)); } fclose(f); */ gettimeofday(&init_start, NULL); state.init(params); initEvolution(steady_state, params, state); gettimeofday(&init_stop, NULL); printf("Evolution init: %.3f s\n", time_diff(init_start, init_stop)); delete[] steady_state; // measure propagation time, for testing purposes calculateEvolution(params, state, 0.0); // warm-up gettimeofday(&init_start, NULL); calculateEvolution(params, state, 0.0); // zero time step - because we are just measuring speed here gettimeofday(&init_stop, NULL); printf("Propagation time: %.3f ms\n", time_diff(init_start, init_stop) * 1000.0f); // prepare textures a_xy.init(params.nvx, params.nvy); b_xy.init(params.nvx, params.nvy); a_zy.init(params.nvz, params.nvy); b_zy.init(params.nvz, params.nvy); // remember starting time gettimeofday(&time_start, NULL); // start main application cycle atexit(cleanup); glutMainLoop(); return 0; }
int main() { int width = 512; int height = 512; // Creation du device cutilSafeCall( cudaSetDevice( cutGetMaxGflopsDeviceId() ) ); // Creation des buffers sur CPU int * a = new int[width*height]; int * b = new int[width*height]; int * res = new int[width*height]; for(int i = 0; i < width*height; i++) { a[i] = (int)ceil(((double)rand()/ (double)RAND_MAX)*100); b[i] = (int)ceil(((double)rand()/ (double)RAND_MAX)*100); } // Allocation des objects on the device // *** data unsigned int size = width * height * sizeof(int); std::cout << "Allocation d'un buffer de taille " << width*height << "\n"; int* d_a = NULL; int* d_b = NULL; int* d_res = NULL; cutilSafeCall( cudaMalloc( (void**) &d_a, size)); cutilSafeCall( cudaMalloc( (void**) &d_b, size)); cutilSafeCall( cudaMalloc( (void**) &d_res, size)); // Copy des donnees cutilSafeCall( cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice)); cutilSafeCall( cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice)); // Lancer le calcul std::cout << "Lancer le kernel ... \n"; runKernel(d_a, d_b, d_res, width*height); cutilSafeCall( cutilDeviceSynchronize() ); // Copie DeviceHost cutilSafeCall( cudaMemcpy(res, d_res, size, cudaMemcpyDeviceToHost)); // Verification du test int i = 0; for(;i < width*height;i++) if(res[i] != a[i] + b[i]) std::cout << "Error : [" << i << "] " << res[i] << " != " << a[i] << " + " << b[i] << std::endl; // Liberation des ressources // *** Device cutilSafeCall(cudaFree(d_a)); cutilSafeCall(cudaFree(d_b)); cutilSafeCall(cudaFree(d_res)); // *** CPU delete[] res; delete[] a; delete[] b; // Close device cutilDeviceReset(); std::cout << "Test result : " << ((i == width*height) ? "Succes" : "Error" ) << std::endl; std::cout.flush(); return 0; }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { shrQAStart(argc, argv); // start logs shrSetLogFileName ("bilateralFilter.txt"); shrLog("%s Starting...\n\n", argv[0]); // use command-line specified CUDA device, otherwise use device with highest Gflops/s cutGetCmdLineArgumenti( argc, (const char**) argv, "threads", &nthreads ); cutGetCmdLineArgumenti( argc, (const char**) argv, "radius", &filter_radius); // load image to process loadImageData(argc, argv); if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") || cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) { // Running CUDA kernel (bilateralFilter) without visualization (QA Testing/Verification) runAutoTest(argc, argv); shrQAFinishExit(argc, (const char **)argv, (g_TotalErrors == 0 ? QA_PASSED : QA_FAILED)); } else if (cutCheckCmdLineFlag(argc, (const char **)argv, "benchmark")) { // Running CUDA kernel (bilateralFilter) in Benchmarking Mode runBenchmark(argc, argv); shrQAFinishExit(argc, (const char **)argv, (g_TotalErrors == 0 ? QA_PASSED : QA_FAILED)); } else { // Running CUDA kernel (bilateralFilter) in CUDA + OpenGL Visualization Mode if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) { printf("[%s]\n", argv[0]); printf(" Does not explicitly support -device=n in OpenGL mode\n"); printf(" To use -device=n, the sample must be running w/o OpenGL\n\n"); printf(" > %s -device=n -qatest\n", argv[0]); printf("exiting...\n"); exit(0); } // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL( argc, argv ); if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) { cutilGLDeviceInit(argc, argv); } else { cudaGLSetGLDevice (cutGetMaxGflopsDeviceId() ); } initCuda(); initOpenGL(); } atexit(cleanup); printf("Running Standard Demonstration with GLUT loop...\n\n"); printf("Press '+' and '-' to change number of iterations\n" "Press LEFT and RIGHT change euclidean delta\n" "Press UP and DOWN to change gaussian delta\n" "Press '1' to show original image\n" "Press '2' to show result\n\n"); glutMainLoop(); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, (g_TotalErrors == 0 ? QA_PASSED : QA_FAILED)); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { //start logs shrSetLogFileName ("volumeRender.txt"); shrLog("%s Starting...\n\n", argv[0]); if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") || cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) { g_bQAReadback = true; fpsLimit = frameCheckNumber; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) { g_bQAGLVerify = true; fpsLimit = frameCheckNumber; } if (g_bQAReadback) { // use command-line specified CUDA device, otherwise use device with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilDeviceInit(argc, argv); } else { cudaSetDevice( cutGetMaxGflopsDeviceId() ); } } else { // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL( &argc, argv ); // use command-line specified CUDA device, otherwise use device with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilGLDeviceInit(argc, argv); } else { cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); } /* int device; struct cudaDeviceProp prop; cudaGetDevice( &device ); cudaGetDeviceProperties( &prop, device ); if( !strncmp( "Tesla", prop.name, 5 ) ) { shrLog("This sample needs a card capable of OpenGL and display.\n"); shrLog("Please choose a different device with the -device=x argument.\n"); cutilExit(argc, argv); } */ } // parse arguments char *filename; if (cutGetCmdLineArgumentstr( argc, (const char**) argv, "file", &filename)) { volumeFilename = filename; } int n; if (cutGetCmdLineArgumenti( argc, (const char**) argv, "size", &n)) { volumeSize.width = volumeSize.height = volumeSize.depth = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "xsize", &n)) { volumeSize.width = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "ysize", &n)) { volumeSize.height = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "zsize", &n)) { volumeSize.depth = n; } // load volume data char* path = shrFindFilePath(volumeFilename, argv[0]); if (path == 0) { shrLog("Error finding file '%s'\n", volumeFilename); exit(EXIT_FAILURE); } size_t size = volumeSize.width*volumeSize.height*volumeSize.depth*sizeof(VolumeType); void *h_volume = loadRawFile(path, size); initCuda(h_volume, volumeSize); free(h_volume); cutilCheckError( cutCreateTimer( &timer)); shrLog("Press '=' and '-' to change density\n" " ']' and '[' to change brightness\n" " ';' and ''' to modify transfer function offset\n" " '.' and ',' to modify transfer function scale\n\n"); // calculate new grid size gridSize = dim3(iDivUp(width, blockSize.x), iDivUp(height, blockSize.y)); if (g_bQAReadback) { g_CheckRender = new CheckBackBuffer(width, height, 4, false); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); uint *d_output; cutilSafeCall(cudaMalloc((void**)&d_output, width*height*sizeof(uint))); cutilSafeCall(cudaMemset(d_output, 0, width*height*sizeof(uint))); float modelView[16] = { 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 4.0f, 1.0f }; invViewMatrix[0] = modelView[0]; invViewMatrix[1] = modelView[4]; invViewMatrix[2] = modelView[8]; invViewMatrix[3] = modelView[12]; invViewMatrix[4] = modelView[1]; invViewMatrix[5] = modelView[5]; invViewMatrix[6] = modelView[9]; invViewMatrix[7] = modelView[13]; invViewMatrix[8] = modelView[2]; invViewMatrix[9] = modelView[6]; invViewMatrix[10] = modelView[10]; invViewMatrix[11] = modelView[14]; // call CUDA kernel, writing results to PBO copyInvViewMatrix(invViewMatrix, sizeof(float4)*3); // Start timer 0 and process n loops on the GPU int nIter = 10; for (int i = -1; i < nIter; i++) { if( i == 0 ) { cudaThreadSynchronize(); cutStartTimer(timer); } render_kernel(gridSize, blockSize, d_output, width, height, density, brightness, transferOffset, transferScale); } cudaThreadSynchronize(); cutStopTimer(timer); // Get elapsed time and throughput, then log to sample and master logs double dAvgTime = cutGetTimerValue(timer)/(nIter * 1000.0); shrLogEx(LOGBOTH | MASTER, 0, "volumeRender, Throughput = %.4f MTexels/s, Time = %.5f s, Size = %u Texels, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * width * height)/dAvgTime, dAvgTime, (width * height), 1, blockSize.x * blockSize.y); cutilCheckMsg("Error: render_kernel() execution FAILED"); cutilSafeCall( cudaThreadSynchronize() ); cutilSafeCall( cudaMemcpy(g_CheckRender->imageData(), d_output, width*height*4, cudaMemcpyDeviceToHost) ); g_CheckRender->savePPM(sOriginal[g_Index], true, NULL); if (!g_CheckRender->PPMvsPPM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, THRESHOLD)) { shrLog("\nFAILED\n\n"); } else { shrLog("\nPASSED\n\n"); } cudaFree(d_output); freeCudaBuffers(); if (g_CheckRender) { delete g_CheckRender; g_CheckRender = NULL; } } else { // This is the normal rendering path for VolumeRender glutDisplayFunc(display); glutKeyboardFunc(keyboard); glutMouseFunc(mouse); glutMotionFunc(motion); glutReshapeFunc(reshape); glutIdleFunc(idle); initPixelBuffer(); if (g_bQAGLVerify) { g_CheckRender = new CheckBackBuffer(width, height, 4); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); } atexit(cleanup); glutMainLoop(); } cudaThreadExit(); shrEXIT(argc, (const char**)argv); }
int main(int argc, char **argv) { GpuProfiling::initProf(); // Start logs shrSetLogFileName ("scan.txt"); shrLog("%s Starting...\n\n", argv[0]); //Use command-line specified CUDA device, otherwise use device with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) cutilDeviceInit(argc, argv); else cudaSetDevice( cutGetMaxGflopsDeviceId() ); uint *d_Input, *d_Output; uint *h_Input, *h_OutputCPU, *h_OutputGPU; uint hTimer; const uint N = 13 * 1048576 / 2; shrLog("Allocating and initializing host arrays...\n"); cutCreateTimer(&hTimer); h_Input = (uint *)malloc(N * sizeof(uint)); h_OutputCPU = (uint *)malloc(N * sizeof(uint)); h_OutputGPU = (uint *)malloc(N * sizeof(uint)); srand(2009); for(uint i = 0; i < N; i++) h_Input[i] = rand(); shrLog("Allocating and initializing CUDA arrays...\n"); cutilSafeCall( cudaMalloc((void **)&d_Input, N * sizeof(uint)) ); cutilSafeCall( cudaMalloc((void **)&d_Output, N * sizeof(uint)) ); cutilSafeCall( cudaMemcpy(d_Input, h_Input, N * sizeof(uint), cudaMemcpyHostToDevice) ); shrLog("Initializing CUDA-C scan...\n\n"); initScan(); int globalFlag = 1; size_t szWorkgroup; const int iCycles = 100; shrLog("*** Running GPU scan for short arrays (%d identical iterations)...\n\n", iCycles); for(uint arrayLength = MIN_SHORT_ARRAY_SIZE; arrayLength <= MAX_SHORT_ARRAY_SIZE; arrayLength <<= 1){ shrLog("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength); cutilSafeCall( cudaThreadSynchronize() ); cutResetTimer(hTimer); cutStartTimer(hTimer); for(int i = 0; i < iCycles; i++) { szWorkgroup = scanExclusiveShort(d_Output, d_Input, N / arrayLength, arrayLength); } cutilSafeCall( cudaThreadSynchronize()); cutStopTimer(hTimer); double timerValue = 1.0e-3 * cutGetTimerValue(hTimer) / iCycles; shrLog("Validating the results...\n"); shrLog("...reading back GPU results\n"); cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost) ); shrLog(" ...scanExclusiveHost()\n"); scanExclusiveHost(h_OutputCPU, h_Input, N / arrayLength, arrayLength); // Compare GPU results with CPU results and accumulate error for this test shrLog(" ...comparing the results\n"); int localFlag = 1; for(uint i = 0; i < N; i++) { if(h_OutputCPU[i] != h_OutputGPU[i]) { localFlag = 0; break; } } // Log message on individual test result, then accumulate to global flag shrLog(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!"); globalFlag = globalFlag && localFlag; // Data log if (arrayLength == MAX_SHORT_ARRAY_SIZE) { shrLog("\n"); shrLogEx(LOGBOTH | MASTER, 0, "scan-Short, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)arrayLength/timerValue), timerValue, arrayLength, 1, szWorkgroup); shrLog("\n"); } } shrLog("***Running GPU scan for large arrays (%u identical iterations)...\n\n", iCycles); for(uint arrayLength = MIN_LARGE_ARRAY_SIZE; arrayLength <= MAX_LARGE_ARRAY_SIZE; arrayLength <<= 1){ shrLog("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength); cutilSafeCall( cudaThreadSynchronize() ); cutResetTimer(hTimer); cutStartTimer(hTimer); for(int i = 0; i < iCycles; i++) { szWorkgroup = scanExclusiveLarge(d_Output, d_Input, N / arrayLength, arrayLength); } cutilSafeCall( cudaThreadSynchronize() ); cutStopTimer(hTimer); double timerValue = 1.0e-3 * cutGetTimerValue(hTimer) / iCycles; shrLog("Validating the results...\n"); shrLog("...reading back GPU results\n"); cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost) ); shrLog("...scanExclusiveHost()\n"); scanExclusiveHost(h_OutputCPU, h_Input, N / arrayLength, arrayLength); // Compare GPU results with CPU results and accumulate error for this test shrLog(" ...comparing the results\n"); int localFlag = 1; for(uint i = 0; i < N; i++) { if(h_OutputCPU[i] != h_OutputGPU[i]) { localFlag = 0; break; } } // Log message on individual test result, then accumulate to global flag shrLog(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!"); globalFlag = globalFlag && localFlag; // Data log if (arrayLength == MAX_LARGE_ARRAY_SIZE) { shrLog("\n"); shrLogEx(LOGBOTH | MASTER, 0, "scan-Large, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)arrayLength/timerValue), timerValue, arrayLength, 1, szWorkgroup); shrLog("\n"); } } // pass or fail (cumulative... all tests in the loop) shrLog(globalFlag ? "PASSED\n\n" : "FAILED\n\n"); GpuProfiling::printResults(); shrLog("Shutting down...\n"); closeScan(); cutilSafeCall( cudaFree(d_Output)); cutilSafeCall( cudaFree(d_Input)); cutilCheckError( cutDeleteTimer(hTimer) ); cudaThreadExit(); exit(0); shrEXIT(argc, (const char**)argv); }