void initGL(int *argc, char** argv) { glutInit( argc, argv ); glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE); glutInitWindowSize(wWidth, wHeight); glutCreateWindow("Function Pointers [CUDA Edge Detection]n"); glewInit(); if (g_bFBODisplay) { if (!glewIsSupported( "GL_VERSION_2_0 GL_ARB_fragment_program GL_EXT_framebuffer_object" )) { fprintf(stderr, "Error: failed to get minimal extensions for demo\n"); fprintf(stderr, "This sample requires:\n"); fprintf(stderr, " OpenGL version 2.0\n"); fprintf(stderr, " GL_ARB_fragment_program\n"); fprintf(stderr, " GL_EXT_framebuffer_object\n"); cutilDeviceReset(); shrQAFinishExit(*argc, (const char **)argv, QA_WAIVED); } } else { if (!glewIsSupported( "GL_VERSION_1_5 GL_ARB_vertex_buffer_object GL_ARB_pixel_buffer_object" )) { fprintf(stderr, "Error: failed to get minimal extensions for demo\n"); fprintf(stderr, "This sample requires:\n"); fprintf(stderr, " OpenGL version 1.5\n"); fprintf(stderr, " GL_ARB_vertex_buffer_object\n"); fprintf(stderr, " GL_ARB_pixel_buffer_object\n"); cutilDeviceReset(); shrQAFinishExit(*argc, (const char **)argv, QA_WAIVED); } } }
void initializeData(char *file, int argc, char **argv) { GLint bsize; unsigned int w, h; size_t file_length= strlen(file); if (!strcmp(&file[file_length-3], "pgm")) { if (cutLoadPGMub(file, &pixels, &w, &h) != CUTTrue) { printf("Failed to load image file: %s\n", file); exit(-1); } g_Bpp = 1; } else if (!strcmp(&file[file_length-3], "ppm")) { if (cutLoadPPM4ub(file, &pixels, &w, &h) != CUTTrue) { printf("Failed to load image file: %s\n", file); exit(-1); } g_Bpp = 4; } else { cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_WAIVED); } imWidth = (int)w; imHeight = (int)h; setupTexture(imWidth, imHeight, pixels, g_Bpp); // copy function pointer tables to host side for later use setupFunctionTables(); memset(pixels, 0x0, g_Bpp * sizeof(Pixel) * imWidth * imHeight); if (!g_bQAReadback) { // use OpenGL Path glGenBuffers(1, &pbo_buffer); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_buffer); glBufferData(GL_PIXEL_UNPACK_BUFFER, g_Bpp * sizeof(Pixel) * imWidth * imHeight, pixels, GL_STREAM_DRAW); glGetBufferParameteriv(GL_PIXEL_UNPACK_BUFFER, GL_BUFFER_SIZE, &bsize); if ((GLuint)bsize != (g_Bpp * sizeof(Pixel) * imWidth * imHeight)) { printf("Buffer object (%d) has incorrect size (%d).\n", (unsigned)pbo_buffer, (unsigned)bsize); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_FAILED); } glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); // register this buffer object with CUDA cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, pbo_buffer, cudaGraphicsMapFlagsWriteDiscard)); glGenTextures(1, &texid); glBindTexture(GL_TEXTURE_2D, texid); glTexImage2D(GL_TEXTURE_2D, 0, ((g_Bpp==1) ? GL_LUMINANCE : GL_BGRA), imWidth, imHeight, 0, GL_LUMINANCE, GL_UNSIGNED_BYTE, NULL); glBindTexture(GL_TEXTURE_2D, 0); glPixelStorei(GL_UNPACK_ALIGNMENT, 1); glPixelStorei(GL_PACK_ALIGNMENT, 1); } }
void runAutoTest(int argc, char **argv) { int devID = 0; printf("[%s] - (automated testing w/ readback)\n", sSDKsample); devID = cutilChooseCudaDevice(argc, argv); // First load the image, so we know what the size of the image (imageW and imageH) printf("Allocating host and CUDA memory and loading image file...\n"); const char *image_path = cutFindFilePath("portrait_noise.bmp", argv[0]); if (image_path == NULL) { printf( "imageDenoisingGL was unable to find and load image file <portrait_noise.bmp>.\nExiting...\n"); shrQAFinishExit(argc, (const char **)argv, QA_FAILED); } LoadBMPFile(&h_Src, &imageW, &imageH, image_path); printf("Data init done.\n"); cutilSafeCall( CUDA_MallocArray(&h_Src, imageW, imageH) ); g_CheckRender = new CheckBackBuffer(imageW, imageH, sizeof(TColor), false); g_CheckRender->setExecPath(argv[0]); TColor *d_dst = NULL; cutilSafeCall( cudaMalloc( (void **)&d_dst, imageW*imageH*sizeof(TColor)) ); while (g_Kernel <= 3) { printf("[AutoTest]: %s <%s>\n", sSDKsample, filterMode[g_Kernel]); cutilSafeCall( CUDA_Bind2TextureArray() ); runImageFilters(d_dst); cutilSafeCall( CUDA_UnbindTexture() ); cutilSafeCall( cutilDeviceSynchronize() ); cudaMemcpy(g_CheckRender->imageData(), d_dst, imageW*imageH*sizeof(TColor), cudaMemcpyDeviceToHost); g_CheckRender->savePPM(sOriginal[g_Kernel], true, NULL); if (!g_CheckRender->PPMvsPPM(sOriginal[g_Kernel], sReference[g_Kernel], MAX_EPSILON_ERROR, 0.15f)) { g_TotalErrors++; } g_Kernel++; } cutilSafeCall( CUDA_FreeArray() ); free(h_Src); cutilSafeCall( cudaFree( d_dst ) ); delete g_CheckRender; printf("\n[%s] -> Test Results: %d errors\n", sSDKsample, g_TotalErrors); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, (!g_TotalErrors ? QA_PASSED : QA_FAILED)); }
void checkDeviceMeetComputeSpec( int argc, char **argv ) { int device = 0; cudaGetDevice( &device ); if( checkCUDAProfile( device, MIN_RUNTIME_VERSION, MIN_COMPUTE_VERSION) ) { fprintf(stderr,"\nCUDA Capable Device %d, meets minimum required specs.\n", device ); } else { fprintf(stderr, "\nNo configuration with minimum compute capabilities found. Exiting...\n"); fprintf(stderr, "This sample requires:\n"); fprintf(stderr, "\tCUDA Compute Capability >= %d.%d is required\n", MIN_COMPUTE_VERSION/16, MIN_COMPUTE_VERSION%16); fprintf(stderr, "\tCUDA Runtime Version >= %d.%d is required\n", MIN_RUNTIME_VERSION/1000, (MIN_RUNTIME_VERSION%100)/10); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_WAIVED); } }
void initGL(int *argc, char** argv) { // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. glutInit(argc, argv); glutInitDisplayMode(GLUT_RGB | GLUT_DEPTH | GLUT_DOUBLE); glutInitWindowSize(720, 480); glutCreateWindow("CUDA n-body system"); if (bFullscreen) glutFullScreen(); GLenum err = glewInit(); if (GLEW_OK != err) { shrLog("GLEW Error: %s\n", glewGetErrorString(err)); cutilDeviceReset(); exit(-1); } else if (!glewIsSupported("GL_VERSION_2_0 " "GL_VERSION_1_5 " "GL_ARB_multitexture " "GL_ARB_vertex_buffer_object")) { fprintf(stderr, "Required OpenGL extensions missing."); exit(-1); } else { #if defined(WIN32) wglSwapIntervalEXT(0); #elif defined(LINUX) glxSwapIntervalSGI(0); #endif } glEnable(GL_DEPTH_TEST); glClearColor(0.0, 0.0, 0.0, 1.0); checkGLErrors("initGL"); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { pArgc = &argc; pArgv = argv; shrQAStart(argc, argv); // start logs shrSetLogFileName ("boxFilter.txt"); shrLog("%s Starting...\n\n", argv[0]); // use command-line specified CUDA device, otherwise use device with highest Gflops/s if (argc > 1) { cutGetCmdLineArgumenti( argc, (const char**) argv, "threads", &nthreads ); cutGetCmdLineArgumenti( argc, (const char**) argv, "radius", &filter_radius); if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) { g_bOpenGLQA = true; fpsLimit = frameCheckNumber; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "fbo")) { g_bFBODisplay = true; } } // load image to process loadImageData(argc, argv); if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") || cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) { // Running CUDA kernel (boxFilter) without visualization (QA Testing/Verification) runAutoTest(argc, argv); shrQAFinishExit(argc, (const char **)argv, (g_TotalErrors == 0 ? QA_PASSED : QA_FAILED)); } else if (cutCheckCmdLineFlag(argc, (const char **)argv, "benchmark")) { // Running CUDA kernels (boxfilter) in Benchmarking mode runBenchmark(argc, argv); shrQAFinishExit(argc, (const char **)argv, (g_TotalErrors == 0 ? QA_PASSED : QA_FAILED)); } else { // Running CUDA kernels (boxFilter) with OpenGL visualization if (g_bFBODisplay) shrLog("[FBO Display] "); if (g_bOpenGLQA) shrLog("[OpenGL Readback Comparisons] "); shrLog("\n"); if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) { printf(" This SDK does not explicitly support -device=n when running with OpenGL.\n"); printf(" When specifying -device=n (n=0,1,2,....) the sample must not use OpenGL.\n"); printf(" See details below to run without OpenGL:\n\n"); printf(" > %s -device=n -qatest\n\n", argv[0]); printf("exiting...\n"); shrQAFinishExit(argc, (const char **)argv, QA_WAIVED); } // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL( &argc, argv ); int dev = findCapableDevice(argc, argv); if( dev != -1 ) { cudaGLSetGLDevice( dev ); } else { cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_WAIVED); } // Now we can create a CUDA context and bind it to the OpenGL context initCuda(); initGLResources(); if (g_bOpenGLQA) { if (g_bFBODisplay) { g_CheckRender = new CheckFBO(width, height, 4, g_FrameBufferObject); } else { g_CheckRender = new CheckBackBuffer(width, height, 4); } g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); } } // sets the callback function so it will call cleanup upon exit atexit(cleanup); shrLog("Running Standard Demonstration with GLUT loop...\n\n"); shrLog("Press '+' and '-' to change filter width\n" "Press ']' and '[' to change number of iterations\n\n"); // Main OpenGL loop that will run visualization for every vsync glutMainLoop(); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, (g_TotalErrors == 0 ? QA_PASSED : QA_FAILED)); }
//////////////////////////////////////////////////////////////////////////////// // Test driver //////////////////////////////////////////////////////////////////////////////// int main(int argc, char** argv){ uint *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal; uint *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal; uint hTimer; const uint N = 4 * 1048576; const uint DIR = 1; const uint numValues = 65536; shrQAStart(argc, argv); printf("Allocating and initializing host arrays...\n\n"); cutCreateTimer(&hTimer); h_SrcKey = (uint *)malloc(N * sizeof(uint)); h_SrcVal = (uint *)malloc(N * sizeof(uint)); h_DstKey = (uint *)malloc(N * sizeof(uint)); h_DstVal = (uint *)malloc(N * sizeof(uint)); srand(2009); for(uint i = 0; i < N; i++) h_SrcKey[i] = rand() % numValues; fillValues(h_SrcVal, N); printf("Allocating and initializing CUDA arrays...\n\n"); cutilSafeCall( cudaMalloc((void **)&d_DstKey, N * sizeof(uint)) ); cutilSafeCall( cudaMalloc((void **)&d_DstVal, N * sizeof(uint)) ); cutilSafeCall( cudaMalloc((void **)&d_BufKey, N * sizeof(uint)) ); cutilSafeCall( cudaMalloc((void **)&d_BufVal, N * sizeof(uint)) ); cutilSafeCall( cudaMalloc((void **)&d_SrcKey, N * sizeof(uint)) ); cutilSafeCall( cudaMalloc((void **)&d_SrcVal, N * sizeof(uint)) ); cutilSafeCall( cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice) ); cutilSafeCall( cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice) ); printf("Initializing GPU merge sort...\n"); initMergeSort(); printf("Running GPU merge sort...\n"); cutilSafeCall( cutilDeviceSynchronize() ); cutResetTimer(hTimer); cutStartTimer(hTimer); mergeSort( d_DstKey, d_DstVal, d_BufKey, d_BufVal, d_SrcKey, d_SrcVal, N, DIR ); cutilSafeCall( cutilDeviceSynchronize() ); cutStopTimer(hTimer); printf("Time: %f ms\n", cutGetTimerValue(hTimer)); printf("Reading back GPU merge sort results...\n"); cutilSafeCall( cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost) ); cutilSafeCall( cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost) ); printf("Inspecting the results...\n"); uint keysFlag = validateSortedKeys( h_DstKey, h_SrcKey, 1, N, numValues, DIR ); uint valuesFlag = validateSortedValues( h_DstKey, h_DstVal, h_SrcKey, 1, N ); printf("Shutting down...\n"); closeMergeSort(); cutilCheckError( cutDeleteTimer(hTimer) ); cutilSafeCall( cudaFree(d_SrcVal) ); cutilSafeCall( cudaFree(d_SrcKey) ); cutilSafeCall( cudaFree(d_BufVal) ); cutilSafeCall( cudaFree(d_BufKey) ); cutilSafeCall( cudaFree(d_DstVal) ); cutilSafeCall( cudaFree(d_DstKey) ); free(h_DstVal); free(h_DstKey); free(h_SrcVal); free(h_SrcKey); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, (keysFlag && valuesFlag) ? QA_PASSED : QA_FAILED); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { shrQAStart(argc, argv); // start logs shrSetLogFileName ("bilateralFilter.txt"); shrLog("%s Starting...\n\n", argv[0]); // use command-line specified CUDA device, otherwise use device with highest Gflops/s cutGetCmdLineArgumenti( argc, (const char**) argv, "threads", &nthreads ); cutGetCmdLineArgumenti( argc, (const char**) argv, "radius", &filter_radius); // load image to process loadImageData(argc, argv); if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") || cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) { // Running CUDA kernel (bilateralFilter) without visualization (QA Testing/Verification) runAutoTest(argc, argv); shrQAFinishExit(argc, (const char **)argv, (g_TotalErrors == 0 ? QA_PASSED : QA_FAILED)); } else if (cutCheckCmdLineFlag(argc, (const char **)argv, "benchmark")) { // Running CUDA kernel (bilateralFilter) in Benchmarking Mode runBenchmark(argc, argv); shrQAFinishExit(argc, (const char **)argv, (g_TotalErrors == 0 ? QA_PASSED : QA_FAILED)); } else { // Running CUDA kernel (bilateralFilter) in CUDA + OpenGL Visualization Mode if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) { printf("[%s]\n", argv[0]); printf(" Does not explicitly support -device=n in OpenGL mode\n"); printf(" To use -device=n, the sample must be running w/o OpenGL\n\n"); printf(" > %s -device=n -qatest\n", argv[0]); printf("exiting...\n"); exit(0); } // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL( argc, argv ); if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) { cutilGLDeviceInit(argc, argv); } else { cudaGLSetGLDevice (cutGetMaxGflopsDeviceId() ); } initCuda(); initOpenGL(); } atexit(cleanup); printf("Running Standard Demonstration with GLUT loop...\n\n"); printf("Press '+' and '-' to change number of iterations\n" "Press LEFT and RIGHT change euclidean delta\n" "Press UP and DOWN to change gaussian delta\n" "Press '1' to show original image\n" "Press '2' to show result\n\n"); glutMainLoop(); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, (g_TotalErrors == 0 ? QA_PASSED : QA_FAILED)); }
int main(int argc, char** argv) { pArgc = &argc; pArgv = argv; shrQAStart(argc, argv); if (argc > 1) { if (cutCheckCmdLineFlag(argc, (const char **)argv, "help")) { printHelp(); } if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") || cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) { g_bQAReadback = true; fpsLimit = frameCheckNumber; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) { g_bOpenGLQA = true; fpsLimit = frameCheckNumber; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "fbo")) { g_bFBODisplay = true; fpsLimit = frameCheckNumber; } } if (g_bQAReadback) { runAutoTest(argc, argv); } else { if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) { printf(" This SDK does not explicitly support -device=n when running with OpenGL.\n"); printf(" When specifying -device=n (n=0,1,2,....) the sample must not use OpenGL.\n"); printf(" See details below to run without OpenGL:\n\n"); printf(" > %s -device=n -qatest\n\n", argv[0]); printf("exiting...\n"); shrQAFinishExit(argc, (const char **)argv, QA_PASSED); } // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL( &argc, argv ); //cudaGLSetGLDevice (cutGetMaxGflopsDeviceId() ); int dev = findCapableDevice(argc, argv); if( dev != -1 ) { cudaGLSetGLDevice( dev ); } else { shrQAFinishExit2(g_bQAReadback, *pArgc, (const char **)pArgv, QA_PASSED); } cutilCheckError(cutCreateTimer(&timer)); cutilCheckError(cutResetTimer(timer)); glutDisplayFunc(display); glutKeyboardFunc(keyboard); glutReshapeFunc(reshape); if (g_bOpenGLQA) { loadDefaultImage( argc, argv ); } if (argc > 1) { char *filename; if (cutGetCmdLineArgumentstr(argc, (const char **)argv, "file", &filename)) { initializeData(filename, argc, argv); } } else { loadDefaultImage( argc, argv ); } // If code is not printing the USage, then we execute this path. if (!bQuit) { if (g_bOpenGLQA) { g_CheckRender = new CheckBackBuffer(wWidth, wHeight, 4); g_CheckRender->setPixelFormat(GL_BGRA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); } printf("I: display Image (no filtering)\n"); printf("T: display Sobel Edge Detection (Using Texture)\n"); printf("S: display Sobel Edge Detection (Using SMEM+Texture)\n"); printf("Use the '-' and '=' keys to change the brightness.\n"); printf("b: switch block filter operation (mean/Sobel)\n"); printf("p: switch point filter operation (threshold on/off)\n"); fflush(stdout); atexit(cleanup); glutTimerFunc(REFRESH_DELAY, timerEvent,0); glutMainLoop(); } } cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_PASSED); }
void runAutoTest(int argc, char **argv) { printf("[%s] (automated testing w/ readback)\n", sSDKsample); if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { int device = cutilDeviceInit(argc, argv); if (device < 0) { printf("No CUDA Capable devices found, exiting...\n"); shrQAFinishExit(argc, (const char **)argv, QA_WAIVED); } checkDeviceMeetComputeSpec( argc, argv ); } else { int dev = findCapableDevice(argc, argv); if( dev != -1 ) cudaSetDevice( dev ); else { cutilDeviceReset(); shrQAFinishExit2(g_bQAReadback, *pArgc, (const char **)pArgv, QA_PASSED); } } loadDefaultImage( argc, argv ); if (argc > 1) { char *filename; if (cutGetCmdLineArgumentstr(argc, (const char **)argv, "file", &filename)) { initializeData(filename, argc, argv); } } else { loadDefaultImage( argc, argv ); } g_CheckRender = new CheckBackBuffer(imWidth, imHeight, sizeof(Pixel), false); g_CheckRender->setExecPath(argv[0]); Pixel *d_result; cutilSafeCall( cudaMalloc( (void **)&d_result, imWidth*imHeight*sizeof(Pixel)) ); while (g_SobelDisplayMode <= 2) { printf("AutoTest: %s <%s>\n", sSDKsample, filterMode[g_SobelDisplayMode]); sobelFilter(d_result, imWidth, imHeight, g_SobelDisplayMode, imageScale, blockOp, pointOp ); cutilSafeCall( cutilDeviceSynchronize() ); cudaMemcpy(g_CheckRender->imageData(), d_result, imWidth*imHeight*sizeof(Pixel), cudaMemcpyDeviceToHost); g_CheckRender->savePGM(sOriginal[g_Index], false, NULL); if (!g_CheckRender->PGMvsPGM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, 0.15f)) { g_TotalErrors++; } g_Index++; g_SobelDisplayMode = (SobelDisplayMode)g_Index; } cutilSafeCall( cudaFree( d_result ) ); delete g_CheckRender; shrQAFinishExit(argc, (const char **)argv, (!g_TotalErrors ? QA_PASSED : QA_FAILED) ); }
void shutDown(unsigned char k, int /*x*/, int /*y*/) { switch (k){ case '\033': case 'q': case 'Q': printf("Shutting down...\n"); cutilCheckError( cutStopTimer(hTimer) ); cutilCheckError( cutDeleteTimer(hTimer) ); // DEPRECATED: cutilSafeCall( cudaGLRegisterBufferObject(gl_PBO) ); cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, gl_PBO, cudaGraphicsMapFlagsWriteDiscard)); glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); glDeleteBuffers(1, &gl_PBO); glDeleteTextures(1, &gl_Tex); cutilSafeCall( CUDA_FreeArray() ); free(h_Src); printf("Shutdown done.\n"); cutilDeviceReset(); exit(0); break; case '1': printf("Passthrough.\n"); g_Kernel = 0; break; case '2': printf("KNN method \n"); g_Kernel = 1; break; case '3': printf("NLM method\n"); g_Kernel = 2; break; case '4': printf("Quick NLM(NLM2) method\n"); g_Kernel = 3; break; case ' ': printf(g_Diag ? "LERP highlighting mode.\n" : "Normal mode.\n"); g_Diag = !g_Diag; break; case 'n': printf("Decrease noise level.\n"); knnNoise -= noiseStep; nlmNoise -= noiseStep; break; case 'N': printf("Increase noise level.\n"); knnNoise += noiseStep; nlmNoise += noiseStep; break; case 'l': printf("Decrease LERP quotent.\n"); lerpC = MAX(lerpC - lerpStep, 0.0f); break; case 'L': printf("Increase LERP quotent.\n"); lerpC = MIN(lerpC + lerpStep, 1.0f); break; case 'f' : case 'F': g_FPS = true; break; case '?': printf("lerpC = %5.5f\n", lerpC); printf("knnNoise = %5.5f\n", knnNoise); printf("nlmNoise = %5.5f\n", nlmNoise); break; } }
////////////////////////////////////////////////////////////////////////////// // Program main ////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { bool bTestResults = true; shrQAStart(argc, argv); if( cutCheckCmdLineFlag(argc, (const char**)argv, "help") ) { showHelp(); return 0; } shrLog("Run \"nbody -benchmark [-n=<numBodies>]\" to measure perfomance.\n"); shrLog("\t-fullscreen (run n-body simulation in fullscreen mode)\n"); shrLog("\t-fp64 (use double precision floating point values for simulation)\n"); shrLog("\t-numdevices=N (use first N CUDA devices for simulation)\n"); // shrLog("\t-hostmem (stores simulation data in host memory)\n"); // shrLog("\t-cpu (performs simulation on the host)\n"); shrLog("\n"); bFullscreen = (cutCheckCmdLineFlag(argc, (const char**) argv, "fullscreen") != 0); if (bFullscreen) bShowSliders = false; benchmark = (cutCheckCmdLineFlag(argc, (const char**) argv, "benchmark") != 0); compareToCPU = ((cutCheckCmdLineFlag(argc, (const char**) argv, "compare") != 0) || (cutCheckCmdLineFlag(argc, (const char**) argv, "qatest") != 0)); QATest = (cutCheckCmdLineFlag(argc, (const char**) argv, "qatest") != 0); useHostMem = (cutCheckCmdLineFlag(argc, (const char**) argv, "hostmem") != 0); fp64 = (cutCheckCmdLineFlag(argc, (const char**) argv, "fp64") != 0); flopsPerInteraction = fp64 ? 30 : 20; useCpu = (cutCheckCmdLineFlag(argc, (const char**) argv, "cpu") != 0); cutGetCmdLineArgumenti(argc, (const char**) argv, "numdevices", &numDevsRequested); // for multi-device we currently require using host memory -- the devices share // data via the host if (numDevsRequested > 1) useHostMem = true; int numDevsAvailable = 0; bool customGPU = false; cudaGetDeviceCount(&numDevsAvailable); if (numDevsAvailable < numDevsRequested) { shrLog("Error: only %d Devices available, %d requested. Exiting.\n", numDevsAvailable, numDevsRequested); shrQAFinishExit(argc, (const char **)argv, QA_PASSED); } shrLog("> %s mode\n", bFullscreen ? "Fullscreen" : "Windowed"); shrLog("> Simulation data stored in %s memory\n", useHostMem ? "system" : "video" ); shrLog("> %s precision floating point simulation\n", fp64 ? "Double" : "Single"); shrLog("> %d Devices used for simulation\n", numDevsRequested); int devID; cudaDeviceProp props; // Initialize GL and GLUT if necessary if (!benchmark && !compareToCPU) { initGL(&argc, argv); initParameters(); } if (useCpu) { useHostMem = true; compareToCPU = false; bSupportDouble = true; #ifdef OPENMP shrLog("> Simulation with CPU using OpenMP\n"); #else shrLog("> Simulation with CPU\n"); #endif } else { // Now choose the CUDA Device // Either without GL interop: if (benchmark || compareToCPU || useHostMem) { // Note if we are using host memory for the body system, we // don't use CUDA-GL interop. if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { devID = cutilDeviceInit(argc, argv); if (devID < 0) { printf("exiting...\n"); shrQAFinishExit(argc, (const char **)argv, QA_PASSED); } customGPU = true; } else { devID = cutGetMaxGflopsDeviceId(); cudaSetDevice( devID ); } } else // or with GL interop: { if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilGLDeviceInit(argc, argv); customGPU = true; } else { devID = cutGetMaxGflopsDeviceId(); cudaGLSetGLDevice( devID ); } } cutilSafeCall(cudaGetDevice(&devID)); cutilSafeCall(cudaGetDeviceProperties(&props, devID)); bSupportDouble = true; #if CUDART_VERSION < 4000 if (numDevsRequested > 1) { shrLog("MultiGPU n-body requires CUDA 4.0 or later\n"); cutilDeviceReset(); shrQAFinishExit(argc, (const char**)argv, QA_PASSED); } #endif // Initialize devices if(numDevsRequested > 1 && customGPU) { printf("You can't use --numdevices and --device at the same time.\n"); shrQAFinishExit(argc, (const char**)argv, QA_PASSED); } if(customGPU) { cudaDeviceProp props; cutilSafeCall(cudaGetDeviceProperties(&props, devID)); shrLog("> Compute %d.%d CUDA device: [%s]\n", props.major, props.minor, props.name); } else { for (int i = 0; i < numDevsRequested; i++) { cudaDeviceProp props; cutilSafeCall(cudaGetDeviceProperties(&props, i)); shrLog("> Compute %d.%d CUDA device: [%s]\n", props.major, props.minor, props.name); if (useHostMem) { #if CUDART_VERSION >= 2020 if(!props.canMapHostMemory) { fprintf(stderr, "Device %d cannot map host memory!\n", devID); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_PASSED); } if (numDevsRequested > 1) cutilSafeCall(cudaSetDevice(i)); cutilSafeCall(cudaSetDeviceFlags(cudaDeviceMapHost)); #else fprintf(stderr, "This CUDART version does not support <cudaDeviceProp.canMapHostMemory> field\n"); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_PASSED); #endif } } // CC 1.2 and earlier do not support double precision if (props.major*10 + props.minor <= 12) bSupportDouble = false; } //if(numDevsRequested > 1) // cutilSafeCall(cudaSetDevice(devID)); if (fp64 && !bSupportDouble) { fprintf(stderr, "One or more of the requested devices does not support double precision floating-point\n"); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_PASSED); } } numIterations = 0; p = 0; q = 1; cutGetCmdLineArgumenti(argc, (const char**) argv, "i", &numIterations); cutGetCmdLineArgumenti(argc, (const char**) argv, "p", &p); cutGetCmdLineArgumenti(argc, (const char**) argv, "q", &q); if (p == 0) // p not set on command line { p = 256; if (q * p > 256) { p = 256 / q; shrLog("Setting p=%d, q=%d to maintain %d threads per block\n", p, q, 256); } } // default number of bodies is #SMs * 4 * CTA size if (useCpu) #ifdef OPENMP numBodies = 8192; #else numBodies = 4096; #endif else if (numDevsRequested == 1)
// commented out to remove unused parameter warnings in Linux void key(unsigned char key, int /*x*/, int /*y*/) { switch (key) { case ' ': bPause = !bPause; break; case 27: // escape case 'q': case 'Q': cutilDeviceReset(); exit(0); break; case 13: // return if (bSupportDouble) { if (fp64) switchDemoPrecision<float, double>(); else switchDemoPrecision<double, float>(); shrLog("> %s precision floating point simulation\n", fp64 ? "Double" : "Single"); } break; case '`': bShowSliders = !bShowSliders; break; case 'g': case 'G': bDispInteractions = !bDispInteractions; break; case 'p': case 'P': displayMode = (ParticleRenderer::DisplayMode) ((displayMode + 1) % ParticleRenderer::PARTICLE_NUM_MODES); break; case 'c': case 'C': cycleDemo = !cycleDemo; shrLog("Cycle Demo Parameters: %s\n", cycleDemo ? "ON" : "OFF"); break; case '[': activeDemo = (activeDemo == 0) ? numDemos - 1 : (activeDemo - 1) % numDemos; selectDemo(activeDemo); break; case ']': activeDemo = (activeDemo + 1) % numDemos; selectDemo(activeDemo); break; case 'd': case 'D': displayEnabled = !displayEnabled; break; case 'o': case 'O': activeParams.print(); break; case '1': if (fp64) NBodyDemo<double>::reset(numBodies, NBODY_CONFIG_SHELL); else NBodyDemo<float>::reset(numBodies, NBODY_CONFIG_SHELL); break; case '2': if (fp64) NBodyDemo<double>::reset(numBodies, NBODY_CONFIG_RANDOM); else NBodyDemo<float>::reset(numBodies, NBODY_CONFIG_RANDOM); break; case '3': if (fp64) NBodyDemo<double>::reset(numBodies, NBODY_CONFIG_EXPAND); else NBodyDemo<float>::reset(numBodies, NBODY_CONFIG_EXPAND); break; } glutPostRedisplay(); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { shrQAStart( argc, argv ); shrSetLogFileName ("reduction.txt"); char *reduceMethod; cutGetCmdLineArgumentstr( argc, (const char**) argv, "method", &reduceMethod); char *typeChoice; cutGetCmdLineArgumentstr( argc, (const char**) argv, "type", &typeChoice); if (0 == typeChoice) { typeChoice = (char*)malloc(4 * sizeof(char)); strcpy(typeChoice, "int"); } ReduceType datatype = REDUCE_INT; if (!strcasecmp(typeChoice, "float")) datatype = REDUCE_FLOAT; else if (!strcasecmp(typeChoice, "double")) datatype = REDUCE_DOUBLE; else datatype = REDUCE_INT; cudaDeviceProp deviceProp; deviceProp.major = 1; deviceProp.minor = 0; int minimumComputeVersion = 10; if (datatype == REDUCE_DOUBLE) { deviceProp.minor = 3; minimumComputeVersion = 13; } int dev; if(!cutCheckCmdLineFlag(argc, (const char**)argv, "method") ) { fprintf(stderr, "MISSING --method FLAG.\nYou must provide --method={ SUM | MIN | MAX }.\n"); exit(1); } if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilDeviceInit(argc, argv); cutilSafeCallNoSync(cudaGetDevice(&dev)); } else { cutilSafeCallNoSync(cudaChooseDevice(&dev, &deviceProp)); } cutilSafeCallNoSync(cudaGetDeviceProperties(&deviceProp, dev)); if((deviceProp.major * 10 + deviceProp.minor) >= minimumComputeVersion) { shrLog("Using Device %d: %s\n\n", dev, deviceProp.name); cutilSafeCallNoSync(cudaSetDevice(dev)); } else { shrLog("Error: the selected device does not support the minimum compute capability of %d.%d.\n\n", minimumComputeVersion / 10, minimumComputeVersion % 10); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_WAIVED); } shrLog("Reducing array of type %s\n\n", typeChoice); bool bResult = false; switch (datatype) { default: case REDUCE_INT: if (strcmp("SUM", reduceMethod) == 0) { bResult = runTestSum<int>( argc, argv, datatype); } else if ( strcmp("MAX", reduceMethod) == 0 ) { bResult = runTestMax<int>( argc, argv, datatype); } else if ( strcmp("MIN", reduceMethod) == 0 ) { bResult = runTestMin<int>( argc, argv, datatype); } else { fprintf(stderr, "No --method specified!\n"); exit(1); } break; case REDUCE_FLOAT: if (strcmp("SUM", reduceMethod) == 0) { bResult = runTestSum<float>( argc, argv, datatype); } else if ( strcmp("MAX", reduceMethod) == 0 ) { bResult = runTestMax<float>( argc, argv, datatype); } else if ( strcmp("MIN", reduceMethod) == 0 ) { bResult = runTestMin<float>( argc, argv, datatype); } else { fprintf(stderr, "No --method specified!\n"); exit(1); } break; case REDUCE_DOUBLE: if (strcmp("SUM", reduceMethod) == 0) { bResult = runTestSum<double>( argc, argv, datatype); } else if ( strcmp("MAX", reduceMethod) == 0 ) { bResult = runTestMax<double>( argc, argv, datatype); } else if ( strcmp("MIN", reduceMethod) == 0 ) { bResult = runTestMin<double>( argc, argv, datatype); } else { fprintf(stderr, "No --method specified!\n"); exit(1); } break; } cutilDeviceReset(); shrQAFinishExit(argc, (const char**)argv, (bResult ? QA_PASSED : QA_FAILED)); }
int main(int argc, char **argv) { shrQAStart(argc, argv); if (argc > 1) { if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") || cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) { g_bQAReadback = true; fpsLimit = frameCheckNumber; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) { g_bOpenGLQA = true; g_bFBODisplay = false; fpsLimit = frameCheckNumber; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "fbo")) { g_bFBODisplay = true; fpsLimit = frameCheckNumber; } } if (g_bQAReadback) { runAutoTest(argc, argv); } else { printf("[%s] ", sSDKsample); if (g_bFBODisplay) printf("[FBO Display] "); if (g_bOpenGLQA) printf("[OpenGL Readback Comparisons] "); printf("\n"); // use command-line specified CUDA device, otherwise use device with highest Gflops/s if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) { printf("[%s]\n", argv[0]); printf(" Does not explicitly support -device=n in OpenGL mode\n"); printf(" To use -device=n, the sample must be running w/o OpenGL\n\n"); printf(" > %s -device=n -qatest\n", argv[0]); printf("exiting...\n"); exit(0); } // First load the image, so we know what the size of the image (imageW and imageH) printf("Allocating host and CUDA memory and loading image file...\n"); const char *image_path = cutFindFilePath("portrait_noise.bmp", argv[0]); if (image_path == NULL) { printf( "imageDenoisingGL was unable to find and load image file <portrait_noise.bmp>.\nExiting...\n"); shrQAFinishExit(argc, (const char **)argv, QA_FAILED); } LoadBMPFile(&h_Src, &imageW, &imageH, image_path); printf("Data init done.\n"); // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL( &argc, argv ); cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); cutilSafeCall( CUDA_MallocArray(&h_Src, imageW, imageH) ); initOpenGLBuffers(); // Creating the Auto-Validation Code if (g_bOpenGLQA) { if (g_bFBODisplay) { g_CheckRender = new CheckFBO(imageW, imageH, 4); } else { g_CheckRender = new CheckBackBuffer(imageW, imageH, 4); } g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(g_bOpenGLQA); } } printf("Starting GLUT main loop...\n"); printf("Press [1] to view noisy image\n"); printf("Press [2] to view image restored with knn filter\n"); printf("Press [3] to view image restored with nlm filter\n"); printf("Press [4] to view image restored with modified nlm filter\n"); printf("Press [ ] to view smooth/edgy areas [RED/BLUE] Ct's\n"); printf("Press [f] to print frame rate\n"); printf("Press [?] to print Noise and Lerp Ct's\n"); printf("Press [q] to exit\n"); glutDisplayFunc(displayFunc); glutKeyboardFunc(shutDown); cutilCheckError( cutCreateTimer(&hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); glutTimerFunc(REFRESH_DELAY, timerEvent,0); glutMainLoop(); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_PASSED); }
int main() { int width = 512; int height = 512; // Creation du device cutilSafeCall( cudaSetDevice( cutGetMaxGflopsDeviceId() ) ); // Creation des buffers sur CPU int * a = new int[width*height]; int * b = new int[width*height]; int * res = new int[width*height]; for(int i = 0; i < width*height; i++) { a[i] = (int)ceil(((double)rand()/ (double)RAND_MAX)*100); b[i] = (int)ceil(((double)rand()/ (double)RAND_MAX)*100); } // Allocation des objects on the device // *** data unsigned int size = width * height * sizeof(int); std::cout << "Allocation d'un buffer de taille " << width*height << "\n"; int* d_a = NULL; int* d_b = NULL; int* d_res = NULL; cutilSafeCall( cudaMalloc( (void**) &d_a, size)); cutilSafeCall( cudaMalloc( (void**) &d_b, size)); cutilSafeCall( cudaMalloc( (void**) &d_res, size)); // Copy des donnees cutilSafeCall( cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice)); cutilSafeCall( cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice)); // Lancer le calcul std::cout << "Lancer le kernel ... \n"; runKernel(d_a, d_b, d_res, width*height); cutilSafeCall( cutilDeviceSynchronize() ); // Copie DeviceHost cutilSafeCall( cudaMemcpy(res, d_res, size, cudaMemcpyDeviceToHost)); // Verification du test int i = 0; for(;i < width*height;i++) if(res[i] != a[i] + b[i]) std::cout << "Error : [" << i << "] " << res[i] << " != " << a[i] << " + " << b[i] << std::endl; // Liberation des ressources // *** Device cutilSafeCall(cudaFree(d_a)); cutilSafeCall(cudaFree(d_b)); cutilSafeCall(cudaFree(d_res)); // *** CPU delete[] res; delete[] a; delete[] b; // Close device cutilDeviceReset(); std::cout << "Test result : " << ((i == width*height) ? "Succes" : "Error" ) << std::endl; std::cout.flush(); return 0; }
int main(int argc, char **argv) { uchar *h_Data; uint *h_HistogramCPU, *h_HistogramGPU; uchar *d_Data; uint *d_Histogram; uint hTimer; int PassFailFlag = 1; uint byteCount = 64 * 1048576; uint uiSizeMult = 1; cudaDeviceProp deviceProp; deviceProp.major = 0; deviceProp.minor = 0; int dev; shrQAStart(argc, argv); // set logfile name and start logs shrSetLogFileName ("histogram.txt"); //Use command-line specified CUDA device, otherwise use device with highest Gflops/s if( shrCheckCmdLineFlag(argc, (const char**)argv, "device") ) { dev = cutilDeviceInit(argc, argv); if (dev < 0) { printf("No CUDA Capable Devices found, exiting...\n"); shrQAFinishExit(argc, (const char **)argv, QA_WAIVED); } } else { cudaSetDevice( dev = cutGetMaxGflopsDeviceId() ); cutilSafeCall( cudaChooseDevice(&dev, &deviceProp) ); } cutilSafeCall( cudaGetDeviceProperties(&deviceProp, dev) ); printf("CUDA device [%s] has %d Multi-Processors, Compute %d.%d\n", deviceProp.name, deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor); int version = deviceProp.major * 0x10 + deviceProp.minor; if(version < 0x11) { printf("There is no device supporting a minimum of CUDA compute capability 1.1 for this SDK sample\n"); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_WAIVED); } cutilCheckError(cutCreateTimer(&hTimer)); // Optional Command-line multiplier to increase size of array to histogram if (shrGetCmdLineArgumentu(argc, (const char**)argv, "sizemult", &uiSizeMult)) { uiSizeMult = CLAMP(uiSizeMult, 1, 10); byteCount *= uiSizeMult; } shrLog("Initializing data...\n"); shrLog("...allocating CPU memory.\n"); h_Data = (uchar *)malloc(byteCount); h_HistogramCPU = (uint *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint)); h_HistogramGPU = (uint *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint)); shrLog("...generating input data\n"); srand(2009); for(uint i = 0; i < byteCount; i++) h_Data[i] = rand() % 256; shrLog("...allocating GPU memory and copying input data\n\n"); cutilSafeCall( cudaMalloc((void **)&d_Data, byteCount ) ); cutilSafeCall( cudaMalloc((void **)&d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint) ) ); cutilSafeCall( cudaMemcpy(d_Data, h_Data, byteCount, cudaMemcpyHostToDevice) ); { shrLog("Starting up 64-bin histogram...\n\n"); initHistogram64(); shrLog("Running 64-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns); for(int iter = -1; iter < numRuns; iter++){ //iter == -1 -- warmup iteration if(iter == 0){ cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); } histogram64(d_Histogram, d_Data, byteCount); } cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutStopTimer(hTimer)); double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns; shrLog("histogram64() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs); shrLogEx(LOGBOTH | MASTER, 0, "histogram64, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM64_THREADBLOCK_SIZE); shrLog("\nValidating GPU results...\n"); shrLog(" ...reading back GPU results\n"); cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM64_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) ); shrLog(" ...histogram64CPU()\n"); histogram64CPU( h_HistogramCPU, h_Data, byteCount ); shrLog(" ...comparing the results...\n"); for(uint i = 0; i < HISTOGRAM64_BIN_COUNT; i++) if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0; shrLog(PassFailFlag ? " ...64-bin histograms match\n\n" : " ***64-bin histograms do not match!!!***\n\n" ); shrLog("Shutting down 64-bin histogram...\n\n\n"); closeHistogram64(); } { shrLog("Initializing 256-bin histogram...\n"); initHistogram256(); shrLog("Running 256-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns); for(int iter = -1; iter < numRuns; iter++){ //iter == -1 -- warmup iteration if(iter == 0){ cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); } histogram256(d_Histogram, d_Data, byteCount); } cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutStopTimer(hTimer)); double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns; shrLog("histogram256() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs); shrLogEx(LOGBOTH | MASTER, 0, "histogram256, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM256_THREADBLOCK_SIZE); shrLog("\nValidating GPU results...\n"); shrLog(" ...reading back GPU results\n"); cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) ); shrLog(" ...histogram256CPU()\n"); histogram256CPU( h_HistogramCPU, h_Data, byteCount ); shrLog(" ...comparing the results\n"); for(uint i = 0; i < HISTOGRAM256_BIN_COUNT; i++) if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0; shrLog(PassFailFlag ? " ...256-bin histograms match\n\n" : " ***256-bin histograms do not match!!!***\n\n" ); shrLog("Shutting down 256-bin histogram...\n\n\n"); closeHistogram256(); } shrLog("Shutting down...\n"); cutilCheckError(cutDeleteTimer(hTimer)); cutilSafeCall( cudaFree(d_Histogram) ); cutilSafeCall( cudaFree(d_Data) ); free(h_HistogramGPU); free(h_HistogramCPU); free(h_Data); cutilDeviceReset(); shrLog("%s - Test Summary\n", sSDKsample); // pass or fail (for both 64 bit and 256 bit histograms) shrQAFinishExit(argc, (const char **)argv, (PassFailFlag ? QA_PASSED : QA_FAILED)); }
int main() { int width = 800; int height = 600; int mesh_width = 256; int mesh_height = 256; // Creation du device cutilSafeCall( cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ) ); // Creation d'une fenetre C3::Window OpenGLWin; OpenGLWin.Create(C3::WindowMode(width,height),"CudaC3"); // Glew init GLenum err = glewInit(); if(err != GLEW_OK) std::cout << "Error on GLEW initialization.\n"; // Configuration OpenGL glClearColor(0.0, 0.0, 0.0, 1.0); glDisable(GL_DEPTH_TEST); glViewport(0, 0, width, height); glMatrixMode(GL_PROJECTION); glLoadIdentity(); gluPerspective(60.0, (GLfloat)width / (GLfloat) height, 0.1, 10.0); // VBO // *** Create GLuint vbo; glGenBuffers(1, &vbo); glBindBuffer(GL_ARRAY_BUFFER, vbo); // *** Initialize unsigned int size = mesh_width * mesh_height * 4 * sizeof(float); glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW); glBindBuffer(GL_ARRAY_BUFFER, 0); // *** Register in CUDA cudaGraphicsResource *cuda_vbo_resource = NULL; cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsWriteDiscard)); float g_fAnim = 0.f; int nbFrame = 0; float timeFPS = 0.f; while(OpenGLWin.IsOpened()) { // Events C3::Event event; while(OpenGLWin.PoolEvent(event)) { //std::cout << "Event !" << std::endl; if(event.Type == C3::Event::Closed) { std::cout << "Close ... " << std::endl; OpenGLWin.Close(); } else if(event.Type == C3::Event::KeyPressed) { if(event.Key.Code == C3::Key::Escape) { std::cout << "Close ... " << std::endl; OpenGLWin.Close(); } } } // Mise a jour du temps g_fAnim += OpenGLWin.GetFrameTime() / 1000.f; timeFPS += OpenGLWin.GetFrameTime() / 1000.f; nbFrame++; if(timeFPS > 1.0f) { std::stringstream ss; ss << "CudaC3 [" << (int)ceil( nbFrame / timeFPS ) << " FPS]"; OpenGLWin.SetTitle(ss.str()); timeFPS = 0.f; nbFrame = 0; } // Draw the scene glClear( GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT ); // Lancer le calcul CUDA // *** map OpenGL buffer object for writing from CUDA float4 *dptr; cutilSafeCall(cudaGraphicsMapResources(1, &cuda_vbo_resource, 0)); size_t num_bytes; cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void **)&dptr, &num_bytes, cuda_vbo_resource)); // *** Run kernel runKernel(dptr, mesh_width, mesh_height,g_fAnim); cutilSafeCall( cutilDeviceSynchronize() ); // *** Unmap cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_vbo_resource, 0)); // OpenGL // *** Make some transformation glMatrixMode(GL_MODELVIEW); glLoadIdentity(); glTranslatef(0.0, 0.0, -3.0); glRotatef(0.0, 1.0, 0.0, 0.0); glRotatef(0.0, 0.0, 1.0, 0.0); // *** Render VBO // --- Bind glBindBuffer(GL_ARRAY_BUFFER, vbo); glVertexPointer(4, GL_FLOAT, 0, 0); // --- Draw glEnableClientState(GL_VERTEX_ARRAY); glColor3f(1.0, 0.0, 0.0); glDrawArrays(GL_POINTS, 0, mesh_width * mesh_height); glDisableClientState(GL_VERTEX_ARRAY); // Swap buffers OpenGLWin.Display(); } // Liberation des ressources cudaGraphicsUnregisterResource(cuda_vbo_resource); glBindBuffer(1, vbo); glDeleteBuffers(1, &vbo); // Close device cutilDeviceReset(); return 0; }