void runAutoTest(int argc, char **argv) { printf("[%s] (automated testing w/ readback)\n", sSDKsample); if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilDeviceInit(argc, argv); } else { cudaSetDevice( cutGetMaxGflopsDeviceId() ); } loadDefaultImage( argv[0] ); if (argc > 1) { char *filename; if (cutGetCmdLineArgumentstr(argc, (const char **)argv, "file", &filename)) { initializeData(filename); } } else { loadDefaultImage( argv[0]); } g_CheckRender = new CheckBackBuffer(imWidth, imHeight, sizeof(Pixel), false); g_CheckRender->setExecPath(argv[0]); Pixel *d_result; cutilSafeCall( cudaMalloc( (void **)&d_result, imWidth*imHeight*sizeof(Pixel)) ); while (g_SobelDisplayMode <= 2) { printf("AutoTest: %s <%s>\n", sSDKsample, filterMode[g_SobelDisplayMode]); sobelFilter(d_result, imWidth, imHeight, g_SobelDisplayMode, imageScale ); cutilSafeCall( cudaThreadSynchronize() ); cudaMemcpy(g_CheckRender->imageData(), d_result, imWidth*imHeight*sizeof(Pixel), cudaMemcpyDeviceToHost); g_CheckRender->savePGM(sOriginal[g_Index], false, NULL); if (!g_CheckRender->PGMvsPGM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, 0.15f)) { g_TotalErrors++; } g_Index++; g_SobelDisplayMode = (SobelDisplayMode)g_Index; } cutilSafeCall( cudaFree( d_result ) ); delete g_CheckRender; if (!g_TotalErrors) printf("TEST PASSED!\n"); else printf("TEST FAILED!\n"); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { //start logs shrSetLogFileName ("volumeRender.txt"); shrLog("%s Starting...\n\n", argv[0]); if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") || cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) { g_bQAReadback = true; fpsLimit = frameCheckNumber; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) { g_bQAGLVerify = true; fpsLimit = frameCheckNumber; } if (g_bQAReadback) { // use command-line specified CUDA device, otherwise use device with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilDeviceInit(argc, argv); } else { cudaSetDevice( cutGetMaxGflopsDeviceId() ); } } else { // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL( &argc, argv ); // use command-line specified CUDA device, otherwise use device with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilGLDeviceInit(argc, argv); } else { cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); } /* int device; struct cudaDeviceProp prop; cudaGetDevice( &device ); cudaGetDeviceProperties( &prop, device ); if( !strncmp( "Tesla", prop.name, 5 ) ) { shrLog("This sample needs a card capable of OpenGL and display.\n"); shrLog("Please choose a different device with the -device=x argument.\n"); cutilExit(argc, argv); } */ } // parse arguments char *filename; if (cutGetCmdLineArgumentstr( argc, (const char**) argv, "file", &filename)) { volumeFilename = filename; } int n; if (cutGetCmdLineArgumenti( argc, (const char**) argv, "size", &n)) { volumeSize.width = volumeSize.height = volumeSize.depth = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "xsize", &n)) { volumeSize.width = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "ysize", &n)) { volumeSize.height = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "zsize", &n)) { volumeSize.depth = n; } // load volume data char* path = shrFindFilePath(volumeFilename, argv[0]); if (path == 0) { shrLog("Error finding file '%s'\n", volumeFilename); exit(EXIT_FAILURE); } size_t size = volumeSize.width*volumeSize.height*volumeSize.depth*sizeof(VolumeType); void *h_volume = loadRawFile(path, size); initCuda(h_volume, volumeSize); free(h_volume); cutilCheckError( cutCreateTimer( &timer)); shrLog("Press '=' and '-' to change density\n" " ']' and '[' to change brightness\n" " ';' and ''' to modify transfer function offset\n" " '.' and ',' to modify transfer function scale\n\n"); // calculate new grid size gridSize = dim3(iDivUp(width, blockSize.x), iDivUp(height, blockSize.y)); if (g_bQAReadback) { g_CheckRender = new CheckBackBuffer(width, height, 4, false); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); uint *d_output; cutilSafeCall(cudaMalloc((void**)&d_output, width*height*sizeof(uint))); cutilSafeCall(cudaMemset(d_output, 0, width*height*sizeof(uint))); float modelView[16] = { 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 4.0f, 1.0f }; invViewMatrix[0] = modelView[0]; invViewMatrix[1] = modelView[4]; invViewMatrix[2] = modelView[8]; invViewMatrix[3] = modelView[12]; invViewMatrix[4] = modelView[1]; invViewMatrix[5] = modelView[5]; invViewMatrix[6] = modelView[9]; invViewMatrix[7] = modelView[13]; invViewMatrix[8] = modelView[2]; invViewMatrix[9] = modelView[6]; invViewMatrix[10] = modelView[10]; invViewMatrix[11] = modelView[14]; // call CUDA kernel, writing results to PBO copyInvViewMatrix(invViewMatrix, sizeof(float4)*3); // Start timer 0 and process n loops on the GPU int nIter = 10; for (int i = -1; i < nIter; i++) { if( i == 0 ) { cudaThreadSynchronize(); cutStartTimer(timer); } render_kernel(gridSize, blockSize, d_output, width, height, density, brightness, transferOffset, transferScale); } cudaThreadSynchronize(); cutStopTimer(timer); // Get elapsed time and throughput, then log to sample and master logs double dAvgTime = cutGetTimerValue(timer)/(nIter * 1000.0); shrLogEx(LOGBOTH | MASTER, 0, "volumeRender, Throughput = %.4f MTexels/s, Time = %.5f s, Size = %u Texels, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * width * height)/dAvgTime, dAvgTime, (width * height), 1, blockSize.x * blockSize.y); cutilCheckMsg("Error: render_kernel() execution FAILED"); cutilSafeCall( cudaThreadSynchronize() ); cutilSafeCall( cudaMemcpy(g_CheckRender->imageData(), d_output, width*height*4, cudaMemcpyDeviceToHost) ); g_CheckRender->savePPM(sOriginal[g_Index], true, NULL); if (!g_CheckRender->PPMvsPPM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, THRESHOLD)) { shrLog("\nFAILED\n\n"); } else { shrLog("\nPASSED\n\n"); } cudaFree(d_output); freeCudaBuffers(); if (g_CheckRender) { delete g_CheckRender; g_CheckRender = NULL; } } else { // This is the normal rendering path for VolumeRender glutDisplayFunc(display); glutKeyboardFunc(keyboard); glutMouseFunc(mouse); glutMotionFunc(motion); glutReshapeFunc(reshape); glutIdleFunc(idle); initPixelBuffer(); if (g_bQAGLVerify) { g_CheckRender = new CheckBackBuffer(width, height, 4); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); } atexit(cleanup); glutMainLoop(); } cudaThreadExit(); shrEXIT(argc, (const char**)argv); }
int main(int argc, char** argv) { printf("[%s]\n", sSDKsample); if (argc > 1) { if (cutCheckCmdLineFlag(argc, (const char **)argv, "help")) { printHelp(); } if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") || cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) { g_bQAReadback = true; fpsLimit = frameCheckNumber; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) { g_bOpenGLQA = true; fpsLimit = frameCheckNumber; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "fbo")) { g_bFBODisplay = true; fpsLimit = frameCheckNumber; } } if (g_bQAReadback) { runAutoTest(argc, argv); } else { // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL( &argc, argv ); // use command-line specified CUDA device if possible, otherwise search for capable device if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilGLDeviceInit(argc, argv); int device; cudaGetDevice( &device ); if( checkCUDAProfile( device ) == false ) { cudaThreadExit(); cutilExit(argc, argv); } } else { //cudaGLSetGLDevice (cutGetMaxGflopsDeviceId() ); int dev = findCapableDevice(argc, argv); if( dev != -1 ) cudaGLSetGLDevice( dev ); else { cudaThreadExit(); cutilExit(argc, argv); } } cutilCheckError(cutCreateTimer(&timer)); cutilCheckError(cutResetTimer(timer)); glutDisplayFunc(display); glutKeyboardFunc(keyboard); glutReshapeFunc(reshape); glutIdleFunc(idle); if (g_bOpenGLQA) { loadDefaultImage( argc, argv ); } if (argc > 1) { char *filename; if (cutGetCmdLineArgumentstr(argc, (const char **)argv, "file", &filename)) { initializeData(filename, argc, argv); } } else { loadDefaultImage( argc, argv ); } // If code is not printing the USage, then we execute this path. if (!bQuit) { if (g_bOpenGLQA) { g_CheckRender = new CheckBackBuffer(wWidth, wHeight, 4); g_CheckRender->setPixelFormat(GL_BGRA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); } printf("I: display image\n"); printf("T: display Sobel edge detection (computed with tex)\n"); printf("S: display Sobel edge detection (computed with tex+shared memory)\n"); printf("Use the '-' and '=' keys to change the brightness.\n"); printf("b: switch block filter operation (mean/Sobel)\n"); printf("p: swtich point filter operation (threshold on/off)\n"); fflush(stdout); atexit(cleanup); glutMainLoop(); } } cudaThreadExit(); cutilExit(argc, argv); }
int main(int argc, char** argv) { pArgc = &argc; pArgv = argv; shrQAStart(argc, argv); if (argc > 1) { if (cutCheckCmdLineFlag(argc, (const char **)argv, "help")) { printHelp(); } if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") || cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) { g_bQAReadback = true; fpsLimit = frameCheckNumber; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) { g_bOpenGLQA = true; fpsLimit = frameCheckNumber; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "fbo")) { g_bFBODisplay = true; fpsLimit = frameCheckNumber; } } if (g_bQAReadback) { runAutoTest(argc, argv); } else { if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) { printf(" This SDK does not explicitly support -device=n when running with OpenGL.\n"); printf(" When specifying -device=n (n=0,1,2,....) the sample must not use OpenGL.\n"); printf(" See details below to run without OpenGL:\n\n"); printf(" > %s -device=n -qatest\n\n", argv[0]); printf("exiting...\n"); shrQAFinishExit(argc, (const char **)argv, QA_PASSED); } // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL( &argc, argv ); //cudaGLSetGLDevice (cutGetMaxGflopsDeviceId() ); int dev = findCapableDevice(argc, argv); if( dev != -1 ) { cudaGLSetGLDevice( dev ); } else { shrQAFinishExit2(g_bQAReadback, *pArgc, (const char **)pArgv, QA_PASSED); } cutilCheckError(cutCreateTimer(&timer)); cutilCheckError(cutResetTimer(timer)); glutDisplayFunc(display); glutKeyboardFunc(keyboard); glutReshapeFunc(reshape); if (g_bOpenGLQA) { loadDefaultImage( argc, argv ); } if (argc > 1) { char *filename; if (cutGetCmdLineArgumentstr(argc, (const char **)argv, "file", &filename)) { initializeData(filename, argc, argv); } } else { loadDefaultImage( argc, argv ); } // If code is not printing the USage, then we execute this path. if (!bQuit) { if (g_bOpenGLQA) { g_CheckRender = new CheckBackBuffer(wWidth, wHeight, 4); g_CheckRender->setPixelFormat(GL_BGRA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); } printf("I: display Image (no filtering)\n"); printf("T: display Sobel Edge Detection (Using Texture)\n"); printf("S: display Sobel Edge Detection (Using SMEM+Texture)\n"); printf("Use the '-' and '=' keys to change the brightness.\n"); printf("b: switch block filter operation (mean/Sobel)\n"); printf("p: switch point filter operation (threshold on/off)\n"); fflush(stdout); atexit(cleanup); glutTimerFunc(REFRESH_DELAY, timerEvent,0); glutMainLoop(); } } cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_PASSED); }
void runAutoTest(int argc, char **argv) { printf("[%s] (automated testing w/ readback)\n", sSDKsample); if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { int device = cutilDeviceInit(argc, argv); if (device < 0) { printf("No CUDA Capable devices found, exiting...\n"); shrQAFinishExit(argc, (const char **)argv, QA_WAIVED); } checkDeviceMeetComputeSpec( argc, argv ); } else { int dev = findCapableDevice(argc, argv); if( dev != -1 ) cudaSetDevice( dev ); else { cutilDeviceReset(); shrQAFinishExit2(g_bQAReadback, *pArgc, (const char **)pArgv, QA_PASSED); } } loadDefaultImage( argc, argv ); if (argc > 1) { char *filename; if (cutGetCmdLineArgumentstr(argc, (const char **)argv, "file", &filename)) { initializeData(filename, argc, argv); } } else { loadDefaultImage( argc, argv ); } g_CheckRender = new CheckBackBuffer(imWidth, imHeight, sizeof(Pixel), false); g_CheckRender->setExecPath(argv[0]); Pixel *d_result; cutilSafeCall( cudaMalloc( (void **)&d_result, imWidth*imHeight*sizeof(Pixel)) ); while (g_SobelDisplayMode <= 2) { printf("AutoTest: %s <%s>\n", sSDKsample, filterMode[g_SobelDisplayMode]); sobelFilter(d_result, imWidth, imHeight, g_SobelDisplayMode, imageScale, blockOp, pointOp ); cutilSafeCall( cutilDeviceSynchronize() ); cudaMemcpy(g_CheckRender->imageData(), d_result, imWidth*imHeight*sizeof(Pixel), cudaMemcpyDeviceToHost); g_CheckRender->savePGM(sOriginal[g_Index], false, NULL); if (!g_CheckRender->PGMvsPGM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, 0.15f)) { g_TotalErrors++; } g_Index++; g_SobelDisplayMode = (SobelDisplayMode)g_Index; } cutilSafeCall( cudaFree( d_result ) ); delete g_CheckRender; shrQAFinishExit(argc, (const char **)argv, (!g_TotalErrors ? QA_PASSED : QA_FAILED) ); }
int main(int argc, char **argv) { char *precisionChoice; cutGetCmdLineArgumentstr(argc, (const char **)argv, "type", &precisionChoice); if(precisionChoice == NULL) useDoublePrecision = 0; else { if(!strcasecmp(precisionChoice, "double")) useDoublePrecision = 1; else useDoublePrecision = 0; } const int MAX_GPU_COUNT = 8; const int OPT_N = 256; const int PATH_N = 1 << 18; const unsigned int SEED = 777; //Input data array TOptionData optionData[OPT_N]; //Final GPU MC results TOptionValue callValueGPU[OPT_N]; //"Theoretical" call values by Black-Scholes formula float callValueBS[OPT_N]; //Solver config TOptionPlan optionSolver[MAX_GPU_COUNT]; //OS thread ID CUTThread threadID[MAX_GPU_COUNT]; //GPU number present in the system int GPU_N; int gpuBase, gpuIndex; int i; //Timer unsigned int hTimer; float time; double delta, ref, sumDelta, sumRef, sumReserve; cutilSafeCall( cudaGetDeviceCount(&GPU_N) ); cutilCheckError( cutCreateTimer(&hTimer) ); #ifdef _EMU GPU_N = 1; #endif printf("main(): generating input data...\n"); srand(123); for(i = 0; i < OPT_N; i++) { optionData[i].S = randFloat(5.0f, 50.0f); optionData[i].X = randFloat(10.0f, 25.0f); optionData[i].T = randFloat(1.0f, 5.0f); optionData[i].R = 0.06f; optionData[i].V = 0.10f; callValueGPU[i].Expected = -1.0f; callValueGPU[i].Confidence = -1.0f; } printf("main(): starting %i host threads...\n", GPU_N); //Get option count for each GPU for(i = 0; i < GPU_N; i++) optionSolver[i].optionCount = OPT_N / GPU_N; //Take into account cases with "odd" option counts for(i = 0; i < (OPT_N % GPU_N); i++) optionSolver[i].optionCount++; //Assign GPU option ranges gpuBase = 0; for(i = 0; i < GPU_N; i++) { optionSolver[i].device = i; optionSolver[i].optionData = optionData + gpuBase; optionSolver[i].callValue = callValueGPU + gpuBase; optionSolver[i].seed = SEED; optionSolver[i].pathN = PATH_N; gpuBase += optionSolver[i].optionCount; } //Start the timer cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); //Start CPU thread for each GPU for(gpuIndex = 0; gpuIndex < GPU_N; gpuIndex++) threadID[gpuIndex] = cutStartThread((CUT_THREADROUTINE)solverThread, &optionSolver[gpuIndex]); //Stop the timer cutilCheckError( cutStopTimer(hTimer) ); time = cutGetTimerValue(hTimer); printf("main(): waiting for GPU results...\n"); cutWaitForThreads(threadID, GPU_N); printf("main(): GPU statistics\n"); for(i = 0; i < GPU_N; i++) { printf("GPU #%i\n", optionSolver[i].device); printf("Options : %i\n", optionSolver[i].optionCount); printf("Simulation paths: %i\n", optionSolver[i].pathN); } printf("\nTotal time (ms.): %f\n", time); printf("Options per sec.: %f\n", OPT_N / (time * 0.001)); #ifdef DO_CPU printf("main(): running CPU MonteCarlo...\n"); TOptionValue callValueCPU; sumDelta = 0; sumRef = 0; for(i = 0; i < OPT_N; i++) { MonteCarloCPU( callValueCPU, optionData[i], NULL, PATH_N ); delta = fabs(callValueCPU.Expected - callValueGPU[i].Expected); ref = callValueCPU.Expected; sumDelta += delta; sumRef += fabs(ref); printf("Exp : %f | %f\t", callValueCPU.Expected, callValueGPU[i].Expected); printf("Conf: %f | %f\n", callValueCPU.Confidence, callValueGPU[i].Confidence); } printf("L1 norm: %E\n", sumDelta / sumRef); #endif printf("main(): comparing Monte Carlo and Black-Scholes results...\n"); sumDelta = 0; sumRef = 0; sumReserve = 0; for(i = 0; i < OPT_N; i++) { BlackScholesCall( callValueBS[i], optionData[i] ); delta = fabs(callValueBS[i] - callValueGPU[i].Expected); ref = callValueBS[i]; sumDelta += delta; sumRef += fabs(ref); if(delta > 1e-6) sumReserve += callValueGPU[i].Confidence / delta; #ifdef PRINT_RESULTS printf("BS: %f; delta: %E\n", callValueBS[i], delta); #endif } sumReserve /= OPT_N; printf("L1 norm : %E\n", sumDelta / sumRef); printf("Average reserve: %f\n", sumReserve); printf((sumReserve > 1.0f) ? "PASSED\n" : "FAILED.\n"); printf("Shutting down...\n"); cutilCheckError( cutDeleteTimer(hTimer) ); cutilExit(argc, argv); }
int main(int argc, char **argv) { // Start logs shrSetLogFileName ("quasirandomGenerator.txt"); shrLog("%s Starting...\n\n", argv[0]); unsigned int useDoublePrecision; char *precisionChoice; cutGetCmdLineArgumentstr(argc, (const char **)argv, "type", &precisionChoice); if(precisionChoice == NULL) useDoublePrecision = 0; else{ if(!strcasecmp(precisionChoice, "double")) useDoublePrecision = 1; else useDoublePrecision = 0; } unsigned int tableCPU[QRNG_DIMENSIONS][QRNG_RESOLUTION]; float *h_OutputGPU; float *d_Output; int dim, pos; double delta, ref, sumDelta, sumRef, L1norm, gpuTime; unsigned int hTimer; if(sizeof(INT64) != 8){ shrLog("sizeof(INT64) != 8\n"); return 0; } // use command-line specified CUDA device, otherwise use device with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) cutilDeviceInit(argc, argv); else cudaSetDevice( cutGetMaxGflopsDeviceId() ); cutilCheckError(cutCreateTimer(&hTimer)); int deviceIndex; cutilSafeCall(cudaGetDevice(&deviceIndex)); cudaDeviceProp deviceProp; cutilSafeCall(cudaGetDeviceProperties(&deviceProp, deviceIndex)); int version = deviceProp.major * 10 + deviceProp.minor; if(useDoublePrecision && version < 13){ shrLog("Double precision not supported.\n"); cudaThreadExit(); return 0; } shrLog("Allocating GPU memory...\n"); cutilSafeCall( cudaMalloc((void **)&d_Output, QRNG_DIMENSIONS * N * sizeof(float)) ); shrLog("Allocating CPU memory...\n"); h_OutputGPU = (float *)malloc(QRNG_DIMENSIONS * N * sizeof(float)); shrLog("Initializing QRNG tables...\n\n"); initQuasirandomGenerator(tableCPU); if(useDoublePrecision) initTable_SM13(tableCPU); else initTable_SM10(tableCPU); shrLog("Testing QRNG...\n\n"); cutilSafeCall( cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float)) ); int numIterations = 20; for (int i = -1; i < numIterations; i++) { if (i == 0) { cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); } if(useDoublePrecision) quasirandomGenerator_SM13(d_Output, 0, N); else quasirandomGenerator_SM10(d_Output, 0, N); } cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError(cutStopTimer(hTimer)); gpuTime = cutGetTimerValue(hTimer)/(double)numIterations*1e-3; shrLogEx(LOGBOTH | MASTER, 0, "quasirandomGenerator, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", (double)QRNG_DIMENSIONS * (double)N * 1.0E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128*QRNG_DIMENSIONS); shrLog("\nReading GPU results...\n"); cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost) ); shrLog("Comparing to the CPU results...\n\n"); sumDelta = 0; sumRef = 0; for(dim = 0; dim < QRNG_DIMENSIONS; dim++) for(pos = 0; pos < N; pos++){ ref = getQuasirandomValue63(pos, dim); delta = (double)h_OutputGPU[dim * N + pos] - ref; sumDelta += fabs(delta); sumRef += fabs(ref); } shrLog("L1 norm: %E\n", sumDelta / sumRef); shrLog("\nTesting inverseCNDgpu()...\n\n"); cutilSafeCall( cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float)) ); for (int i = -1; i < numIterations; i++) { if (i == 0) { cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); } if(useDoublePrecision) inverseCND_SM13(d_Output, NULL, QRNG_DIMENSIONS * N); else inverseCND_SM10(d_Output, NULL, QRNG_DIMENSIONS * N); } cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError(cutStopTimer(hTimer)); gpuTime = cutGetTimerValue(hTimer)/(double)numIterations*1e-3; shrLogEx(LOGBOTH | MASTER, 0, "quasirandomGenerator-inverse, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", (double)QRNG_DIMENSIONS * (double)N * 1E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128); shrLog("Reading GPU results...\n"); cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost) ); shrLog("\nComparing to the CPU results...\n"); sumDelta = 0; sumRef = 0; for(pos = 0; pos < QRNG_DIMENSIONS * N; pos++){ double p = (double)(pos + 1) / (double)(QRNG_DIMENSIONS * N + 1); ref = MoroInvCNDcpu(p); delta = (double)h_OutputGPU[pos] - ref; sumDelta += fabs(delta); sumRef += fabs(ref); } shrLog("L1 norm: %E\n\n", L1norm = sumDelta / sumRef); shrLog((L1norm < 1E-6) ? "PASSED\n\n" : "FAILED\n\n"); shrLog("Shutting down...\n"); cutilCheckError(cutDeleteTimer(hTimer)); free(h_OutputGPU); cutilSafeCall( cudaFree(d_Output) ); cudaThreadExit(); shrEXIT(argc, (const char**)argv); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { shrQAStart( argc, argv ); shrSetLogFileName ("reduction.txt"); char *reduceMethod; cutGetCmdLineArgumentstr( argc, (const char**) argv, "method", &reduceMethod); char *typeChoice; cutGetCmdLineArgumentstr( argc, (const char**) argv, "type", &typeChoice); if (0 == typeChoice) { typeChoice = (char*)malloc(4 * sizeof(char)); strcpy(typeChoice, "int"); } ReduceType datatype = REDUCE_INT; if (!strcasecmp(typeChoice, "float")) datatype = REDUCE_FLOAT; else if (!strcasecmp(typeChoice, "double")) datatype = REDUCE_DOUBLE; else datatype = REDUCE_INT; cudaDeviceProp deviceProp; deviceProp.major = 1; deviceProp.minor = 0; int minimumComputeVersion = 10; if (datatype == REDUCE_DOUBLE) { deviceProp.minor = 3; minimumComputeVersion = 13; } int dev; if(!cutCheckCmdLineFlag(argc, (const char**)argv, "method") ) { fprintf(stderr, "MISSING --method FLAG.\nYou must provide --method={ SUM | MIN | MAX }.\n"); exit(1); } if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilDeviceInit(argc, argv); cutilSafeCallNoSync(cudaGetDevice(&dev)); } else { cutilSafeCallNoSync(cudaChooseDevice(&dev, &deviceProp)); } cutilSafeCallNoSync(cudaGetDeviceProperties(&deviceProp, dev)); if((deviceProp.major * 10 + deviceProp.minor) >= minimumComputeVersion) { shrLog("Using Device %d: %s\n\n", dev, deviceProp.name); cutilSafeCallNoSync(cudaSetDevice(dev)); } else { shrLog("Error: the selected device does not support the minimum compute capability of %d.%d.\n\n", minimumComputeVersion / 10, minimumComputeVersion % 10); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_WAIVED); } shrLog("Reducing array of type %s\n\n", typeChoice); bool bResult = false; switch (datatype) { default: case REDUCE_INT: if (strcmp("SUM", reduceMethod) == 0) { bResult = runTestSum<int>( argc, argv, datatype); } else if ( strcmp("MAX", reduceMethod) == 0 ) { bResult = runTestMax<int>( argc, argv, datatype); } else if ( strcmp("MIN", reduceMethod) == 0 ) { bResult = runTestMin<int>( argc, argv, datatype); } else { fprintf(stderr, "No --method specified!\n"); exit(1); } break; case REDUCE_FLOAT: if (strcmp("SUM", reduceMethod) == 0) { bResult = runTestSum<float>( argc, argv, datatype); } else if ( strcmp("MAX", reduceMethod) == 0 ) { bResult = runTestMax<float>( argc, argv, datatype); } else if ( strcmp("MIN", reduceMethod) == 0 ) { bResult = runTestMin<float>( argc, argv, datatype); } else { fprintf(stderr, "No --method specified!\n"); exit(1); } break; case REDUCE_DOUBLE: if (strcmp("SUM", reduceMethod) == 0) { bResult = runTestSum<double>( argc, argv, datatype); } else if ( strcmp("MAX", reduceMethod) == 0 ) { bResult = runTestMax<double>( argc, argv, datatype); } else if ( strcmp("MIN", reduceMethod) == 0 ) { bResult = runTestMin<double>( argc, argv, datatype); } else { fprintf(stderr, "No --method specified!\n"); exit(1); } break; } cutilDeviceReset(); shrQAFinishExit(argc, (const char**)argv, (bResult ? QA_PASSED : QA_FAILED)); }
//////////////////////////////////////////////////////////////////////////////// // initialize marching cubes //////////////////////////////////////////////////////////////////////////////// void initMC(int argc, char** argv) { // parse command line arguments int n; if (cutGetCmdLineArgumenti( argc, (const char**) argv, "grid", &n)) { gridSizeLog2.x = gridSizeLog2.y = gridSizeLog2.z = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "gridx", &n)) { gridSizeLog2.x = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "gridy", &n)) { gridSizeLog2.y = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "gridz", &n)) { gridSizeLog2.z = n; } char *filename; if (cutGetCmdLineArgumentstr( argc, (const char**) argv, "file", &filename)) { volumeFilename = filename; } gridSize = make_uint3(1<<gridSizeLog2.x, 1<<gridSizeLog2.y, 1<<gridSizeLog2.z); gridSizeMask = make_uint3(gridSize.x-1, gridSize.y-1, gridSize.z-1); gridSizeShift = make_uint3(0, gridSizeLog2.x, gridSizeLog2.x+gridSizeLog2.y); numVoxels = gridSize.x*gridSize.y*gridSize.z; voxelSize = make_float3(2.0f / gridSize.x, 2.0f / gridSize.y, 2.0f / gridSize.z); maxVerts = gridSize.x*gridSize.y*100; printf("grid: %d x %d x %d = %d voxels\n", gridSize.x, gridSize.y, gridSize.z, numVoxels); printf("max verts = %d\n", maxVerts); #if SAMPLE_VOLUME // load volume data char* path = cutFindFilePath(volumeFilename, argv[0]); if (path == 0) { fprintf(stderr, "Error finding file '%s'\n", volumeFilename); cudaThreadExit(); exit(EXIT_FAILURE); } int size = gridSize.x*gridSize.y*gridSize.z*sizeof(uchar); uchar *volume = loadRawFile(path, size); cutilSafeCall(cudaMalloc((void**) &d_volume, size)); cutilSafeCall(cudaMemcpy(d_volume, volume, size, cudaMemcpyHostToDevice) ); free(volume); bindVolumeTexture(d_volume); #endif if (g_bQAReadback) { cudaMalloc((void **)&(d_pos), maxVerts*sizeof(float)*4); cudaMalloc((void **)&(d_normal), maxVerts*sizeof(float)*4); } else { // create VBOs createVBO(&posVbo, maxVerts*sizeof(float)*4); // DEPRECATED: cutilSafeCall( cudaGLRegisterBufferObject(posVbo) ); cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_posvbo_resource, posVbo, cudaGraphicsMapFlagsWriteDiscard)); createVBO(&normalVbo, maxVerts*sizeof(float)*4); // DEPRECATED: cutilSafeCall(cudaGLRegisterBufferObject(normalVbo)); cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_normalvbo_resource, normalVbo, cudaGraphicsMapFlagsWriteDiscard)); } // allocate textures allocateTextures( &d_edgeTable, &d_triTable, &d_numVertsTable ); // allocate device memory unsigned int memSize = sizeof(uint) * numVoxels; cutilSafeCall(cudaMalloc((void**) &d_voxelVerts, memSize)); cutilSafeCall(cudaMalloc((void**) &d_voxelVertsScan, memSize)); cutilSafeCall(cudaMalloc((void**) &d_voxelOccupied, memSize)); cutilSafeCall(cudaMalloc((void**) &d_voxelOccupiedScan, memSize)); cutilSafeCall(cudaMalloc((void**) &d_compVoxelArray, memSize)); // initialize CUDPP scan CUDPPConfiguration config; config.algorithm = CUDPP_SCAN; config.datatype = CUDPP_UINT; config.op = CUDPP_ADD; config.options = CUDPP_OPTION_FORWARD | CUDPP_OPTION_EXCLUSIVE; cudppPlan(&scanplan, config, numVoxels, 1, 0); }
int main(int argc, char** argv) { // EDISON ////////////////////////////////////////////////////////////////// sigmaS = 7.0f; sigmaR = 6.5f; edison.minRegion = 20.0f; cutLoadPPMub("image.ppm", &edison.inputImage_, &width, &height); edison.meanShift(); cutSavePPMub("segmimage.ppm", edison.segmImage_, width, height); cutSavePPMub("filtimage.ppm", edison.filtImage_, width, height); unsigned char data[height * width]; memset(data, 0, height * width * sizeof(unsigned char)); for(int i = 0; i < edison.numBoundaries_; i++) { data[edison.boundaries_[i]] = 255; } cutSavePGMub("bndyimage.pgm", data, width, height); //return 0; // EDISON ////////////////////////////////////////////////////////////////// if (argc > 1) { if (cutCheckCmdLineFlag(argc, (const char **)argv, "help")) { printHelp(); } if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") || cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) { g_bQAReadback = true; fpsLimit = frameCheckNumber; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) { g_bOpenGLQA = true; fpsLimit = frameCheckNumber; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "fbo")) { g_bFBODisplay = true; fpsLimit = frameCheckNumber; } } if (g_bQAReadback) { runAutoTest(argc, argv); } else { // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL( argc, argv ); // use command-line specified CUDA device, otherwise use device with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilGLDeviceInit(argc, argv); } else { cudaGLSetGLDevice (cutGetMaxGflopsDeviceId() ); } int device; struct cudaDeviceProp prop; cudaGetDevice( &device ); cudaGetDeviceProperties( &prop, device ); if(!strncmp( "Tesla", prop.name, 5 )) { printf("This sample needs a card capable of OpenGL and display.\n"); printf("Please choose a different device with the -device=x argument.\n"); cudaThreadExit(); cutilExit(argc, argv); } cutilCheckError(cutCreateTimer(&timer)); cutilCheckError(cutResetTimer(timer)); glutDisplayFunc(display); glutKeyboardFunc(keyboard); glutReshapeFunc(reshape); glutIdleFunc(idle); if (g_bOpenGLQA) { loadDefaultImage( argv[0] ); } if (argc > 1) { char *filename; if (cutGetCmdLineArgumentstr(argc, (const char **)argv, "file", &filename)) { initializeData(filename); } } else { loadDefaultImage( argv[0]); } // If code is not printing the USage, then we execute this path. if (!bQuit) { if (g_bOpenGLQA) { g_CheckRender = new CheckBackBuffer(wWidth, wHeight, 4); g_CheckRender->setPixelFormat(GL_BGRA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); } printf("I: display image\n"); printf("T: display Sobel edge detection (computed with tex)\n"); printf("S: display Sobel edge detection (computed with tex+shared memory)\n"); printf("Use the '-' and '=' keys to change the brightness.\n"); fflush(stdout); atexit(cleanup); glutMainLoop(); } } cudaThreadExit(); cutilExit(argc, argv); }
//////////////////////////////////////////////////////////////////////////////// // Main program //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv){ const unsigned int OPT_N_MAX = 512; unsigned int useDoublePrecision; printf("[binomialOptions]\n"); int devID = cutilDeviceInit(argc, argv); if (devID < 0) { printf("exiting...\n"); cutilExit(argc, argv); exit(0); } cutilSafeCall(cudaGetDevice(&devID)); cudaDeviceProp deviceProp; cutilSafeCall(cudaGetDeviceProperties(&deviceProp, devID)); char *precisionChoice; cutGetCmdLineArgumentstr(argc, (const char **)argv, "type", &precisionChoice); if(precisionChoice == NULL) { useDoublePrecision = 0; } else { if(!strcasecmp(precisionChoice, "double")) useDoublePrecision = 1; else useDoublePrecision = 0; } printf(useDoublePrecision ? "Using double precision...\n" : "Using single precision...\n"); const int OPT_N = deviceEmulation() ? 1 : OPT_N_MAX; TOptionData optionData[OPT_N_MAX]; float callValueBS[OPT_N_MAX], callValueGPU[OPT_N_MAX], callValueCPU[OPT_N_MAX]; double sumDelta, sumRef, gpuTime, errorVal; unsigned int hTimer; int i; cutilCheckError( cutCreateTimer(&hTimer) ); int version = deviceProp.major * 10 + deviceProp.minor; if(useDoublePrecision && version < 13){ printf("Double precision is not supported.\n"); return 0; } printf("Generating input data...\n"); //Generate options set srand(123); for(i = 0; i < OPT_N; i++){ optionData[i].S = randData(5.0f, 30.0f); optionData[i].X = randData(1.0f, 100.0f); optionData[i].T = randData(0.25f, 10.0f); optionData[i].R = 0.06f; optionData[i].V = 0.10f; BlackScholesCall(callValueBS[i], optionData[i]); } printf("Running GPU binomial tree...\n"); cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); if(useDoublePrecision) binomialOptions_SM13(callValueGPU, optionData, OPT_N); else binomialOptions_SM10(callValueGPU, optionData, OPT_N); cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError( cutStopTimer(hTimer) ); gpuTime = cutGetTimerValue(hTimer); printf("Options count : %i \n", OPT_N); printf("Time steps : %i \n", NUM_STEPS); printf("binomialOptionsGPU() time: %f msec\n", gpuTime); printf("Options per second : %f \n", OPT_N / (gpuTime * 0.001)); printf("Running CPU binomial tree...\n"); for(i = 0; i < OPT_N; i++) binomialOptionsCPU(callValueCPU[i], optionData[i]); printf("Comparing the results...\n"); sumDelta = 0; sumRef = 0; printf("GPU binomial vs. Black-Scholes\n"); for(i = 0; i < OPT_N; i++){ sumDelta += fabs(callValueBS[i] - callValueGPU[i]); sumRef += fabs(callValueBS[i]); } if(sumRef >1E-5) printf("L1 norm: %E\n", sumDelta / sumRef); else printf("Avg. diff: %E\n", sumDelta / (double)OPT_N); printf("CPU binomial vs. Black-Scholes\n"); sumDelta = 0; sumRef = 0; for(i = 0; i < OPT_N; i++){ sumDelta += fabs(callValueBS[i]- callValueCPU[i]); sumRef += fabs(callValueBS[i]); } if(sumRef >1E-5) printf("L1 norm: %E\n", sumDelta / sumRef); else printf("Avg. diff: %E\n", sumDelta / (double)OPT_N); printf("CPU binomial vs. GPU binomial\n"); sumDelta = 0; sumRef = 0; for(i = 0; i < OPT_N; i++){ sumDelta += fabs(callValueGPU[i] - callValueCPU[i]); sumRef += callValueCPU[i]; } if(sumRef > 1E-5) printf("L1 norm: %E\n", errorVal = sumDelta / sumRef); else printf("Avg. diff: %E\n", errorVal = sumDelta / (double)OPT_N); printf("Shutting down...\n"); printf("\n[binomialOptions] - Test Summary:\n"); printf((errorVal < 5e-4) ? "PASSED\n" : "FAILED\n"); cutilCheckError( cutDeleteTimer(hTimer) ); cudaThreadExit(); cutilExit(argc, argv); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { // use command-line specified CUDA device, otherwise use device with highest Gflops/s if (!cutCheckCmdLineFlag(argc, (const char **)argv, "noqatest") || cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) { g_bQAReadback = true; fpsLimit = frameCheckNumber; } if (argc > 1) { if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) { g_bOpenGLQA = true; fpsLimit = frameCheckNumber; } } printf("[%s] ", sSDKsample); if (g_bQAReadback) printf("(Automated Testing)\n"); if (g_bOpenGLQA) printf("(OpenGL Readback)\n"); // Get the path of the filename char *filename; if (cutGetCmdLineArgumentstr(argc, (const char**) argv, "image", &filename)) { image_filename = filename; } // load image char* image_path = cutFindFilePath(image_filename, argv[0]); if (image_path == 0) { fprintf(stderr, "Error finding image file '%s'\n", image_filename); cudaThreadExit(); exit(EXIT_FAILURE); } cutilCheckError( cutLoadPPM4ub(image_path, (unsigned char **) &h_img, &width, &height)); if (!h_img) { printf("Error opening file '%s'\n", image_path); cudaThreadExit(); exit(-1); } printf("Loaded '%s', %d x %d pixels\n", image_path, width, height); cutGetCmdLineArgumenti(argc, (const char**) argv, "threads", &nthreads); cutGetCmdLineArgumentf(argc, (const char**) argv, "sigma", &sigma); runBenchmark = cutCheckCmdLineFlag(argc, (const char**) argv, "bench"); int device; struct cudaDeviceProp prop; cudaGetDevice( &device ); cudaGetDeviceProperties( &prop, device ); if( !strncmp( "Tesla", prop.name, 5 ) ) { printf("Tesla card detected, running the test in benchmark mode (no OpenGL display)\n"); // runBenchmark = CUTTrue; g_bQAReadback = true; } // Benchmark or AutoTest mode detected, no OpenGL if (runBenchmark == CUTTrue || g_bQAReadback) { if( cutCheckCmdLineFlag( argc, (const char **)argv, "device" ) ) cutilDeviceInit( argc, argv ); else cudaSetDevice( cutGetMaxGflopsDeviceId() ); } else { // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL(argc, argv); if( cutCheckCmdLineFlag( argc, (const char **)argv, "device" ) ) cutilGLDeviceInit( argc, argv ); else cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); } initCudaBuffers(); if (g_bOpenGLQA) { g_CheckRender = new CheckBackBuffer(width, height, 4); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); } if (g_bQAReadback) { // This is the automated testing path g_CheckRender = new CheckBackBuffer(width, height, 4, false); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); runAutoTest(argc, argv); cleanup(); cudaThreadExit(); cutilExit(argc, argv); } if (runBenchmark) { benchmark(100); cleanup(); cudaThreadExit(); exit(0); } initGLBuffers(); atexit(cleanup); glutMainLoop(); cudaThreadExit(); cutilExit(argc, argv); }