void benchmark(int iterations) { // allocate memory for result unsigned int *d_result; unsigned int size = width * height * sizeof(unsigned int); cutilSafeCall( cudaMalloc( (void**) &d_result, size)); // warm-up gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError( cutStartTimer( timer)); // execute the kernel for(int i=0; i<iterations; i++) { gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); } cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError( cutStopTimer( timer)); // check if kernel execution generated an error cutilCheckMsg("Kernel execution failed"); printf("Processing time: %f (ms)\n", cutGetTimerValue( timer)); printf("%.2f Mpixels/sec\n", (width*height*iterations / (cutGetTimerValue( timer) / 1000.0f)) / 1e6); cutilSafeCall(cudaFree(d_result)); }
void idle() { float time = cutGetTimerValue(timer); if (animate) { animTime += (time - prevTime) * animationRate; } glutPostRedisplay(); prevTime = time; }
void runAutoTest(int argc, char **argv) { // allocate memory for result unsigned int *d_result; unsigned int size = width * height * sizeof(unsigned int); cutilSafeCall( cudaMalloc( (void**) &d_result, size)); // warm-up gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError( cutStartTimer( timer)); while (sigma <= 22) { gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); cutilSafeCall( cudaThreadSynchronize() ); // check if kernel execution generated an error cutilCheckMsg("Kernel execution failed"); cudaMemcpy(g_CheckRender->imageData(), d_result, width*height*4, cudaMemcpyDeviceToHost); g_CheckRender->savePPM(sOriginal[g_Index], false, NULL); if (!g_CheckRender->PPMvsPPM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, 0.50f)) { g_TotalErrors++; } g_Index++; sigma += 4; } cutilCheckError( cutStopTimer( timer)); printf("Processing time: %f (ms)\n", cutGetTimerValue( timer)); printf("%.2f Mpixels/sec\n", (width*height*g_Index / (cutGetTimerValue( timer) / 1000.0f)) / 1e6); printf("Summary: %d errors!\n", g_TotalErrors); printf("Test %s!\n", (g_TotalErrors==0) ? "PASSED" : "FAILED"); cutilSafeCall(cudaFree(d_result)); }
void disp(void){ glClear(GL_COLOR_BUFFER_BIT); update_phi(); its++; if(its<ITERATIONS){ glutPostRedisplay(); if(its%50==0){ printf("Iteration %3d Total Time: %3.2f ReInit Time: %3.2f\n", its, 0.001*cutGetTimerValue(Timer), 0.001*cutGetTimerValue(ReInitTimer)); cutStartTimer(ReInitTimer); // ReInit Timer Start reinit_phi(); // ReInit glDrawPixels(imageW, imageH, GL_GREEN, GL_FLOAT, phi); glutSwapBuffers(); cutStopTimer(ReInitTimer); // ReInit Timer Stop } } else { printf("Iteration %3d Total Time: %3.2f ReInit Time: %3.2f\n", its, 0.001*cutGetTimerValue(Timer), 0.001*cutGetTimerValue(ReInitTimer)); glDrawPixels(imageW, imageH, GL_GREEN, GL_FLOAT, phi); glutSwapBuffers(); } }
int main(int argc, char** argv) { printHeader("Initializare"); initCUDA(); init(); printHeader("Calcul CPU"); cutilCheckError(cutStartTimer(timer)); // Calculeaza sampleul de control - CPU printf("Asteptati: Se calculeaza controlul pe CPU ... "); computeControl(); printf("DONE\n"); float time = cutGetTimerValue(timer); printf("Timp de calcul pe CPU = %f milisecunde\n",time); cutilCheckError(cutResetTimer(timer)); printHeader("Calcul CUDA"); // Se calculeaza pe CUDA printf("Asteptati: Se calculeaza pe CUDA ... "); runCUDA(); printf("DONE\n"); time = cutGetTimerValue(timer); printf("Timp de calcul pe GPU = %f milisecunde\n",time); printHeader("Verificare calcule"); // Se verifica daca s-a calculat corect pe CUDA printf("Se verifica daca rezultatul pe CUDA corespunde cu rezultatul pe CPU : "); verificaCalcule(); printHeader(""); cleanup(); printf("Apasa ENTER pentru a termina programul\n"); getchar(); return 0; }
void runBenchmark(int iterations) { cutilCheckError(cutStartTimer(timer)); for (int i = 0; i < iterations; ++i) { psystem->update(timestep); } cutilCheckError(cutStopTimer(timer)); float milliseconds = cutGetTimerValue(timer); printf("%d particles, total time for %d iterations: %0.3f ms\n", numParticles, iterations, milliseconds); printf("Test PASSED\n"); }
void _runBenchmark(int iterations) { // once without timing to prime the device if (!useCpu) m_nbody->update(activeParams.m_timestep); if (useCpu) { cutCreateTimer(&timer); cutStartTimer(timer); } else { cutilSafeCall(cudaEventRecord(startEvent, 0)); } for (int i = 0; i < iterations; ++i) { m_nbody->update(activeParams.m_timestep); } float milliseconds = 0; if (useCpu) { cutStopTimer(timer); milliseconds = cutGetTimerValue(timer); cutDeleteTimer(timer); } else { cutilSafeCall(cudaEventRecord(stopEvent, 0)); cutilSafeCall(cudaEventSynchronize(stopEvent)); cutilSafeCall( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent)); } double interactionsPerSecond = 0; double gflops = 0; computePerfStats(interactionsPerSecond, gflops, milliseconds, iterations); shrLog("%d bodies, total time for %d iterations: %.3f ms\n", numBodies, iterations, milliseconds); shrLog("= %.3f billion interactions per second\n", interactionsPerSecond); shrLog("= %.3f %s-precision GFLOP/s at %d flops per interaction\n", gflops, (sizeof(T) > 4) ? "double" : "single", flopsPerInteraction); }
//////////////////////////////////////////////////////////////////////////////// //! Run test //////////////////////////////////////////////////////////////////////////////// void runAutoTest(int argc, char** argv) { printf("[%s]\n", sSDKsample); // Cuda init int dev = cutilChooseCudaDevice(argc, argv); cudaDeviceProp deviceProp; cutilSafeCall(cudaGetDeviceProperties(&deviceProp, dev)); printf("Compute capability %d.%d\n", deviceProp.major, deviceProp.minor); int version = deviceProp.major*10 + deviceProp.minor; g_hasDouble = (version >= 13); if (inEmulationMode()) { // workaround since SM13 kernel doesn't produce correct output in emulation mode g_hasDouble = false; } // create FFT plan CUFFT_SAFE_CALL(cufftPlan2d(&fftPlan, meshW, meshH, CUFFT_C2R) ); // allocate memory fftInputW = (meshW / 2)+1; fftInputH = meshH; fftInputSize = (fftInputW*fftInputH)*sizeof(float2); cutilSafeCall(cudaMalloc((void **)&d_h0, fftInputSize) ); cutilSafeCall(cudaMalloc((void **)&d_ht, fftInputSize) ); h_h0 = (float2 *) malloc(fftInputSize); generate_h0(); cutilSafeCall(cudaMemcpy(d_h0, h_h0, fftInputSize, cudaMemcpyHostToDevice) ); cutilSafeCall(cudaMalloc((void **)&d_slope, meshW*meshH*sizeof(float2)) ); cutCreateTimer(&timer); cutStartTimer(timer); prevTime = cutGetTimerValue(timer); // Creating the Auto-Validation Code g_CheckRender = new CheckBackBuffer(windowH, windowH, 4, false); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); runCudaTest(g_hasDouble); cudaThreadExit(); }
int main(int argc, char **argv) { char *precisionChoice; cutGetCmdLineArgumentstr(argc, (const char **)argv, "type", &precisionChoice); if(precisionChoice == NULL) useDoublePrecision = 0; else { if(!strcasecmp(precisionChoice, "double")) useDoublePrecision = 1; else useDoublePrecision = 0; } const int MAX_GPU_COUNT = 8; const int OPT_N = 256; const int PATH_N = 1 << 18; const unsigned int SEED = 777; //Input data array TOptionData optionData[OPT_N]; //Final GPU MC results TOptionValue callValueGPU[OPT_N]; //"Theoretical" call values by Black-Scholes formula float callValueBS[OPT_N]; //Solver config TOptionPlan optionSolver[MAX_GPU_COUNT]; //OS thread ID CUTThread threadID[MAX_GPU_COUNT]; //GPU number present in the system int GPU_N; int gpuBase, gpuIndex; int i; //Timer unsigned int hTimer; float time; double delta, ref, sumDelta, sumRef, sumReserve; cutilSafeCall( cudaGetDeviceCount(&GPU_N) ); cutilCheckError( cutCreateTimer(&hTimer) ); #ifdef _EMU GPU_N = 1; #endif printf("main(): generating input data...\n"); srand(123); for(i = 0; i < OPT_N; i++) { optionData[i].S = randFloat(5.0f, 50.0f); optionData[i].X = randFloat(10.0f, 25.0f); optionData[i].T = randFloat(1.0f, 5.0f); optionData[i].R = 0.06f; optionData[i].V = 0.10f; callValueGPU[i].Expected = -1.0f; callValueGPU[i].Confidence = -1.0f; } printf("main(): starting %i host threads...\n", GPU_N); //Get option count for each GPU for(i = 0; i < GPU_N; i++) optionSolver[i].optionCount = OPT_N / GPU_N; //Take into account cases with "odd" option counts for(i = 0; i < (OPT_N % GPU_N); i++) optionSolver[i].optionCount++; //Assign GPU option ranges gpuBase = 0; for(i = 0; i < GPU_N; i++) { optionSolver[i].device = i; optionSolver[i].optionData = optionData + gpuBase; optionSolver[i].callValue = callValueGPU + gpuBase; optionSolver[i].seed = SEED; optionSolver[i].pathN = PATH_N; gpuBase += optionSolver[i].optionCount; } //Start the timer cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); //Start CPU thread for each GPU for(gpuIndex = 0; gpuIndex < GPU_N; gpuIndex++) threadID[gpuIndex] = cutStartThread((CUT_THREADROUTINE)solverThread, &optionSolver[gpuIndex]); //Stop the timer cutilCheckError( cutStopTimer(hTimer) ); time = cutGetTimerValue(hTimer); printf("main(): waiting for GPU results...\n"); cutWaitForThreads(threadID, GPU_N); printf("main(): GPU statistics\n"); for(i = 0; i < GPU_N; i++) { printf("GPU #%i\n", optionSolver[i].device); printf("Options : %i\n", optionSolver[i].optionCount); printf("Simulation paths: %i\n", optionSolver[i].pathN); } printf("\nTotal time (ms.): %f\n", time); printf("Options per sec.: %f\n", OPT_N / (time * 0.001)); #ifdef DO_CPU printf("main(): running CPU MonteCarlo...\n"); TOptionValue callValueCPU; sumDelta = 0; sumRef = 0; for(i = 0; i < OPT_N; i++) { MonteCarloCPU( callValueCPU, optionData[i], NULL, PATH_N ); delta = fabs(callValueCPU.Expected - callValueGPU[i].Expected); ref = callValueCPU.Expected; sumDelta += delta; sumRef += fabs(ref); printf("Exp : %f | %f\t", callValueCPU.Expected, callValueGPU[i].Expected); printf("Conf: %f | %f\n", callValueCPU.Confidence, callValueGPU[i].Confidence); } printf("L1 norm: %E\n", sumDelta / sumRef); #endif printf("main(): comparing Monte Carlo and Black-Scholes results...\n"); sumDelta = 0; sumRef = 0; sumReserve = 0; for(i = 0; i < OPT_N; i++) { BlackScholesCall( callValueBS[i], optionData[i] ); delta = fabs(callValueBS[i] - callValueGPU[i].Expected); ref = callValueBS[i]; sumDelta += delta; sumRef += fabs(ref); if(delta > 1e-6) sumReserve += callValueGPU[i].Confidence / delta; #ifdef PRINT_RESULTS printf("BS: %f; delta: %E\n", callValueBS[i], delta); #endif } sumReserve /= OPT_N; printf("L1 norm : %E\n", sumDelta / sumRef); printf("Average reserve: %f\n", sumReserve); printf((sumReserve > 1.0f) ? "PASSED\n" : "FAILED.\n"); printf("Shutting down...\n"); cutilCheckError( cutDeleteTimer(hTimer) ); cutilExit(argc, argv); }
//////////////////////////////////////////////////////////////////////////////// //! Run test //////////////////////////////////////////////////////////////////////////////// void runGraphicsTest(int argc, char** argv) { printf("[%s] ", sSDKsample); if (g_bOpenGLQA) printf("[OpenGL Readback Comparisons] "); printf("\n"); if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device") ) { printf("[%s]\n", argv[0]); printf(" Does not explicitly support -device=n in OpenGL mode\n"); printf(" To use -device=n, the sample must be running w/o OpenGL\n\n"); printf(" > %s -device=n -qatest\n", argv[0]); printf("exiting...\n"); exit(0); } // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. if(CUTFalse == initGL( &argc, argv )) { cudaThreadExit(); return; } cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); // create FFT plan CUFFT_SAFE_CALL(cufftPlan2d(&fftPlan, meshW, meshH, CUFFT_C2R) ); // allocate memory fftInputW = (meshW / 2)+1; fftInputH = meshH; fftInputSize = (fftInputW*fftInputH)*sizeof(float2); cutilSafeCall(cudaMalloc((void **)&d_h0, fftInputSize) ); cutilSafeCall(cudaMalloc((void **)&d_ht, fftInputSize) ); h_h0 = (float2 *) malloc(fftInputSize); generate_h0(); cutilSafeCall(cudaMemcpy(d_h0, h_h0, fftInputSize, cudaMemcpyHostToDevice) ); cutilSafeCall(cudaMalloc((void **)&d_slope, meshW*meshH*sizeof(float2)) ); cutCreateTimer(&timer); cutStartTimer(timer); prevTime = cutGetTimerValue(timer); // create vertex buffers and register with CUDA createVBO(&heightVertexBuffer, meshW*meshH*sizeof(float)); // DEPRECATED: cutilSafeCall(cudaGLRegisterBufferObject(heightVertexBuffer)); cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_heightVB_resource, heightVertexBuffer, cudaGraphicsMapFlagsWriteDiscard)); createVBO(&slopeVertexBuffer, meshW*meshH*sizeof(float2)); // DEPRECATED: cutilSafeCall(cudaGLRegisterBufferObject(slopeVertexBuffer)); cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_slopeVB_resource, slopeVertexBuffer, cudaGraphicsMapFlagsWriteDiscard)); // create vertex and index buffer for mesh createMeshPositionVBO(&posVertexBuffer, meshW, meshH); createMeshIndexBuffer(&indexBuffer, meshW, meshH); // Creating the Auto-Validation Code if (g_bOpenGLQA) { g_CheckRender = new CheckBackBuffer(windowH, windowH, 4); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); } runCuda(); // register callbacks glutDisplayFunc(display); glutKeyboardFunc(keyboard); glutMouseFunc(mouse); glutMotionFunc(motion); glutReshapeFunc(reshape); glutIdleFunc(idle); // start rendering mainloop glutMainLoop(); cudaThreadExit(); }
//////////////////////////////////////////////////////////////////////////////// // Main program //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv){ const unsigned int OPT_N_MAX = 512; unsigned int useDoublePrecision; printf("[binomialOptions]\n"); int devID = cutilDeviceInit(argc, argv); if (devID < 0) { printf("exiting...\n"); cutilExit(argc, argv); exit(0); } cutilSafeCall(cudaGetDevice(&devID)); cudaDeviceProp deviceProp; cutilSafeCall(cudaGetDeviceProperties(&deviceProp, devID)); char *precisionChoice; cutGetCmdLineArgumentstr(argc, (const char **)argv, "type", &precisionChoice); if(precisionChoice == NULL) { useDoublePrecision = 0; } else { if(!strcasecmp(precisionChoice, "double")) useDoublePrecision = 1; else useDoublePrecision = 0; } printf(useDoublePrecision ? "Using double precision...\n" : "Using single precision...\n"); const int OPT_N = deviceEmulation() ? 1 : OPT_N_MAX; TOptionData optionData[OPT_N_MAX]; float callValueBS[OPT_N_MAX], callValueGPU[OPT_N_MAX], callValueCPU[OPT_N_MAX]; double sumDelta, sumRef, gpuTime, errorVal; unsigned int hTimer; int i; cutilCheckError( cutCreateTimer(&hTimer) ); int version = deviceProp.major * 10 + deviceProp.minor; if(useDoublePrecision && version < 13){ printf("Double precision is not supported.\n"); return 0; } printf("Generating input data...\n"); //Generate options set srand(123); for(i = 0; i < OPT_N; i++){ optionData[i].S = randData(5.0f, 30.0f); optionData[i].X = randData(1.0f, 100.0f); optionData[i].T = randData(0.25f, 10.0f); optionData[i].R = 0.06f; optionData[i].V = 0.10f; BlackScholesCall(callValueBS[i], optionData[i]); } printf("Running GPU binomial tree...\n"); cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); if(useDoublePrecision) binomialOptions_SM13(callValueGPU, optionData, OPT_N); else binomialOptions_SM10(callValueGPU, optionData, OPT_N); cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError( cutStopTimer(hTimer) ); gpuTime = cutGetTimerValue(hTimer); printf("Options count : %i \n", OPT_N); printf("Time steps : %i \n", NUM_STEPS); printf("binomialOptionsGPU() time: %f msec\n", gpuTime); printf("Options per second : %f \n", OPT_N / (gpuTime * 0.001)); printf("Running CPU binomial tree...\n"); for(i = 0; i < OPT_N; i++) binomialOptionsCPU(callValueCPU[i], optionData[i]); printf("Comparing the results...\n"); sumDelta = 0; sumRef = 0; printf("GPU binomial vs. Black-Scholes\n"); for(i = 0; i < OPT_N; i++){ sumDelta += fabs(callValueBS[i] - callValueGPU[i]); sumRef += fabs(callValueBS[i]); } if(sumRef >1E-5) printf("L1 norm: %E\n", sumDelta / sumRef); else printf("Avg. diff: %E\n", sumDelta / (double)OPT_N); printf("CPU binomial vs. Black-Scholes\n"); sumDelta = 0; sumRef = 0; for(i = 0; i < OPT_N; i++){ sumDelta += fabs(callValueBS[i]- callValueCPU[i]); sumRef += fabs(callValueBS[i]); } if(sumRef >1E-5) printf("L1 norm: %E\n", sumDelta / sumRef); else printf("Avg. diff: %E\n", sumDelta / (double)OPT_N); printf("CPU binomial vs. GPU binomial\n"); sumDelta = 0; sumRef = 0; for(i = 0; i < OPT_N; i++){ sumDelta += fabs(callValueGPU[i] - callValueCPU[i]); sumRef += callValueCPU[i]; } if(sumRef > 1E-5) printf("L1 norm: %E\n", errorVal = sumDelta / sumRef); else printf("Avg. diff: %E\n", errorVal = sumDelta / (double)OPT_N); printf("Shutting down...\n"); printf("\n[binomialOptions] - Test Summary:\n"); printf((errorVal < 5e-4) ? "PASSED\n" : "FAILED\n"); cutilCheckError( cutDeleteTimer(hTimer) ); cudaThreadExit(); cutilExit(argc, argv); }
int main(int argc, char **argv) { GpuProfiling::initProf(); // Start logs shrSetLogFileName ("scan.txt"); shrLog("%s Starting...\n\n", argv[0]); //Use command-line specified CUDA device, otherwise use device with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) cutilDeviceInit(argc, argv); else cudaSetDevice( cutGetMaxGflopsDeviceId() ); uint *d_Input, *d_Output; uint *h_Input, *h_OutputCPU, *h_OutputGPU; uint hTimer; const uint N = 13 * 1048576 / 2; shrLog("Allocating and initializing host arrays...\n"); cutCreateTimer(&hTimer); h_Input = (uint *)malloc(N * sizeof(uint)); h_OutputCPU = (uint *)malloc(N * sizeof(uint)); h_OutputGPU = (uint *)malloc(N * sizeof(uint)); srand(2009); for(uint i = 0; i < N; i++) h_Input[i] = rand(); shrLog("Allocating and initializing CUDA arrays...\n"); cutilSafeCall( cudaMalloc((void **)&d_Input, N * sizeof(uint)) ); cutilSafeCall( cudaMalloc((void **)&d_Output, N * sizeof(uint)) ); cutilSafeCall( cudaMemcpy(d_Input, h_Input, N * sizeof(uint), cudaMemcpyHostToDevice) ); shrLog("Initializing CUDA-C scan...\n\n"); initScan(); int globalFlag = 1; size_t szWorkgroup; const int iCycles = 100; shrLog("*** Running GPU scan for short arrays (%d identical iterations)...\n\n", iCycles); for(uint arrayLength = MIN_SHORT_ARRAY_SIZE; arrayLength <= MAX_SHORT_ARRAY_SIZE; arrayLength <<= 1){ shrLog("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength); cutilSafeCall( cudaThreadSynchronize() ); cutResetTimer(hTimer); cutStartTimer(hTimer); for(int i = 0; i < iCycles; i++) { szWorkgroup = scanExclusiveShort(d_Output, d_Input, N / arrayLength, arrayLength); } cutilSafeCall( cudaThreadSynchronize()); cutStopTimer(hTimer); double timerValue = 1.0e-3 * cutGetTimerValue(hTimer) / iCycles; shrLog("Validating the results...\n"); shrLog("...reading back GPU results\n"); cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost) ); shrLog(" ...scanExclusiveHost()\n"); scanExclusiveHost(h_OutputCPU, h_Input, N / arrayLength, arrayLength); // Compare GPU results with CPU results and accumulate error for this test shrLog(" ...comparing the results\n"); int localFlag = 1; for(uint i = 0; i < N; i++) { if(h_OutputCPU[i] != h_OutputGPU[i]) { localFlag = 0; break; } } // Log message on individual test result, then accumulate to global flag shrLog(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!"); globalFlag = globalFlag && localFlag; // Data log if (arrayLength == MAX_SHORT_ARRAY_SIZE) { shrLog("\n"); shrLogEx(LOGBOTH | MASTER, 0, "scan-Short, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)arrayLength/timerValue), timerValue, arrayLength, 1, szWorkgroup); shrLog("\n"); } } shrLog("***Running GPU scan for large arrays (%u identical iterations)...\n\n", iCycles); for(uint arrayLength = MIN_LARGE_ARRAY_SIZE; arrayLength <= MAX_LARGE_ARRAY_SIZE; arrayLength <<= 1){ shrLog("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength); cutilSafeCall( cudaThreadSynchronize() ); cutResetTimer(hTimer); cutStartTimer(hTimer); for(int i = 0; i < iCycles; i++) { szWorkgroup = scanExclusiveLarge(d_Output, d_Input, N / arrayLength, arrayLength); } cutilSafeCall( cudaThreadSynchronize() ); cutStopTimer(hTimer); double timerValue = 1.0e-3 * cutGetTimerValue(hTimer) / iCycles; shrLog("Validating the results...\n"); shrLog("...reading back GPU results\n"); cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost) ); shrLog("...scanExclusiveHost()\n"); scanExclusiveHost(h_OutputCPU, h_Input, N / arrayLength, arrayLength); // Compare GPU results with CPU results and accumulate error for this test shrLog(" ...comparing the results\n"); int localFlag = 1; for(uint i = 0; i < N; i++) { if(h_OutputCPU[i] != h_OutputGPU[i]) { localFlag = 0; break; } } // Log message on individual test result, then accumulate to global flag shrLog(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!"); globalFlag = globalFlag && localFlag; // Data log if (arrayLength == MAX_LARGE_ARRAY_SIZE) { shrLog("\n"); shrLogEx(LOGBOTH | MASTER, 0, "scan-Large, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)arrayLength/timerValue), timerValue, arrayLength, 1, szWorkgroup); shrLog("\n"); } } // pass or fail (cumulative... all tests in the loop) shrLog(globalFlag ? "PASSED\n\n" : "FAILED\n\n"); GpuProfiling::printResults(); shrLog("Shutting down...\n"); closeScan(); cutilSafeCall( cudaFree(d_Output)); cutilSafeCall( cudaFree(d_Input)); cutilCheckError( cutDeleteTimer(hTimer) ); cudaThreadExit(); exit(0); shrEXIT(argc, (const char**)argv); }
int main(int argc, char **argv) { uchar *h_Data; uint *h_HistogramCPU, *h_HistogramGPU; uchar *d_Data; uint *d_Histogram; uint hTimer; int PassFailFlag = 1; uint byteCount = 64 * 1048576; uint uiSizeMult = 1; cudaDeviceProp deviceProp; deviceProp.major = 0; deviceProp.minor = 0; int dev; shrQAStart(argc, argv); // set logfile name and start logs shrSetLogFileName ("histogram.txt"); //Use command-line specified CUDA device, otherwise use device with highest Gflops/s if( shrCheckCmdLineFlag(argc, (const char**)argv, "device") ) { dev = cutilDeviceInit(argc, argv); if (dev < 0) { printf("No CUDA Capable Devices found, exiting...\n"); shrQAFinishExit(argc, (const char **)argv, QA_WAIVED); } } else { cudaSetDevice( dev = cutGetMaxGflopsDeviceId() ); cutilSafeCall( cudaChooseDevice(&dev, &deviceProp) ); } cutilSafeCall( cudaGetDeviceProperties(&deviceProp, dev) ); printf("CUDA device [%s] has %d Multi-Processors, Compute %d.%d\n", deviceProp.name, deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor); int version = deviceProp.major * 0x10 + deviceProp.minor; if(version < 0x11) { printf("There is no device supporting a minimum of CUDA compute capability 1.1 for this SDK sample\n"); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_WAIVED); } cutilCheckError(cutCreateTimer(&hTimer)); // Optional Command-line multiplier to increase size of array to histogram if (shrGetCmdLineArgumentu(argc, (const char**)argv, "sizemult", &uiSizeMult)) { uiSizeMult = CLAMP(uiSizeMult, 1, 10); byteCount *= uiSizeMult; } shrLog("Initializing data...\n"); shrLog("...allocating CPU memory.\n"); h_Data = (uchar *)malloc(byteCount); h_HistogramCPU = (uint *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint)); h_HistogramGPU = (uint *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint)); shrLog("...generating input data\n"); srand(2009); for(uint i = 0; i < byteCount; i++) h_Data[i] = rand() % 256; shrLog("...allocating GPU memory and copying input data\n\n"); cutilSafeCall( cudaMalloc((void **)&d_Data, byteCount ) ); cutilSafeCall( cudaMalloc((void **)&d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint) ) ); cutilSafeCall( cudaMemcpy(d_Data, h_Data, byteCount, cudaMemcpyHostToDevice) ); { shrLog("Starting up 64-bin histogram...\n\n"); initHistogram64(); shrLog("Running 64-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns); for(int iter = -1; iter < numRuns; iter++){ //iter == -1 -- warmup iteration if(iter == 0){ cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); } histogram64(d_Histogram, d_Data, byteCount); } cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutStopTimer(hTimer)); double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns; shrLog("histogram64() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs); shrLogEx(LOGBOTH | MASTER, 0, "histogram64, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM64_THREADBLOCK_SIZE); shrLog("\nValidating GPU results...\n"); shrLog(" ...reading back GPU results\n"); cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM64_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) ); shrLog(" ...histogram64CPU()\n"); histogram64CPU( h_HistogramCPU, h_Data, byteCount ); shrLog(" ...comparing the results...\n"); for(uint i = 0; i < HISTOGRAM64_BIN_COUNT; i++) if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0; shrLog(PassFailFlag ? " ...64-bin histograms match\n\n" : " ***64-bin histograms do not match!!!***\n\n" ); shrLog("Shutting down 64-bin histogram...\n\n\n"); closeHistogram64(); } { shrLog("Initializing 256-bin histogram...\n"); initHistogram256(); shrLog("Running 256-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns); for(int iter = -1; iter < numRuns; iter++){ //iter == -1 -- warmup iteration if(iter == 0){ cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); } histogram256(d_Histogram, d_Data, byteCount); } cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutStopTimer(hTimer)); double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns; shrLog("histogram256() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs); shrLogEx(LOGBOTH | MASTER, 0, "histogram256, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM256_THREADBLOCK_SIZE); shrLog("\nValidating GPU results...\n"); shrLog(" ...reading back GPU results\n"); cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) ); shrLog(" ...histogram256CPU()\n"); histogram256CPU( h_HistogramCPU, h_Data, byteCount ); shrLog(" ...comparing the results\n"); for(uint i = 0; i < HISTOGRAM256_BIN_COUNT; i++) if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0; shrLog(PassFailFlag ? " ...256-bin histograms match\n\n" : " ***256-bin histograms do not match!!!***\n\n" ); shrLog("Shutting down 256-bin histogram...\n\n\n"); closeHistogram256(); } shrLog("Shutting down...\n"); cutilCheckError(cutDeleteTimer(hTimer)); cutilSafeCall( cudaFree(d_Histogram) ); cutilSafeCall( cudaFree(d_Data) ); free(h_HistogramGPU); free(h_HistogramCPU); free(h_Data); cutilDeviceReset(); shrLog("%s - Test Summary\n", sSDKsample); // pass or fail (for both 64 bit and 256 bit histograms) shrQAFinishExit(argc, (const char **)argv, (PassFailFlag ? QA_PASSED : QA_FAILED)); }
void display() { static double gflops = 0; static double ifps = 0; static double interactionsPerSecond = 0; // update the simulation if (!bPause) { if (cycleDemo && (cutGetTimerValue(demoTimer) > demoTime)) { activeDemo = (activeDemo + 1) % numDemos; selectDemo(activeDemo); } updateSimulation(); if (!useCpu) cudaEventRecord(hostMemSyncEvent, 0); // insert an event to wait on before rendering } glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); if (displayEnabled) { // view transform { glMatrixMode(GL_MODELVIEW); glLoadIdentity(); for (int c = 0; c < 3; ++c) { camera_trans_lag[c] += (camera_trans[c] - camera_trans_lag[c]) * inertia; camera_rot_lag[c] += (camera_rot[c] - camera_rot_lag[c]) * inertia; } glTranslatef(camera_trans_lag[0], camera_trans_lag[1], camera_trans_lag[2]); glRotatef(camera_rot_lag[0], 1.0, 0.0, 0.0); glRotatef(camera_rot_lag[1], 0.0, 1.0, 0.0); } displayNBodySystem(); // display user interface if (bShowSliders) { glBlendFunc(GL_ONE_MINUS_DST_COLOR, GL_ZERO); // invert color glEnable(GL_BLEND); paramlist->Render(0, 0); glDisable(GL_BLEND); } if (bFullscreen) { beginWinCoords(); char msg0[256], msg1[256], msg2[256]; if (bDispInteractions) { sprintf(msg1, "%0.2f billion interactions per second", interactionsPerSecond); } else { sprintf(msg1, "%0.2f GFLOP/s", gflops); } sprintf(msg0, "%s", deviceName); sprintf(msg2, "%0.2f FPS [%s | %d bodies]", ifps, fp64 ? "double precision" : "single precision", numBodies); glBlendFunc(GL_ONE_MINUS_DST_COLOR, GL_ZERO); // invert color glEnable(GL_BLEND); glColor3f(0.46f, 0.73f, 0.0f); glPrint(80, glutGet(GLUT_WINDOW_HEIGHT) - 122, msg0, GLUT_BITMAP_TIMES_ROMAN_24); glColor3f(1.0f, 1.0f, 1.0f); glPrint(80, glutGet(GLUT_WINDOW_HEIGHT) - 96, msg2, GLUT_BITMAP_TIMES_ROMAN_24); glColor3f(1.0f, 1.0f, 1.0f); glPrint(80, glutGet(GLUT_WINDOW_HEIGHT) - 70, msg1, GLUT_BITMAP_TIMES_ROMAN_24); glDisable(GL_BLEND); endWinCoords(); } glutSwapBuffers(); } fpsCount++; // this displays the frame rate updated every second (independent of frame rate) if (fpsCount >= fpsLimit) { char fps[256]; float milliseconds = 1; // stop timer if (useCpu) { milliseconds = cutGetTimerValue(timer); cutilCheckError(cutResetTimer(timer)); } else { cutilSafeCall(cudaEventRecord(stopEvent, 0)); cutilSafeCall(cudaEventSynchronize(stopEvent)); cutilSafeCall( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent)); } milliseconds /= (float)fpsCount; computePerfStats(interactionsPerSecond, gflops, milliseconds, 1); ifps = 1.f / (milliseconds / 1000.f); sprintf(fps, "CUDA N-Body (%d bodies): " "%0.1f fps | %0.1f BIPS | %0.1f GFLOP/s | %s", numBodies, ifps, interactionsPerSecond, gflops, fp64 ? "double precision" : "single precision"); glutSetWindowTitle(fps); fpsCount = 0; fpsLimit = (ifps > 1.f) ? (int)ifps : 1; if (bPause) fpsLimit = 0; // restart timer if (!useCpu) { cutilSafeCall(cudaEventRecord(startEvent, 0)); } } glutReportErrors(); }
//////////////////////////////////////////////////////////////////////////////// // Test driver //////////////////////////////////////////////////////////////////////////////// int main(int argc, char** argv){ uint *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal; uint *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal; uint hTimer; const uint N = 4 * 1048576; const uint DIR = 1; const uint numValues = 65536; printf("Allocating and initializing host arrays...\n\n"); cutCreateTimer(&hTimer); h_SrcKey = (uint *)malloc(N * sizeof(uint)); h_SrcVal = (uint *)malloc(N * sizeof(uint)); h_DstKey = (uint *)malloc(N * sizeof(uint)); h_DstVal = (uint *)malloc(N * sizeof(uint)); srand(2009); for(uint i = 0; i < N; i++) h_SrcKey[i] = rand() % numValues; fillValues(h_SrcVal, N); printf("Allocating and initializing CUDA arrays...\n\n"); cutilSafeCall( cudaMalloc((void **)&d_DstKey, N * sizeof(uint)) ); cutilSafeCall( cudaMalloc((void **)&d_DstVal, N * sizeof(uint)) ); cutilSafeCall( cudaMalloc((void **)&d_BufKey, N * sizeof(uint)) ); cutilSafeCall( cudaMalloc((void **)&d_BufVal, N * sizeof(uint)) ); cutilSafeCall( cudaMalloc((void **)&d_SrcKey, N * sizeof(uint)) ); cutilSafeCall( cudaMalloc((void **)&d_SrcVal, N * sizeof(uint)) ); cutilSafeCall( cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice) ); cutilSafeCall( cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice) ); printf("Initializing GPU merge sort...\n"); initMergeSort(); printf("Running GPU merge sort...\n"); cutilSafeCall( cudaThreadSynchronize() ); cutResetTimer(hTimer); cutStartTimer(hTimer); mergeSort( d_DstKey, d_DstVal, d_BufKey, d_BufVal, d_SrcKey, d_SrcVal, N, DIR ); cutilSafeCall( cudaThreadSynchronize() ); cutStopTimer(hTimer); printf("Time: %f ms\n", cutGetTimerValue(hTimer)); printf("Reading back GPU merge sort results...\n"); cutilSafeCall( cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost) ); cutilSafeCall( cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost) ); printf("Inspecting the results...\n"); uint keysFlag = validateSortedKeys( h_DstKey, h_SrcKey, 1, N, numValues, DIR ); uint valuesFlag = validateSortedValues( h_DstKey, h_DstVal, h_SrcKey, 1, N ); printf( (keysFlag && valuesFlag) ? "TEST PASSED\n" : "TEST FAILED\n"); printf("Shutting down...\n"); closeMergeSort(); cutilCheckError( cutDeleteTimer(hTimer) ); cutilSafeCall( cudaFree(d_SrcVal) ); cutilSafeCall( cudaFree(d_SrcKey) ); cutilSafeCall( cudaFree(d_BufVal) ); cutilSafeCall( cudaFree(d_BufKey) ); cutilSafeCall( cudaFree(d_DstVal) ); cutilSafeCall( cudaFree(d_DstKey) ); free(h_DstVal); free(h_DstKey); free(h_SrcVal); free(h_SrcKey); cudaThreadExit(); cutilExit(argc, argv); }
void displayFunc(void){ cutStartTimer(hTimer); TColor *d_dst = NULL; size_t num_bytes; if(frameCounter++ == 0) cutResetTimer(hTimer); // DEPRECATED: cutilSafeCall(cudaGLMapBufferObject((void**)&d_dst, gl_PBO)); cutilSafeCall(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); cutilCheckMsg("cudaGraphicsMapResources failed"); cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void**)&d_dst, &num_bytes, cuda_pbo_resource)); cutilCheckMsg("cudaGraphicsResourceGetMappedPointer failed"); cutilSafeCall( CUDA_Bind2TextureArray() ); runImageFilters(d_dst); cutilSafeCall( CUDA_UnbindTexture() ); // DEPRECATED: cutilSafeCall(cudaGLUnmapBufferObject(gl_PBO)); cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); if (g_bFBODisplay) { g_FrameBufferObject->bindRenderPath(); } // Common display code path { glClear(GL_COLOR_BUFFER_BIT); glTexSubImage2D( GL_TEXTURE_2D, 0, 0, 0, imageW, imageH, GL_RGBA, GL_UNSIGNED_BYTE, BUFFER_DATA(0) ); glBegin(GL_TRIANGLES); glTexCoord2f(0, 0); glVertex2f(-1, -1); glTexCoord2f(2, 0); glVertex2f(+3, -1); glTexCoord2f(0, 2); glVertex2f(-1, +3); glEnd(); glFinish(); } if (g_bFBODisplay) { g_FrameBufferObject->unbindRenderPath(); glBindTexture(GL_TEXTURE_2D, 0); } if (g_CheckRender && g_CheckRender->IsQAReadback() && g_Verify) { printf("> (Frame %d) readback BackBuffer\n", frameCount); if (g_bFBODisplay) { g_CheckRender->readback( imageW, imageH, g_FrameBufferObject->getFbo() ); } else { g_CheckRender->readback( imageW, imageH ); } g_CheckRender->savePPM ( sOriginal[g_Kernel], true, NULL ); if (!g_CheckRender->PPMvsPPM(sOriginal[g_Kernel], sReference[g_Kernel], MAX_EPSILON_ERROR, 0.15f)) { g_TotalErrors++; } g_Verify = false; } if(frameCounter == frameN){ frameCounter = 0; if(g_FPS){ printf("FPS: %3.1f\n", frameN / (cutGetTimerValue(hTimer) * 0.001) ); g_FPS = false; } } glutSwapBuffers(); cutStopTimer(hTimer); computeFPS(); }
// Host code int main(int argc, char** argv) { ParseArguments(argc, argv); float s_SobelMatrix[25]; s_SobelMatrix[0] = 1; s_SobelMatrix[1] = 2; s_SobelMatrix[2]= 0; s_SobelMatrix[3] = -2; s_SobelMatrix[4] = -1; s_SobelMatrix[5] = 4; s_SobelMatrix[6] = 8; s_SobelMatrix[7] = 0; s_SobelMatrix[8] = -8; s_SobelMatrix[9] = -4; s_SobelMatrix[10] = 6; s_SobelMatrix[11] = 12; s_SobelMatrix[12] = 0; s_SobelMatrix[13] = -12; s_SobelMatrix[14] = -6; s_SobelMatrix[15] = 4; s_SobelMatrix[16] = 8; s_SobelMatrix[17] = 0; s_SobelMatrix[18] = -8; s_SobelMatrix[19] =-4; s_SobelMatrix[20] =1; s_SobelMatrix[21] =2; s_SobelMatrix[22] =0; s_SobelMatrix[23] =-2; s_SobelMatrix[24] =-1; unsigned char *palete = NULL; unsigned char *data = NULL, *out = NULL; PPMImage *input_image=NULL, *output_image=NULL; output_image = (PPMImage *)malloc(sizeof(PPMImage)); input_image = readPPM(PPMInFileL); printf("Running %s filter\n", Filter); out = (unsigned char *)malloc(); printf("Computing the CPU output\n"); printf("Image details: %d by %d = %d , imagesize = %d\n", input_image->x, input_image->y, input_image->x * input_image->y, input_image->x * input_image->y); cutilCheckError(cutStartTimer(time_CPU)); if(FilterMode == SOBEL_FILTER){ printf("Running Sobel\n"); CPU_Sobel(intput_image->data, output_image, input_image->x, input_image->y); } else if(FilterMode == HIGH_BOOST_FILTER){ printf("Running boost\n"); CPU_Boost(data, out, dib.width, dib.height); } cutilCheckError(cutStopTimer(time_CPU)); if(FilterMode == SOBEL_FILTER || FilterMode == SOBEL_FILTER5) BitMapWrite("CPU_sobel.bmp", &bmp, &dib, out, palete); else if(FilterMode == AVERAGE_FILTER) BitMapWrite("CPU_average.bmp", &bmp, &dib, out, palete); else if(FilterMode == HIGH_BOOST_FILTER) BitMapWrite("CPU_boost.bmp", &bmp, &dib, out, palete); printf("Done with CPU output\n"); printf("CPU execution time %f \n", cutGetTimerValue(time_CPU)); printf("Allocating %d bytes for image \n", dib.image_size); cutilSafeCall( cudaMalloc( (void **)&d_In, dib.image_size*sizeof(unsigned char)) ); cutilSafeCall( cudaMalloc( (void **)&d_Out, dib.image_size*sizeof(unsigned char)) ); // creating space for filter matrix cutilSafeCall( cudaMalloc( (void **)&sobel_matrix, 25*sizeof(float)) ); cutilCheckError(cutStartTimer(time_mem)); cudaMemcpy(d_In, data, dib.image_size*sizeof(unsigned char), cudaMemcpyHostToDevice); cudaMemcpy(sobel_matrix, s_SobelMatrix, 25*sizeof(float), cudaMemcpyHostToDevice); cutilCheckError(cutStopTimer(time_mem)); FilterWrapper(data, dib.width, dib.height); // Copy image back to host cutilCheckError(cutStartTimer(time_mem)); cudaMemcpy(out, d_Out, dib.image_size*sizeof(unsigned char), cudaMemcpyDeviceToHost); cutilCheckError(cutStopTimer(time_mem)); printf("GPU execution time %f Memtime %f \n", cutGetTimerValue(time_GPU), cutGetTimerValue(time_mem)); printf("Total GPU = %f \n", (cutGetTimerValue(time_GPU) + cutGetTimerValue(time_mem))); // Write output image BitMapWrite(BMPOutFile, &bmp, &dib, out, palete); Cleanup(); }
void display() { // update the simulation if (!bPause) { if (cycleDemo && (cutGetTimerValue(demoTimer) > demoTime)) { activeDemo = (activeDemo + 1) % numDemos; selectDemo(activeDemo); } nbody->update(activeParams.m_timestep); renderer->setPBO(nbody->getCurrentReadBuffer(), nbody->getNumBodies()); } glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); if (displayEnabled) { // view transform glMatrixMode(GL_MODELVIEW); glLoadIdentity(); for (int c = 0; c < 3; ++c) { camera_trans_lag[c] += (camera_trans[c] - camera_trans_lag[c]) * inertia; camera_rot_lag[c] += (camera_rot[c] - camera_rot_lag[c]) * inertia; } glTranslatef(camera_trans_lag[0], camera_trans_lag[1], camera_trans_lag[2]); glRotatef(camera_rot_lag[0], 1.0, 0.0, 0.0); glRotatef(camera_rot_lag[1], 0.0, 1.0, 0.0); renderer->setSpriteSize(activeParams.m_pointSize); renderer->display(displayMode); // display user interface. if (bShowSliders) { glBlendFunc(GL_ONE_MINUS_DST_COLOR, GL_ZERO); // invert color glEnable(GL_BLEND); paramlist->Render(0, 0); glDisable(GL_BLEND); } glutSwapBuffers(); } fpsCount++; // this displays the frame rate updated every second (independent of frame rate) if (fpsCount >= fpsLimit) { char fps[256]; float milliseconds = 0; // stop timer /* cutilSafeCall(cudaEventRecord(stopEvent, 0)); cutilSafeCall(cudaEventSynchronize(stopEvent)); cutilSafeCall( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent)); */ gettimeofday(&stopTime, 0); milliseconds = (float)(stopTime.tv_sec - startTime.tv_sec) * 1000.0f + (float)(stopTime.tv_usec - startTime.tv_usec) / 1000.0f; milliseconds /= (float)fpsCount; double interactionsPerSecond = 0; double gflops = 0; computePerfStats(interactionsPerSecond, gflops, milliseconds, 1); float ifps = 1.f / (milliseconds / 1000.f); sprintf(fps, "CUDA N-Body (%d bodies): " "%0.1f fps | %0.1f BIPS | %0.1f GFLOP/s", numBodies, ifps, interactionsPerSecond, gflops); glutSetWindowTitle(fps); fpsCount = 0; fpsLimit = (ifps > 1.f) ? (int)ifps : 1; if (bPause) fpsLimit = 0; // restart timer //cutilSafeCall(cudaEventRecord(startEvent, 0)); gettimeofday(&startTime, 0); } glutReportErrors(); }
void quickshift(image_t im, float sigma, float tau, float * map, float * gaps, float * E) { int verb = 1 ; float *M = 0, *n = 0; float tau2; int K, d; int N1,N2, i1,i2, j1,j2, R, tR; int medoid = 0 ; float const * I = im.I; N1 = im.N1; N2 = im.N2; K = im.K; d = 2 + K ; /* Total dimensions include spatial component (x,y) */ tau2 = tau*tau; if (medoid) { /* n and M are only used in mediod shift */ M = (float *) calloc(N1*N2*d, sizeof(float)) ; n = (float *) calloc(N1*N2, sizeof(float)) ; } R = (int) ceil (3 * sigma) ; tR = (int) ceil (tau) ; if (verb) { printf("quickshift: [N1,N2,K]: [%d,%d,%d]\n", N1,N2,K) ; printf("quickshift: type: %s\n", medoid ? "medoid" : "quick"); printf("quickshift: sigma: %g\n", sigma) ; /* R is ceil(3 * sigma) and determines the window size to accumulate * similarity */ printf("quickshift: R: %d\n", R) ; printf("quickshift: tau: %g\n", tau) ; printf("quickshift: tR: %d\n", tR) ; } /* ----------------------------------------------------------------- * n * -------------------------------------------------------------- */ /* If we are doing medoid shift, initialize n to the inner product of the * image with itself */ if (n) { for (i2 = 0 ; i2 < N2 ; ++ i2) { for (i1 = 0 ; i1 < N1 ; ++ i1) { n [i1 + N1 * i2] = inner(I,N1,N2,K, i1,i2, i1,i2) ; } } } unsigned int Etimer; cutilCheckError( cutCreateTimer(&Etimer) ); cutilCheckError( cutResetTimer(Etimer) ); cutilCheckError( cutStartTimer(Etimer) ); /* ----------------------------------------------------------------- * E = - [oN'*F]', M * -------------------------------------------------------------- */ /* D_ij = d(x_i,x_j) E_ij = exp(- .5 * D_ij / sigma^2) ; F_ij = - E_ij E_i = sum_j E_ij M_di = sum_j X_j F_ij E is the parzen window estimate of the density 0 = dissimilar to everything, windowsize = identical */ for (i2 = 0 ; i2 < N2 ; ++ i2) { for (i1 = 0 ; i1 < N1 ; ++ i1) { float Ei = 0; int j1min = VL_MAX(i1 - R, 0 ) ; int j1max = VL_MIN(i1 + R, N1-1) ; int j2min = VL_MAX(i2 - R, 0 ) ; int j2max = VL_MIN(i2 + R, N2-1) ; /* For each pixel in the window compute the distance between it and the * source pixel */ for (j2 = j2min ; j2 <= j2max ; ++ j2) { for (j1 = j1min ; j1 <= j1max ; ++ j1) { float Dij = distance(I,N1,N2,K, i1,i2, j1,j2) ; /* Make distance a similarity */ float Fij = exp(- Dij / (2*sigma*sigma)) ; /* E is E_i above */ Ei += Fij; if (M) { /* Accumulate votes for the median */ int k ; M [i1 + N1*i2 + (N1*N2) * 0] += j1 * Fij ; M [i1 + N1*i2 + (N1*N2) * 1] += j2 * Fij ; for (k = 0 ; k < K ; ++k) { M [i1 + N1*i2 + (N1*N2) * (k+2)] += I [j1 + N1*j2 + (N1*N2) * k] * Fij ; } } } /* j1 */ } /* j2 */ /* Normalize */ E [i1 + N1 * i2] = Ei / ((j1max-j1min)*(j2max-j2min)); /*E [i1 + N1 * i2] = Ei ; */ } /* i1 */ } /* i2 */ cutilCheckError( cutStopTimer(Etimer) ); float ETime = cutGetTimerValue(Etimer); printf("ComputeE: %fms\n", ETime); unsigned int Ntimer; cutilCheckError( cutCreateTimer(&Ntimer) ); cutilCheckError( cutResetTimer(Ntimer) ); cutilCheckError( cutStartTimer(Ntimer) ); /* ----------------------------------------------------------------- * Find best neighbors * -------------------------------------------------------------- */ if (medoid) { /* Qij = - nj Ei - 2 sum_k Gjk Mik n is I.^2 */ /* medoid shift */ for (i2 = 0 ; i2 < N2 ; ++i2) { for (i1 = 0 ; i1 < N1 ; ++i1) { float sc_best = 0 ; /* j1/j2 best are the best indicies for each i */ float j1_best = i1 ; float j2_best = i2 ; int j1min = VL_MAX(i1 - R, 0 ) ; int j1max = VL_MIN(i1 + R, N1-1) ; int j2min = VL_MAX(i2 - R, 0 ) ; int j2max = VL_MIN(i2 + R, N2-1) ; for (j2 = j2min ; j2 <= j2max ; ++ j2) { for (j1 = j1min ; j1 <= j1max ; ++ j1) { float Qij = - n [j1 + j2 * N1] * E [i1 + i2 * N1] ; int k ; Qij -= 2 * j1 * M [i1 + i2 * N1 + (N1*N2) * 0] ; Qij -= 2 * j2 * M [i1 + i2 * N1 + (N1*N2) * 1] ; for (k = 0 ; k < K ; ++k) { Qij -= 2 * I [j1 + j2 * N1 + (N1*N2) * k] * M [i1 + i2 * N1 + (N1*N2) * (k + 2)] ; } if (Qij > sc_best) { sc_best = Qij ; j1_best = j1 ; j2_best = j2 ; } } } /* map_i is the linear index of j which is the best pair (in matlab * notation * gaps_i is the score of the best match */ map [i1 + N1 * i2] = j1_best + N1 * j2_best ; /*+ 1 ; */ gaps[i1 + N1 * i2] = sc_best ; } } } else { /* Quickshift assigns each i to the closest j which has an increase in the * density (E). If there is no j s.t. Ej > Ei, then gaps_i == inf (a root * node in one of the trees of merges). */ for (i2 = 0 ; i2 < N2 ; ++i2) { for (i1 = 0 ; i1 < N1 ; ++i1) { float E0 = E [i1 + N1 * i2] ; float d_best = INF ; float j1_best = i1 ; float j2_best = i2 ; int j1min = VL_MAX(i1 - tR, 0 ) ; int j1max = VL_MIN(i1 + tR, N1-1) ; int j2min = VL_MAX(i2 - tR, 0 ) ; int j2max = VL_MIN(i2 + tR, N2-1) ; for (j2 = j2min ; j2 <= j2max ; ++ j2) { for (j1 = j1min ; j1 <= j1max ; ++ j1) { if (E [j1 + N1 * j2] > E0) { float Dij = distance(I,N1,N2,K, i1,i2, j1,j2) ; if (Dij <= tau2 && Dij < d_best) { d_best = Dij ; j1_best = j1 ; j2_best = j2 ; } } } } /* map is the index of the best pair */ /* gaps_i is the minimal distance, inf implies no Ej > Ei within * distance tau from the point */ map [i1 + N1 * i2] = j1_best + N1 * j2_best ; /* + 1 ; */ if (map[i1 + N1 * i2] != i1 + N1 * i2) gaps[i1 + N1 * i2] = sqrt(d_best) ; else gaps[i1 + N1 * i2] = d_best; /* inf */ } } } if (M) free(M) ; if (n) free(n) ; cutilCheckError( cutStopTimer(Ntimer) ); float NTime = cutGetTimerValue(Ntimer); printf("ComputeN: %fms\n", NTime); }
int main(int argc, char **argv) { // Start logs shrSetLogFileName ("quasirandomGenerator.txt"); shrLog("%s Starting...\n\n", argv[0]); unsigned int useDoublePrecision; char *precisionChoice; cutGetCmdLineArgumentstr(argc, (const char **)argv, "type", &precisionChoice); if(precisionChoice == NULL) useDoublePrecision = 0; else{ if(!strcasecmp(precisionChoice, "double")) useDoublePrecision = 1; else useDoublePrecision = 0; } unsigned int tableCPU[QRNG_DIMENSIONS][QRNG_RESOLUTION]; float *h_OutputGPU; float *d_Output; int dim, pos; double delta, ref, sumDelta, sumRef, L1norm, gpuTime; unsigned int hTimer; if(sizeof(INT64) != 8){ shrLog("sizeof(INT64) != 8\n"); return 0; } // use command-line specified CUDA device, otherwise use device with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) cutilDeviceInit(argc, argv); else cudaSetDevice( cutGetMaxGflopsDeviceId() ); cutilCheckError(cutCreateTimer(&hTimer)); int deviceIndex; cutilSafeCall(cudaGetDevice(&deviceIndex)); cudaDeviceProp deviceProp; cutilSafeCall(cudaGetDeviceProperties(&deviceProp, deviceIndex)); int version = deviceProp.major * 10 + deviceProp.minor; if(useDoublePrecision && version < 13){ shrLog("Double precision not supported.\n"); cudaThreadExit(); return 0; } shrLog("Allocating GPU memory...\n"); cutilSafeCall( cudaMalloc((void **)&d_Output, QRNG_DIMENSIONS * N * sizeof(float)) ); shrLog("Allocating CPU memory...\n"); h_OutputGPU = (float *)malloc(QRNG_DIMENSIONS * N * sizeof(float)); shrLog("Initializing QRNG tables...\n\n"); initQuasirandomGenerator(tableCPU); if(useDoublePrecision) initTable_SM13(tableCPU); else initTable_SM10(tableCPU); shrLog("Testing QRNG...\n\n"); cutilSafeCall( cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float)) ); int numIterations = 20; for (int i = -1; i < numIterations; i++) { if (i == 0) { cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); } if(useDoublePrecision) quasirandomGenerator_SM13(d_Output, 0, N); else quasirandomGenerator_SM10(d_Output, 0, N); } cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError(cutStopTimer(hTimer)); gpuTime = cutGetTimerValue(hTimer)/(double)numIterations*1e-3; shrLogEx(LOGBOTH | MASTER, 0, "quasirandomGenerator, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", (double)QRNG_DIMENSIONS * (double)N * 1.0E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128*QRNG_DIMENSIONS); shrLog("\nReading GPU results...\n"); cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost) ); shrLog("Comparing to the CPU results...\n\n"); sumDelta = 0; sumRef = 0; for(dim = 0; dim < QRNG_DIMENSIONS; dim++) for(pos = 0; pos < N; pos++){ ref = getQuasirandomValue63(pos, dim); delta = (double)h_OutputGPU[dim * N + pos] - ref; sumDelta += fabs(delta); sumRef += fabs(ref); } shrLog("L1 norm: %E\n", sumDelta / sumRef); shrLog("\nTesting inverseCNDgpu()...\n\n"); cutilSafeCall( cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float)) ); for (int i = -1; i < numIterations; i++) { if (i == 0) { cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); } if(useDoublePrecision) inverseCND_SM13(d_Output, NULL, QRNG_DIMENSIONS * N); else inverseCND_SM10(d_Output, NULL, QRNG_DIMENSIONS * N); } cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError(cutStopTimer(hTimer)); gpuTime = cutGetTimerValue(hTimer)/(double)numIterations*1e-3; shrLogEx(LOGBOTH | MASTER, 0, "quasirandomGenerator-inverse, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", (double)QRNG_DIMENSIONS * (double)N * 1E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128); shrLog("Reading GPU results...\n"); cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost) ); shrLog("\nComparing to the CPU results...\n"); sumDelta = 0; sumRef = 0; for(pos = 0; pos < QRNG_DIMENSIONS * N; pos++){ double p = (double)(pos + 1) / (double)(QRNG_DIMENSIONS * N + 1); ref = MoroInvCNDcpu(p); delta = (double)h_OutputGPU[pos] - ref; sumDelta += fabs(delta); sumRef += fabs(ref); } shrLog("L1 norm: %E\n\n", L1norm = sumDelta / sumRef); shrLog((L1norm < 1E-6) ? "PASSED\n\n" : "FAILED\n\n"); shrLog("Shutting down...\n"); cutilCheckError(cutDeleteTimer(hTimer)); free(h_OutputGPU); cutilSafeCall( cudaFree(d_Output) ); cudaThreadExit(); shrEXIT(argc, (const char**)argv); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { //start logs shrSetLogFileName ("volumeRender.txt"); shrLog("%s Starting...\n\n", argv[0]); if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") || cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) { g_bQAReadback = true; fpsLimit = frameCheckNumber; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) { g_bQAGLVerify = true; fpsLimit = frameCheckNumber; } if (g_bQAReadback) { // use command-line specified CUDA device, otherwise use device with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilDeviceInit(argc, argv); } else { cudaSetDevice( cutGetMaxGflopsDeviceId() ); } } else { // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL( &argc, argv ); // use command-line specified CUDA device, otherwise use device with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilGLDeviceInit(argc, argv); } else { cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); } /* int device; struct cudaDeviceProp prop; cudaGetDevice( &device ); cudaGetDeviceProperties( &prop, device ); if( !strncmp( "Tesla", prop.name, 5 ) ) { shrLog("This sample needs a card capable of OpenGL and display.\n"); shrLog("Please choose a different device with the -device=x argument.\n"); cutilExit(argc, argv); } */ } // parse arguments char *filename; if (cutGetCmdLineArgumentstr( argc, (const char**) argv, "file", &filename)) { volumeFilename = filename; } int n; if (cutGetCmdLineArgumenti( argc, (const char**) argv, "size", &n)) { volumeSize.width = volumeSize.height = volumeSize.depth = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "xsize", &n)) { volumeSize.width = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "ysize", &n)) { volumeSize.height = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "zsize", &n)) { volumeSize.depth = n; } // load volume data char* path = shrFindFilePath(volumeFilename, argv[0]); if (path == 0) { shrLog("Error finding file '%s'\n", volumeFilename); exit(EXIT_FAILURE); } size_t size = volumeSize.width*volumeSize.height*volumeSize.depth*sizeof(VolumeType); void *h_volume = loadRawFile(path, size); initCuda(h_volume, volumeSize); free(h_volume); cutilCheckError( cutCreateTimer( &timer)); shrLog("Press '=' and '-' to change density\n" " ']' and '[' to change brightness\n" " ';' and ''' to modify transfer function offset\n" " '.' and ',' to modify transfer function scale\n\n"); // calculate new grid size gridSize = dim3(iDivUp(width, blockSize.x), iDivUp(height, blockSize.y)); if (g_bQAReadback) { g_CheckRender = new CheckBackBuffer(width, height, 4, false); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); uint *d_output; cutilSafeCall(cudaMalloc((void**)&d_output, width*height*sizeof(uint))); cutilSafeCall(cudaMemset(d_output, 0, width*height*sizeof(uint))); float modelView[16] = { 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 4.0f, 1.0f }; invViewMatrix[0] = modelView[0]; invViewMatrix[1] = modelView[4]; invViewMatrix[2] = modelView[8]; invViewMatrix[3] = modelView[12]; invViewMatrix[4] = modelView[1]; invViewMatrix[5] = modelView[5]; invViewMatrix[6] = modelView[9]; invViewMatrix[7] = modelView[13]; invViewMatrix[8] = modelView[2]; invViewMatrix[9] = modelView[6]; invViewMatrix[10] = modelView[10]; invViewMatrix[11] = modelView[14]; // call CUDA kernel, writing results to PBO copyInvViewMatrix(invViewMatrix, sizeof(float4)*3); // Start timer 0 and process n loops on the GPU int nIter = 10; for (int i = -1; i < nIter; i++) { if( i == 0 ) { cudaThreadSynchronize(); cutStartTimer(timer); } render_kernel(gridSize, blockSize, d_output, width, height, density, brightness, transferOffset, transferScale); } cudaThreadSynchronize(); cutStopTimer(timer); // Get elapsed time and throughput, then log to sample and master logs double dAvgTime = cutGetTimerValue(timer)/(nIter * 1000.0); shrLogEx(LOGBOTH | MASTER, 0, "volumeRender, Throughput = %.4f MTexels/s, Time = %.5f s, Size = %u Texels, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * width * height)/dAvgTime, dAvgTime, (width * height), 1, blockSize.x * blockSize.y); cutilCheckMsg("Error: render_kernel() execution FAILED"); cutilSafeCall( cudaThreadSynchronize() ); cutilSafeCall( cudaMemcpy(g_CheckRender->imageData(), d_output, width*height*4, cudaMemcpyDeviceToHost) ); g_CheckRender->savePPM(sOriginal[g_Index], true, NULL); if (!g_CheckRender->PPMvsPPM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, THRESHOLD)) { shrLog("\nFAILED\n\n"); } else { shrLog("\nPASSED\n\n"); } cudaFree(d_output); freeCudaBuffers(); if (g_CheckRender) { delete g_CheckRender; g_CheckRender = NULL; } } else { // This is the normal rendering path for VolumeRender glutDisplayFunc(display); glutKeyboardFunc(keyboard); glutMouseFunc(mouse); glutMotionFunc(motion); glutReshapeFunc(reshape); glutIdleFunc(idle); initPixelBuffer(); if (g_bQAGLVerify) { g_CheckRender = new CheckBackBuffer(width, height, 4); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); } atexit(cleanup); glutMainLoop(); } cudaThreadExit(); shrEXIT(argc, (const char**)argv); }
int main(int, char* argv[]) { // In assignment 2.1, we will fix filter_radius as 1, // and therefore it's filter size is 3x3. const unsigned int filter_radius = 1; unsigned int width, height; unsigned char* h_img = NULL; unsigned char* your_result = NULL; unsigned char* gold_result = NULL; // This is the filename of the image file in PGM format // and the file should be put under the subdirectory, "data". const char* image_filename = "test.pgm"; loadImage(&h_img, &width, &height, image_filename, argv[0]); if (width <= 2*filter_radius || height <= 2*filter_radius) { fprintf(stderr, "Filter radius is too large.\n"); exit(-1); } your_result = (unsigned char*) malloc(width*height*sizeof(unsigned char)); gold_result = (unsigned char*) malloc(width*height*sizeof(unsigned char)); // Run your median filter { unsigned int timer = 0; cutilCheckError(cutCreateTimer(&timer)); cutilCheckError(cutStartTimer(timer)); // You should implemnt medianFilter() in medianFilter_kernel.cu medianFilter(h_img, your_result, width, height, filter_radius); cutilCheckError(cutStopTimer(timer)); printf("[Yours] Processing time: %f (ms) \n", cutGetTimerValue(timer)); cutilCheckError(cutDeleteTimer(timer)); } // Run the reference median filter { unsigned int timer = 0; cutilCheckError(cutCreateTimer(&timer)); cutilCheckError(cutStartTimer(timer)); medianFilter_gold(h_img, gold_result, width, height, filter_radius); cutilCheckError(cutStopTimer(timer)); printf("[Gold] Processing time: %f (ms) \n", cutGetTimerValue(timer)); cutilCheckError(cutDeleteTimer(timer)); } // You can use saveImage() to save the result. // Under Windows, you can use IrfanView (http://www.irfanview.com/) to view PGM files. // Or you can convert PGM into JPEG, PNG, and etc. // saveImage(your_result, width, height, "output.pgm"); // saveImage(gold_result, width, height, "gold.pgm"); // Compare your result and the reference result. { if (test(your_result, gold_result, width, height, filter_radius)) { printf("PASSED\n"); } else { printf("FAILED\n"); } } free(gold_result); free(your_result); return 0; }