void computeFPS() { fpsCount++; if (fpsCount == fpsLimit) { char fps[256]; float ifps = 1.0f / (cutGetAverageTimerValue(timer) / 1000.0f); sprintf(fps, "CUDA Bilateral Filter: %3.f fps (euclidean_delta=%.2f, gaussian_delta=%.2f, iterations=%.2f)", ifps, (double)euclidean_delta, (double)gaussian_delta, (double)iterations); glutSetWindowTitle(fps); fpsCount = 0; fpsLimit = (int)MAX(ifps, 1.0f); cutilCheckError(cutResetTimer(timer)); } }
// Simple method to display the Frames Per Second in the window title void computeFPS() { static int fpsCount=0; static int fpsLimit=100; fpsCount++; if (fpsCount == fpsLimit) { char fps[256]; float ifps = 1.f / (cutGetAverageTimerValue(timer) / 1000.f); sprintf(fps, "Cuda GL Interop Wrapper: %3.1f fps ", ifps); glutSetWindowTitle(fps); fpsCount = 0; cutilCheckError(cutResetTimer(timer)); } }
void computeFPS() { frameCount++; fpsCount++; if (fpsCount == fpsLimit-1) { g_Verify = true; } if (fpsCount == fpsLimit) { char fps[256]; float ifps = 1.f / (cutGetAverageTimerValue(timer) / 1000.f); sprintf(fps, "%s %s (sigma=%4.2f): %3.1f fps", sSDKsample, ((g_CheckRender && g_CheckRender->IsQAReadback()) ? "AutoTest: " : ""), sigma, ifps); glutSetWindowTitle(fps); fpsCount = 0; cutilCheckError(cutResetTimer(timer)); AutoQATest(); } }
void AutoQATest() { if (g_CheckRender && g_CheckRender->IsQAReadback()) { char fps[256]; float ifps = 1.f / (cutGetAverageTimerValue(timer) / 1000.f); sprintf(fps, "%s %s (sigma=%4.2f): %3.1f fps", sSDKsample, ((g_CheckRender && g_CheckRender->IsQAReadback()) ? "AutoTest: " : ""), sigma, ifps); glutSetWindowTitle(fps); g_Index++; sigma += 4; if (sigma > 22) { printf("Summary: %d errors!\n", g_TotalErrors); printf("Test %s!\n", (g_TotalErrors==0) ? "PASSED" : "FAILED"); exit(0); } } }
void computeFPS() { frameCount++; if (fpsCount++ == fpsLimit-1) { g_Verify = true; } if (fpsCount == fpsLimit) { char fps[256]; float ifps = 1.0f / (cutGetAverageTimerValue(timer) / 1000.0f); sprintf(fps, "%sCUDA Box Filter (radius=%d): %3.1f fps", ((g_CheckRender && g_CheckRender->IsQAReadback()) ? "[AutoTest]: " : ""), filter_radius, ifps); glutSetWindowTitle(fps); fpsCount = 0; if (g_CheckRender && !g_CheckRender->IsQAReadback()) fpsLimit = (int)MAX(ifps, 1.0f); cutilCheckError(cutResetTimer(timer)); AutoQATest(); } }
void computeFPS() { frameCount++; fpsCount++; if (fpsCount == fpsLimit-1) { g_Verify = true; } if (fpsCount == fpsLimit) { char fps[256]; float ifps = 1.f / (cutGetAverageTimerValue(timer) / 1000.f); sprintf(fps, "%s Cuda Edge Detection (%s): %3.1f fps", ((g_CheckRender && g_CheckRender->IsQAReadback()) ? "AutoTest:" : ""), filterMode[g_SobelDisplayMode], ifps); glutSetWindowTitle(fps); fpsCount = 0; if (g_CheckRender && !g_CheckRender->IsQAReadback()) fpsLimit = (int)MAX(ifps, 1.f); cutilCheckError(cutResetTimer(timer)); AutoQATest(); } }
// main rendering loop void display() { cutilCheckError(cutStartTimer(timer)); if( !gestures.m_bPause ) { //Read next available data gestures.m_Context.WaitAndUpdateAll(); } //Process the data gestures.m_DepthGenerator.GetMetaData( depthMD ); gestures.m_UserGenerator.GetUserPixels( 0, sceneMD ); // move camera if (cameraPos[1] > 0.0f) cameraPos[1] = 0.0f; cameraPosLag += (cameraPos - cameraPosLag) * inertia; cameraRotLag += (cameraRot - cameraRotLag) * inertia; cursorPosLag += (cursorPos - cursorPosLag) * inertia; // view transform glMatrixMode(GL_MODELVIEW); glLoadIdentity(); glRotatef(cameraRotLag[0], 1.0, 0.0, 0.0); glRotatef(cameraRotLag[1], 0.0, 1.0, 0.0); glTranslatef(cameraPosLag[0], cameraPosLag[1], cameraPosLag[2]); glGetFloatv(GL_MODELVIEW_MATRIX, modelView); // update the simulation if (!paused) { if (emitterOn) { runEmitter(); } SimParams &p = psystem->getParams(); p.cursorPos = make_float3(cursorPosLag.x, cursorPosLag.y, cursorPosLag.z); psystem->step(timestep); currentTime += timestep; } renderer->calcVectors(); vec3f sortVector = renderer->getSortVector(); psystem->setSortVector(make_float3(sortVector.x, sortVector.y, sortVector.z)); psystem->setModelView(modelView); psystem->setSorting(sort); psystem->depthSort(); // render glClearColor(0.0, 0.0, 0.0, 1.0); glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); renderScene(); // draw particles if (displayEnabled) { // render scene to offscreen buffers to get correct occlusion renderer->beginSceneRender(SmokeRenderer::LIGHT_BUFFER); renderScene(); renderer->endSceneRender(SmokeRenderer::LIGHT_BUFFER); renderer->beginSceneRender(SmokeRenderer::SCENE_BUFFER); renderScene(); renderer->endSceneRender(SmokeRenderer::SCENE_BUFFER); renderer->setPositionBuffer(psystem->getPosBuffer()); renderer->setVelocityBuffer(psystem->getVelBuffer()); renderer->setIndexBuffer(psystem->getSortedIndexBuffer()); renderer->setNumParticles(psystem->getNumParticles()); renderer->setParticleRadius(spriteSize); renderer->setDisplayLightBuffer(displayLightBuffer); renderer->setAlpha(alpha); renderer->setShadowAlpha(shadowAlpha); renderer->setLightPosition(lightPos); renderer->setColorAttenuation(colorAttenuation); renderer->setLightColor(lightColor); renderer->setNumSlices(numSlices); renderer->setNumDisplayedSlices(numDisplayedSlices); renderer->setBlurRadius(blurRadius); renderer->render(); if (drawVectors) { renderer->debugVectors(); } } // display sliders if (displaySliders) { glDisable(GL_DEPTH_TEST); glBlendFunc(GL_ONE_MINUS_DST_COLOR, GL_ZERO); // invert color glEnable(GL_BLEND); params->Render(0, 0); glDisable(GL_BLEND); glEnable(GL_DEPTH_TEST); } glutSwapBuffers(); glutReportErrors(); cutilCheckError(cutStopTimer(timer)); // readback for verification//sw/devrel/SDK10/Compute/projects/recursiveGaussian/recursiveGaussian.cpp if (g_CheckRender && g_CheckRender->IsQAReadback() && (++frameNumber >= frameCheckNumber)) { printf("> (Frame %d) Readback BackBuffer\n", frameNumber); g_CheckRender->readback( winWidth, winHeight ); g_CheckRender->savePPM(sOriginal, true, NULL); bool passed = g_CheckRender->PPMvsPPM(sOriginal, sReference, MAX_EPSILON_ERROR, THRESHOLD); printf("Summary: %d errors!\n", passed ? 0 : 1); printf("%s\n", passed ? "PASSED" : "FAILED"); cleanup(); exit(0); } fpsCount++; // this displays the frame rate updated every second (independent of frame rate) if (fpsCount >= fpsLimit) { char fps[256]; float ifps = 1.f / (cutGetAverageTimerValue(timer) / 1000.f); sprintf(fps, "CUDA Smoke Particles (%d particles): %3.1f fps", numParticles, ifps); glutSetWindowTitle(fps); fpsCount = 0; fpsLimit = (ifps > 1.f) ? (int)ifps : 1; if (paused) fpsLimit = 0; cutilCheckError(cutResetTimer(timer)); } }
void display() { cutilCheckError(cutStartTimer(timer)); // update the simulation if (!bPause) { psystem->setIterations(iterations); psystem->setDamping(damping); psystem->setGravity(-gravity); psystem->setCollideSpring(collideSpring); psystem->setCollideDamping(collideDamping); psystem->setCollideShear(collideShear); psystem->setCollideAttraction(collideAttraction); psystem->update(timestep); renderer->setVertexBuffer(psystem->getCurrentReadBuffer(), psystem->getNumParticles()); } else { usleep(32666); } // render glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); // view transform glMatrixMode(GL_MODELVIEW); glLoadIdentity(); for (int c = 0; c < 3; ++c) { camera_trans_lag[c] += (camera_trans[c] - camera_trans_lag[c]) * inertia; camera_rot_lag[c] += (camera_rot[c] - camera_rot_lag[c]) * inertia; } glTranslatef(camera_trans_lag[0], camera_trans_lag[1], camera_trans_lag[2]); glRotatef(camera_rot_lag[0], 1.0, 0.0, 0.0); glRotatef(camera_rot_lag[1], 0.0, 1.0, 0.0); glGetFloatv(GL_MODELVIEW_MATRIX, modelView); // cube glColor3f(1.0, 1.0, 1.0); glutWireCube(2.0); // collider glPushMatrix(); float4 p = psystem->getColliderPos(); glTranslatef(p.x, p.y, p.z); glColor3f(1.0, 0.0, 0.0); glutSolidSphere(psystem->getColliderRadius(), 20, 10); glPopMatrix(); if (displayEnabled) { renderer->display(displayMode); } if (displaySliders) { glDisable(GL_DEPTH_TEST); glBlendFunc(GL_ONE_MINUS_DST_COLOR, GL_ZERO); // invert color glEnable(GL_BLEND); params->Render(0, 0); glDisable(GL_BLEND); glEnable(GL_DEPTH_TEST); } cutilCheckError(cutStopTimer(timer)); glutSwapBuffers(); fpsCount++; // this displays the frame rate updated every second (independent of frame rate) if (fpsCount >= fpsLimit) { char fps[256]; float ifps = 1.f / (cutGetAverageTimerValue(timer) / 1000.f); sprintf(fps, "CUDA particles (%d particles): %3.1f fps", numParticles, ifps); glutSetWindowTitle(fps); fpsCount = 0; fpsLimit = (ifps > 1.f) ? (int)ifps : 1; if (bPause) fpsLimit = 0; cutilCheckError(cutResetTimer(timer)); } glutReportErrors(); }
bool runTestMax( int argc, char** argv, ReduceType datatype) { int size = 1<<24; // number of elements to reduce int maxThreads = 256; // number of threads per block int whichKernel = 6; int maxBlocks = 64; bool cpuFinalReduction = false; int cpuFinalThreshold = 1; cutGetCmdLineArgumenti( argc, (const char**) argv, "n", &size); cutGetCmdLineArgumenti( argc, (const char**) argv, "threads", &maxThreads); cutGetCmdLineArgumenti( argc, (const char**) argv, "kernel", &whichKernel); cutGetCmdLineArgumenti( argc, (const char**) argv, "maxblocks", &maxBlocks); shrLog("METHOD: MAX\n"); shrLog("%d elements\n", size); shrLog("%d threads (max)\n", maxThreads); cpuFinalReduction = (cutCheckCmdLineFlag( argc, (const char**) argv, "cpufinal") == CUTTrue); cutGetCmdLineArgumenti( argc, (const char**) argv, "cputhresh", &cpuFinalThreshold); bool runShmoo = (cutCheckCmdLineFlag(argc, (const char**) argv, "shmoo") == CUTTrue); if (runShmoo) { shmoo<T>(1, 33554432, maxThreads, maxBlocks, datatype); } else { // create random input data on CPU unsigned int bytes = size * sizeof(T); T *h_idata = (T *) malloc(bytes); for(int i=0; i<size; i++) { // Keep the numbers small so we don't get truncation error in the sum if (datatype == REDUCE_INT) h_idata[i] = (T)(rand() & 0xFF); else h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX; } int numBlocks = 0; int numThreads = 0; getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads); if (numBlocks == 1) cpuFinalThreshold = 1; // allocate mem for the result on host side T* h_odata = (T*) malloc(numBlocks*sizeof(T)); shrLog("%d blocks\n\n", numBlocks); // allocate device memory and data T* d_idata = NULL; T* d_odata = NULL; cutilSafeCallNoSync( cudaMalloc((void**) &d_idata, bytes) ); cutilSafeCallNoSync( cudaMalloc((void**) &d_odata, numBlocks*sizeof(T)) ); // copy data directly to device memory cutilSafeCallNoSync( cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice) ); cutilSafeCallNoSync( cudaMemcpy(d_odata, h_idata, numBlocks*sizeof(T), cudaMemcpyHostToDevice) ); // warm-up maxreduce<T>(size, numThreads, numBlocks, whichKernel, d_idata, d_odata); int testIterations = 100; unsigned int timer = 0; cutilCheckError( cutCreateTimer( &timer)); T gpu_result = 0; gpu_result = benchmarkReduceMax<T>(size, numThreads, numBlocks, maxThreads, maxBlocks, whichKernel, testIterations, cpuFinalReduction, cpuFinalThreshold, timer, h_odata, d_idata, d_odata); double reduceTime = cutGetAverageTimerValue(timer) * 1e-3; shrLogEx(LOGBOTH | MASTER, 0, "Reduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %d, Workgroup = %u\n", 1.0e-9 * ((double)bytes)/reduceTime, reduceTime, size, 1, numThreads); // compute reference solution T cpu_result = maxreduceCPU<T>(h_idata, size); double threshold = 1e-12; double diff = 0; if (datatype == REDUCE_INT) { shrLog("\nGPU result = %d\n", gpu_result); shrLog("CPU result = %d\n\n", cpu_result); } else { shrLog("\nGPU result = %f\n", gpu_result); shrLog("CPU result = %f\n\n", cpu_result); if (datatype == REDUCE_FLOAT) threshold = 1e-8 * size; diff = fabs((double)gpu_result - (double)cpu_result); } // cleanup cutilCheckError( cutDeleteTimer(timer) ); free(h_idata); free(h_odata); cutilSafeCallNoSync(cudaFree(d_idata)); cutilSafeCallNoSync(cudaFree(d_odata)); if (datatype == REDUCE_INT) { return (gpu_result == cpu_result); } else { return (diff < threshold); } } return true; }
void shmoo(int minN, int maxN, int maxThreads, int maxBlocks, ReduceType datatype) { fprintf(stderr, "Shmoo wasn't implemented in this modified kernel!\n"); exit(1); // create random input data on CPU unsigned int bytes = maxN * sizeof(T); T *h_idata = (T*) malloc(bytes); for(int i = 0; i < maxN; i++) { // Keep the numbers small so we don't get truncation error in the sum if (datatype == REDUCE_INT) h_idata[i] = (T)(rand() & 0xFF); else h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX; } int maxNumBlocks = MIN( maxN / maxThreads, MAX_BLOCK_DIM_SIZE); // allocate mem for the result on host side T* h_odata = (T*) malloc(maxNumBlocks*sizeof(T)); // allocate device memory and data T* d_idata = NULL; T* d_odata = NULL; cutilSafeCallNoSync( cudaMalloc((void**) &d_idata, bytes) ); cutilSafeCallNoSync( cudaMalloc((void**) &d_odata, maxNumBlocks*sizeof(T)) ); // copy data directly to device memory cutilSafeCallNoSync( cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice) ); cutilSafeCallNoSync( cudaMemcpy(d_odata, h_idata, maxNumBlocks*sizeof(T), cudaMemcpyHostToDevice) ); // warm-up for (int kernel = 0; kernel < 7; kernel++) { sumreduce<T>(maxN, maxThreads, maxNumBlocks, kernel, d_idata, d_odata); } int testIterations = 100; unsigned int timer = 0; cutilCheckError( cutCreateTimer( &timer)); // print headers shrLog("Time in milliseconds for various numbers of elements for each kernel\n\n\n"); shrLog("Kernel"); for (int i = minN; i <= maxN; i *= 2) { shrLog(", %d", i); } for (int kernel = 0; kernel < 7; kernel++) { shrLog("\n%d", kernel); for (int i = minN; i <= maxN; i *= 2) { cutResetTimer(timer); int numBlocks = 0; int numThreads = 0; getNumBlocksAndThreads(kernel, i, maxBlocks, maxThreads, numBlocks, numThreads); float reduceTime; if( numBlocks <= MAX_BLOCK_DIM_SIZE ) { benchmarkReduceSum(i, numThreads, numBlocks, maxThreads, maxBlocks, kernel, testIterations, false, 1, timer, h_odata, d_idata, d_odata); reduceTime = cutGetAverageTimerValue(timer); } else { reduceTime = -1.0; } shrLog(", %.5f", reduceTime); } } // cleanup cutilCheckError(cutDeleteTimer(timer)); free(h_idata); free(h_odata); cutilSafeCallNoSync(cudaFree(d_idata)); cutilSafeCallNoSync(cudaFree(d_odata)); }