예제 #1
0
void computeFPS()
{
    fpsCount++;

    if (fpsCount == fpsLimit) {
        char fps[256];
        float ifps = 1.0f / (cutGetAverageTimerValue(timer) / 1000.0f);
        sprintf(fps, "CUDA Bilateral Filter: %3.f fps (euclidean_delta=%.2f, gaussian_delta=%.2f, iterations=%.2f)", 
                ifps, (double)euclidean_delta, (double)gaussian_delta, (double)iterations);

        glutSetWindowTitle(fps);
        fpsCount = 0; 
        fpsLimit = (int)MAX(ifps, 1.0f);

        cutilCheckError(cutResetTimer(timer));  
    }
}
예제 #2
0
// Simple method to display the Frames Per Second in the window title
void computeFPS()
{
  static int fpsCount=0;
  static int fpsLimit=100;
 
  fpsCount++;
   
  if (fpsCount == fpsLimit) {
    char fps[256];
    float ifps = 1.f / (cutGetAverageTimerValue(timer) / 1000.f);
    sprintf(fps, "Cuda GL Interop Wrapper: %3.1f fps ", ifps); 
     
    glutSetWindowTitle(fps);
    fpsCount = 0;
     
    cutilCheckError(cutResetTimer(timer)); 
  }
}
void computeFPS()
{
    frameCount++;
    fpsCount++;
    if (fpsCount == fpsLimit-1) {
        g_Verify = true;
    }
    if (fpsCount == fpsLimit) {
        char fps[256];
        float ifps = 1.f / (cutGetAverageTimerValue(timer) / 1000.f);
        sprintf(fps, "%s %s (sigma=%4.2f): %3.1f fps", sSDKsample,
                ((g_CheckRender && g_CheckRender->IsQAReadback()) ? "AutoTest: " : ""), sigma, ifps);  

        glutSetWindowTitle(fps);
        fpsCount = 0; 
        cutilCheckError(cutResetTimer(timer));  

        AutoQATest();
    }
}
void AutoQATest()
{
    if (g_CheckRender && g_CheckRender->IsQAReadback()) {
        char fps[256];
        float ifps = 1.f / (cutGetAverageTimerValue(timer) / 1000.f);
        sprintf(fps, "%s %s (sigma=%4.2f): %3.1f fps", sSDKsample,
                ((g_CheckRender && g_CheckRender->IsQAReadback()) ? "AutoTest: " : ""), sigma, ifps);  

	    glutSetWindowTitle(fps);

        g_Index++;
        sigma += 4;

		if (sigma > 22) {
			printf("Summary: %d errors!\n", g_TotalErrors);
			printf("Test %s!\n", (g_TotalErrors==0) ? "PASSED" : "FAILED");
			exit(0);
		}
    }
}
예제 #5
0
void computeFPS()
{
    frameCount++;
    if (fpsCount++ == fpsLimit-1) {
        g_Verify = true;
    }
    if (fpsCount == fpsLimit) {
        char fps[256];
        float ifps = 1.0f / (cutGetAverageTimerValue(timer) / 1000.0f);
        sprintf(fps, "%sCUDA Box Filter (radius=%d): %3.1f fps",
                ((g_CheckRender && g_CheckRender->IsQAReadback()) ? "[AutoTest]: " : ""), filter_radius, ifps);

        glutSetWindowTitle(fps);
        fpsCount = 0;
        if (g_CheckRender && !g_CheckRender->IsQAReadback()) fpsLimit = (int)MAX(ifps, 1.0f);

        cutilCheckError(cutResetTimer(timer));

        AutoQATest();
    }
}
void computeFPS()
{
    frameCount++;
    fpsCount++;
    if (fpsCount == fpsLimit-1) {
        g_Verify = true;
    }
    if (fpsCount == fpsLimit) {
        char fps[256];
        float ifps = 1.f / (cutGetAverageTimerValue(timer) / 1000.f);
        sprintf(fps, "%s Cuda Edge Detection (%s): %3.1f fps", 
                ((g_CheckRender && g_CheckRender->IsQAReadback()) ? "AutoTest:" : ""),
				filterMode[g_SobelDisplayMode], ifps);  

        glutSetWindowTitle(fps);
        fpsCount = 0; 
        if (g_CheckRender && !g_CheckRender->IsQAReadback()) fpsLimit = (int)MAX(ifps, 1.f);

        cutilCheckError(cutResetTimer(timer));  
        AutoQATest();
    }
}
// main rendering loop
void display()
{
    cutilCheckError(cutStartTimer(timer));

	if( !gestures.m_bPause )
	{
		//Read next available data
		gestures.m_Context.WaitAndUpdateAll();
	}

	//Process the data
	gestures.m_DepthGenerator.GetMetaData( depthMD );
	gestures.m_UserGenerator.GetUserPixels( 0, sceneMD );

    // move camera
    if (cameraPos[1] > 0.0f) cameraPos[1] = 0.0f;
    cameraPosLag += (cameraPos - cameraPosLag) * inertia;
    cameraRotLag += (cameraRot - cameraRotLag) * inertia;
	cursorPosLag += (cursorPos - cursorPosLag) * inertia;

    // view transform
    glMatrixMode(GL_MODELVIEW);
    glLoadIdentity();
    glRotatef(cameraRotLag[0], 1.0, 0.0, 0.0);
    glRotatef(cameraRotLag[1], 0.0, 1.0, 0.0);
    glTranslatef(cameraPosLag[0], cameraPosLag[1], cameraPosLag[2]);

    glGetFloatv(GL_MODELVIEW_MATRIX, modelView);

    // update the simulation
    if (!paused)
    {
        if (emitterOn) {
            runEmitter();
        }
        SimParams &p = psystem->getParams();
        p.cursorPos = make_float3(cursorPosLag.x, cursorPosLag.y, cursorPosLag.z);

        psystem->step(timestep); 
        currentTime += timestep;
    }

    renderer->calcVectors();
    vec3f sortVector = renderer->getSortVector();

    psystem->setSortVector(make_float3(sortVector.x, sortVector.y, sortVector.z));
    psystem->setModelView(modelView);
    psystem->setSorting(sort);
    psystem->depthSort();

    // render
    glClearColor(0.0, 0.0, 0.0, 1.0);
    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);  
    renderScene();

    // draw particles
    if (displayEnabled)
    {
        // render scene to offscreen buffers to get correct occlusion
        renderer->beginSceneRender(SmokeRenderer::LIGHT_BUFFER);
        renderScene();
        renderer->endSceneRender(SmokeRenderer::LIGHT_BUFFER);

        renderer->beginSceneRender(SmokeRenderer::SCENE_BUFFER);
        renderScene();
        renderer->endSceneRender(SmokeRenderer::SCENE_BUFFER);

        renderer->setPositionBuffer(psystem->getPosBuffer());
        renderer->setVelocityBuffer(psystem->getVelBuffer());
        renderer->setIndexBuffer(psystem->getSortedIndexBuffer());

        renderer->setNumParticles(psystem->getNumParticles());
        renderer->setParticleRadius(spriteSize);
        renderer->setDisplayLightBuffer(displayLightBuffer);
        renderer->setAlpha(alpha);
        renderer->setShadowAlpha(shadowAlpha);
		renderer->setLightPosition(lightPos);
		renderer->setColorAttenuation(colorAttenuation);
		renderer->setLightColor(lightColor);
		renderer->setNumSlices(numSlices);
		renderer->setNumDisplayedSlices(numDisplayedSlices);
		renderer->setBlurRadius(blurRadius);

        renderer->render();

		if (drawVectors) {
			renderer->debugVectors();
		}
    }

    // display sliders
    if (displaySliders) {
        glDisable(GL_DEPTH_TEST);
        glBlendFunc(GL_ONE_MINUS_DST_COLOR, GL_ZERO); // invert color
        glEnable(GL_BLEND);
        params->Render(0, 0);
        glDisable(GL_BLEND);
        glEnable(GL_DEPTH_TEST);
    }

    glutSwapBuffers();
    glutReportErrors();
    cutilCheckError(cutStopTimer(timer));  

    // readback for verification//sw/devrel/SDK10/Compute/projects/recursiveGaussian/recursiveGaussian.cpp
    if (g_CheckRender && g_CheckRender->IsQAReadback() && (++frameNumber >= frameCheckNumber)) {
        printf("> (Frame %d) Readback BackBuffer\n", frameNumber);
        g_CheckRender->readback( winWidth, winHeight );
        g_CheckRender->savePPM(sOriginal, true, NULL);
        bool passed = g_CheckRender->PPMvsPPM(sOriginal, sReference, MAX_EPSILON_ERROR, THRESHOLD);
        printf("Summary: %d errors!\n", passed ? 0 : 1);
		printf("%s\n", passed ? "PASSED" : "FAILED");
        cleanup();
		exit(0);
    }

    fpsCount++;
    // this displays the frame rate updated every second (independent of frame rate)
    if (fpsCount >= fpsLimit) {
        char fps[256];
        float ifps = 1.f / (cutGetAverageTimerValue(timer) / 1000.f);
        sprintf(fps, "CUDA Smoke Particles (%d particles): %3.1f fps", numParticles, ifps);  
        glutSetWindowTitle(fps);
        fpsCount = 0; 
        fpsLimit = (ifps > 1.f) ? (int)ifps : 1;
        if (paused) fpsLimit = 0;
        cutilCheckError(cutResetTimer(timer));  
    }


}
예제 #8
0
void display()
{
    cutilCheckError(cutStartTimer(timer));  

    // update the simulation
    if (!bPause)
    {
        psystem->setIterations(iterations);
        psystem->setDamping(damping);
        psystem->setGravity(-gravity);
        psystem->setCollideSpring(collideSpring);
        psystem->setCollideDamping(collideDamping);
        psystem->setCollideShear(collideShear);
        psystem->setCollideAttraction(collideAttraction);

        psystem->update(timestep); 
        renderer->setVertexBuffer(psystem->getCurrentReadBuffer(), psystem->getNumParticles());
    }
    else
    {
    	usleep(32666);
    }

    // render
    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);  

    // view transform
    glMatrixMode(GL_MODELVIEW);
    glLoadIdentity();
    for (int c = 0; c < 3; ++c)
    {
        camera_trans_lag[c] += (camera_trans[c] - camera_trans_lag[c]) * inertia;
        camera_rot_lag[c] += (camera_rot[c] - camera_rot_lag[c]) * inertia;
    }
    glTranslatef(camera_trans_lag[0], camera_trans_lag[1], camera_trans_lag[2]);
    glRotatef(camera_rot_lag[0], 1.0, 0.0, 0.0);
    glRotatef(camera_rot_lag[1], 0.0, 1.0, 0.0);

    glGetFloatv(GL_MODELVIEW_MATRIX, modelView);

    // cube
    glColor3f(1.0, 1.0, 1.0);
    glutWireCube(2.0);

    // collider
    glPushMatrix();
    float4 p = psystem->getColliderPos();
    glTranslatef(p.x, p.y, p.z);
    glColor3f(1.0, 0.0, 0.0);
    glutSolidSphere(psystem->getColliderRadius(), 20, 10);
    glPopMatrix();

    if (displayEnabled)
    {
        renderer->display(displayMode);
    }

    if (displaySliders) {
        glDisable(GL_DEPTH_TEST);
        glBlendFunc(GL_ONE_MINUS_DST_COLOR, GL_ZERO); // invert color
        glEnable(GL_BLEND);
        params->Render(0, 0);
        glDisable(GL_BLEND);
        glEnable(GL_DEPTH_TEST);
    }

    cutilCheckError(cutStopTimer(timer));  

    glutSwapBuffers();

    fpsCount++;
    // this displays the frame rate updated every second (independent of frame rate)
    if (fpsCount >= fpsLimit) {
        char fps[256];
        float ifps = 1.f / (cutGetAverageTimerValue(timer) / 1000.f);
        sprintf(fps, "CUDA particles (%d particles): %3.1f fps", numParticles, ifps);  
        glutSetWindowTitle(fps);
        fpsCount = 0; 
        fpsLimit = (ifps > 1.f) ? (int)ifps : 1;
        if (bPause) fpsLimit = 0;
        cutilCheckError(cutResetTimer(timer));  
    }

    glutReportErrors();
}
예제 #9
0
bool
runTestMax( int argc, char** argv, ReduceType datatype) 
{
    int size = 1<<24;    // number of elements to reduce
    int maxThreads = 256;  // number of threads per block
    int whichKernel = 6;
    int maxBlocks = 64;
    bool cpuFinalReduction = false;
    int cpuFinalThreshold = 1;

    cutGetCmdLineArgumenti( argc, (const char**) argv, "n", &size);
    cutGetCmdLineArgumenti( argc, (const char**) argv, "threads", &maxThreads);
    cutGetCmdLineArgumenti( argc, (const char**) argv, "kernel", &whichKernel);
    cutGetCmdLineArgumenti( argc, (const char**) argv, "maxblocks", &maxBlocks);
    
		shrLog("METHOD: MAX\n");
    shrLog("%d elements\n", size);
    shrLog("%d threads (max)\n", maxThreads);

    cpuFinalReduction = (cutCheckCmdLineFlag( argc, (const char**) argv, "cpufinal") == CUTTrue);
    cutGetCmdLineArgumenti( argc, (const char**) argv, "cputhresh", &cpuFinalThreshold);

    bool runShmoo = (cutCheckCmdLineFlag(argc, (const char**) argv, "shmoo") == CUTTrue);

    if (runShmoo)
    {
        shmoo<T>(1, 33554432, maxThreads, maxBlocks, datatype);
    }
    else
    {

        // create random input data on CPU
        unsigned int bytes = size * sizeof(T);

        T *h_idata = (T *) malloc(bytes);

        for(int i=0; i<size; i++) 
        {
            // Keep the numbers small so we don't get truncation error in the sum
            if (datatype == REDUCE_INT)
                h_idata[i] = (T)(rand() & 0xFF);
            else
                h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX;
        }

        int numBlocks = 0;
        int numThreads = 0;
        getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads);
        if (numBlocks == 1) cpuFinalThreshold = 1;

        // allocate mem for the result on host side
        T* h_odata = (T*) malloc(numBlocks*sizeof(T));

        shrLog("%d blocks\n\n", numBlocks);

        // allocate device memory and data
        T* d_idata = NULL;
        T* d_odata = NULL;

        cutilSafeCallNoSync( cudaMalloc((void**) &d_idata, bytes) );
        cutilSafeCallNoSync( cudaMalloc((void**) &d_odata, numBlocks*sizeof(T)) );

        // copy data directly to device memory
        cutilSafeCallNoSync( cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice) );
        cutilSafeCallNoSync( cudaMemcpy(d_odata, h_idata, numBlocks*sizeof(T), cudaMemcpyHostToDevice) );

        // warm-up
        maxreduce<T>(size, numThreads, numBlocks, whichKernel, d_idata, d_odata);
        
        int testIterations = 100;

        unsigned int timer = 0;
        cutilCheckError( cutCreateTimer( &timer));
        
        T gpu_result = 0;

        gpu_result = benchmarkReduceMax<T>(size, numThreads, numBlocks, maxThreads, maxBlocks,
                                        whichKernel, testIterations, cpuFinalReduction, 
                                        cpuFinalThreshold, timer,
                                        h_odata, d_idata, d_odata);

		double reduceTime = cutGetAverageTimerValue(timer) * 1e-3;
        shrLogEx(LOGBOTH | MASTER, 0, "Reduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %d, Workgroup = %u\n", 
               1.0e-9 * ((double)bytes)/reduceTime, reduceTime, size, 1, numThreads);

        // compute reference solution
        T cpu_result = maxreduceCPU<T>(h_idata, size);

        double threshold = 1e-12;
        double diff = 0;
		
        if (datatype == REDUCE_INT)
        {
            shrLog("\nGPU result = %d\n", gpu_result);
            shrLog("CPU result = %d\n\n", cpu_result);
        }
        else
        {
            shrLog("\nGPU result = %f\n", gpu_result);
            shrLog("CPU result = %f\n\n", cpu_result);

            if (datatype == REDUCE_FLOAT)
                threshold = 1e-8 * size;
            diff = fabs((double)gpu_result - (double)cpu_result);
        }

        // cleanup
        cutilCheckError( cutDeleteTimer(timer) );
        free(h_idata);
        free(h_odata);

        cutilSafeCallNoSync(cudaFree(d_idata));
        cutilSafeCallNoSync(cudaFree(d_odata));

		if (datatype == REDUCE_INT) {
			return (gpu_result == cpu_result);
		} else {
			return (diff < threshold);
		}
	}
	return true;
}
예제 #10
0
void shmoo(int minN, int maxN, int maxThreads, int maxBlocks, ReduceType datatype)
{ 
		fprintf(stderr, "Shmoo wasn't implemented in this modified kernel!\n");
		exit(1);
    // create random input data on CPU
    unsigned int bytes = maxN * sizeof(T);

    T *h_idata = (T*) malloc(bytes);

    for(int i = 0; i < maxN; i++) {
        // Keep the numbers small so we don't get truncation error in the sum
        if (datatype == REDUCE_INT)
            h_idata[i] = (T)(rand() & 0xFF);
        else
            h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX;
    }

    int maxNumBlocks = MIN( maxN / maxThreads, MAX_BLOCK_DIM_SIZE);

    // allocate mem for the result on host side
    T* h_odata = (T*) malloc(maxNumBlocks*sizeof(T));

    // allocate device memory and data
    T* d_idata = NULL;
    T* d_odata = NULL;

    cutilSafeCallNoSync( cudaMalloc((void**) &d_idata, bytes) );
    cutilSafeCallNoSync( cudaMalloc((void**) &d_odata, maxNumBlocks*sizeof(T)) );

    // copy data directly to device memory
    cutilSafeCallNoSync( cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice) );
    cutilSafeCallNoSync( cudaMemcpy(d_odata, h_idata, maxNumBlocks*sizeof(T), cudaMemcpyHostToDevice) );

    // warm-up
    for (int kernel = 0; kernel < 7; kernel++)
    {
        sumreduce<T>(maxN, maxThreads, maxNumBlocks, kernel, d_idata, d_odata);
    }
    int testIterations = 100;

    unsigned int timer = 0;
    cutilCheckError( cutCreateTimer( &timer));
    
    // print headers
    shrLog("Time in milliseconds for various numbers of elements for each kernel\n\n\n");
    shrLog("Kernel");
    for (int i = minN; i <= maxN; i *= 2)
    {
        shrLog(", %d", i);
    }
   
    for (int kernel = 0; kernel < 7; kernel++)
    {
        shrLog("\n%d", kernel);
        for (int i = minN; i <= maxN; i *= 2)
        {
            cutResetTimer(timer);
            int numBlocks = 0;
            int numThreads = 0;
            getNumBlocksAndThreads(kernel, i, maxBlocks, maxThreads, numBlocks, numThreads);
            
            float reduceTime;
            if( numBlocks <= MAX_BLOCK_DIM_SIZE ) {
                benchmarkReduceSum(i, numThreads, numBlocks, maxThreads, maxBlocks, kernel, 
                                testIterations, false, 1, timer, h_odata, d_idata, d_odata);
                reduceTime =  cutGetAverageTimerValue(timer);
            } else {                
                reduceTime = -1.0;
            }
            shrLog(", %.5f", reduceTime);
        }
    }

    // cleanup
    cutilCheckError(cutDeleteTimer(timer));
    free(h_idata);
    free(h_odata);

    cutilSafeCallNoSync(cudaFree(d_idata));
    cutilSafeCallNoSync(cudaFree(d_odata));    
}