void
benchmark(int iterations) 
{
    // allocate memory for result
    unsigned int *d_result;
    unsigned int size = width * height * sizeof(unsigned int);
    cutilSafeCall( cudaMalloc( (void**) &d_result, size));

    // warm-up
    gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads);

    cutilSafeCall( cudaThreadSynchronize() );
    cutilCheckError( cutStartTimer( timer));

    // execute the kernel
    for(int i=0; i<iterations; i++) {
        gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads);
    }

    cutilSafeCall( cudaThreadSynchronize() );
    cutilCheckError( cutStopTimer( timer));

    // check if kernel execution generated an error
    cutilCheckMsg("Kernel execution failed");

    printf("Processing time: %f (ms)\n", cutGetTimerValue( timer));
    printf("%.2f Mpixels/sec\n", (width*height*iterations / (cutGetTimerValue( timer) / 1000.0f)) / 1e6);

    cutilSafeCall(cudaFree(d_result));
}
Esempio n. 2
0
// keplereq_wrapper_C:
//         C wrapper function to solve's Kepler's equation num times.  
// inputs: 
//         ph_ma:  pointer to beginning element of array of doubles containing mean anomaly in radians 
//         ph_ecc: pointer to beginning element of array of doubles containing eccentricity 
//         num:    integer size of input arrays 
//         ph_eccanom: pointer to beginning element of array of doubles eccentric anomaly in radians 
// outputs:
//         ph_eccanom: values overwritten with eccentric anomaly
// assumptions:
//         input mean anomalies between 0 and 2pi
//         input eccentricities between 0 and 1
//         all three arrays have at least num elements 
//
void keplereq_wrapper_c(double *ph_ma, double *ph_ecc, int num, double *ph_eccanom)
{
	int gpuid = init_cuda();
	// put vectors in thrust format from raw points
	thrust::host_vector<double> h_ecc(ph_ecc,ph_ecc+num);
	thrust::host_vector<double> h_ma(ph_ma,ph_ma+num);

	cutCreateTimer(&memoryTime);  	cutCreateTimer(&kernelTime);
	cutResetTimer(memoryTime);    	cutResetTimer(kernelTime);

	if(gpuid>=0)
	{
	cutStartTimer(memoryTime);
	// transfer input params to GPU
	thrust::device_vector<double> d_ecc = h_ecc;
	thrust::device_vector<double> d_ma = h_ma;
	// allocate mem on GPU
	thrust::device_vector<double> d_eccanom(num);
	cudaThreadSynchronize();
	cutStopTimer(memoryTime);
	
	// distribute the computation to the GPU
	cutStartTimer(kernelTime);
	thrust::for_each(
	   thrust::make_zip_iterator(thrust::make_tuple(d_ma.begin(),d_ecc.begin(),d_eccanom.begin())),
	   thrust::make_zip_iterator(thrust::make_tuple(d_ma.end(),  d_ecc.end(),  d_eccanom.end())), 
	   keplereq_functor() );
	cudaThreadSynchronize();
	cutStopTimer(kernelTime);

	// transfer results back to host
	cutStartTimer(memoryTime);
	thrust::copy(d_eccanom.begin(),d_eccanom.end(),ph_eccanom);
	cudaThreadSynchronize();
	cutStopTimer(memoryTime);
	}
	else
	{
	// distribute the computation to the CPU
	cutStartTimer(kernelTime);
	thrust::for_each(
	   thrust::make_zip_iterator(thrust::make_tuple(h_ma.begin(),h_ecc.begin(),ph_eccanom)),
	   thrust::make_zip_iterator(thrust::make_tuple(h_ma.end(),  h_ecc.end(),  ph_eccanom+num)), 
	   keplereq_functor() );
	cutStopTimer(kernelTime);	
	}
}
// This is the normal display path
void display(void) 
{  
    cutilCheckError(cutStartTimer(timer));  

    // Sobel operation
    Pixel *data = NULL;

    // map PBO to get CUDA device pointer
	cutilSafeCall(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0));
    size_t num_bytes; 
    cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void **)&data, &num_bytes,  
						       cuda_pbo_resource));
    //printf("CUDA mapped PBO: May access %ld bytes\n", num_bytes);
	
	sobelFilter(data, imWidth, imHeight, g_SobelDisplayMode, imageScale, blockOp, pointOp );
    cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));

    glClear(GL_COLOR_BUFFER_BIT);

    glBindTexture(GL_TEXTURE_2D, texid);
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_buffer);
    glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, imWidth, imHeight, 
                   GL_LUMINANCE, GL_UNSIGNED_BYTE, OFFSET(0));
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);

    glDisable(GL_DEPTH_TEST);
    glEnable(GL_TEXTURE_2D);
    glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
    glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
    glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
    glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
        
    glBegin(GL_QUADS);
    glVertex2f(0, 0); glTexCoord2f(0, 0);
    glVertex2f(0, 1); glTexCoord2f(1, 0);
    glVertex2f(1, 1); glTexCoord2f(1, 1);
    glVertex2f(1, 0); glTexCoord2f(0, 1);
    glEnd();
    glBindTexture(GL_TEXTURE_2D, 0);

    if (g_CheckRender && g_CheckRender->IsQAReadback() && g_Verify) {
        printf("> (Frame %d) readback BackBuffer\n", frameCount);
        g_CheckRender->readback( imWidth, imHeight );
        g_CheckRender->savePPM ( sOriginal_ppm[g_Index], true, NULL );
        if (!g_CheckRender->PPMvsPPM(sOriginal_ppm[g_Index], sReference_ppm[g_Index], MAX_EPSILON_ERROR, 0.15f)) {
            g_TotalErrors++;
        }
        g_Verify = false;
    }
    glutSwapBuffers();

    cutilCheckError(cutStopTimer(timer));  

    computeFPS();

    glutPostRedisplay();
}
Esempio n. 4
0
void fpsDisplay()
{
  cutilCheckError(cutStartTimer(timer)); 
   
  display();
   
  cutilCheckError(cutStopTimer(timer));
  computeFPS();
}
Esempio n. 5
0
File: main.cpp Progetto: elf11/GPGPU
void cleanup()
{
	cutilCheckError(cutStopTimer(timer));
	cutilCheckError(cutDeleteTimer( timer));

	cudaFree(a_d);cudaFree(b_d);cudaFree(r_d);

	cudaThreadExit();
      
}
// display results using OpenGL
void display()
{
    cutilCheckError(cutStartTimer(timer));  

    // execute filter, writing results to pbo
    unsigned int *d_result;
    //DEPRECATED: cutilSafeCall( cudaGLMapBufferObject((void**)&d_result, pbo) );
    cutilSafeCall(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0));
    size_t num_bytes; 
    cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void **)&d_result, &num_bytes,  
						       cuda_pbo_resource));
    
    runSelect(d_result);
    // DEPRECATED: cutilSafeCall(cudaGLUnmapBufferObject(pbo));
    cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));

    // Common display code path
    {
        glClear(GL_COLOR_BUFFER_BIT);
	
        // load texture from pbo
        glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
        glBindTexture(GL_TEXTURE_2D, texid);
        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0);
        glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);

        // fragment program is required to display floating point texture
        glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, shader);
        glEnable(GL_FRAGMENT_PROGRAM_ARB);
        glDisable(GL_DEPTH_TEST);

        glBegin(GL_QUADS);
        {
            glTexCoord2f(0, 0);          
            glVertex2f(0, 0);
            glTexCoord2f(1, 0);          
            glVertex2f(1, 0);
            glTexCoord2f(1, 1);          
            glVertex2f(1, 1);
            glTexCoord2f(0, 1);          
            glVertex2f(0, 1);
        }
        glEnd();
        glBindTexture(GL_TEXTURE_TYPE, 0);
        glDisable(GL_FRAGMENT_PROGRAM_ARB);
    }

	glutSwapBuffers();
    glutReportErrors();

    cutilCheckError(cutStopTimer(timer));  

    computeFPS();
}
Esempio n. 7
0
void runBenchmark(int iterations)
{
    cutilCheckError(cutStartTimer(timer));  
    for (int i = 0; i < iterations; ++i)
    {
        psystem->update(timestep);
    }
    cutilCheckError(cutStopTimer(timer));  
    float milliseconds = cutGetTimerValue(timer);

    printf("%d particles, total time for %d iterations: %0.3f ms\n", numParticles, iterations, milliseconds);
    printf("Test PASSED\n");
}
// display results using OpenGL
void display()
{
    cutilCheckError(cutStartTimer(timer));  

    // execute filter, writing results to pbo
    unsigned int *d_result;
    cutilSafeCall(cudaGLMapBufferObject((void**)&d_result, pbo));
    gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads);
    cutilSafeCall(cudaGLUnmapBufferObject(pbo));

    // load texture from pbo
    glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
    glBindTexture(GL_TEXTURE_2D, texid);
    glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
    glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0);
    glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);

    // display results
    glClear(GL_COLOR_BUFFER_BIT);

    glEnable(GL_TEXTURE_2D);
    glDisable(GL_DEPTH_TEST);

    glBegin(GL_QUADS);
    glTexCoord2f(0, 1); glVertex2f(0, 0);
    glTexCoord2f(1, 1); glVertex2f(1, 0);
    glTexCoord2f(1, 0); glVertex2f(1, 1);
    glTexCoord2f(0, 0); glVertex2f(0, 1);
    glEnd();

    glDisable(GL_TEXTURE_2D);

    if (g_CheckRender && g_CheckRender->IsQAReadback() && g_Verify) {
        // readback for QA testing
        printf("> (Frame %d) Readback BackBuffer\n", frameCount);
        g_CheckRender->readback( width, height );
        g_CheckRender->savePPM(sOriginal[g_Index], true, NULL);
        if (!g_CheckRender->PPMvsPPM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, 0.50f )) {
            g_TotalErrors++;
        }
        g_Verify = false;
    }

    glutSwapBuffers();

    cutilCheckError(cutStopTimer(timer));  

    computeFPS();
}
Esempio n. 9
0
////////////////////////////////////////////////////////////////////////////////
//! Display callback
////////////////////////////////////////////////////////////////////////////////
void display()
{
    cutilCheckError(cutStartTimer(timer));  

    // run CUDA kernel to generate vertex positions
    runCuda(vbo);

    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);

    // set view matrix
    glMatrixMode(GL_MODELVIEW);
    glLoadIdentity();
    glTranslatef(0.0, 0.0, translate_z);
    glRotatef(rotate_x, 1.0, 0.0, 0.0);
    glRotatef(rotate_y, 0.0, 1.0, 0.0);

    // render from the vbo
    glBindBuffer(GL_ARRAY_BUFFER, vbo);
    glVertexPointer(4, GL_FLOAT, 0, 0);

    glEnableClientState(GL_VERTEX_ARRAY);
    glColor3f(1.0, 0.0, 0.0);
    glDrawArrays(GL_POINTS, 0, mesh_width * mesh_height);
    glDisableClientState(GL_VERTEX_ARRAY);

    if (g_CheckRender && g_CheckRender->IsQAReadback() && g_Verify) {
        // readback for QA testing
        printf("> (Frame %d) Readback BackBuffer\n", frameCount);
        g_CheckRender->readback( window_width, window_height );
        g_CheckRender->savePPM(sOriginal[g_Index], true, NULL);
        if (!g_CheckRender->PPMvsPPM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, 0.15f)) {
            g_TotalErrors++;
        }
        else
        {
        	printf( "TEST PASSED\n" );
        }
        g_Verify = false;
    }

    glutSwapBuffers();
    glutPostRedisplay();

    anim += 0.01;

    cutilCheckError(cutStopTimer(timer));  
    computeFPS();
}
Esempio n. 10
0
File: main.cpp Progetto: elf11/GPGPU
void cleanup()
{
	free(a_h);free(b_h);free(r_h);
	free(control);

	cutilCheckError(cutStopTimer(timer));
	cutilCheckError(cutDeleteTimer( timer));

	cudaFree(a_d);cudaFree(b_d);cudaFree(r_d);

	
	cutilSafeCall(release());
	checkCUDAError("release");
	cudaThreadExit();
      
}
Esempio n. 11
0
    void _runBenchmark(int iterations)
    {
        // once without timing to prime the device
        if (!useCpu)
            m_nbody->update(activeParams.m_timestep);

        if (useCpu)
        {
            cutCreateTimer(&timer);
            cutStartTimer(timer);
        }
        else
        {
            cutilSafeCall(cudaEventRecord(startEvent, 0));
        }

        for (int i = 0; i < iterations; ++i)
        {
            m_nbody->update(activeParams.m_timestep);
        }

        float milliseconds = 0;

        if (useCpu)
        {
            cutStopTimer(timer);
            milliseconds = cutGetTimerValue(timer);
            cutDeleteTimer(timer);
        }
        else
        {
            cutilSafeCall(cudaEventRecord(stopEvent, 0));  
            cutilSafeCall(cudaEventSynchronize(stopEvent));
            cutilSafeCall( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent));
        }
        
        double interactionsPerSecond = 0;
        double gflops = 0;
        computePerfStats(interactionsPerSecond, gflops, milliseconds, iterations);

        shrLog("%d bodies, total time for %d iterations: %.3f ms\n", 
            numBodies, iterations, milliseconds);
        shrLog("= %.3f billion interactions per second\n", interactionsPerSecond);
        shrLog("= %.3f %s-precision GFLOP/s at %d flops per interaction\n", gflops, 
               (sizeof(T) > 4) ? "double" : "single", flopsPerInteraction);   
    }
void
runAutoTest(int argc, char **argv) 
{
    // allocate memory for result
    unsigned int *d_result;
    unsigned int size = width * height * sizeof(unsigned int);
    cutilSafeCall( cudaMalloc( (void**) &d_result, size));

    // warm-up
    gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads);

    cutilSafeCall( cudaThreadSynchronize() );
    cutilCheckError( cutStartTimer( timer));
    
    while (sigma <= 22) {    
        gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads);
        cutilSafeCall( cudaThreadSynchronize() );
        // check if kernel execution generated an error
        cutilCheckMsg("Kernel execution failed");

        cudaMemcpy(g_CheckRender->imageData(), d_result, width*height*4, cudaMemcpyDeviceToHost);
        g_CheckRender->savePPM(sOriginal[g_Index], false, NULL);
        if (!g_CheckRender->PPMvsPPM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, 0.50f)) {
            g_TotalErrors++;
        }
        g_Index++;
        sigma += 4;
    }

    cutilCheckError( cutStopTimer( timer));

    printf("Processing time: %f (ms)\n", cutGetTimerValue( timer));
    printf("%.2f Mpixels/sec\n", (width*height*g_Index / (cutGetTimerValue( timer) / 1000.0f)) / 1e6);

    printf("Summary: %d errors!\n", g_TotalErrors);
	printf("Test %s!\n", (g_TotalErrors==0) ? "PASSED" : "FAILED");
    

    cutilSafeCall(cudaFree(d_result));
}
Esempio n. 13
0
void disp(void){
	
	
	glClear(GL_COLOR_BUFFER_BIT);

	
	update_phi();
	

	its++;

	if(its<ITERATIONS){
		glutPostRedisplay();
		
		if(its%50==0){
			
			printf("Iteration %3d Total Time: %3.2f ReInit Time: %3.2f\n", its, 0.001*cutGetTimerValue(Timer), 0.001*cutGetTimerValue(ReInitTimer));
			
			cutStartTimer(ReInitTimer); // ReInit Timer Start
			
			reinit_phi(); // ReInit

			glDrawPixels(imageW, imageH, GL_GREEN, GL_FLOAT, phi);
			glutSwapBuffers();
			cutStopTimer(ReInitTimer); // ReInit Timer Stop
		}

	} else {
		
		printf("Iteration %3d Total Time: %3.2f ReInit Time: %3.2f\n", its, 0.001*cutGetTimerValue(Timer), 0.001*cutGetTimerValue(ReInitTimer));

		glDrawPixels(imageW, imageH, GL_GREEN, GL_FLOAT, phi);
		glutSwapBuffers();



	}
	
}
void shutDown(unsigned char k, int /*x*/, int /*y*/)
{
    switch (k){
        case '\033':
        case 'q':
        case 'Q':
            printf("Shutting down...\n");
                cutilCheckError( cutStopTimer(hTimer)   );
                cutilCheckError( cutDeleteTimer(hTimer) );
				// DEPRECATED: cutilSafeCall( cudaGLRegisterBufferObject(gl_PBO) );
				cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, gl_PBO, 
									   cudaGraphicsMapFlagsWriteDiscard));
                glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
                glDeleteBuffers(1, &gl_PBO);
                glDeleteTextures(1, &gl_Tex);

                cutilSafeCall( CUDA_FreeArray() );
                free(h_Src);
            printf("Shutdown done.\n");

            cutilDeviceReset();
            exit(0);
        break;

        case '1':
            printf("Passthrough.\n");
            g_Kernel = 0;
        break;

        case '2':
            printf("KNN method \n");
            g_Kernel = 1;
        break;

        case '3':
            printf("NLM method\n");
            g_Kernel = 2;
        break;

        case '4':
            printf("Quick NLM(NLM2) method\n");
            g_Kernel = 3;
        break;

        case ' ':
            printf(g_Diag ? "LERP highlighting mode.\n" : "Normal mode.\n");
            g_Diag = !g_Diag;
        break;

        case 'n':
            printf("Decrease noise level.\n");
            knnNoise -= noiseStep;
            nlmNoise -= noiseStep;
        break;

        case 'N':
            printf("Increase noise level.\n");
            knnNoise += noiseStep;
            nlmNoise += noiseStep;
        break;

        case 'l':
            printf("Decrease LERP quotent.\n");
            lerpC = MAX(lerpC - lerpStep, 0.0f);
        break;

        case 'L':
            printf("Increase LERP quotent.\n");
            lerpC = MIN(lerpC + lerpStep, 1.0f);
        break;

        case 'f' : case 'F':
            g_FPS = true;
        break;

        case '?':
            printf("lerpC = %5.5f\n", lerpC);
            printf("knnNoise = %5.5f\n", knnNoise);
            printf("nlmNoise = %5.5f\n", nlmNoise);
        break;
    }
}
Esempio n. 15
0
int main(int argc, char **argv)
{
    uchar *h_Data;
    uint  *h_HistogramCPU, *h_HistogramGPU;
    uchar *d_Data;
    uint  *d_Histogram;
    uint hTimer;
    int PassFailFlag = 1;
    uint byteCount = 64 * 1048576;
    uint uiSizeMult = 1;

    cudaDeviceProp deviceProp;
    deviceProp.major = 0;
    deviceProp.minor = 0;
    int dev;

	shrQAStart(argc, argv);

	// set logfile name and start logs
    shrSetLogFileName ("histogram.txt");

    //Use command-line specified CUDA device, otherwise use device with highest Gflops/s
    if( shrCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
        dev = cutilDeviceInit(argc, argv);
        if (dev < 0) {
           printf("No CUDA Capable Devices found, exiting...\n");
           shrQAFinishExit(argc, (const char **)argv, QA_WAIVED);
        }
    } else {
        cudaSetDevice( dev = cutGetMaxGflopsDeviceId() );
        cutilSafeCall( cudaChooseDevice(&dev, &deviceProp) );
    }
    cutilSafeCall( cudaGetDeviceProperties(&deviceProp, dev) );

	printf("CUDA device [%s] has %d Multi-Processors, Compute %d.%d\n", 
		deviceProp.name, deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor);

	int version = deviceProp.major * 0x10 + deviceProp.minor;

	if(version < 0x11) 
    {
        printf("There is no device supporting a minimum of CUDA compute capability 1.1 for this SDK sample\n");
        cutilDeviceReset();
		shrQAFinishExit(argc, (const char **)argv, QA_WAIVED);
    }

    cutilCheckError(cutCreateTimer(&hTimer));

    // Optional Command-line multiplier to increase size of array to histogram
    if (shrGetCmdLineArgumentu(argc, (const char**)argv, "sizemult", &uiSizeMult))
    {
        uiSizeMult = CLAMP(uiSizeMult, 1, 10);
        byteCount *= uiSizeMult;
    }

    shrLog("Initializing data...\n");
        shrLog("...allocating CPU memory.\n");
            h_Data         = (uchar *)malloc(byteCount);
            h_HistogramCPU = (uint  *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint));
            h_HistogramGPU = (uint  *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint));

        shrLog("...generating input data\n");
            srand(2009);
            for(uint i = 0; i < byteCount; i++) 
                h_Data[i] = rand() % 256;

        shrLog("...allocating GPU memory and copying input data\n\n");
            cutilSafeCall( cudaMalloc((void **)&d_Data, byteCount  ) );
            cutilSafeCall( cudaMalloc((void **)&d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint)  ) );
            cutilSafeCall( cudaMemcpy(d_Data, h_Data, byteCount, cudaMemcpyHostToDevice) );

    {
        shrLog("Starting up 64-bin histogram...\n\n");
            initHistogram64();

        shrLog("Running 64-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns);
            for(int iter = -1; iter < numRuns; iter++){
                //iter == -1 -- warmup iteration
                if(iter == 0){
                    cutilSafeCall( cutilDeviceSynchronize() );
                    cutilCheckError( cutResetTimer(hTimer) );
                    cutilCheckError( cutStartTimer(hTimer) );
                }

                histogram64(d_Histogram, d_Data, byteCount);
            }

            cutilSafeCall( cutilDeviceSynchronize() );
            cutilCheckError(  cutStopTimer(hTimer));
            double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns;
        shrLog("histogram64() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs);
        shrLogEx(LOGBOTH | MASTER, 0, "histogram64, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", 
                (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM64_THREADBLOCK_SIZE); 

        shrLog("\nValidating GPU results...\n");
            shrLog(" ...reading back GPU results\n");
                cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM64_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) );

            shrLog(" ...histogram64CPU()\n");
               histogram64CPU(
                    h_HistogramCPU,
                    h_Data,
                    byteCount
                );

            shrLog(" ...comparing the results...\n");
                for(uint i = 0; i < HISTOGRAM64_BIN_COUNT; i++)
                    if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0;
            shrLog(PassFailFlag ? " ...64-bin histograms match\n\n" : " ***64-bin histograms do not match!!!***\n\n" );

        shrLog("Shutting down 64-bin histogram...\n\n\n");
            closeHistogram64();
    }

    {
        shrLog("Initializing 256-bin histogram...\n");
            initHistogram256();

        shrLog("Running 256-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns);
            for(int iter = -1; iter < numRuns; iter++){
                //iter == -1 -- warmup iteration
                if(iter == 0){
                    cutilSafeCall( cutilDeviceSynchronize() );
                    cutilCheckError( cutResetTimer(hTimer) );
                    cutilCheckError( cutStartTimer(hTimer) );
                }

                histogram256(d_Histogram, d_Data, byteCount);
            }

            cutilSafeCall( cutilDeviceSynchronize() );
            cutilCheckError(  cutStopTimer(hTimer));
            double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns;
        shrLog("histogram256() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs);
        shrLogEx(LOGBOTH | MASTER, 0, "histogram256, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", 
                (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM256_THREADBLOCK_SIZE); 
                
        shrLog("\nValidating GPU results...\n");
            shrLog(" ...reading back GPU results\n");
                cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) );

            shrLog(" ...histogram256CPU()\n");
                histogram256CPU(
                    h_HistogramCPU,
                    h_Data,
                    byteCount
                );

            shrLog(" ...comparing the results\n");
                for(uint i = 0; i < HISTOGRAM256_BIN_COUNT; i++)
                    if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0;
            shrLog(PassFailFlag ? " ...256-bin histograms match\n\n" : " ***256-bin histograms do not match!!!***\n\n" );

        shrLog("Shutting down 256-bin histogram...\n\n\n");
            closeHistogram256();
    }

    shrLog("Shutting down...\n");
        cutilCheckError(cutDeleteTimer(hTimer));
        cutilSafeCall( cudaFree(d_Histogram) );
        cutilSafeCall( cudaFree(d_Data) );
        free(h_HistogramGPU);
        free(h_HistogramCPU);
        free(h_Data);

    cutilDeviceReset();
	shrLog("%s - Test Summary\n", sSDKsample);
    // pass or fail (for both 64 bit and 256 bit histograms)
    shrQAFinishExit(argc, (const char **)argv, (PassFailFlag ? QA_PASSED : QA_FAILED));
}
Esempio n. 16
0
// Host code
int main(int argc, char** argv)
{
    ParseArguments(argc, argv);
	
	float s_SobelMatrix[25];  
	s_SobelMatrix[0] = 1;
	s_SobelMatrix[1] = 2;
	s_SobelMatrix[2]= 0;
	s_SobelMatrix[3] = -2;
	s_SobelMatrix[4] = -1;
	s_SobelMatrix[5] = 4;
	s_SobelMatrix[6] = 8;
	s_SobelMatrix[7] = 0;
	s_SobelMatrix[8] = -8;
	s_SobelMatrix[9] = -4;
	s_SobelMatrix[10] = 6;
	s_SobelMatrix[11] = 12;
	s_SobelMatrix[12] = 0;
	s_SobelMatrix[13] = -12;
	s_SobelMatrix[14] = -6;
	s_SobelMatrix[15] = 4;
	s_SobelMatrix[16] = 8; 
	s_SobelMatrix[17] = 0;
	s_SobelMatrix[18] = -8;
	s_SobelMatrix[19] =-4;
	s_SobelMatrix[20] =1;
	s_SobelMatrix[21] =2;
	s_SobelMatrix[22] =0;
	s_SobelMatrix[23] =-2;
	s_SobelMatrix[24] =-1;
	
    unsigned char *palete = NULL;
    unsigned char *data = NULL, *out = NULL;
    PPMImage *input_image=NULL, *output_image=NULL;
    output_image = (PPMImage *)malloc(sizeof(PPMImage));
    input_image = readPPM(PPMInFileL);
    printf("Running %s filter\n", Filter);
    out = (unsigned char *)malloc();

    printf("Computing the CPU output\n");
    printf("Image details: %d by %d = %d , imagesize = %d\n", input_image->x, input_image->y, input_image->x * input_image->y, input_image->x * input_image->y);
    
	cutilCheckError(cutStartTimer(time_CPU));
	if(FilterMode == SOBEL_FILTER){
	printf("Running Sobel\n");
	CPU_Sobel(intput_image->data, output_image, input_image->x, input_image->y);
	}
	else if(FilterMode == HIGH_BOOST_FILTER){
	printf("Running boost\n");
	CPU_Boost(data, out, dib.width, dib.height);
	}
	cutilCheckError(cutStopTimer(time_CPU));
	if(FilterMode == SOBEL_FILTER || FilterMode == SOBEL_FILTER5)
    BitMapWrite("CPU_sobel.bmp", &bmp, &dib, out, palete);
	
	else if(FilterMode == AVERAGE_FILTER)
	BitMapWrite("CPU_average.bmp", &bmp, &dib, out, palete);
	
	else if(FilterMode == HIGH_BOOST_FILTER)
	BitMapWrite("CPU_boost.bmp", &bmp, &dib, out, palete);
	
    printf("Done with CPU output\n");
	printf("CPU execution time %f \n", cutGetTimerValue(time_CPU));
	
	
    printf("Allocating %d bytes for image \n", dib.image_size);
	
    cutilSafeCall( cudaMalloc( (void **)&d_In, dib.image_size*sizeof(unsigned char)) );
    cutilSafeCall( cudaMalloc( (void **)&d_Out, dib.image_size*sizeof(unsigned char)) );
    
	// creating space for filter matrix
	cutilSafeCall( cudaMalloc( (void **)&sobel_matrix, 25*sizeof(float)) );
	
	cutilCheckError(cutStartTimer(time_mem));
	
	cudaMemcpy(d_In, data, dib.image_size*sizeof(unsigned char), cudaMemcpyHostToDevice);
	
	cudaMemcpy(sobel_matrix, s_SobelMatrix, 25*sizeof(float), cudaMemcpyHostToDevice);
	
	cutilCheckError(cutStopTimer(time_mem));
    
	FilterWrapper(data, dib.width, dib.height);

    // Copy image back to host
	
	cutilCheckError(cutStartTimer(time_mem));
    cudaMemcpy(out, d_Out, dib.image_size*sizeof(unsigned char), cudaMemcpyDeviceToHost);
	cutilCheckError(cutStopTimer(time_mem));
	
	printf("GPU execution time %f Memtime %f \n", cutGetTimerValue(time_GPU), cutGetTimerValue(time_mem));
    printf("Total GPU = %f \n", (cutGetTimerValue(time_GPU) + cutGetTimerValue(time_mem)));
	// Write output image   
    BitMapWrite(BMPOutFile, &bmp, &dib, out, palete);

    Cleanup();
}
Esempio n. 17
0
int main(int argc, char **argv)
{
	GpuProfiling::initProf();
    // Start logs
    shrSetLogFileName ("scan.txt");
    shrLog("%s Starting...\n\n", argv[0]);

    //Use command-line specified CUDA device, otherwise use device with highest Gflops/s
    if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
        cutilDeviceInit(argc, argv);
    else
        cudaSetDevice( cutGetMaxGflopsDeviceId() );

    uint *d_Input, *d_Output;
    uint *h_Input, *h_OutputCPU, *h_OutputGPU;
    uint hTimer;
    const uint N = 13 * 1048576 / 2;

    shrLog("Allocating and initializing host arrays...\n");
        cutCreateTimer(&hTimer);
        h_Input     = (uint *)malloc(N * sizeof(uint));
        h_OutputCPU = (uint *)malloc(N * sizeof(uint));
        h_OutputGPU = (uint *)malloc(N * sizeof(uint));
        srand(2009);
        for(uint i = 0; i < N; i++)
            h_Input[i] = rand();

    shrLog("Allocating and initializing CUDA arrays...\n");
        cutilSafeCall( cudaMalloc((void **)&d_Input, N * sizeof(uint)) );
        cutilSafeCall( cudaMalloc((void **)&d_Output, N * sizeof(uint)) );
        cutilSafeCall( cudaMemcpy(d_Input, h_Input, N * sizeof(uint), cudaMemcpyHostToDevice) );

    shrLog("Initializing CUDA-C scan...\n\n");
        initScan();

    int globalFlag = 1;
    size_t szWorkgroup;
    const int iCycles = 100;
    shrLog("*** Running GPU scan for short arrays (%d identical iterations)...\n\n", iCycles);
        for(uint arrayLength = MIN_SHORT_ARRAY_SIZE; arrayLength <= MAX_SHORT_ARRAY_SIZE; arrayLength <<= 1){
            shrLog("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength);
                cutilSafeCall( cudaThreadSynchronize() );
                cutResetTimer(hTimer);
                cutStartTimer(hTimer);
                for(int i = 0; i < iCycles; i++)
                {
                    szWorkgroup = scanExclusiveShort(d_Output, d_Input, N / arrayLength, arrayLength);
                }
                cutilSafeCall( cudaThreadSynchronize());
                cutStopTimer(hTimer);
                double timerValue = 1.0e-3 * cutGetTimerValue(hTimer) / iCycles;

            shrLog("Validating the results...\n");
                shrLog("...reading back GPU results\n");
                    cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost) );

                shrLog(" ...scanExclusiveHost()\n");
                    scanExclusiveHost(h_OutputCPU, h_Input, N / arrayLength, arrayLength);

                // Compare GPU results with CPU results and accumulate error for this test
                shrLog(" ...comparing the results\n");
                    int localFlag = 1;
                    for(uint i = 0; i < N; i++)
                    {
                        if(h_OutputCPU[i] != h_OutputGPU[i])
                        {
                            localFlag = 0;
                            break;
                        }
                    }

                // Log message on individual test result, then accumulate to global flag
                shrLog(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!");
                globalFlag = globalFlag && localFlag;

                // Data log
                if (arrayLength == MAX_SHORT_ARRAY_SIZE)
                {
                    shrLog("\n");
                    shrLogEx(LOGBOTH | MASTER, 0, "scan-Short, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n",
                           (1.0e-6 * (double)arrayLength/timerValue), timerValue, arrayLength, 1, szWorkgroup);
                    shrLog("\n");
                }
        }

    shrLog("***Running GPU scan for large arrays (%u identical iterations)...\n\n", iCycles);
        for(uint arrayLength = MIN_LARGE_ARRAY_SIZE; arrayLength <= MAX_LARGE_ARRAY_SIZE; arrayLength <<= 1){
            shrLog("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength);
                cutilSafeCall( cudaThreadSynchronize() );
                cutResetTimer(hTimer);
                cutStartTimer(hTimer);
                for(int i = 0; i < iCycles; i++)
                {
                    szWorkgroup = scanExclusiveLarge(d_Output, d_Input, N / arrayLength, arrayLength);
                }
                cutilSafeCall( cudaThreadSynchronize() );
                cutStopTimer(hTimer);
                double timerValue = 1.0e-3 * cutGetTimerValue(hTimer) / iCycles;

            shrLog("Validating the results...\n");
                shrLog("...reading back GPU results\n");
                    cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost) );

                shrLog("...scanExclusiveHost()\n");
                    scanExclusiveHost(h_OutputCPU, h_Input, N / arrayLength, arrayLength);

                // Compare GPU results with CPU results and accumulate error for this test
                shrLog(" ...comparing the results\n");
                    int localFlag = 1;
                    for(uint i = 0; i < N; i++)
                    {
                        if(h_OutputCPU[i] != h_OutputGPU[i])
                        {
                            localFlag = 0;
                            break;
                        }
                    }

                // Log message on individual test result, then accumulate to global flag
                shrLog(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!");
                globalFlag = globalFlag && localFlag;

                // Data log
                if (arrayLength == MAX_LARGE_ARRAY_SIZE)
                {
                    shrLog("\n");
                    shrLogEx(LOGBOTH | MASTER, 0, "scan-Large, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n",
                           (1.0e-6 * (double)arrayLength/timerValue), timerValue, arrayLength, 1, szWorkgroup);
                    shrLog("\n");
                }
        }

    // pass or fail (cumulative... all tests in the loop)
    shrLog(globalFlag ? "PASSED\n\n" : "FAILED\n\n");
	GpuProfiling::printResults();

    shrLog("Shutting down...\n");
        closeScan();
        cutilSafeCall( cudaFree(d_Output));
        cutilSafeCall( cudaFree(d_Input));

        cutilCheckError( cutDeleteTimer(hTimer) );

        cudaThreadExit();
		exit(0);
        shrEXIT(argc, (const char**)argv);
}
Esempio n. 18
0
// display results using OpenGL
void display()
{
    cutilCheckError(cutStartTimer(timer));

    // execute filter, writing results to pbo
    unsigned int *d_result;
    //DEPRECATED: cutilSafeCall( cudaGLMapBufferObject((void**)&d_result, pbo) );
    cutilSafeCall(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0));
    size_t num_bytes;
    cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void **)&d_result, &num_bytes,
                  cuda_pbo_resource));
    boxFilterRGBA(d_img, d_temp, d_result, width, height, filter_radius, iterations, nthreads);
    // DEPRECATED: cutilSafeCall(cudaGLUnmapBufferObject(pbo));
    cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));

    if (g_bFBODisplay) {
        g_FrameBufferObject->bindRenderPath();
    }

    // Common display code path
    {
        glClear(GL_COLOR_BUFFER_BIT);

        // load texture from pbo
        glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
        glBindTexture(GL_TEXTURE_2D, texid);
        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0);
        glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);

        // fragment program is required to display floating point texture
        glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, shader);
        glEnable(GL_FRAGMENT_PROGRAM_ARB);
        glDisable(GL_DEPTH_TEST);

        glBegin(GL_QUADS);
        if (GL_TEXTURE_TYPE == GL_TEXTURE_2D) {
            glTexCoord2f(0.0f, 0.0f);
            glVertex2f(0.0f, 0.0f);
            glTexCoord2f(1.0f, 0.0f);
            glVertex2f(1.0f, 0.0f);
            glTexCoord2f(1.0f, 1.0f);
            glVertex2f(1.0f, 1.0f);
            glTexCoord2f(0.0f, 1.0f);
            glVertex2f(0.0f, 1.0f);
        } else {
            glTexCoord2f(0.0f, 0.0f);
            glVertex2f(0.0f, 0.0f);
            glTexCoord2f((float)width, 0.0f);
            glVertex2f(1.0f, 0.0f);
            glTexCoord2f((float)width, (float)height);
            glVertex2f(1.0f, 1.0f);
            glTexCoord2f(0.0f, (float)height);
            glVertex2f(0.0f, 1.0f);
        }
        glEnd();
        glBindTexture(GL_TEXTURE_TYPE, 0);
        glDisable(GL_FRAGMENT_PROGRAM_ARB);
    }

    if (g_bFBODisplay) {
        g_FrameBufferObject->unbindRenderPath();

    }

    if (g_CheckRender && g_CheckRender->IsQAReadback() && g_Verify) {
        // readback for QA testing
        if (g_bFBODisplay) {
            shrLog("> (Frame %d) Readback FBO\n", frameCount);
            g_CheckRender->readback( width, height, g_FrameBufferObject->getFbo() );
        } else {
            shrLog("> (Frame %d) Readback BackBuffer\n", frameCount);
            g_CheckRender->readback( width, height );
        }
        g_CheckRender->savePPM(sOriginal[g_Index], true, NULL);
        if (!g_CheckRender->PPMvsPPM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, 0.15f)) {
            g_TotalErrors++;
        }
        g_Verify = false;
    }

    glutSwapBuffers();
    glutReportErrors();

    cutilCheckError(cutStopTimer(timer));

    computeFPS();
}
int main(int argc, char **argv)
{
    // Start logs
    shrSetLogFileName ("quasirandomGenerator.txt");
    shrLog("%s Starting...\n\n", argv[0]);
    
    unsigned int useDoublePrecision;

    char *precisionChoice;
    cutGetCmdLineArgumentstr(argc, (const char **)argv, "type", &precisionChoice);
    if(precisionChoice == NULL)
        useDoublePrecision = 0;
    else{
        if(!strcasecmp(precisionChoice, "double"))
            useDoublePrecision = 1;
        else
            useDoublePrecision = 0;
    }

    unsigned int tableCPU[QRNG_DIMENSIONS][QRNG_RESOLUTION];

    float
        *h_OutputGPU;

    float
        *d_Output;

    int
        dim, pos;

    double
        delta, ref, sumDelta, sumRef, L1norm, gpuTime;

    unsigned int hTimer;

    if(sizeof(INT64) != 8){
        shrLog("sizeof(INT64) != 8\n");
        return 0;
    }

    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
        cutilDeviceInit(argc, argv);
    else
        cudaSetDevice( cutGetMaxGflopsDeviceId() );

    cutilCheckError(cutCreateTimer(&hTimer));

    int deviceIndex;
    cutilSafeCall(cudaGetDevice(&deviceIndex));
    cudaDeviceProp deviceProp;
    cutilSafeCall(cudaGetDeviceProperties(&deviceProp, deviceIndex));
    int version = deviceProp.major * 10 + deviceProp.minor;
    if(useDoublePrecision && version < 13){
        shrLog("Double precision not supported.\n");
        cudaThreadExit();
        return 0;
    }

    shrLog("Allocating GPU memory...\n");
        cutilSafeCall( cudaMalloc((void **)&d_Output, QRNG_DIMENSIONS * N * sizeof(float)) );

    shrLog("Allocating CPU memory...\n");
        h_OutputGPU = (float *)malloc(QRNG_DIMENSIONS * N * sizeof(float));

    shrLog("Initializing QRNG tables...\n\n");
        initQuasirandomGenerator(tableCPU);
        if(useDoublePrecision)
            initTable_SM13(tableCPU);
        else
            initTable_SM10(tableCPU);

    shrLog("Testing QRNG...\n\n");
        cutilSafeCall( cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float)) );
		int numIterations = 20;
		for (int i = -1; i < numIterations; i++)
		{
			if (i == 0)
			{
				cutilSafeCall( cudaThreadSynchronize() );
				cutilCheckError( cutResetTimer(hTimer) );
				cutilCheckError( cutStartTimer(hTimer) );
			}
            if(useDoublePrecision)
                quasirandomGenerator_SM13(d_Output, 0, N);
            else
                quasirandomGenerator_SM10(d_Output, 0, N);
		}
        cutilSafeCall( cudaThreadSynchronize() );
        cutilCheckError(cutStopTimer(hTimer));
        gpuTime = cutGetTimerValue(hTimer)/(double)numIterations*1e-3;
	shrLogEx(LOGBOTH | MASTER, 0, "quasirandomGenerator, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", 
                (double)QRNG_DIMENSIONS * (double)N * 1.0E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128*QRNG_DIMENSIONS); 

    shrLog("\nReading GPU results...\n");
        cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost) );

    shrLog("Comparing to the CPU results...\n\n");
        sumDelta = 0;
        sumRef = 0;
        for(dim = 0; dim < QRNG_DIMENSIONS; dim++)
            for(pos = 0; pos < N; pos++){
                ref       = getQuasirandomValue63(pos, dim);
                delta     = (double)h_OutputGPU[dim * N + pos] - ref;
                sumDelta += fabs(delta);
                sumRef   += fabs(ref);
            }
    shrLog("L1 norm: %E\n", sumDelta / sumRef);

    shrLog("\nTesting inverseCNDgpu()...\n\n");
        cutilSafeCall( cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float)) );
		for (int i = -1; i < numIterations; i++)
		{
			if (i == 0) 
			{
				cutilSafeCall( cudaThreadSynchronize() );
				cutilCheckError( cutResetTimer(hTimer) );
				cutilCheckError( cutStartTimer(hTimer) );
			}
            if(useDoublePrecision)
                inverseCND_SM13(d_Output, NULL, QRNG_DIMENSIONS * N);
            else
                inverseCND_SM10(d_Output, NULL, QRNG_DIMENSIONS * N);
		}
        cutilSafeCall( cudaThreadSynchronize() );
        cutilCheckError(cutStopTimer(hTimer));
        gpuTime = cutGetTimerValue(hTimer)/(double)numIterations*1e-3;
	shrLogEx(LOGBOTH | MASTER, 0, "quasirandomGenerator-inverse, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", 
                (double)QRNG_DIMENSIONS * (double)N * 1E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128); 

    shrLog("Reading GPU results...\n");
        cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost) );

    shrLog("\nComparing to the CPU results...\n");
        sumDelta = 0;
        sumRef = 0;
        for(pos = 0; pos < QRNG_DIMENSIONS * N; pos++){
            double  p = (double)(pos + 1) / (double)(QRNG_DIMENSIONS * N + 1);
            ref       = MoroInvCNDcpu(p);
            delta     = (double)h_OutputGPU[pos] - ref;
            sumDelta += fabs(delta);
            sumRef   += fabs(ref);
        }
    shrLog("L1 norm: %E\n\n", L1norm = sumDelta / sumRef);
    shrLog((L1norm < 1E-6) ? "PASSED\n\n" : "FAILED\n\n");

    shrLog("Shutting down...\n");
        cutilCheckError(cutDeleteTimer(hTimer));
        free(h_OutputGPU);
        cutilSafeCall( cudaFree(d_Output) );

    cudaThreadExit();

    shrEXIT(argc, (const char**)argv);
}
Esempio n. 20
0
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv) 
{
    //start logs
    shrSetLogFileName ("volumeRender.txt");
    shrLog("%s Starting...\n\n", argv[0]); 

    if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") ||
		cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) 
	{
        g_bQAReadback = true;
        fpsLimit = frameCheckNumber;
    }

    if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) 
	{
        g_bQAGLVerify = true;
        fpsLimit = frameCheckNumber;
    }

    if (g_bQAReadback) {
	    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
        if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
            cutilDeviceInit(argc, argv);
        } else {
            cudaSetDevice( cutGetMaxGflopsDeviceId() );
        }

    } else {
        // First initialize OpenGL context, so we can properly set the GL for CUDA.
        // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
        initGL( &argc, argv );

	    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
        if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
            cutilGLDeviceInit(argc, argv);
        } else {
            cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() );
        }
/*
        int device;
        struct cudaDeviceProp prop;
        cudaGetDevice( &device );
        cudaGetDeviceProperties( &prop, device );
        if( !strncmp( "Tesla", prop.name, 5 ) ) {
            shrLog("This sample needs a card capable of OpenGL and display.\n");
            shrLog("Please choose a different device with the -device=x argument.\n");
            cutilExit(argc, argv);
        }
*/
	}

    // parse arguments
    char *filename;
    if (cutGetCmdLineArgumentstr( argc, (const char**) argv, "file", &filename)) {
        volumeFilename = filename;
    }
    int n;
    if (cutGetCmdLineArgumenti( argc, (const char**) argv, "size", &n)) {
        volumeSize.width = volumeSize.height = volumeSize.depth = n;
    }
    if (cutGetCmdLineArgumenti( argc, (const char**) argv, "xsize", &n)) {
        volumeSize.width = n;
    }
    if (cutGetCmdLineArgumenti( argc, (const char**) argv, "ysize", &n)) {
        volumeSize.height = n;
    }
    if (cutGetCmdLineArgumenti( argc, (const char**) argv, "zsize", &n)) {
         volumeSize.depth = n;
    }

    // load volume data
    char* path = shrFindFilePath(volumeFilename, argv[0]);
    if (path == 0) {
        shrLog("Error finding file '%s'\n", volumeFilename);
        exit(EXIT_FAILURE);
    }

    size_t size = volumeSize.width*volumeSize.height*volumeSize.depth*sizeof(VolumeType);
    void *h_volume = loadRawFile(path, size);
    
    initCuda(h_volume, volumeSize);
    free(h_volume);

    cutilCheckError( cutCreateTimer( &timer));

    shrLog("Press '=' and '-' to change density\n"
           "      ']' and '[' to change brightness\n"
           "      ';' and ''' to modify transfer function offset\n"
           "      '.' and ',' to modify transfer function scale\n\n");

    // calculate new grid size
    gridSize = dim3(iDivUp(width, blockSize.x), iDivUp(height, blockSize.y));

    if (g_bQAReadback) {
        g_CheckRender = new CheckBackBuffer(width, height, 4, false);
        g_CheckRender->setPixelFormat(GL_RGBA);
        g_CheckRender->setExecPath(argv[0]);
        g_CheckRender->EnableQAReadback(true);

        uint *d_output;
        cutilSafeCall(cudaMalloc((void**)&d_output, width*height*sizeof(uint)));
        cutilSafeCall(cudaMemset(d_output, 0, width*height*sizeof(uint)));

        float modelView[16] = 
        {
            1.0f, 0.0f, 0.0f, 0.0f,
            0.0f, 1.0f, 0.0f, 0.0f,
            0.0f, 0.0f, 1.0f, 0.0f,
            0.0f, 0.0f, 4.0f, 1.0f
        };

        invViewMatrix[0] = modelView[0]; invViewMatrix[1] = modelView[4]; invViewMatrix[2] = modelView[8]; invViewMatrix[3] = modelView[12];
        invViewMatrix[4] = modelView[1]; invViewMatrix[5] = modelView[5]; invViewMatrix[6] = modelView[9]; invViewMatrix[7] = modelView[13];
        invViewMatrix[8] = modelView[2]; invViewMatrix[9] = modelView[6]; invViewMatrix[10] = modelView[10]; invViewMatrix[11] = modelView[14];

        // call CUDA kernel, writing results to PBO
	    copyInvViewMatrix(invViewMatrix, sizeof(float4)*3);
        
        // Start timer 0 and process n loops on the GPU 
        int nIter = 10;
        for (int i = -1; i < nIter; i++)
        {
            if( i == 0 ) {
                cudaThreadSynchronize();
                cutStartTimer(timer); 
            }
            
            render_kernel(gridSize, blockSize, d_output, width, height, density, brightness, transferOffset, transferScale);
        }
        cudaThreadSynchronize();
        cutStopTimer(timer);
        // Get elapsed time and throughput, then log to sample and master logs
        double dAvgTime = cutGetTimerValue(timer)/(nIter * 1000.0);
        shrLogEx(LOGBOTH | MASTER, 0, "volumeRender, Throughput = %.4f MTexels/s, Time = %.5f s, Size = %u Texels, NumDevsUsed = %u, Workgroup = %u\n", 
               (1.0e-6 * width * height)/dAvgTime, dAvgTime, (width * height), 1, blockSize.x * blockSize.y); 
        

        cutilCheckMsg("Error: render_kernel() execution FAILED");
        cutilSafeCall( cudaThreadSynchronize() );

        cutilSafeCall( cudaMemcpy(g_CheckRender->imageData(), d_output, width*height*4, cudaMemcpyDeviceToHost) );
        g_CheckRender->savePPM(sOriginal[g_Index], true, NULL);

        if (!g_CheckRender->PPMvsPPM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, THRESHOLD)) {
            shrLog("\nFAILED\n\n");
        } else {
            shrLog("\nPASSED\n\n");
        }

        cudaFree(d_output);
    	freeCudaBuffers();

        if (g_CheckRender) {
            delete g_CheckRender; g_CheckRender = NULL;
        }

    } else {
        // This is the normal rendering path for VolumeRender
        glutDisplayFunc(display);
        glutKeyboardFunc(keyboard);
        glutMouseFunc(mouse);
        glutMotionFunc(motion);
        glutReshapeFunc(reshape);
        glutIdleFunc(idle);

        initPixelBuffer();

        if (g_bQAGLVerify) {
            g_CheckRender = new CheckBackBuffer(width, height, 4);
            g_CheckRender->setPixelFormat(GL_RGBA);
            g_CheckRender->setExecPath(argv[0]);
            g_CheckRender->EnableQAReadback(true);
        }
        atexit(cleanup);

        glutMainLoop();
    }

    cudaThreadExit();
    shrEXIT(argc, (const char**)argv);
}
void quickshift(image_t im, float sigma, float tau, float * map, float * gaps, float * E)
{
  int verb = 1 ;

  float *M = 0, *n = 0;
  float tau2;
  
  int K, d;
  int N1,N2, i1,i2, j1,j2, R, tR;

  int medoid = 0 ;

  float const * I = im.I;
  N1 = im.N1;
  N2 = im.N2;
  K = im.K;

  d = 2 + K ; /* Total dimensions include spatial component (x,y) */
  
  tau2  = tau*tau;

  
  if (medoid) { /* n and M are only used in mediod shift */
    M = (float *) calloc(N1*N2*d, sizeof(float)) ;
    n = (float *) calloc(N1*N2,   sizeof(float)) ;
  }

  R = (int) ceil (3 * sigma) ;
  tR = (int) ceil (tau) ;
  
  if (verb) {
    printf("quickshift: [N1,N2,K]: [%d,%d,%d]\n", N1,N2,K) ;
    printf("quickshift: type: %s\n", medoid ? "medoid" : "quick");
    printf("quickshift: sigma:   %g\n", sigma) ;
    /* R is ceil(3 * sigma) and determines the window size to accumulate
     * similarity */
    printf("quickshift: R:       %d\n", R) ; 
    printf("quickshift: tau:     %g\n", tau) ;
    printf("quickshift: tR:      %d\n", tR) ;
  }

  /* -----------------------------------------------------------------
   *                                                                 n 
   * -------------------------------------------------------------- */

  /* If we are doing medoid shift, initialize n to the inner product of the
   * image with itself
   */
  if (n) { 
    for (i2 = 0 ; i2 < N2 ; ++ i2) {
      for (i1 = 0 ; i1 < N1 ; ++ i1) {        
        n [i1 + N1 * i2] = inner(I,N1,N2,K,
                                 i1,i2,
                                 i1,i2) ;
      }
    }
  }
  
  unsigned int Etimer;
  cutilCheckError( cutCreateTimer(&Etimer) );
  cutilCheckError( cutResetTimer(Etimer) );
  cutilCheckError( cutStartTimer(Etimer) );

  /* -----------------------------------------------------------------
   *                                                 E = - [oN'*F]', M
   * -------------------------------------------------------------- */
  
  /* 
     D_ij = d(x_i,x_j)
     E_ij = exp(- .5 * D_ij / sigma^2) ;
     F_ij = - E_ij             
     E_i  = sum_j E_ij
     M_di = sum_j X_j F_ij

     E is the parzen window estimate of the density
     0 = dissimilar to everything, windowsize = identical
  */
  
  for (i2 = 0 ; i2 < N2 ; ++ i2) {
    for (i1 = 0 ; i1 < N1 ; ++ i1) {
      
      float Ei = 0;
      int j1min = VL_MAX(i1 - R, 0   ) ;
      int j1max = VL_MIN(i1 + R, N1-1) ;
      int j2min = VL_MAX(i2 - R, 0   ) ;
      int j2max = VL_MIN(i2 + R, N2-1) ;      
      
      /* For each pixel in the window compute the distance between it and the
       * source pixel */
      for (j2 = j2min ; j2 <= j2max ; ++ j2) {
        for (j1 = j1min ; j1 <= j1max ; ++ j1) {
          float Dij = distance(I,N1,N2,K, i1,i2, j1,j2) ;          
          /* Make distance a similarity */ 
          float Fij = exp(- Dij / (2*sigma*sigma)) ;

          /* E is E_i above */
          Ei += Fij;
          
          if (M) {
            /* Accumulate votes for the median */
            int k ;
            M [i1 + N1*i2 + (N1*N2) * 0] += j1 * Fij ;
            M [i1 + N1*i2 + (N1*N2) * 1] += j2 * Fij ;
            for (k = 0 ; k < K ; ++k) {
              M [i1 + N1*i2 + (N1*N2) * (k+2)] += 
                I [j1 + N1*j2 + (N1*N2) * k] * Fij ;
            }
          } 
          
        } /* j1 */ 
      } /* j2 */
      /* Normalize */
      E [i1 + N1 * i2] = Ei / ((j1max-j1min)*(j2max-j2min));
      
      /*E [i1 + N1 * i2] = Ei ; */

    }  /* i1 */
  } /* i2 */
  
  cutilCheckError( cutStopTimer(Etimer) );
  float ETime = cutGetTimerValue(Etimer);
  printf("ComputeE: %fms\n", ETime);

  unsigned int Ntimer;
  cutilCheckError( cutCreateTimer(&Ntimer) );
  cutilCheckError( cutResetTimer(Ntimer) );
  cutilCheckError( cutStartTimer(Ntimer) );
 
  /* -----------------------------------------------------------------
   *                                               Find best neighbors
   * -------------------------------------------------------------- */
  
  if (medoid) {
    
    /* 
       Qij = - nj Ei - 2 sum_k Gjk Mik
       n is I.^2
    */
    
    /* medoid shift */
    for (i2 = 0 ; i2 < N2 ; ++i2) {
      for (i1 = 0 ; i1 < N1 ; ++i1) {
        
        float sc_best = 0  ;
        /* j1/j2 best are the best indicies for each i */
        float j1_best = i1 ;
        float j2_best = i2 ; 
        
        int j1min = VL_MAX(i1 - R, 0   ) ;
        int j1max = VL_MIN(i1 + R, N1-1) ;
        int j2min = VL_MAX(i2 - R, 0   ) ;
        int j2max = VL_MIN(i2 + R, N2-1) ;      
        
        for (j2 = j2min ; j2 <= j2max ; ++ j2) {
          for (j1 = j1min ; j1 <= j1max ; ++ j1) {            
            
            float Qij = - n [j1 + j2 * N1] * E [i1 + i2 * N1] ;
            int k ;

            Qij -= 2 * j1 * M [i1 + i2 * N1 + (N1*N2) * 0] ;
            Qij -= 2 * j2 * M [i1 + i2 * N1 + (N1*N2) * 1] ;
            for (k = 0 ; k < K ; ++k) {
              Qij -= 2 * 
                I [j1 + j2 * N1 + (N1*N2) * k] *
                M [i1 + i2 * N1 + (N1*N2) * (k + 2)] ;
            }
            
            if (Qij > sc_best) {
              sc_best = Qij ;
              j1_best = j1 ;
              j2_best = j2 ;
            }
          }
        }

        /* map_i is the linear index of j which is the best pair (in matlab
         * notation
         * gaps_i is the score of the best match
         */
        map [i1 + N1 * i2] = j1_best + N1 * j2_best ; /*+ 1 ; */
        gaps[i1 + N1 * i2] = sc_best ;
      }
    }  

  } else {
    
    /* Quickshift assigns each i to the closest j which has an increase in the
     * density (E). If there is no j s.t. Ej > Ei, then gaps_i == inf (a root
     * node in one of the trees of merges).
     */
    for (i2 = 0 ; i2 < N2 ; ++i2) {
      for (i1 = 0 ; i1 < N1 ; ++i1) {
        
        float E0 = E [i1 + N1 * i2] ;
        float d_best = INF ;
        float j1_best = i1   ;
        float j2_best = i2   ; 
        
        int j1min = VL_MAX(i1 - tR, 0   ) ;
        int j1max = VL_MIN(i1 + tR, N1-1) ;
        int j2min = VL_MAX(i2 - tR, 0   ) ;
        int j2max = VL_MIN(i2 + tR, N2-1) ;      
        
        for (j2 = j2min ; j2 <= j2max ; ++ j2) {
          for (j1 = j1min ; j1 <= j1max ; ++ j1) {            
            if (E [j1 + N1 * j2] > E0) {
              float Dij = distance(I,N1,N2,K, i1,i2, j1,j2) ;           
              if (Dij <= tau2 && Dij < d_best) {
                d_best = Dij ;
                j1_best = j1 ;
                j2_best = j2 ;
              }
            }
          }
        }
        
        /* map is the index of the best pair */
        /* gaps_i is the minimal distance, inf implies no Ej > Ei within
         * distance tau from the point */
        map [i1 + N1 * i2] = j1_best + N1 * j2_best ; /* + 1 ; */
        if (map[i1 + N1 * i2] != i1 + N1 * i2)
          gaps[i1 + N1 * i2] = sqrt(d_best) ;
        else
          gaps[i1 + N1 * i2] = d_best; /* inf */
      }
    }  
  }
  
  if (M) free(M) ;
  if (n) free(n) ;
  
  cutilCheckError( cutStopTimer(Ntimer) );
  float NTime = cutGetTimerValue(Ntimer);
  printf("ComputeN: %fms\n", NTime);

}
void displayFunc(void){
	cutStartTimer(hTimer);
    TColor *d_dst = NULL;
	size_t num_bytes;

    if(frameCounter++ == 0) cutResetTimer(hTimer);
    // DEPRECATED: cutilSafeCall(cudaGLMapBufferObject((void**)&d_dst, gl_PBO));
    cutilSafeCall(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0));
	cutilCheckMsg("cudaGraphicsMapResources failed");
    cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void**)&d_dst, &num_bytes, cuda_pbo_resource));
	cutilCheckMsg("cudaGraphicsResourceGetMappedPointer failed");

    cutilSafeCall( CUDA_Bind2TextureArray()                      );

    runImageFilters(d_dst);

    cutilSafeCall( CUDA_UnbindTexture()     );
    // DEPRECATED: cutilSafeCall(cudaGLUnmapBufferObject(gl_PBO));
	cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));

	if (g_bFBODisplay) {
		g_FrameBufferObject->bindRenderPath();
	}

    // Common display code path
	{
        glClear(GL_COLOR_BUFFER_BIT);

        glTexSubImage2D( GL_TEXTURE_2D, 0, 0, 0, imageW, imageH, GL_RGBA, GL_UNSIGNED_BYTE, BUFFER_DATA(0) );
        glBegin(GL_TRIANGLES);
            glTexCoord2f(0, 0); glVertex2f(-1, -1);
            glTexCoord2f(2, 0); glVertex2f(+3, -1);
            glTexCoord2f(0, 2); glVertex2f(-1, +3);
        glEnd();
        glFinish();
    }

	if (g_bFBODisplay) {
		g_FrameBufferObject->unbindRenderPath();
        glBindTexture(GL_TEXTURE_2D, 0);
    }

    if (g_CheckRender && g_CheckRender->IsQAReadback() && g_Verify) {
        printf("> (Frame %d) readback BackBuffer\n", frameCount);
        if (g_bFBODisplay) {
            g_CheckRender->readback( imageW, imageH, g_FrameBufferObject->getFbo() );
        } else {
            g_CheckRender->readback( imageW, imageH );
        }
        g_CheckRender->savePPM ( sOriginal[g_Kernel], true, NULL );
        if (!g_CheckRender->PPMvsPPM(sOriginal[g_Kernel], sReference[g_Kernel], MAX_EPSILON_ERROR, 0.15f)) {
            g_TotalErrors++;
        }
        g_Verify = false;
    }

    if(frameCounter == frameN){
        frameCounter = 0;
        if(g_FPS){
            printf("FPS: %3.1f\n", frameN / (cutGetTimerValue(hTimer) * 0.001) );
            g_FPS = false;
        }
    }

	glutSwapBuffers();

	cutStopTimer(hTimer);
	computeFPS();
}
Esempio n. 23
0
int main(int argc, char **argv) {
    char *precisionChoice;
    cutGetCmdLineArgumentstr(argc, (const char **)argv, "type", &precisionChoice);
    if(precisionChoice == NULL)
        useDoublePrecision = 0;
    else {
        if(!strcasecmp(precisionChoice, "double"))
            useDoublePrecision = 1;
        else
            useDoublePrecision = 0;
    }

    const int MAX_GPU_COUNT = 8;
    const int         OPT_N = 256;
    const int        PATH_N = 1 << 18;
    const unsigned int SEED = 777;

    //Input data array
    TOptionData optionData[OPT_N];
    //Final GPU MC results
    TOptionValue callValueGPU[OPT_N];
    //"Theoretical" call values by Black-Scholes formula
    float callValueBS[OPT_N];
    //Solver config
    TOptionPlan optionSolver[MAX_GPU_COUNT];
    //OS thread ID
    CUTThread threadID[MAX_GPU_COUNT];


    //GPU number present in the system
    int GPU_N;
    int gpuBase, gpuIndex;
    int i;

    //Timer
    unsigned int hTimer;
    float time;

    double
    delta, ref, sumDelta, sumRef, sumReserve;

    cutilSafeCall( cudaGetDeviceCount(&GPU_N) );
    cutilCheckError( cutCreateTimer(&hTimer) );

#ifdef _EMU
    GPU_N = 1;
#endif
    printf("main(): generating input data...\n");
    srand(123);
    for(i = 0; i < OPT_N; i++) {
        optionData[i].S = randFloat(5.0f, 50.0f);
        optionData[i].X = randFloat(10.0f, 25.0f);
        optionData[i].T = randFloat(1.0f, 5.0f);
        optionData[i].R = 0.06f;
        optionData[i].V = 0.10f;
        callValueGPU[i].Expected   = -1.0f;
        callValueGPU[i].Confidence = -1.0f;
    }

    printf("main(): starting %i host threads...\n", GPU_N);
    //Get option count for each GPU
    for(i = 0; i < GPU_N; i++)
        optionSolver[i].optionCount = OPT_N / GPU_N;
    //Take into account cases with "odd" option counts
    for(i = 0; i < (OPT_N % GPU_N); i++)
        optionSolver[i].optionCount++;

    //Assign GPU option ranges
    gpuBase = 0;
    for(i = 0; i < GPU_N; i++) {
        optionSolver[i].device     = i;
        optionSolver[i].optionData = optionData   + gpuBase;
        optionSolver[i].callValue  = callValueGPU + gpuBase;
        optionSolver[i].seed       = SEED;
        optionSolver[i].pathN      = PATH_N;
        gpuBase += optionSolver[i].optionCount;
    }

    //Start the timer
    cutilCheckError( cutResetTimer(hTimer) );
    cutilCheckError( cutStartTimer(hTimer) );

    //Start CPU thread for each GPU
    for(gpuIndex = 0; gpuIndex < GPU_N; gpuIndex++)
        threadID[gpuIndex] = cutStartThread((CUT_THREADROUTINE)solverThread, &optionSolver[gpuIndex]);

    //Stop the timer
    cutilCheckError( cutStopTimer(hTimer) );
    time = cutGetTimerValue(hTimer);

    printf("main(): waiting for GPU results...\n");
    cutWaitForThreads(threadID, GPU_N);

    printf("main(): GPU statistics\n");
    for(i = 0; i < GPU_N; i++) {
        printf("GPU #%i\n", optionSolver[i].device);
        printf("Options         : %i\n", optionSolver[i].optionCount);
        printf("Simulation paths: %i\n", optionSolver[i].pathN);
    }
    printf("\nTotal time (ms.): %f\n", time);
    printf("Options per sec.: %f\n", OPT_N / (time * 0.001));

#ifdef DO_CPU
    printf("main(): running CPU MonteCarlo...\n");
    TOptionValue callValueCPU;
    sumDelta = 0;
    sumRef   = 0;
    for(i = 0; i < OPT_N; i++) {
        MonteCarloCPU(
            callValueCPU,
            optionData[i],
            NULL,
            PATH_N
        );
        delta     = fabs(callValueCPU.Expected - callValueGPU[i].Expected);
        ref       = callValueCPU.Expected;
        sumDelta += delta;
        sumRef   += fabs(ref);
        printf("Exp : %f | %f\t", callValueCPU.Expected,   callValueGPU[i].Expected);
        printf("Conf: %f | %f\n", callValueCPU.Confidence, callValueGPU[i].Confidence);
    }
    printf("L1 norm: %E\n", sumDelta / sumRef);
#endif

    printf("main(): comparing Monte Carlo and Black-Scholes results...\n");
    sumDelta   = 0;
    sumRef     = 0;
    sumReserve = 0;
    for(i = 0; i < OPT_N; i++) {
        BlackScholesCall(
            callValueBS[i],
            optionData[i]
        );
        delta     = fabs(callValueBS[i] - callValueGPU[i].Expected);
        ref       = callValueBS[i];
        sumDelta += delta;
        sumRef   += fabs(ref);
        if(delta > 1e-6) sumReserve += callValueGPU[i].Confidence / delta;
#ifdef PRINT_RESULTS
        printf("BS: %f; delta: %E\n", callValueBS[i], delta);
#endif
    }
    sumReserve /= OPT_N;
    printf("L1 norm        : %E\n", sumDelta / sumRef);
    printf("Average reserve: %f\n", sumReserve);
    printf((sumReserve > 1.0f) ? "PASSED\n" : "FAILED.\n");

    printf("Shutting down...\n");

    cutilCheckError( cutDeleteTimer(hTimer) );
    cutilExit(argc, argv);
}
Esempio n. 24
0
void display()
{
    cutilCheckError(cutStartTimer(timer));  

    // update the simulation
    if (!bPause)
    {
        psystem->setIterations(iterations);
        psystem->setDamping(damping);
        psystem->setGravity(-gravity);
        psystem->setCollideSpring(collideSpring);
        psystem->setCollideDamping(collideDamping);
        psystem->setCollideShear(collideShear);
        psystem->setCollideAttraction(collideAttraction);

        psystem->update(timestep); 
        renderer->setVertexBuffer(psystem->getCurrentReadBuffer(), psystem->getNumParticles());
    }
    else
    {
    	usleep(32666);
    }

    // render
    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);  

    // view transform
    glMatrixMode(GL_MODELVIEW);
    glLoadIdentity();
    for (int c = 0; c < 3; ++c)
    {
        camera_trans_lag[c] += (camera_trans[c] - camera_trans_lag[c]) * inertia;
        camera_rot_lag[c] += (camera_rot[c] - camera_rot_lag[c]) * inertia;
    }
    glTranslatef(camera_trans_lag[0], camera_trans_lag[1], camera_trans_lag[2]);
    glRotatef(camera_rot_lag[0], 1.0, 0.0, 0.0);
    glRotatef(camera_rot_lag[1], 0.0, 1.0, 0.0);

    glGetFloatv(GL_MODELVIEW_MATRIX, modelView);

    // cube
    glColor3f(1.0, 1.0, 1.0);
    glutWireCube(2.0);

    // collider
    glPushMatrix();
    float4 p = psystem->getColliderPos();
    glTranslatef(p.x, p.y, p.z);
    glColor3f(1.0, 0.0, 0.0);
    glutSolidSphere(psystem->getColliderRadius(), 20, 10);
    glPopMatrix();

    if (displayEnabled)
    {
        renderer->display(displayMode);
    }

    if (displaySliders) {
        glDisable(GL_DEPTH_TEST);
        glBlendFunc(GL_ONE_MINUS_DST_COLOR, GL_ZERO); // invert color
        glEnable(GL_BLEND);
        params->Render(0, 0);
        glDisable(GL_BLEND);
        glEnable(GL_DEPTH_TEST);
    }

    cutilCheckError(cutStopTimer(timer));  

    glutSwapBuffers();

    fpsCount++;
    // this displays the frame rate updated every second (independent of frame rate)
    if (fpsCount >= fpsLimit) {
        char fps[256];
        float ifps = 1.f / (cutGetAverageTimerValue(timer) / 1000.f);
        sprintf(fps, "CUDA particles (%d particles): %3.1f fps", numParticles, ifps);  
        glutSetWindowTitle(fps);
        fpsCount = 0; 
        fpsLimit = (ifps > 1.f) ? (int)ifps : 1;
        if (bPause) fpsLimit = 0;
        cutilCheckError(cutResetTimer(timer));  
    }

    glutReportErrors();
}
Esempio n. 25
0
////////////////////////////////////////////////////////////////////////////////
//! Display callback
////////////////////////////////////////////////////////////////////////////////
void
display()
{
    cutilCheckError(cutStartTimer(timer));  

    // run CUDA kernel to generate geometry
    if (compute) {
        computeIsosurface();
    }

	if (g_bFBODisplay) {
		g_FrameBufferObject->bindRenderPath();
	}

    // Common display code path
	{
		glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);

		// set view matrix
		glMatrixMode(GL_MODELVIEW);
		glLoadIdentity();
		glTranslatef(translate.x, translate.y, translate.z);
		glRotatef(rotate.x, 1.0, 0.0, 0.0);
		glRotatef(rotate.y, 0.0, 1.0, 0.0);

		glPolygonMode(GL_FRONT_AND_BACK, wireframe? GL_LINE : GL_FILL);
		if (lighting) {
			glEnable(GL_LIGHTING);
		}

		// render
		if (render) {
			glPushMatrix();
			glRotatef(180.0, 0.0, 1.0, 0.0);
			glRotatef(90.0, 1.0, 0.0, 0.0);
			renderIsosurface();
			glPopMatrix();
		}

		glDisable(GL_LIGHTING);
	} 

	if (g_bFBODisplay) {
		g_FrameBufferObject->unbindRenderPath();

        // now rebind the texture and renderQuad
//		g_FrameBufferObject->renderQuad(width, height, GL_TEXTURE_TYPE);
	}

    if (g_CheckRender && g_CheckRender->IsQAReadback() && g_Verify) {
        // readback for QA testing
 		if (g_bFBODisplay) {
            printf("> (Frame %d) Readback FBO\n", frameCount);
			g_CheckRender->readback( window_width, window_height, g_FrameBufferObject->getFbo() );
        } else {
            printf("> (Frame %d) Readback BackBuffer\n", frameCount);
            g_CheckRender->readback( window_width, window_height );
        }
        g_CheckRender->savePPM(sOriginal[g_Index], true, NULL);
        if (!g_CheckRender->PPMvsPPM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, 0.15f)) {
            g_TotalErrors++;
        }
        g_Verify = false;
	}

    glutSwapBuffers();
    glutReportErrors();

    cutilCheckError(cutStopTimer(timer));  

    computeFPS();
}
// main rendering loop
void display()
{
    cutilCheckError(cutStartTimer(timer));

	if( !gestures.m_bPause )
	{
		//Read next available data
		gestures.m_Context.WaitAndUpdateAll();
	}

	//Process the data
	gestures.m_DepthGenerator.GetMetaData( depthMD );
	gestures.m_UserGenerator.GetUserPixels( 0, sceneMD );

    // move camera
    if (cameraPos[1] > 0.0f) cameraPos[1] = 0.0f;
    cameraPosLag += (cameraPos - cameraPosLag) * inertia;
    cameraRotLag += (cameraRot - cameraRotLag) * inertia;
	cursorPosLag += (cursorPos - cursorPosLag) * inertia;

    // view transform
    glMatrixMode(GL_MODELVIEW);
    glLoadIdentity();
    glRotatef(cameraRotLag[0], 1.0, 0.0, 0.0);
    glRotatef(cameraRotLag[1], 0.0, 1.0, 0.0);
    glTranslatef(cameraPosLag[0], cameraPosLag[1], cameraPosLag[2]);

    glGetFloatv(GL_MODELVIEW_MATRIX, modelView);

    // update the simulation
    if (!paused)
    {
        if (emitterOn) {
            runEmitter();
        }
        SimParams &p = psystem->getParams();
        p.cursorPos = make_float3(cursorPosLag.x, cursorPosLag.y, cursorPosLag.z);

        psystem->step(timestep); 
        currentTime += timestep;
    }

    renderer->calcVectors();
    vec3f sortVector = renderer->getSortVector();

    psystem->setSortVector(make_float3(sortVector.x, sortVector.y, sortVector.z));
    psystem->setModelView(modelView);
    psystem->setSorting(sort);
    psystem->depthSort();

    // render
    glClearColor(0.0, 0.0, 0.0, 1.0);
    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);  
    renderScene();

    // draw particles
    if (displayEnabled)
    {
        // render scene to offscreen buffers to get correct occlusion
        renderer->beginSceneRender(SmokeRenderer::LIGHT_BUFFER);
        renderScene();
        renderer->endSceneRender(SmokeRenderer::LIGHT_BUFFER);

        renderer->beginSceneRender(SmokeRenderer::SCENE_BUFFER);
        renderScene();
        renderer->endSceneRender(SmokeRenderer::SCENE_BUFFER);

        renderer->setPositionBuffer(psystem->getPosBuffer());
        renderer->setVelocityBuffer(psystem->getVelBuffer());
        renderer->setIndexBuffer(psystem->getSortedIndexBuffer());

        renderer->setNumParticles(psystem->getNumParticles());
        renderer->setParticleRadius(spriteSize);
        renderer->setDisplayLightBuffer(displayLightBuffer);
        renderer->setAlpha(alpha);
        renderer->setShadowAlpha(shadowAlpha);
		renderer->setLightPosition(lightPos);
		renderer->setColorAttenuation(colorAttenuation);
		renderer->setLightColor(lightColor);
		renderer->setNumSlices(numSlices);
		renderer->setNumDisplayedSlices(numDisplayedSlices);
		renderer->setBlurRadius(blurRadius);

        renderer->render();

		if (drawVectors) {
			renderer->debugVectors();
		}
    }

    // display sliders
    if (displaySliders) {
        glDisable(GL_DEPTH_TEST);
        glBlendFunc(GL_ONE_MINUS_DST_COLOR, GL_ZERO); // invert color
        glEnable(GL_BLEND);
        params->Render(0, 0);
        glDisable(GL_BLEND);
        glEnable(GL_DEPTH_TEST);
    }

    glutSwapBuffers();
    glutReportErrors();
    cutilCheckError(cutStopTimer(timer));  

    // readback for verification//sw/devrel/SDK10/Compute/projects/recursiveGaussian/recursiveGaussian.cpp
    if (g_CheckRender && g_CheckRender->IsQAReadback() && (++frameNumber >= frameCheckNumber)) {
        printf("> (Frame %d) Readback BackBuffer\n", frameNumber);
        g_CheckRender->readback( winWidth, winHeight );
        g_CheckRender->savePPM(sOriginal, true, NULL);
        bool passed = g_CheckRender->PPMvsPPM(sOriginal, sReference, MAX_EPSILON_ERROR, THRESHOLD);
        printf("Summary: %d errors!\n", passed ? 0 : 1);
		printf("%s\n", passed ? "PASSED" : "FAILED");
        cleanup();
		exit(0);
    }

    fpsCount++;
    // this displays the frame rate updated every second (independent of frame rate)
    if (fpsCount >= fpsLimit) {
        char fps[256];
        float ifps = 1.f / (cutGetAverageTimerValue(timer) / 1000.f);
        sprintf(fps, "CUDA Smoke Particles (%d particles): %3.1f fps", numParticles, ifps);  
        glutSetWindowTitle(fps);
        fpsCount = 0; 
        fpsLimit = (ifps > 1.f) ? (int)ifps : 1;
        if (paused) fpsLimit = 0;
        cutilCheckError(cutResetTimer(timer));  
    }


}
Esempio n. 27
0
// display results using OpenGL (called by GLUT)
void display()
{
    cutilCheckError(cutStartTimer(timer));  

    // use OpenGL to build view matrix
    GLfloat modelView[16];
    glMatrixMode(GL_MODELVIEW);
    glPushMatrix();
        glLoadIdentity();
        glRotatef(-viewRotation.x, 1.0, 0.0, 0.0);
        glRotatef(-viewRotation.y, 0.0, 1.0, 0.0);
        glTranslatef(-viewTranslation.x, -viewTranslation.y, -viewTranslation.z);
    glGetFloatv(GL_MODELVIEW_MATRIX, modelView);
    glPopMatrix();

    invViewMatrix[0] = modelView[0]; invViewMatrix[1] = modelView[4]; invViewMatrix[2] = modelView[8]; invViewMatrix[3] = modelView[12];
    invViewMatrix[4] = modelView[1]; invViewMatrix[5] = modelView[5]; invViewMatrix[6] = modelView[9]; invViewMatrix[7] = modelView[13];
    invViewMatrix[8] = modelView[2]; invViewMatrix[9] = modelView[6]; invViewMatrix[10] = modelView[10]; invViewMatrix[11] = modelView[14];

    render();

    // display results
    glClear(GL_COLOR_BUFFER_BIT);

    // draw image from PBO
    glDisable(GL_DEPTH_TEST);

    glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
#if 0
    // draw using glDrawPixels (slower)
    glRasterPos2i(0, 0);
    glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
    glDrawPixels(width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0);
    glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
#else
    // draw using texture

    // copy from pbo to texture
    glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
    glBindTexture(GL_TEXTURE_2D, tex);
    glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0);
    glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);

    // draw textured quad
    glEnable(GL_TEXTURE_2D);
    glBegin(GL_QUADS);
    glTexCoord2f(0, 0); glVertex2f(0, 0);
    glTexCoord2f(1, 0); glVertex2f(1, 0);
    glTexCoord2f(1, 1); glVertex2f(1, 1);
    glTexCoord2f(0, 1); glVertex2f(0, 1);
    glEnd();

    glDisable(GL_TEXTURE_2D);
    glBindTexture(GL_TEXTURE_2D, 0);
#endif

    if (g_CheckRender && g_CheckRender->IsQAReadback() && g_Verify) {
        // readback for QA testing
        shrLog("\n> (Frame %d) Readback BackBuffer\n", frameCount);
        g_CheckRender->readback( width, height );
        g_CheckRender->savePPM(sOriginal[g_Index], true, NULL);
        if (!g_CheckRender->PPMvsPPM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, THRESHOLD)) {
            g_TotalErrors++;
        }
        g_Verify = false;
    }
    glutSwapBuffers();
    glutReportErrors();

    cutilCheckError(cutStopTimer(timer));  

    computeFPS();
}
Esempio n. 28
0
////////////////////////////////////////////////////////////////////////////////
// Test driver
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char** argv){
    uint 
        *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal;
    uint 
        *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal;
    uint hTimer;

    const uint   N = 4 * 1048576;
    const uint DIR = 1;

    const uint numValues = 65536;


    printf("Allocating and initializing host arrays...\n\n");
        cutCreateTimer(&hTimer);
        h_SrcKey = (uint *)malloc(N * sizeof(uint));
        h_SrcVal = (uint *)malloc(N * sizeof(uint));
        h_DstKey = (uint *)malloc(N * sizeof(uint));
        h_DstVal = (uint *)malloc(N * sizeof(uint));

        srand(2009);
        for(uint i = 0; i < N; i++)
            h_SrcKey[i] = rand() % numValues;
        fillValues(h_SrcVal, N);

    printf("Allocating and initializing CUDA arrays...\n\n");
        cutilSafeCall( cudaMalloc((void **)&d_DstKey, N * sizeof(uint)) );
        cutilSafeCall( cudaMalloc((void **)&d_DstVal, N * sizeof(uint)) );
        cutilSafeCall( cudaMalloc((void **)&d_BufKey, N * sizeof(uint)) );
        cutilSafeCall( cudaMalloc((void **)&d_BufVal, N * sizeof(uint)) );
        cutilSafeCall( cudaMalloc((void **)&d_SrcKey, N * sizeof(uint)) );
        cutilSafeCall( cudaMalloc((void **)&d_SrcVal, N * sizeof(uint)) );
        cutilSafeCall( cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice) );
        cutilSafeCall( cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice) );

    printf("Initializing GPU merge sort...\n");
        initMergeSort();

    printf("Running GPU merge sort...\n");
        cutilSafeCall( cudaThreadSynchronize() );
        cutResetTimer(hTimer);
        cutStartTimer(hTimer);
            mergeSort(
                d_DstKey,
                d_DstVal,
                d_BufKey,
                d_BufVal,
                d_SrcKey,
                d_SrcVal,
                N,
                DIR
            );
        cutilSafeCall( cudaThreadSynchronize() );
        cutStopTimer(hTimer);
    printf("Time: %f ms\n", cutGetTimerValue(hTimer));

    printf("Reading back GPU merge sort results...\n");
        cutilSafeCall( cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost) );
        cutilSafeCall( cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost) );

    printf("Inspecting the results...\n");
        uint keysFlag = validateSortedKeys(
            h_DstKey,
            h_SrcKey,
            1,
            N,
            numValues,
            DIR
        );

        uint valuesFlag = validateSortedValues(
            h_DstKey,
            h_DstVal,
            h_SrcKey,
            1,
            N
        );

    printf( (keysFlag && valuesFlag) ? "TEST PASSED\n" : "TEST FAILED\n");

    printf("Shutting down...\n");
        closeMergeSort();
        cutilCheckError( cutDeleteTimer(hTimer) );
        cutilSafeCall( cudaFree(d_SrcVal) );
        cutilSafeCall( cudaFree(d_SrcKey) );
        cutilSafeCall( cudaFree(d_BufVal) );
        cutilSafeCall( cudaFree(d_BufKey) );
        cutilSafeCall( cudaFree(d_DstVal) );
        cutilSafeCall( cudaFree(d_DstKey) );
        free(h_DstVal);
        free(h_DstKey);
        free(h_SrcVal);
        free(h_SrcKey);
        cudaThreadExit();
        cutilExit(argc, argv);
}
////////////////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv){
    const unsigned int OPT_N_MAX = 512;
    unsigned int useDoublePrecision;

    printf("[binomialOptions]\n");

    int devID = cutilDeviceInit(argc, argv);
    if (devID < 0) {
       printf("exiting...\n");
       cutilExit(argc, argv);
       exit(0);
    }

    cutilSafeCall(cudaGetDevice(&devID));
    cudaDeviceProp deviceProp;
    cutilSafeCall(cudaGetDeviceProperties(&deviceProp, devID));

    char *precisionChoice;
    cutGetCmdLineArgumentstr(argc, (const char **)argv, "type", &precisionChoice);
    if(precisionChoice == NULL) {
        useDoublePrecision = 0;
    } else {
        if(!strcasecmp(precisionChoice, "double"))
            useDoublePrecision = 1;
        else
            useDoublePrecision = 0;
    }
    printf(useDoublePrecision ? "Using double precision...\n" : "Using single precision...\n");
    const int OPT_N = deviceEmulation() ? 1 : OPT_N_MAX;

    TOptionData optionData[OPT_N_MAX];
    float
        callValueBS[OPT_N_MAX],
        callValueGPU[OPT_N_MAX],
        callValueCPU[OPT_N_MAX];

    double
        sumDelta, sumRef, gpuTime, errorVal;

    unsigned int hTimer;
    int i;

    cutilCheckError( cutCreateTimer(&hTimer) );

    int version = deviceProp.major * 10 + deviceProp.minor;
    if(useDoublePrecision && version < 13){
        printf("Double precision is not supported.\n");
        return 0;
    }

    printf("Generating input data...\n");
        //Generate options set
        srand(123);
        for(i = 0; i < OPT_N; i++){
            optionData[i].S = randData(5.0f, 30.0f);
            optionData[i].X = randData(1.0f, 100.0f);
            optionData[i].T = randData(0.25f, 10.0f);
            optionData[i].R = 0.06f;
            optionData[i].V = 0.10f;
            BlackScholesCall(callValueBS[i], optionData[i]);
        }

    printf("Running GPU binomial tree...\n");
        cutilSafeCall( cudaThreadSynchronize() );
        cutilCheckError( cutResetTimer(hTimer) );
        cutilCheckError( cutStartTimer(hTimer) );

        if(useDoublePrecision)
            binomialOptions_SM13(callValueGPU, optionData, OPT_N);
        else
            binomialOptions_SM10(callValueGPU, optionData, OPT_N);

        cutilSafeCall( cudaThreadSynchronize() );
        cutilCheckError( cutStopTimer(hTimer) );
        gpuTime = cutGetTimerValue(hTimer);
    printf("Options count            : %i     \n", OPT_N);
    printf("Time steps               : %i     \n", NUM_STEPS);
    printf("binomialOptionsGPU() time: %f msec\n", gpuTime);
    printf("Options per second       : %f     \n", OPT_N / (gpuTime * 0.001));

    printf("Running CPU binomial tree...\n");
        for(i = 0; i < OPT_N; i++)
            binomialOptionsCPU(callValueCPU[i], optionData[i]);

    printf("Comparing the results...\n");
    sumDelta = 0;
    sumRef   = 0;
    printf("GPU binomial vs. Black-Scholes\n");
    for(i = 0; i < OPT_N; i++){
        sumDelta += fabs(callValueBS[i] - callValueGPU[i]);
        sumRef += fabs(callValueBS[i]);
    }
    if(sumRef >1E-5)
        printf("L1 norm: %E\n", sumDelta / sumRef);
    else
        printf("Avg. diff: %E\n", sumDelta / (double)OPT_N);

    printf("CPU binomial vs. Black-Scholes\n");
    sumDelta = 0;
    sumRef   = 0;
    for(i = 0; i < OPT_N; i++){
        sumDelta += fabs(callValueBS[i]- callValueCPU[i]);
        sumRef += fabs(callValueBS[i]);
    }
    if(sumRef >1E-5)
        printf("L1 norm: %E\n", sumDelta / sumRef);
    else
        printf("Avg. diff: %E\n", sumDelta / (double)OPT_N);

    printf("CPU binomial vs. GPU binomial\n");
    sumDelta = 0;
    sumRef   = 0;
    for(i = 0; i < OPT_N; i++){
        sumDelta += fabs(callValueGPU[i] - callValueCPU[i]);
        sumRef += callValueCPU[i];
    }
    if(sumRef > 1E-5)
        printf("L1 norm: %E\n", errorVal = sumDelta / sumRef);
    else
        printf("Avg. diff: %E\n", errorVal = sumDelta / (double)OPT_N);

    printf("Shutting down...\n");

	printf("\n[binomialOptions] - Test Summary:\n");
    printf((errorVal < 5e-4) ? "PASSED\n" : "FAILED\n");

    cutilCheckError( cutDeleteTimer(hTimer) );

    cudaThreadExit();

    cutilExit(argc, argv);
}
Esempio n. 30
0
T benchmarkReduceMax(int  n, 
                  int  numThreads,
                  int  numBlocks,
                  int  maxThreads,
                  int  maxBlocks,
                  int  whichKernel, 
                  int  testIterations,
                  bool cpuFinalReduction,
                  int  cpuFinalThreshold,
                  unsigned int timer,
                  T* h_odata,
                  T* d_idata, 
                  T* d_odata)
{
    T gpu_result = 0;
    bool needReadBack = true;

    for (int i = 0; i < testIterations; ++i)
    {
        gpu_result = 0;

        cutilDeviceSynchronize();
        cutilCheckError( cutStartTimer( timer));

        // execute the kernel
        maxreduce<T>(n, numThreads, numBlocks, whichKernel, d_idata, d_odata);

        // check if kernel execution generated an error
        cutilCheckMsg("Kernel execution failed");

        if (cpuFinalReduction)
        {
            // sum partial sums from each block on CPU        
            // copy result from device to host
            cutilSafeCallNoSync( cudaMemcpy( h_odata, d_odata, numBlocks*sizeof(T), cudaMemcpyDeviceToHost) );

            for(int i=0; i<numBlocks; i++) 
            {
                gpu_result += h_odata[i];
            }

            needReadBack = false;
        }
        else
        {
            // sum partial block sums on GPU
            int s=numBlocks;
            int kernel = whichKernel;
            while(s > cpuFinalThreshold) 
            {
                int threads = 0, blocks = 0;
                getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, threads);
                
                maxreduce<T>(s, threads, blocks, kernel, d_odata, d_odata);
                
                if (kernel < 3)
                    s = (s + threads - 1) / threads;
                else
                    s = (s + (threads*2-1)) / (threads*2);
            }
            
            if (s > 1)
            {
                // copy result from device to host
                cutilSafeCallNoSync( cudaMemcpy( h_odata, d_odata, s * sizeof(T), cudaMemcpyDeviceToHost) );

                for(int i=0; i < s; i++) 
                {
                    gpu_result += h_odata[i];
                }

                needReadBack = false;
            }
        }

        cutilDeviceSynchronize();
        cutilCheckError( cutStopTimer(timer) );      
    }

    if (needReadBack)
    {
        // copy final sum from device to host
        cutilSafeCallNoSync( cudaMemcpy( &gpu_result, d_odata, sizeof(T), cudaMemcpyDeviceToHost) );
    }

    return gpu_result;
}