Example #1
0
void copy_image(PPM_IMG img_in)
{
    StopWatchInterface *timer=NULL;

    PPM_IMG host_img;
    PPM_IMG device_img;

    int size = img_in.w * img_in.h * sizeof(unsigned char);

    host_img.w = img_in.w;
    host_img.h = img_in.h;
    host_img.img_r = (unsigned char *)malloc(size);
    host_img.img_g = (unsigned char *)malloc(size);
    host_img.img_b = (unsigned char *)malloc(size);

    device_img.w = img_in.w;
    device_img.h = img_in.h;
    cudaMalloc((void **)&(device_img.img_r), size);
    cudaMalloc((void **)&(device_img.img_g), size);
    cudaMalloc((void **)&(device_img.img_b), size);

    launchEmptyKernel();    // lauch an empty kernel
    printf("Starting copy image...\n");

    // CPU to GPU
    sdkCreateTimer(&timer);
    sdkStartTimer(&timer);
    cudaMemcpy(device_img.img_r, img_in.img_r, size, cudaMemcpyHostToDevice);
    cudaMemcpy(device_img.img_g, img_in.img_g, size, cudaMemcpyHostToDevice);
    cudaMemcpy(device_img.img_b, img_in.img_b, size, cudaMemcpyHostToDevice);
    sdkStopTimer(&timer);
    printf("Time of copy image from CPU to GPU: %f (ms)\n", sdkGetTimerValue(&timer));
    sdkDeleteTimer(&timer);

    // GPU to CPU
    sdkCreateTimer(&timer);
    sdkStartTimer(&timer);
    cudaMemcpy(host_img.img_r, device_img.img_r, size, cudaMemcpyDeviceToHost);
    cudaMemcpy(host_img.img_g, device_img.img_g, size, cudaMemcpyDeviceToHost);
    cudaMemcpy(host_img.img_b, device_img.img_b, size, cudaMemcpyDeviceToHost);
    sdkStopTimer(&timer);
    printf("Time of copy image from GPU to CPU: %f (ms)\n", sdkGetTimerValue(&timer));
    sdkDeleteTimer(&timer);

    cudaFree(device_img.img_r);
    cudaFree(device_img.img_g);
    cudaFree(device_img.img_b);

    free(host_img.img_r);
    free(host_img.img_g);
    free(host_img.img_b);
}
void
benchmark(int iterations)
{
    // allocate memory for result
    unsigned int *d_result;
    unsigned int size = width * height * sizeof(unsigned int);
    checkCudaErrors(cudaMalloc((void **) &d_result, size));

    // warm-up
    gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads);

    checkCudaErrors(cudaDeviceSynchronize());
    sdkStartTimer(&timer);

    // execute the kernel
    for (int i = 0; i < iterations; i++)
    {
        gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads);
    }

    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&timer);

    // check if kernel execution generated an error
    getLastCudaError("Kernel execution failed");

    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
    printf("%.2f Mpixels/sec\n", (width*height*iterations / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);

    checkCudaErrors(cudaFree(d_result));
}
void siTest(T *d_ptclA, T *d_ptclA_new, T *d_wghtA, unsigned int size, int stateDim)
{
    int blocks, threads;
    float elapsedTimeInMs = 0.0f;
    threads = BLOCK_SIZE;
    blocks = (size + threads - 1) / threads;
#ifdef NVS
    while (blocks > GRID_LIMIT){
        blocks >>= 1;
        threads <<= 1;
    }
#endif
    StopWatchInterface *timer = NULL;
    sdkCreateTimer(&timer);

    for (int i = 0 ; i < TEST_ITERATIONS ; i ++){
        cudaDeviceSynchronize();
        sdkStartTimer(&timer);

        SI<T>(blocks, threads, d_ptclA, d_ptclA_new, d_wghtA, size, stateDim);

        checkCudaErrors(cudaDeviceSynchronize());
        sdkStopTimer(&timer);
    }
    elapsedTimeInMs = sdkGetAverageTimerValue(&timer);
    printf("%f\t", elapsedTimeInMs);
    printf("size=%u, stateDim=%d, blocks=%d, threads=%d\n",size, stateDim, blocks, threads);

    sdkDeleteTimer(&timer);
}
Example #4
0
void runBenchmark(int iterations, char *exec_path)
{
    printf("Run %u particles simulation for %d iterations...\n\n", numParticles, iterations);
    cudaDeviceSynchronize();
    sdkStartTimer(&timer);

    for (int i = 0; i < iterations; ++i)
    {
        psystem->update(timestep);
    }

    cudaDeviceSynchronize();
    sdkStopTimer(&timer);
    float fAvgSeconds = ((float)1.0e-3 * (float)sdkGetTimerValue(&timer)/(float)iterations);

    printf("particles, Throughput = %.4f KParticles/s, Time = %.5f s, Size = %u particles, NumDevsUsed = %u, Workgroup = %u\n",
           (1.0e-3 * numParticles)/fAvgSeconds, fAvgSeconds, numParticles, 1, 0);

    if (g_refFile)
    {
        printf("\nChecking result...\n\n");
        float *hPos = (float *)malloc(sizeof(float)*4*psystem->getNumParticles());
        copyArrayFromDevice(hPos, psystem->getCudaPosVBO(),
                            0, sizeof(float)*4*psystem->getNumParticles());

        sdkDumpBin((void *)hPos, sizeof(float)*4*psystem->getNumParticles(), "particles.bin");

        if (!sdkCompareBin2BinFloat("particles.bin", g_refFile, sizeof(float)*4*psystem->getNumParticles(),
                                    MAX_EPSILON_ERROR, THRESHOLD, exec_path))
        {
            g_TotalErrors++;
        }
    }
}
void runBenchmark(int iterations)
{
    printf("[%s] (Benchmark Mode)\n", sSDKsample);

    sdkCreateTimer(&timer);

    uchar4 *d_output;
    checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0));
    size_t num_bytes;
    checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_output, &num_bytes,
                    cuda_pbo_resource));

    sdkStartTimer(&timer);

    for (int i = 0; i < iterations; ++i)
    {
        render(imageWidth, imageHeight, tx, ty, scale, cx, cy,
               blockSize, gridSize, g_FilterMode, d_output);
    }

    cudaDeviceSynchronize();
    sdkStopTimer(&timer);
    float time = sdkGetTimerValue(&timer) / (float) iterations;

    checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));

    printf("time: %0.3f ms, %f Mpixels/sec\n", time, (width*height / (time * 0.001f)) / 1e6);
}
Example #6
0
void runAutoTest(const char *ref_file, char *exec_path)
{
    checkCudaErrors(cudaMalloc((void **)&d_output, width*height*sizeof(GLubyte)*4));

    // render the volumeData
    render_kernel(gridSize, blockSize, d_output, width, height, w);

    checkCudaErrors(cudaDeviceSynchronize());
    getLastCudaError("render_kernel failed");

    void *h_output = malloc(width*height*sizeof(GLubyte)*4);
    checkCudaErrors(cudaMemcpy(h_output, d_output, width*height*sizeof(GLubyte)*4, cudaMemcpyDeviceToHost));
    sdkDumpBin(h_output, width*height*sizeof(GLubyte)*4, "simpleTexture3D.bin");

    bool bTestResult = sdkCompareBin2BinFloat("simpleTexture3D.bin", sdkFindFilePath(ref_file, exec_path), width*height,
                                              MAX_EPSILON_ERROR, THRESHOLD, exec_path);

    checkCudaErrors(cudaFree(d_output));
    free(h_output);

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();
    sdkStopTimer(&timer);
    sdkDeleteTimer(&timer);

    exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
Example #7
0
void display(void)
{
	pthread_mutex_lock(&display_mutex);

    if (!ref_file)
    {
        sdkStartTimer(&timer);
        simulateFluids();
    }

    // render points from vertex buffer
    glClear(GL_COLOR_BUFFER_BIT);
    glClearColor(1.f/256*172, 1.f/256*101, 1.f/256*4, 0.f);
    glColor4f(1.f, 1.f, 1.f, 0.5f);
    glPointSize(1);
    glEnable(GL_POINT_SMOOTH);
    glEnable(GL_BLEND);
    glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
    glEnableClientState(GL_VERTEX_ARRAY);
    glDisable(GL_DEPTH_TEST);
    glDisable(GL_CULL_FACE);
    glBindBufferARB(GL_ARRAY_BUFFER_ARB, vbo);
#ifdef OPTIMUS
    glBufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(cData) * DS,
                    particles, GL_DYNAMIC_DRAW_ARB);
#endif
    glVertexPointer(2, GL_FLOAT, 0, NULL);
    glDrawArrays(GL_POINTS, 0, DS);
    glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0);
    glDisableClientState(GL_VERTEX_ARRAY);
    glDisableClientState(GL_TEXTURE_COORD_ARRAY);
    glDisable(GL_TEXTURE_2D);

    if (ref_file)
    {
        return;
    }

    // Finish timing before swap buffers to avoid refresh sync
    sdkStopTimer(&timer);
    glutSwapBuffers();

    fpsCount++;

    if (fpsCount == fpsLimit)
    {
        char fps[256];
        float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f);
        sprintf(fps, "Caffe Macchiato / Stable Fluids (%d x %d): %3.1f fps", DIM, DIM, ifps);
        glutSetWindowTitle(fps);
        fpsCount = 0;
        fpsLimit = (int)MAX(ifps, 1.f);
        sdkResetTimer(&timer);
    }

    glutPostRedisplay();

	pthread_mutex_unlock(&display_mutex);
}
void displayFunc(void)
{
    sdkStartTimer(&timer);
    TColor *d_dst = NULL;
    size_t num_bytes;

    if (frameCounter++ == 0)
    {
        sdkResetTimer(&timer);
    }

    // DEPRECATED: checkCudaErrors(cudaGLMapBufferObject((void**)&d_dst, gl_PBO));
    checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0));
    getLastCudaError("cudaGraphicsMapResources failed");
    checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_dst, &num_bytes, cuda_pbo_resource));
    getLastCudaError("cudaGraphicsResourceGetMappedPointer failed");

    checkCudaErrors(CUDA_Bind2TextureArray());

    runImageFilters(d_dst);

    checkCudaErrors(CUDA_UnbindTexture());
    // DEPRECATED: checkCudaErrors(cudaGLUnmapBufferObject(gl_PBO));
    checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));

    // Common display code path
    {
        glClear(GL_COLOR_BUFFER_BIT);

        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, imageW, imageH, GL_RGBA, GL_UNSIGNED_BYTE, BUFFER_DATA(0));
        glBegin(GL_TRIANGLES);
        glTexCoord2f(0, 0);
        glVertex2f(-1, -1);
        glTexCoord2f(2, 0);
        glVertex2f(+3, -1);
        glTexCoord2f(0, 2);
        glVertex2f(-1, +3);
        glEnd();
        glFinish();
    }

    if (frameCounter == frameN)
    {
        frameCounter = 0;

        if (g_FPS)
        {
            printf("FPS: %3.1f\n", frameN / (sdkGetTimerValue(&timer) * 0.001));
            g_FPS = false;
        }
    }

    glutSwapBuffers();
    glutReportErrors();

    sdkStopTimer(&timer);

    computeFPS();
}
Example #9
0
void EyeDescriptor::mainHough(cv::Mat& dst) {
	StopWatchInterface *timer = NULL;
	sdkCreateTimer(&timer);
	sdkStartTimer(&timer);
	
	//pixels which should be considered
	std::vector<std::pair<int, int>> edgesIdx;
	for (int y = 0; y < height; y++)
		for (int x = 0; x < width; x++)
			if (dst.at<uchar>(y, x) == 255)
			{
				edgesIdx.push_back( std::pair<int, int>(x, y) );
			}
	
	int NPixelsEdges = (int) edgesIdx.size();
	for (int ipixel = 0; ipixel < NPixelsEdges; ++ipixel)
	{
		int x = edgesIdx[ipixel].first;
		int y = edgesIdx[ipixel].second;
		
		//gradient angle?
		int q_angle = localGradient_angles[x + y * width];
		
		//we check for each radius
		//	from rmin to rmax
		for (double r = rmin; r < rmax; r += rdelta)
		{
			int eps = 40;
			if (std::abs(r) < eps)
				continue;
			
			int ri = int(((r - rmin) / (rmax - rmin))*rstepnumb);
			if (ri == rstepnumb)
				ri = rstepnumb - 1; //small chance for drawing rmax and getting out of bounds
			
			int x0 = int(x - r*ci[q_angle] + 0.5);
			int y0 = int(y - r*si[q_angle] + 0.5);
			
			if (!(x0>=0 && x0 < width && y0>=0 && y0 < height))
				continue;
			
			int tmp = ++accummulator[x0 + y0*width + ri*height*width];
			
			if (tmp > houghmaxval){
				houghmaxval = tmp;
				x_maxval = x0;
				y_maxval = y0;
				r_maxval = int(std::abs(r)+0.5);
			}
		}
	}
	
	sdkStopTimer(&timer);
	float execution_time = sdkGetTimerValue(&timer);
	std::cout << "Main loop, TIME: " << execution_time << "[ms]" << std::endl;
}
Example #10
0
// display results using OpenGL
void display()
{
    sdkStartTimer(&timer);

    // execute filter, writing results to pbo
    unsigned int *dResult;

    //DEPRECATED: checkCudaErrors( cudaGLMapBufferObject((void**)&d_result, pbo) );
    checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0));
    size_t num_bytes;
    checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&dResult, &num_bytes, cuda_pbo_resource));
    bilateralFilterRGBA(dResult, width, height, euclidean_delta, filter_radius, iterations, kernel_timer);

    // DEPRECATED: checkCudaErrors(cudaGLUnmapBufferObject(pbo));
    checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));

    // Common display code path
    {
        glClear(GL_COLOR_BUFFER_BIT);

        // load texture from pbo
        glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
        glBindTexture(GL_TEXTURE_2D, texid);
        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0);
        glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);

        // fragment program is required to display floating point texture
        glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, shader);
        glEnable(GL_FRAGMENT_PROGRAM_ARB);
        glDisable(GL_DEPTH_TEST);

        glBegin(GL_QUADS);
        {
            glTexCoord2f(0, 0);
            glVertex2f(0, 0);
            glTexCoord2f(1, 0);
            glVertex2f(1, 0);
            glTexCoord2f(1, 1);
            glVertex2f(1, 1);
            glTexCoord2f(0, 1);
            glVertex2f(0, 1);
        }
        glEnd();
        glBindTexture(GL_TEXTURE_TYPE, 0);
        glDisable(GL_FRAGMENT_PROGRAM_ARB);
    }

    glutSwapBuffers();
    glutReportErrors();

    sdkStopTimer(&timer);

    computeFPS();
}
Example #11
0
void run_cpu_color_test(PPM_IMG img_in)
{
    StopWatchInterface *timer=NULL;
    printf("Starting CPU processing...\n");

    sdkCreateTimer(&timer);
    sdkStartTimer(&timer);
    img_obuf_yuv_cpu = rgb2yuv(img_in); //Start RGB 2 YUV
    sdkStopTimer(&timer);
    printf("RGB to YUV conversion time: %f (ms)\n", sdkGetTimerValue(&timer));
    sdkDeleteTimer(&timer);

    sdkCreateTimer(&timer);
    sdkStartTimer(&timer);
    img_obuf_rgb_cpu = yuv2rgb(img_obuf_yuv_cpu); //Start YUV 2 RGB
    sdkStopTimer(&timer);
    printf("YUV to RGB conversion time: %f (ms)\n", sdkGetTimerValue(&timer));
    sdkDeleteTimer(&timer);    

    write_yuv(img_obuf_yuv_cpu, "out_yuv.yuv");
    write_ppm(img_obuf_rgb_cpu, "out_rgb.ppm");
}
Example #12
0
void display(void)
{

    if (!ref_file)
    {
        sdkStartTimer(&timer);
        simulateFluids();
    }

    // render points from vertex buffer
    glClear(GL_COLOR_BUFFER_BIT);
    glColor4f(0,1,0,0.5f);
    glPointSize(1);
    glEnable(GL_POINT_SMOOTH);
    glEnable(GL_BLEND);
    glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
    glEnableClientState(GL_VERTEX_ARRAY);
    glDisable(GL_DEPTH_TEST);
    glDisable(GL_CULL_FACE);
    glBindBufferARB(GL_ARRAY_BUFFER_ARB, vbo);
    glVertexPointer(2, GL_FLOAT, 0, NULL);
    glDrawArrays(GL_POINTS, 0, DS);
    glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0);
    glDisableClientState(GL_VERTEX_ARRAY);
    glDisableClientState(GL_TEXTURE_COORD_ARRAY);
    glDisable(GL_TEXTURE_2D);

    if (ref_file)
    {
        return;
    }

    // Finish timing before swap buffers to avoid refresh sync
    sdkStopTimer(&timer);
    glutSwapBuffers();

    fpsCount++;

    if (fpsCount == fpsLimit)
    {
        char fps[256];
        float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f);
        sprintf(fps, "Cuda/GL Stable Fluids (%d x %d): %3.1f fps", DIM, DIM, ifps);
        glutSetWindowTitle(fps);
        fpsCount = 0;
        fpsLimit = (int)MAX(ifps, 1.f);
        sdkResetTimer(&timer);
    }

    glutPostRedisplay();
}
Example #13
0
void run_gpu_color_test(PPM_IMG img_in)
{
    StopWatchInterface *timer=NULL;
    launchEmptyKernel();    // lauch an empty kernel
    printf("Starting GPU processing...\n");

    sdkCreateTimer(&timer);
    sdkStartTimer(&timer);
    img_obuf_yuv_gpu = rgb2yuvGPU(img_in); //Start RGB 2 YUV
    sdkStopTimer(&timer);
    printf("RGB to YUV conversion time(GPU): %f (ms)\n", sdkGetTimerValue(&timer));
    sdkDeleteTimer(&timer);

    sdkCreateTimer(&timer);
    sdkStartTimer(&timer);
    img_obuf_rgb_gpu = yuv2rgbGPU(img_obuf_yuv_gpu); //Start YUV 2 RGB
    sdkStopTimer(&timer);
    printf("YUV to RGB conversion time(GPU): %f (ms)\n", sdkGetTimerValue(&timer));
    sdkDeleteTimer(&timer);    

    write_ppm(img_obuf_rgb_gpu, "out_rgb.ppm");
    write_yuv(img_obuf_yuv_gpu, "out_yuv.yuv");
}
// This is the normal display path
void display(void)
{
    sdkStartTimer(&timer);

    // Sobel operation
    Pixel *data = NULL;

    // map PBO to get CUDA device pointer
    checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0));
    size_t num_bytes;
    checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&data, &num_bytes,
                                                         cuda_pbo_resource));
    //printf("CUDA mapped PBO: May access %ld bytes\n", num_bytes);

    sobelFilter(data, imWidth, imHeight, g_SobelDisplayMode, imageScale);
    checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));

    glClear(GL_COLOR_BUFFER_BIT);

    glBindTexture(GL_TEXTURE_2D, texid);
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_buffer);
    glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, imWidth, imHeight,
                    GL_LUMINANCE, GL_UNSIGNED_BYTE, OFFSET(0));
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);

    glDisable(GL_DEPTH_TEST);
    glEnable(GL_TEXTURE_2D);
    glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
    glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
    glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
    glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);

    glBegin(GL_QUADS);
    glVertex2f(0, 0);
    glTexCoord2f(0, 0);
    glVertex2f(0, 1);
    glTexCoord2f(1, 0);
    glVertex2f(1, 1);
    glTexCoord2f(1, 1);
    glVertex2f(1, 0);
    glTexCoord2f(0, 1);
    glEnd();
    glBindTexture(GL_TEXTURE_2D, 0);
    glutSwapBuffers();

    sdkStopTimer(&timer);

    computeFPS();
}
Example #15
0
////////////////////////////////////////////////////////////////////////////////
//! Display callback
////////////////////////////////////////////////////////////////////////////////
void display()
{
    sdkStartTimer(&timer);

    // run CUDA kernel to generate vertex positions
    runCuda(&cuda_vbo_resource);

	//簡易ライトセット
	glEnable(GL_LIGHTING);
	glEnable(GL_LIGHT0);
	glLightfv(GL_LIGHT0, GL_POSITION, gkLightPos);
	glLightfv(GL_LIGHT0, GL_DIFFUSE, gkLightDiff);
//	glLightfv(GL_LIGHT0, GL_AMBIENT, gkLightAmb);
	glEnable(GL_LIGHT1);
	glLightfv(GL_LIGHT1, GL_POSITION, gkLightPos2);
	glLightfv(GL_LIGHT1, GL_DIFFUSE, gkLightDiff2);

	//Zバッファ有効
	glEnable(GL_DEPTH_TEST);
    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);

    // set view matrix
    glMatrixMode(GL_MODELVIEW);
    glLoadIdentity();
    glTranslatef(0.0, 0.0, translate_z);
    glRotatef(rotate_x, 1.0, 0.0, 0.0);
    glRotatef(rotate_y, 0.0, 1.0, 0.0);

	// Earth
//	glMaterialfv(GL_FRONT, GL_DIFFUSE, gkMaterial);
	glutSolidSphere(50.0 * h_axis_radius, 20, 20);
	glDisable(GL_LIGHTING);

    // render from the vbo
    glBindBuffer(GL_ARRAY_BUFFER, vbo);
    glVertexPointer(4, GL_FLOAT, 0, 0);

    glEnableClientState(GL_VERTEX_ARRAY);
    glColor3f(1.0, 1.0, 1.0);
    glDrawArrays(GL_POINTS, 0, mesh_width * mesh_height);
    glDisableClientState(GL_VERTEX_ARRAY);

    glutSwapBuffers();

    g_fAnim += 0.01f;

    sdkStopTimer(&timer);
    computeFPS();
}
bool InfiniTAMApp::ProcessFrame(void)
{
	if (!mImageSource->hasMoreImages()) return false;
	mImageSource->getImages(inputRGBImage, inputRawDepthImage);

	if (mImuSource != NULL) {
		if (!mImuSource->hasMoreMeasurements()) return false;
		else mImuSource->getMeasurement(inputIMUMeasurement);
	}

	sdkResetTimer(&timer_instant);
	sdkStartTimer(&timer_instant); sdkStartTimer(&timer_average);

	//actual processing on the mainEngine
	if (mImuSource != NULL) mMainEngine->ProcessFrame(inputRGBImage, inputRawDepthImage, inputIMUMeasurement);
	else mMainEngine->ProcessFrame(inputRGBImage, inputRawDepthImage);

	ITMSafeCall(cudaDeviceSynchronize());
	sdkStopTimer(&timer_instant); sdkStopTimer(&timer_average);

	__android_log_print(ANDROID_LOG_VERBOSE, "InfiniTAM", "Process Frame finished: %f %f", sdkGetTimerValue(&timer_instant), sdkGetAverageTimerValue(&timer_average));

	return true;
}
Example #17
0
        void _runBenchmark(int iterations)
        {
            // once without timing to prime the device
            if (!useCpu)
            {
                m_nbody->update(activeParams.m_timestep);
            }

            if (useCpu)
            {
                sdkCreateTimer(&timer);
                sdkStartTimer(&timer);
            }
            else
            {
                checkCudaErrors(cudaEventRecord(startEvent, 0));
            }

            for (int i = 0; i < iterations; ++i)
            {
                m_nbody->update(activeParams.m_timestep);
            }

            float milliseconds = 0;

            if (useCpu)
            {
                sdkStopTimer(&timer);
                milliseconds = sdkGetTimerValue(&timer);
                sdkStartTimer(&timer);
            }
            else
            {
                checkCudaErrors(cudaEventRecord(stopEvent, 0));
                checkCudaErrors(cudaEventSynchronize(stopEvent));
                checkCudaErrors(cudaEventElapsedTime(&milliseconds, startEvent, stopEvent));
            }

            double interactionsPerSecond = 0;
            double gflops = 0;
            computePerfStats(interactionsPerSecond, gflops, milliseconds, iterations);

            printf("%d bodies, total time for %d iterations: %.3f ms, mean %f\n",
                   numBodies, iterations, milliseconds, milliseconds/iterations);
            printf("= %.3f billion interactions per second\n", interactionsPerSecond);
            printf("= %.3f %s-precision GFLOP/s at %d flops per interaction\n", gflops,
                   (sizeof(T) > 4) ? "double" : "single", flopsPerInteraction);
        }
Example #18
0
void runBenchmark(int iterations, char *exec_path)
{
    printf("Run %u particles simulation for %d iterations...\n\n", numParticles, iterations);
    cudaDeviceSynchronize();
    sdkStartTimer(&timer);

    for (int i = 0; i < iterations; ++i)
    {
        psystem->update(timestep);
    }

    cudaDeviceSynchronize();
    sdkStopTimer(&timer);
    float fAvgSeconds = ((float)1.0e-3 * (float)sdkGetTimerValue(&timer)/(float)iterations);

    printf("particles, Throughput = %.4f KParticles/s, Time = %.5f s, Size = %u particles, NumDevsUsed = %u, Workgroup = %u\n",
           (1.0e-3 * numParticles)/fAvgSeconds, fAvgSeconds, numParticles, 1, 0);
}
bool
runSingleTest(const char *ref_file, const char *exec_path)
{
    // allocate memory for result
    int nTotalErrors = 0;
    unsigned int *d_result;
    unsigned int size = width * height * sizeof(unsigned int);
    checkCudaErrors(cudaMalloc((void **) &d_result, size));

    // warm-up
    gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads);

    checkCudaErrors(cudaDeviceSynchronize());
    sdkStartTimer(&timer);

    gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads);
    checkCudaErrors(cudaDeviceSynchronize());
    getLastCudaError("Kernel execution failed");
    sdkStopTimer(&timer);

    unsigned char *h_result = (unsigned char *)malloc(width*height*4);
    checkCudaErrors(cudaMemcpy(h_result, d_result, width*height*4, cudaMemcpyDeviceToHost));

    char dump_file[1024];
    sprintf(dump_file, "lena_%02d.ppm", (int)sigma);
    sdkSavePPM4ub(dump_file, h_result, width, height);

    if (!sdkComparePPM(dump_file, sdkFindFilePath(ref_file, exec_path), MAX_EPSILON_ERROR, THRESHOLD, false))
    {
        nTotalErrors++;
    }

    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
    printf("%.2f Mpixels/sec\n", (width*height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);

    checkCudaErrors(cudaFree(d_result));
    free(h_result);

    printf("Summary: %d errors!\n", nTotalErrors);

    printf(nTotalErrors == 0 ? "Test passed\n": "Test failed!\n");
    return (nTotalErrors == 0);
}
Example #20
0
////////////////////////////////////////////////////////////////////////////////
//! Run a simple benchmark test for CUDA
////////////////////////////////////////////////////////////////////////////////
int runBenchmark(int argc, char **argv)
{
    printf("[runBenchmark]: [%s]\n", sSDKsample);

    loadImageData(argc, argv);
    initCuda();

    unsigned int *dResult;
    size_t pitch;
    checkCudaErrors(cudaMallocPitch((void **)&dResult, &pitch, width*sizeof(unsigned int), height));
    sdkStartTimer(&kernel_timer);

    // warm-up
    bilateralFilterRGBA(dResult, width, height, euclidean_delta, filter_radius, iterations, kernel_timer);
    checkCudaErrors(cudaDeviceSynchronize());

    // Start round-trip timer and process iCycles loops on the GPU
    iterations = 1;     // standard 1-pass filtering
    const int iCycles = 150;
    double dProcessingTime = 0.0;
    printf("\nRunning BilateralFilterGPU for %d cycles...\n\n", iCycles);

    for (int i = 0; i < iCycles; i++)
    {
        dProcessingTime += bilateralFilterRGBA(dResult, width, height, euclidean_delta, filter_radius, iterations, kernel_timer);
    }

    // check if kernel execution generated an error and sync host
    getLastCudaError("Error: bilateralFilterRGBA Kernel execution FAILED");
    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&kernel_timer);

    // Get average computation time
    dProcessingTime /= (double)iCycles;

    // log testname, throughput, timing and config info to sample and master logs
    printf("bilateralFilter-texture, Throughput = %.4f M RGBA Pixels/s, Time = %.5f s, Size = %u RGBA Pixels, NumDevsUsed = %u\n",
           (1.0e-6 * width * height)/dProcessingTime, dProcessingTime, (width * height), 1);
    printf("\n");

    return 0;
}
// display results using OpenGL
void display()
{
    sdkStartTimer(&timer);

    // execute filter, writing results to pbo
    unsigned int *d_result;
    checkCudaErrors(cudaGLMapBufferObject((void **)&d_result, pbo));
    gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads);
    checkCudaErrors(cudaGLUnmapBufferObject(pbo));

    // load texture from pbo
    glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
    glBindTexture(GL_TEXTURE_2D, texid);
    glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
    glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0);
    glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);

    // display results
    glClear(GL_COLOR_BUFFER_BIT);

    glEnable(GL_TEXTURE_2D);
    glDisable(GL_DEPTH_TEST);

    glBegin(GL_QUADS);
    glTexCoord2f(0, 1);
    glVertex2f(0, 0);
    glTexCoord2f(1, 1);
    glVertex2f(1, 0);
    glTexCoord2f(1, 0);
    glVertex2f(1, 1);
    glTexCoord2f(0, 0);
    glVertex2f(0, 1);
    glEnd();

    glDisable(GL_TEXTURE_2D);
    glutSwapBuffers();

    sdkStopTimer(&timer);

    computeFPS();
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple benchmark test for CUDA
////////////////////////////////////////////////////////////////////////////////
int runBenchmark()
{
    printf("[runBenchmark]: [%s]\n", sSDKsample);

    initCuda(true);

    unsigned int *d_result;
    checkCudaErrors(cudaMalloc((void **)&d_result, width*height*sizeof(unsigned int)));

    // warm-up
    boxFilterRGBA(d_img, d_temp, d_temp, width, height, filter_radius, iterations, nthreads, kernel_timer);
    checkCudaErrors(cudaDeviceSynchronize());

    sdkStartTimer(&kernel_timer);
    // Start round-trip timer and process iCycles loops on the GPU
    iterations = 1;     // standard 1-pass filtering
    const int iCycles = 150;
    double dProcessingTime = 0.0;
    printf("\nRunning BoxFilterGPU for %d cycles...\n\n", iCycles);

    for (int i = 0; i < iCycles; i++)
    {
        dProcessingTime += boxFilterRGBA(d_img, d_temp, d_img, width, height, filter_radius, iterations, nthreads, kernel_timer);
    }

    // check if kernel execution generated an error and sync host
    getLastCudaError("Error: boxFilterRGBA Kernel execution FAILED");
    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&kernel_timer);

    // Get average computation time
    dProcessingTime /= (double)iCycles;

    // log testname, throughput, timing and config info to sample and master logs
    printf("boxFilter-texture, Throughput = %.4f M RGBA Pixels/s, Time = %.5f s, Size = %u RGBA Pixels, NumDevsUsed = %u, Workgroup = %u\n",
           (1.0e-6 * width * height)/dProcessingTime, dProcessingTime,
           (width * height), 1, nthreads);
    printf("\n");

    return 0;
}
Example #23
0
// display results using OpenGL (called by GLUT)
void display()
{
    sdkStartTimer(&timer);

    render();

    // display results
    glClear(GL_COLOR_BUFFER_BIT);

    // draw image from PBO
    glDisable(GL_DEPTH_TEST);
    glRasterPos2i(0, 0);
    glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
    glDrawPixels(width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0);
    glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);

    glutSwapBuffers();
    glutReportErrors();

    sdkStopTimer(&timer);
    computeFPS();
}
Example #24
0
File: main.cpp Project: hoopoe/cmag
void display()
{
    //cutilCheckError(cutStartTimer(timer));
    sdkStartTimer(&timer);
	if(IsFirstTime){
		IsFirstTime = false;
		psystem->update(); 
		if (renderer) 
			renderer->setVertexBuffer(psystem->getCurrentReadBuffer(), psystem->getNumParticles());
	}
	if (!bPause){
		psystem->update(); 
		if (renderer) 
			renderer->setVertexBuffer(psystem->getCurrentReadBuffer(), psystem->getNumParticles());
	}

	glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);  
	glMatrixMode(GL_MODELVIEW);
	glLoadIdentity();
	for (int c = 0; c < 3; ++c)	{
		camera_trans_lag[c] += (camera_trans[c] - camera_trans_lag[c]) * inertia;
		camera_rot_lag[c] += (camera_rot[c] - camera_rot_lag[c]) * inertia;
	}
	glTranslatef(camera_trans_lag[0], camera_trans_lag[1], camera_trans_lag[2]);
	glRotatef(camera_rot_lag[0], 1.0, 0.0, 0.0);
	glRotatef(camera_rot_lag[1], 0.0, 1.0, 0.0);

	glGetFloatv(GL_MODELVIEW_MATRIX, modelView);

	glColor3f(0.0, 0.0, 0.0);
	//glutWireCube(2.0);

	if (renderer) renderer->display();
    //cutilCheckError(cutStopTimer(timer));
    sdkStopTimer(&timer);
	glutSwapBuffers();
	glutReportErrors();
	computeFPS();
}
Example #25
0
static CUT_THREADPROC solverThread(TOptionPlan *plan)
{
    //Init GPU
    checkCudaErrors(cudaSetDevice(plan->device));

    cudaDeviceProp deviceProp;
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, plan->device));

    //Start the timer
    sdkStartTimer(&hTimer[plan->device]);

    // Allocate intermediate memory for MC integrator and initialize
    // RNG states
    initMonteCarloGPU(plan);

    // Main commputation
    MonteCarloGPU(plan);

    checkCudaErrors(cudaDeviceSynchronize());

    //Stop the timer
    sdkStopTimer(&hTimer[plan->device]);

    //Shut down this GPU
    closeMonteCarloGPU(plan);

    cudaStreamSynchronize(0);

    printf("solverThread() finished - GPU Device %d: %s\n", plan->device, deviceProp.name);

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();
    CUT_THREADEND;
}
Example #26
0
void runBenchmark(int iterations, char *exec_path)
{
    int file_count=0, iterationsPerFrame = (int)(1.0/(30.0*timestep));

    printf("Run %u particles simulation for %d iterations...\n\n", numParticles, iterations);
    //abb58: 1. what are you trying to sync???
    cudaDeviceSynchronize();
    sdkStartTimer(&timer);

    for (int i = 0; i < iterations; ++i) {
        psystem->update(timestep);

        if (i % iterationsPerFrame == 0) {
            psystem->writeParticles(fpout, 0, numParticles, file_count);
            //psystem->dumpParticles(0, numParticles, file_count);
            file_count++;
        }
    }

    cudaDeviceSynchronize();
    sdkStopTimer(&timer);
    float fAvgSeconds = ((float)1.0e-3 * (float)sdkGetTimerValue(&timer)/(float)iterations);
}
// display results using OpenGL (called by GLUT)
void display()
{
    sdkStartTimer(&timer);

    // map PBO to get CUDA device pointer
    uchar4 *d_output;
    checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0));
    size_t num_bytes;
    checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_output, &num_bytes,
                    cuda_pbo_resource));
    render(imageWidth, imageHeight, tx, ty, scale, cx, cy,
           blockSize, gridSize, g_FilterMode, d_output);

    checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));

    // Common display path
    {
        // display results
        glClear(GL_COLOR_BUFFER_BIT);

#if USE_BUFFER_TEX
        // display using buffer texture
        glBindTexture(GL_TEXTURE_BUFFER_EXT, bufferTex);
        glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, fprog);
        glEnable(GL_FRAGMENT_PROGRAM_ARB);
        glProgramLocalParameterI4iNV(GL_FRAGMENT_PROGRAM_ARB, 0, width, 0, 0, 0);
#else
        // download image from PBO to OpenGL texture
        glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
        glBindTexture(GL_TEXTURE_TYPE, displayTex);
        glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
        glTexSubImage2D(GL_TEXTURE_TYPE,
                        0, 0, 0, width, height, GL_BGRA, GL_UNSIGNED_BYTE, 0);
        glEnable(GL_TEXTURE_TYPE);
#endif

        // draw textured quad
        glDisable(GL_DEPTH_TEST);
        glBegin(GL_QUADS);
        glTexCoord2f(0.0f          , (GLfloat)height);
        glVertex2f(0.0f, 0.0f);
        glTexCoord2f((GLfloat)width, (GLfloat)height);
        glVertex2f(1.0f, 0.0f);
        glTexCoord2f((GLfloat)width, 0.0f);
        glVertex2f(1.0f, 1.0f);
        glTexCoord2f(0.0f          , 0.0f);
        glVertex2f(0.0f, 1.0f);
        glEnd();
        glDisable(GL_TEXTURE_TYPE);
        glDisable(GL_FRAGMENT_PROGRAM_ARB);

        glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);

        if (drawCurves)
        {
            // draw spline curves
            glPushMatrix();
            glScalef(0.25, 0.25, 1.0);

            glTranslatef(0.0, 2.0, 0.0);
            glColor3f(1.0, 0.0, 0.0);
            plotCurve(bspline_w3);

            glTranslatef(1.0, 0.0, 0.0);
            glColor3f(0.0, 1.0, 0.0);
            plotCurve(bspline_w2);

            glTranslatef(1.0, 0.0, 0.0);
            glColor3f(0.0, 0.0, 1.0);
            plotCurve(bspline_w1);

            glTranslatef(1.0, 0.0, 0.0);
            glColor3f(1.0, 0.0, 1.0);
            plotCurve(bspline_w0);

            glPopMatrix();
            glColor3f(1.0, 1.0, 1.0);
        }
    }

    glutSwapBuffers();
    glutReportErrors();

    sdkStopTimer(&timer);

    computeFPS();
}
Example #28
0
////////////////////////////////////////////////////////////////////////////////
// Test driver
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    uint *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal;
    uint *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal;
    StopWatchInterface *hTimer = NULL;

    const uint   N = 4 * 1048576;
    const uint DIR = 1;
    const uint numValues = 65536;

    printf("%s Starting...\n\n", argv[0]);

    int dev = findCudaDevice(argc, (const char **) argv);

    if (dev == -1)
    {
        return EXIT_FAILURE;
    }

    printf("Allocating and initializing host arrays...\n\n");
    sdkCreateTimer(&hTimer);
    h_SrcKey = (uint *)malloc(N * sizeof(uint));
    h_SrcVal = (uint *)malloc(N * sizeof(uint));
    h_DstKey = (uint *)malloc(N * sizeof(uint));
    h_DstVal = (uint *)malloc(N * sizeof(uint));

    srand(2009);

    for (uint i = 0; i < N; i++)
    {
        h_SrcKey[i] = rand() % numValues;
    }

    fillValues(h_SrcVal, N);

    printf("Allocating and initializing CUDA arrays...\n\n");
    checkCudaErrors(cudaMalloc((void **)&d_DstKey, N * sizeof(uint)));
    checkCudaErrors(cudaMalloc((void **)&d_DstVal, N * sizeof(uint)));
    checkCudaErrors(cudaMalloc((void **)&d_BufKey, N * sizeof(uint)));
    checkCudaErrors(cudaMalloc((void **)&d_BufVal, N * sizeof(uint)));
    checkCudaErrors(cudaMalloc((void **)&d_SrcKey, N * sizeof(uint)));
    checkCudaErrors(cudaMalloc((void **)&d_SrcVal, N * sizeof(uint)));
    checkCudaErrors(cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice));

    printf("Initializing GPU merge sort...\n");
    initMergeSort();

    printf("Running GPU merge sort...\n");
    checkCudaErrors(cudaDeviceSynchronize());
    sdkResetTimer(&hTimer);
    sdkStartTimer(&hTimer);
    mergeSort(
        d_DstKey,
        d_DstVal,
        d_BufKey,
        d_BufVal,
        d_SrcKey,
        d_SrcVal,
        N,
        DIR
    );
    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&hTimer);
    printf("Time: %f ms\n", sdkGetTimerValue(&hTimer));

    printf("Reading back GPU merge sort results...\n");
    checkCudaErrors(cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost));
    checkCudaErrors(cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost));

    printf("Inspecting the results...\n");
    uint keysFlag = validateSortedKeys(
                        h_DstKey,
                        h_SrcKey,
                        1,
                        N,
                        numValues,
                        DIR
                    );

    uint valuesFlag = validateSortedValues(
                          h_DstKey,
                          h_DstVal,
                          h_SrcKey,
                          1,
                          N
                      );

    printf("Shutting down...\n");
    closeMergeSort();
    sdkDeleteTimer(&hTimer);
    checkCudaErrors(cudaFree(d_SrcVal));
    checkCudaErrors(cudaFree(d_SrcKey));
    checkCudaErrors(cudaFree(d_BufVal));
    checkCudaErrors(cudaFree(d_BufKey));
    checkCudaErrors(cudaFree(d_DstVal));
    checkCudaErrors(cudaFree(d_DstKey));
    free(h_DstVal);
    free(h_DstKey);
    free(h_SrcVal);
    free(h_SrcKey);

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();

    exit((keysFlag && valuesFlag) ? EXIT_SUCCESS : EXIT_FAILURE);
}
TEST(RMDCuTests, seedMatrixInit)
{
  const boost::filesystem::path dataset_path("../test_data");
  const boost::filesystem::path sequence_file_path("../test_data/first_200_frames_traj_over_table_input_sequence.txt");

  rmd::PinholeCamera cam(481.2f, -480.0f, 319.5f, 239.5f);

  rmd::test::Dataset dataset(dataset_path.string(), sequence_file_path.string(), cam);
  if (!dataset.readDataSequence())
    FAIL() << "could not read dataset";

  const size_t ref_ind = 1;
  const size_t curr_ind = 20;

  const auto ref_entry = dataset(ref_ind);
  cv::Mat ref_img;
  dataset.readImage(ref_img, ref_entry);
  cv::Mat ref_img_flt;
  ref_img.convertTo(ref_img_flt, CV_32F, 1.0f/255.0f);

  cv::Mat ref_depthmap;
  dataset.readDepthmap(ref_depthmap, ref_entry, ref_img.cols, ref_img.rows);

  rmd::SE3<float> T_world_ref;
  dataset.readCameraPose(T_world_ref, ref_entry);

  const auto curr_entry = dataset(curr_ind);
  cv::Mat curr_img;
  dataset.readImage(curr_img, curr_entry);
  cv::Mat curr_img_flt;
  curr_img.convertTo(curr_img_flt, CV_32F, 1.0f/255.0f);

  rmd::SE3<float> T_world_curr;
  dataset.readCameraPose(T_world_curr, curr_entry);

  const float min_scene_depth = 0.4f;
  const float max_scene_depth = 1.8f;

  rmd::SeedMatrix seeds(ref_img.cols, ref_img.rows, cam);

  StopWatchInterface * timer = NULL;
  sdkCreateTimer(&timer);
  sdkResetTimer(&timer);
  sdkStartTimer(&timer);

  seeds.setReferenceImage(
        reinterpret_cast<float*>(ref_img_flt.data),
        T_world_ref.inv(),
        min_scene_depth,
        max_scene_depth);

  sdkStopTimer(&timer);
  double t = sdkGetAverageTimerValue(&timer) / 1000.0;
  printf("setReference image CUDA execution time: %f seconds.\n", t);

  cv::Mat initial_depthmap(ref_img.rows, ref_img.cols, CV_32FC1);
  seeds.downloadDepthmap(reinterpret_cast<float*>(initial_depthmap.data));

  cv::Mat initial_sigma_sq(ref_img.rows, ref_img.cols, CV_32FC1);
  seeds.downloadSigmaSq(reinterpret_cast<float*>(initial_sigma_sq.data));

  cv::Mat initial_a(ref_img.rows, ref_img.cols, CV_32FC1);
  seeds.downloadA(reinterpret_cast<float*>(initial_a.data));

  cv::Mat initial_b(ref_img.rows, ref_img.cols, CV_32FC1);
  seeds.downloadB(reinterpret_cast<float*>(initial_b.data));

  const float avg_scene_depth = (min_scene_depth+max_scene_depth)/2.0f;
  const float max_scene_sigma_sq = (max_scene_depth - min_scene_depth) * (max_scene_depth - min_scene_depth) / 36.0f;
  for(size_t r=0; r<ref_img.rows; ++r)
  {
    for(size_t c=0; c<ref_img.cols; ++c)
    {
      ASSERT_FLOAT_EQ(avg_scene_depth, initial_depthmap.at<float>(r, c));
      ASSERT_FLOAT_EQ(max_scene_sigma_sq, initial_sigma_sq.at<float>(r, c));
      ASSERT_FLOAT_EQ(10.0f, initial_a.at<float>(r, c));
      ASSERT_FLOAT_EQ(10.0f, initial_b.at<float>(r, c));
    }
  }

  // Test initialization of NCC template statistics

  // CUDA computation
  cv::Mat cu_sum_templ(ref_img.rows, ref_img.cols, CV_32FC1);
  seeds.downloadSumTempl(reinterpret_cast<float*>(cu_sum_templ.data));
  cv::Mat cu_const_templ_denom(ref_img.rows, ref_img.cols, CV_32FC1);
  seeds.downloadConstTemplDenom(reinterpret_cast<float*>(cu_const_templ_denom.data));

  // Host computation
  cv::Mat ocv_sum_templ(ref_img.rows, ref_img.cols, CV_32FC1);
  cv::Mat ocv_const_templ_denom(ref_img.rows, ref_img.cols, CV_32FC1);

  const int side = seeds.getPatchSide();
  for(size_t y=side; y<ref_img.rows-side/2; ++y)
  {
    for(size_t x=side; x<ref_img.cols-side/2; ++x)
    {
      double sum_templ    = 0.0f;
      double sum_templ_sq = 0.0f;
      for(int patch_y=0; patch_y<side; ++patch_y)
      {
        for(int patch_x=0; patch_x<side; ++patch_x)
        {
          const double templ = (double) ref_img_flt.at<float>( y-side/2+patch_y, x-side/2+patch_x );
          sum_templ += templ;
          sum_templ_sq += templ*templ;
        }
      }
      ocv_sum_templ.at<float>(y, x) = (float) sum_templ;
      ocv_const_templ_denom.at<float>(y, x) = (float) ( ((double)(side*side))*sum_templ_sq - sum_templ*sum_templ );
    }
  }
  for(size_t r=side; r<ref_img.rows-side/2; ++r)
  {
    for(size_t c=side; c<ref_img.cols-side/2; ++c)
    {
      ASSERT_NEAR(ocv_sum_templ.at<float>(r, c), cu_sum_templ.at<float>(r, c), 0.00001f);
      ASSERT_NEAR(ocv_const_templ_denom.at<float>(r, c), cu_const_templ_denom.at<float>(r, c), 0.001f);
    }
  }

}
TEST(RMDCuTests, seedMatrixCheck)
{
  const boost::filesystem::path dataset_path("../test_data");
  const boost::filesystem::path sequence_file_path("../test_data/first_200_frames_traj_over_table_input_sequence.txt");

  rmd::PinholeCamera cam(481.2f, -480.0f, 319.5f, 239.5f);

  rmd::test::Dataset dataset(dataset_path.string(), sequence_file_path.string(), cam);
  if (!dataset.readDataSequence())
    FAIL() << "could not read dataset";

  const size_t ref_ind = 1;
  const size_t curr_ind = 20;

  const auto ref_entry = dataset(ref_ind);
  cv::Mat ref_img;
  dataset.readImage(ref_img, ref_entry);
  cv::Mat ref_img_flt;
  ref_img.convertTo(ref_img_flt, CV_32F, 1.0f/255.0f);

  cv::Mat ref_depthmap;
  dataset.readDepthmap(ref_depthmap, ref_entry, ref_img.cols, ref_img.rows);

  rmd::SE3<float> T_world_ref;
  dataset.readCameraPose(T_world_ref, ref_entry);

  const auto curr_entry = dataset(curr_ind);
  cv::Mat curr_img;
  dataset.readImage(curr_img, curr_entry);
  cv::Mat curr_img_flt;
  curr_img.convertTo(curr_img_flt, CV_32F, 1.0f/255.0f);

  rmd::SE3<float> T_world_curr;
  dataset.readCameraPose(T_world_curr, curr_entry);

  const float min_scene_depth = 0.4f;
  const float max_scene_depth = 1.8f;

  rmd::SeedMatrix seeds(ref_img.cols, ref_img.rows, cam);

  seeds.setReferenceImage(
        reinterpret_cast<float*>(ref_img_flt.data),
        T_world_ref.inv(),
        min_scene_depth,
        max_scene_depth);

  StopWatchInterface * timer = NULL;
  sdkCreateTimer(&timer);
  sdkResetTimer(&timer);
  sdkStartTimer(&timer);

  seeds.update(
        reinterpret_cast<float*>(ref_img_flt.data),
        T_world_curr.inv());

  sdkStopTimer(&timer);
  double t = sdkGetAverageTimerValue(&timer) / 1000.0;
  printf("update CUDA execution time: %f seconds.\n", t);

  cv::Mat cu_convergence(ref_img.rows, ref_img.cols, CV_32SC1);
  seeds.downloadConvergence(reinterpret_cast<int*>(cu_convergence.data));

  const int side = seeds.getPatchSide();
  for(size_t r=0; r<ref_img.rows; ++r)
  {
    for(size_t c=0; c<ref_img.cols; ++c)
    {
      if(r>ref_img.rows-side-1
         || r<side
         || c>ref_img.cols-side-1
         || c<side)
      {
        ASSERT_EQ(rmd::ConvergenceStates::BORDER, cu_convergence.at<int>(r, c)) << "(r, c) = (" << r << ", " << c <<")";
      }
      else
      {
        const int result = cu_convergence.at<int>(r, c);
        const bool success = (result == rmd::ConvergenceStates::UPDATE      ||
                              result == rmd::ConvergenceStates::DIVERGED    ||
                              result == rmd::ConvergenceStates::CONVERGED   ||
                              result == rmd::ConvergenceStates::NOT_VISIBLE ||
                              result == rmd::ConvergenceStates::NO_MATCH    );
        ASSERT_EQ(true, success) << "(r, c) = (" << r << ", " << c <<")";
      }
    }
  }

}