Exemple #1
0
void computeFPS()
{
    frameCount++;
    fpsCount++;

    if (fpsCount == fpsLimit)
    {
        char fps[256];
		char meshInf[256];
		float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f);
		std::string f;
		
		if (psystem->getFriction())
			sprintf(fps, "CUDA Particles (%d particles): %3.1f fps with friction", (int)psystem->getNumParticles(), ifps );
		else
			sprintf(fps, "CUDA Particles (%d particles): %3.1f fps without friction", (int)psystem->getNumParticles(), ifps);
		if (displayMode == ParticleRenderer::MESH_MODE)
		{
			sprintf(meshInf, " Face:%d Vertex:%d ", psystem->getNumFaces(), psystem->getNumVertexes());
			strcat(fps, meshInf);
		}

        glutSetWindowTitle(fps);
        fpsCount = 0;

        fpsLimit = (int)MAX(ifps, 1.f);
        sdkResetTimer(&timer);
    }
}
void siTest(T *d_ptclA, T *d_ptclA_new, T *d_wghtA, unsigned int size, int stateDim)
{
    int blocks, threads;
    float elapsedTimeInMs = 0.0f;
    threads = BLOCK_SIZE;
    blocks = (size + threads - 1) / threads;
#ifdef NVS
    while (blocks > GRID_LIMIT){
        blocks >>= 1;
        threads <<= 1;
    }
#endif
    StopWatchInterface *timer = NULL;
    sdkCreateTimer(&timer);

    for (int i = 0 ; i < TEST_ITERATIONS ; i ++){
        cudaDeviceSynchronize();
        sdkStartTimer(&timer);

        SI<T>(blocks, threads, d_ptclA, d_ptclA_new, d_wghtA, size, stateDim);

        checkCudaErrors(cudaDeviceSynchronize());
        sdkStopTimer(&timer);
    }
    elapsedTimeInMs = sdkGetAverageTimerValue(&timer);
    printf("%f\t", elapsedTimeInMs);
    printf("size=%u, stateDim=%d, blocks=%d, threads=%d\n",size, stateDim, blocks, threads);

    sdkDeleteTimer(&timer);
}
void display(void)
{
	pthread_mutex_lock(&display_mutex);

    if (!ref_file)
    {
        sdkStartTimer(&timer);
        simulateFluids();
    }

    // render points from vertex buffer
    glClear(GL_COLOR_BUFFER_BIT);
    glClearColor(1.f/256*172, 1.f/256*101, 1.f/256*4, 0.f);
    glColor4f(1.f, 1.f, 1.f, 0.5f);
    glPointSize(1);
    glEnable(GL_POINT_SMOOTH);
    glEnable(GL_BLEND);
    glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
    glEnableClientState(GL_VERTEX_ARRAY);
    glDisable(GL_DEPTH_TEST);
    glDisable(GL_CULL_FACE);
    glBindBufferARB(GL_ARRAY_BUFFER_ARB, vbo);
#ifdef OPTIMUS
    glBufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(cData) * DS,
                    particles, GL_DYNAMIC_DRAW_ARB);
#endif
    glVertexPointer(2, GL_FLOAT, 0, NULL);
    glDrawArrays(GL_POINTS, 0, DS);
    glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0);
    glDisableClientState(GL_VERTEX_ARRAY);
    glDisableClientState(GL_TEXTURE_COORD_ARRAY);
    glDisable(GL_TEXTURE_2D);

    if (ref_file)
    {
        return;
    }

    // Finish timing before swap buffers to avoid refresh sync
    sdkStopTimer(&timer);
    glutSwapBuffers();

    fpsCount++;

    if (fpsCount == fpsLimit)
    {
        char fps[256];
        float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f);
        sprintf(fps, "Caffe Macchiato / Stable Fluids (%d x %d): %3.1f fps", DIM, DIM, ifps);
        glutSetWindowTitle(fps);
        fpsCount = 0;
        fpsLimit = (int)MAX(ifps, 1.f);
        sdkResetTimer(&timer);
    }

    glutPostRedisplay();

	pthread_mutex_unlock(&display_mutex);
}
Exemple #4
0
void display(void)
{

    if (!ref_file)
    {
        sdkStartTimer(&timer);
        simulateFluids();
    }

    // render points from vertex buffer
    glClear(GL_COLOR_BUFFER_BIT);
    glColor4f(0,1,0,0.5f);
    glPointSize(1);
    glEnable(GL_POINT_SMOOTH);
    glEnable(GL_BLEND);
    glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
    glEnableClientState(GL_VERTEX_ARRAY);
    glDisable(GL_DEPTH_TEST);
    glDisable(GL_CULL_FACE);
    glBindBufferARB(GL_ARRAY_BUFFER_ARB, vbo);
    glVertexPointer(2, GL_FLOAT, 0, NULL);
    glDrawArrays(GL_POINTS, 0, DS);
    glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0);
    glDisableClientState(GL_VERTEX_ARRAY);
    glDisableClientState(GL_TEXTURE_COORD_ARRAY);
    glDisable(GL_TEXTURE_2D);

    if (ref_file)
    {
        return;
    }

    // Finish timing before swap buffers to avoid refresh sync
    sdkStopTimer(&timer);
    glutSwapBuffers();

    fpsCount++;

    if (fpsCount == fpsLimit)
    {
        char fps[256];
        float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f);
        sprintf(fps, "Cuda/GL Stable Fluids (%d x %d): %3.1f fps", DIM, DIM, ifps);
        glutSetWindowTitle(fps);
        fpsCount = 0;
        fpsLimit = (int)MAX(ifps, 1.f);
        sdkResetTimer(&timer);
    }

    glutPostRedisplay();
}
Exemple #5
0
void computeFPS()
{
    frameCount++;
    fpsCount++;

    if (fpsCount == fpsLimit)
    {
        char fps[256];
        float ifps = 1.f / (sdkGetAverageTimerValue(&hTimer) / 1000.f);
        sprintf(fps, "<CUDA %s Set> %3.1f fps", g_isJuliaSet ? "Julia" : "Mandelbrot", ifps);
        glutSetWindowTitle(fps);
        fpsCount = 0;

        fpsLimit = MAX(1.f, (float)ifps);
        sdkResetTimer(&hTimer);
    }
}
Exemple #6
0
void computeFPS()
{
    frameCount++;
    fpsCount++;

    if (fpsCount == fpsLimit) {
        char fps[256];
        float ifps = 1.f / (sdkGetAverageTimerValue(&timer)  / 1000.f);
        sprintf(fps, "Dark Matter PBVR: %3.1f fps (Max 100Hz)", ifps);

        glutSetWindowTitle(fps);
        fpsCount = 0; 
  	    fpsLimit = (int)MAX(ifps, 1.f);

        sdkResetTimer(&timer);
    }
 }
void computeFPS()
{
    frameCount++;
    fpsCount++;

    if (fpsCount == fpsLimit)
    {
        char fps[256];
        float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f);
        sprintf(fps, "CUDA Edge Detection (%s): %3.1f fps", filterMode[g_SobelDisplayMode], ifps);

        glutSetWindowTitle(fps);
        fpsCount = 0;

        sdkResetTimer(&timer);
    }
}
void computeFPS()
{
    frameCount++;
    fpsCount++;

    if (fpsCount == fpsLimit)
    {
        char fps[256];
        float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f);
        sprintf(fps, "%s (sigma=%4.2f): %3.1f fps", sSDKsample, sigma, ifps);

        glutSetWindowTitle(fps);
        fpsCount = 0;

        fpsLimit = ftoi(MAX(ifps, 1.f));
        sdkResetTimer(&timer);
    }
}
void computeFPS()
{
    frameCount++;
    fpsCount++;

    if (fpsCount == fpsLimit)
    {
        char fps[256];
        float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f);
        sprintf(fps, "<%s>: %3.1f fps", filterMode[g_Kernel], ifps);

        glutSetWindowTitle(fps);
        fpsCount = 0;

        //fpsLimit = (int)MAX(ifps, 1.f);
        sdkResetTimer(&timer);
    }
}
Exemple #10
0
void computeFPS()
{
    frameCount++;
    fpsCount++;

    if (fpsCount == fpsLimit)
    {
        avgFPS = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f);
        fpsCount = 0;
        fpsLimit = (int)MAX(avgFPS, 1.f);

        sdkResetTimer(&timer);
    }

    char fps[256];
    sprintf(fps, "MPI Cuda GL Interop (VBO): %3.1f fps (Max 100Hz)", avgFPS);
    glutSetWindowTitle(fps);
}
void computeFPS()
{
    frameCount++;
    fpsCount++;

    if (fpsCount == fpsLimit)
    {
        char fps[256];
        float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f);
        sprintf(fps, "CUDA Particles (%d particles): %3.1f fps", numParticles, ifps);

        glutSetWindowTitle(fps);
        fpsCount = 0;

        fpsLimit = (int)MAX(ifps, 1.f);
        sdkResetTimer(&timer);
    }
}
void computeFPS()
{
    frameCount++;
    fpsCount++;

    if (fpsCount == fpsLimit)
    {
        char fps[256];
        float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f);
        sprintf(fps, "CUDA 3D Volume Filtering: %3.1f fps", ifps);

        glutSetWindowTitle(fps);
        fpsCount = 0;

        fpsLimit = MAX(1.f, ifps);
        sdkResetTimer(&timer);
    }
}
Exemple #13
0
void computeFPS()
{
	frameCount++;
	fpsCount++;

	if (fpsCount == fpsLimit) {
		char fps[256];
        //float ifps = 1.f / (cutGetAverageTimerValue(timer) / 1000.f);
        float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f);

		sprintf(fps, "Dam Break (%d particles): %3.1f fps; elapsed Time: %f",
			psystem->getNumParticles(), ifps, psystem->getElapsedTime()); 
		glutSetWindowTitle(fps);
		fpsCount = 0;         

        //cutilCheckError(cutResetTimer(timer));
        sdkResetTimer(&timer);
	}
}
void computeFPS()
{
    frameCount++;
    fpsCount++;

    if (fpsCount == fpsLimit-1)
    {
        g_Verify = true;
    }

    if (fpsCount == fpsLimit)
    {
        char fps[256];
        float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f);
        sprintf(fps, "%s %s <%s>: %3.1f fps", "", sSDKsample, sFilterMode[g_FilterMode], ifps);

        glutSetWindowTitle(fps);
        fpsCount = 0;

        sdkResetTimer(&timer);
    }
}
// Calculate the Frames per second and print in the title bar
void computeFPS()
{
    frameCount++;
    fpsCount++;

    if (fpsCount == fpsLimit)
    {
        avgFPS = 1.0f / (sdkGetAverageTimerValue(&timer) / 1000.0f);
        fpsCount = 0;
        fpsLimit = (int)MAX(avgFPS, 1.0f);
        sdkResetTimer(&timer);
    }

    char fps[256];
    sprintf(fps, "CUDA Rolling Box Filter <Animation=%s> (radius=%d, passes=%d): %3.1f fps",
            (!g_bInteractive ? "ON" : "OFF"), filter_radius, iterations, avgFPS);
    glutSetWindowTitle(fps);

    if (!g_bInteractive)
    {
        varySigma();
    }
}
void computeFPS()
{
    fpsCount++;

    if (fpsCount == fpsLimit)
    {
        char fps[256];
        float ifps = 1.0f / (sdkGetAverageTimerValue(&timer) / 1000.0f);
        sprintf(fps, "CUDA Bilateral Filter: %3.f fps (radius=%d, iter=%d, euclidean=%.2f, gaussian=%.2f)",
                ifps, filter_radius, iterations, (double)euclidean_delta, (double)gaussian_delta);

        glutSetWindowTitle(fps);
        fpsCount = 0;
        fpsLimit = (int)MAX(ifps, 1.0f);

        sdkResetTimer(&timer);
    }

    if (!g_bInteractive)
    {
        varyEuclidean();
    }
}
Exemple #17
0
void computeFPS(HWND hWnd, bool bUseInterop)
{
    sdkStopTimer(&frame_timer);

    if (g_bRunning)
    {
        g_fpsCount++;

        if (!g_pFrameQueue->isEndOfDecode())
        {
            g_FrameCount++;
        }
    }

    char sFPS[256];
    std::string sDecodeStatus;

    if (g_bDeviceLost)
    {
        sDecodeStatus = "DeviceLost!\0";
        sprintf(sFPS, "%s [%s] - [%s %d]",
                sAppName, sDecodeStatus.c_str(),
                (g_bIsProgressive ? "Frame" : "Field"),
                g_DecodeFrameCount);

        if (bUseInterop && (!g_bQAReadback))
        {
            SetWindowText(hWnd, sFPS);
            UpdateWindow(hWnd);
        }

        sdkResetTimer(&frame_timer);
        g_fpsCount = 0;
        return;
    }

    if (g_pFrameQueue->isEndOfDecode())
    {
        sDecodeStatus = "STOP (End of File)\0";

        // we only want to record this once
        if (total_time == 0.0f)
        {
            total_time = sdkGetTimerValue(&global_timer);
        }

        sdkStopTimer(&global_timer);

        if (g_bAutoQuit)
        {
            g_bRunning = false;
            g_bDone    = true;
        }
    }
    else
    {
        if (!g_bRunning)
        {
            sDecodeStatus = "PAUSE\0";
            sprintf(sFPS, "%s [%s] - [%s %d] - Video Display %s / Vsync %s",
                    sAppName, sDecodeStatus.c_str(),
                    (g_bIsProgressive ? "Frame" : "Field"), g_DecodeFrameCount,
                    g_bUseDisplay ? "ON" : "OFF",
                    g_bUseVsync   ? "ON" : "OFF");

            if (bUseInterop && (!g_bQAReadback))
            {
                SetWindowText(hWnd, sFPS);
                UpdateWindow(hWnd);
            }
        }
        else
        {
            if (g_bFrameStep)
            {
                sDecodeStatus = "STEP\0";
            }
            else
            {
                sDecodeStatus = "PLAY\0";
            }
        }

        if (g_fpsCount == g_fpsLimit)
        {
            float ifps = 1.f / (sdkGetAverageTimerValue(&frame_timer) / 1000.f);

            sprintf(sFPS, "[%s] [%s] - [%3.1f fps, %s %d] - Video Display %s / Vsync %s",
                    sAppName, sDecodeStatus.c_str(), ifps,
                    (g_bIsProgressive ? "Frame" : "Field"), g_DecodeFrameCount,
                    g_bUseDisplay ? "ON" : "OFF",
                    g_bUseVsync   ? "ON" : "OFF");

            if (bUseInterop && (!g_bQAReadback))
            {
                SetWindowText(hWnd, sFPS);
                UpdateWindow(hWnd);
            }

            printf("[%s] - [%s: %04d, %04.1f fps, time: %04.2f (ms) ]\n",
                   sSDKname, (g_bIsProgressive ? "Frame" : "Field"), g_FrameCount, ifps, 1000.f/ifps);

            sdkResetTimer(&frame_timer);
            g_fpsCount = 0;
        }
    }

    sdkStartTimer(&frame_timer);
}
bool InfiniTAMApp::ProcessFrame(void)
{
	if (!mImageSource->hasMoreImages()) return false;
	mImageSource->getImages(inputRGBImage, inputRawDepthImage);

	if (mImuSource != NULL) {
		if (!mImuSource->hasMoreMeasurements()) return false;
		else mImuSource->getMeasurement(inputIMUMeasurement);
	}

	sdkResetTimer(&timer_instant);
	sdkStartTimer(&timer_instant); sdkStartTimer(&timer_average);

	//actual processing on the mainEngine
	if (mImuSource != NULL) mMainEngine->ProcessFrame(inputRGBImage, inputRawDepthImage, inputIMUMeasurement);
	else mMainEngine->ProcessFrame(inputRGBImage, inputRawDepthImage);

	ITMSafeCall(cudaDeviceSynchronize());
	sdkStopTimer(&timer_instant); sdkStopTimer(&timer_average);

	__android_log_print(ANDROID_LOG_VERBOSE, "InfiniTAM", "Process Frame finished: %f %f", sdkGetTimerValue(&timer_instant), sdkGetAverageTimerValue(&timer_average));

	return true;
}
TEST(RMDCuTests, seedMatrixInit)
{
  const boost::filesystem::path dataset_path("../test_data");
  const boost::filesystem::path sequence_file_path("../test_data/first_200_frames_traj_over_table_input_sequence.txt");

  rmd::PinholeCamera cam(481.2f, -480.0f, 319.5f, 239.5f);

  rmd::test::Dataset dataset(dataset_path.string(), sequence_file_path.string(), cam);
  if (!dataset.readDataSequence())
    FAIL() << "could not read dataset";

  const size_t ref_ind = 1;
  const size_t curr_ind = 20;

  const auto ref_entry = dataset(ref_ind);
  cv::Mat ref_img;
  dataset.readImage(ref_img, ref_entry);
  cv::Mat ref_img_flt;
  ref_img.convertTo(ref_img_flt, CV_32F, 1.0f/255.0f);

  cv::Mat ref_depthmap;
  dataset.readDepthmap(ref_depthmap, ref_entry, ref_img.cols, ref_img.rows);

  rmd::SE3<float> T_world_ref;
  dataset.readCameraPose(T_world_ref, ref_entry);

  const auto curr_entry = dataset(curr_ind);
  cv::Mat curr_img;
  dataset.readImage(curr_img, curr_entry);
  cv::Mat curr_img_flt;
  curr_img.convertTo(curr_img_flt, CV_32F, 1.0f/255.0f);

  rmd::SE3<float> T_world_curr;
  dataset.readCameraPose(T_world_curr, curr_entry);

  const float min_scene_depth = 0.4f;
  const float max_scene_depth = 1.8f;

  rmd::SeedMatrix seeds(ref_img.cols, ref_img.rows, cam);

  StopWatchInterface * timer = NULL;
  sdkCreateTimer(&timer);
  sdkResetTimer(&timer);
  sdkStartTimer(&timer);

  seeds.setReferenceImage(
        reinterpret_cast<float*>(ref_img_flt.data),
        T_world_ref.inv(),
        min_scene_depth,
        max_scene_depth);

  sdkStopTimer(&timer);
  double t = sdkGetAverageTimerValue(&timer) / 1000.0;
  printf("setReference image CUDA execution time: %f seconds.\n", t);

  cv::Mat initial_depthmap(ref_img.rows, ref_img.cols, CV_32FC1);
  seeds.downloadDepthmap(reinterpret_cast<float*>(initial_depthmap.data));

  cv::Mat initial_sigma_sq(ref_img.rows, ref_img.cols, CV_32FC1);
  seeds.downloadSigmaSq(reinterpret_cast<float*>(initial_sigma_sq.data));

  cv::Mat initial_a(ref_img.rows, ref_img.cols, CV_32FC1);
  seeds.downloadA(reinterpret_cast<float*>(initial_a.data));

  cv::Mat initial_b(ref_img.rows, ref_img.cols, CV_32FC1);
  seeds.downloadB(reinterpret_cast<float*>(initial_b.data));

  const float avg_scene_depth = (min_scene_depth+max_scene_depth)/2.0f;
  const float max_scene_sigma_sq = (max_scene_depth - min_scene_depth) * (max_scene_depth - min_scene_depth) / 36.0f;
  for(size_t r=0; r<ref_img.rows; ++r)
  {
    for(size_t c=0; c<ref_img.cols; ++c)
    {
      ASSERT_FLOAT_EQ(avg_scene_depth, initial_depthmap.at<float>(r, c));
      ASSERT_FLOAT_EQ(max_scene_sigma_sq, initial_sigma_sq.at<float>(r, c));
      ASSERT_FLOAT_EQ(10.0f, initial_a.at<float>(r, c));
      ASSERT_FLOAT_EQ(10.0f, initial_b.at<float>(r, c));
    }
  }

  // Test initialization of NCC template statistics

  // CUDA computation
  cv::Mat cu_sum_templ(ref_img.rows, ref_img.cols, CV_32FC1);
  seeds.downloadSumTempl(reinterpret_cast<float*>(cu_sum_templ.data));
  cv::Mat cu_const_templ_denom(ref_img.rows, ref_img.cols, CV_32FC1);
  seeds.downloadConstTemplDenom(reinterpret_cast<float*>(cu_const_templ_denom.data));

  // Host computation
  cv::Mat ocv_sum_templ(ref_img.rows, ref_img.cols, CV_32FC1);
  cv::Mat ocv_const_templ_denom(ref_img.rows, ref_img.cols, CV_32FC1);

  const int side = seeds.getPatchSide();
  for(size_t y=side; y<ref_img.rows-side/2; ++y)
  {
    for(size_t x=side; x<ref_img.cols-side/2; ++x)
    {
      double sum_templ    = 0.0f;
      double sum_templ_sq = 0.0f;
      for(int patch_y=0; patch_y<side; ++patch_y)
      {
        for(int patch_x=0; patch_x<side; ++patch_x)
        {
          const double templ = (double) ref_img_flt.at<float>( y-side/2+patch_y, x-side/2+patch_x );
          sum_templ += templ;
          sum_templ_sq += templ*templ;
        }
      }
      ocv_sum_templ.at<float>(y, x) = (float) sum_templ;
      ocv_const_templ_denom.at<float>(y, x) = (float) ( ((double)(side*side))*sum_templ_sq - sum_templ*sum_templ );
    }
  }
  for(size_t r=side; r<ref_img.rows-side/2; ++r)
  {
    for(size_t c=side; c<ref_img.cols-side/2; ++c)
    {
      ASSERT_NEAR(ocv_sum_templ.at<float>(r, c), cu_sum_templ.at<float>(r, c), 0.00001f);
      ASSERT_NEAR(ocv_const_templ_denom.at<float>(r, c), cu_const_templ_denom.at<float>(r, c), 0.001f);
    }
  }

}
TEST(RMDCuTests, seedMatrixCheck)
{
  const boost::filesystem::path dataset_path("../test_data");
  const boost::filesystem::path sequence_file_path("../test_data/first_200_frames_traj_over_table_input_sequence.txt");

  rmd::PinholeCamera cam(481.2f, -480.0f, 319.5f, 239.5f);

  rmd::test::Dataset dataset(dataset_path.string(), sequence_file_path.string(), cam);
  if (!dataset.readDataSequence())
    FAIL() << "could not read dataset";

  const size_t ref_ind = 1;
  const size_t curr_ind = 20;

  const auto ref_entry = dataset(ref_ind);
  cv::Mat ref_img;
  dataset.readImage(ref_img, ref_entry);
  cv::Mat ref_img_flt;
  ref_img.convertTo(ref_img_flt, CV_32F, 1.0f/255.0f);

  cv::Mat ref_depthmap;
  dataset.readDepthmap(ref_depthmap, ref_entry, ref_img.cols, ref_img.rows);

  rmd::SE3<float> T_world_ref;
  dataset.readCameraPose(T_world_ref, ref_entry);

  const auto curr_entry = dataset(curr_ind);
  cv::Mat curr_img;
  dataset.readImage(curr_img, curr_entry);
  cv::Mat curr_img_flt;
  curr_img.convertTo(curr_img_flt, CV_32F, 1.0f/255.0f);

  rmd::SE3<float> T_world_curr;
  dataset.readCameraPose(T_world_curr, curr_entry);

  const float min_scene_depth = 0.4f;
  const float max_scene_depth = 1.8f;

  rmd::SeedMatrix seeds(ref_img.cols, ref_img.rows, cam);

  seeds.setReferenceImage(
        reinterpret_cast<float*>(ref_img_flt.data),
        T_world_ref.inv(),
        min_scene_depth,
        max_scene_depth);

  StopWatchInterface * timer = NULL;
  sdkCreateTimer(&timer);
  sdkResetTimer(&timer);
  sdkStartTimer(&timer);

  seeds.update(
        reinterpret_cast<float*>(ref_img_flt.data),
        T_world_curr.inv());

  sdkStopTimer(&timer);
  double t = sdkGetAverageTimerValue(&timer) / 1000.0;
  printf("update CUDA execution time: %f seconds.\n", t);

  cv::Mat cu_convergence(ref_img.rows, ref_img.cols, CV_32SC1);
  seeds.downloadConvergence(reinterpret_cast<int*>(cu_convergence.data));

  const int side = seeds.getPatchSide();
  for(size_t r=0; r<ref_img.rows; ++r)
  {
    for(size_t c=0; c<ref_img.cols; ++c)
    {
      if(r>ref_img.rows-side-1
         || r<side
         || c>ref_img.cols-side-1
         || c<side)
      {
        ASSERT_EQ(rmd::ConvergenceStates::BORDER, cu_convergence.at<int>(r, c)) << "(r, c) = (" << r << ", " << c <<")";
      }
      else
      {
        const int result = cu_convergence.at<int>(r, c);
        const bool success = (result == rmd::ConvergenceStates::UPDATE      ||
                              result == rmd::ConvergenceStates::DIVERGED    ||
                              result == rmd::ConvergenceStates::CONVERGED   ||
                              result == rmd::ConvergenceStates::NOT_VISIBLE ||
                              result == rmd::ConvergenceStates::NO_MATCH    );
        ASSERT_EQ(true, success) << "(r, c) = (" << r << ", " << c <<")";
      }
    }
  }

}
bool
runTest(int argc, char **argv, ReduceType datatype)
{
    int size = 1<<24;    // number of elements to reduce
    int maxThreads = 256;  // number of threads per block
    int whichKernel = 6;
    int maxBlocks = 64;
    bool cpuFinalReduction = false;
    int cpuFinalThreshold = 1;

    if (checkCmdLineFlag(argc, (const char **) argv, "n"))
    {
        size = getCmdLineArgumentInt(argc, (const char **) argv, "n");
    }

    if (checkCmdLineFlag(argc, (const char **) argv, "threads"))
    {
        maxThreads = getCmdLineArgumentInt(argc, (const char **) argv, "threads");
    }

    if (checkCmdLineFlag(argc, (const char **) argv, "kernel"))
    {
        whichKernel = getCmdLineArgumentInt(argc, (const char **) argv, "kernel");
    }

    if (checkCmdLineFlag(argc, (const char **) argv, "maxblocks"))
    {
        maxBlocks = getCmdLineArgumentInt(argc, (const char **) argv, "maxblocks");
    }

    printf("%d elements\n", size);
    printf("%d threads (max)\n", maxThreads);

    cpuFinalReduction = checkCmdLineFlag(argc, (const char **) argv, "cpufinal");

    if (checkCmdLineFlag(argc, (const char **) argv, "cputhresh"))
    {
        cpuFinalThreshold = getCmdLineArgumentInt(argc, (const char **) argv, "cputhresh");
    }

    bool runShmoo = checkCmdLineFlag(argc, (const char **) argv, "shmoo");

    if (runShmoo)
    {
        shmoo<T>(1, 33554432, maxThreads, maxBlocks, datatype);
    }
    else
    {
        // create random input data on CPU
        unsigned int bytes = size * sizeof(T);

        T *h_idata = (T *) malloc(bytes);

        for (int i=0; i<size; i++)
        {
            // Keep the numbers small so we don't get truncation error in the sum
            if (datatype == REDUCE_INT)
            {
                h_idata[i] = (T)(rand() & 0xFF);
            }
            else
            {
                h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX;
            }
        }

        int numBlocks = 0;
        int numThreads = 0;
        getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads);

        if (numBlocks == 1)
        {
            cpuFinalThreshold = 1;
        }

        // allocate mem for the result on host side
        T *h_odata = (T *) malloc(numBlocks*sizeof(T));

        printf("%d blocks\n\n", numBlocks);

        // allocate device memory and data
        T *d_idata = NULL;
        T *d_odata = NULL;

        checkCudaErrors(cudaMalloc((void **) &d_idata, bytes));
        checkCudaErrors(cudaMalloc((void **) &d_odata, numBlocks*sizeof(T)));

        // copy data directly to device memory
        checkCudaErrors(cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice));
        checkCudaErrors(cudaMemcpy(d_odata, h_idata, numBlocks*sizeof(T), cudaMemcpyHostToDevice));

        // warm-up
        reduce<T>(size, numThreads, numBlocks, whichKernel, d_idata, d_odata);

        int testIterations = 100;

        StopWatchInterface *timer = 0;
        sdkCreateTimer(&timer);

        T gpu_result = 0;

        gpu_result = benchmarkReduce<T>(size, numThreads, numBlocks, maxThreads, maxBlocks,
                                        whichKernel, testIterations, cpuFinalReduction,
                                        cpuFinalThreshold, timer,
                                        h_odata, d_idata, d_odata);

        double reduceTime = sdkGetAverageTimerValue(&timer) * 1e-3;
        printf("Reduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %d, Workgroup = %u\n",
               1.0e-9 * ((double)bytes)/reduceTime, reduceTime, size, 1, numThreads);

        // compute reference solution
        T cpu_result = reduceCPU<T>(h_idata, size);

        int precision = 0;
        double threshold = 0;
        double diff = 0;

        if (datatype == REDUCE_INT)
        {
            printf("\nGPU result = %d\n", (int)gpu_result);
            printf("CPU result = %d\n\n", (int)cpu_result);
        }
        else
        {
            if (datatype == REDUCE_FLOAT)
            {
                precision = 8;
                threshold = 1e-8 * size;
            }
            else
            {
                precision = 12;
                threshold = 1e-12 * size;
            }

            printf("\nGPU result = %.*f\n", precision, (double)gpu_result);
            printf("CPU result = %.*f\n\n", precision, (double)cpu_result);

            diff = fabs((double)gpu_result - (double)cpu_result);
        }

        // cleanup
        sdkDeleteTimer(&timer);
        free(h_idata);
        free(h_odata);

        checkCudaErrors(cudaFree(d_idata));
        checkCudaErrors(cudaFree(d_odata));

        if (datatype == REDUCE_INT)
        {
            return (gpu_result == cpu_result);
        }
        else
        {
            return (diff < threshold);
        }
    }

    return true;
}
void shmoo(int minN, int maxN, int maxThreads, int maxBlocks, ReduceType datatype)
{
    // create random input data on CPU
    unsigned int bytes = maxN * sizeof(T);

    T *h_idata = (T *) malloc(bytes);

    for (int i = 0; i < maxN; i++)
    {
        // Keep the numbers small so we don't get truncation error in the sum
        if (datatype == REDUCE_INT)
        {
            h_idata[i] = (T)(rand() & 0xFF);
        }
        else
        {
            h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX;
        }
    }

    int maxNumBlocks = MIN(maxN / maxThreads, MAX_BLOCK_DIM_SIZE);

    // allocate mem for the result on host side
    T *h_odata = (T *) malloc(maxNumBlocks*sizeof(T));

    // allocate device memory and data
    T *d_idata = NULL;
    T *d_odata = NULL;

    checkCudaErrors(cudaMalloc((void **) &d_idata, bytes));
    checkCudaErrors(cudaMalloc((void **) &d_odata, maxNumBlocks*sizeof(T)));

    // copy data directly to device memory
    checkCudaErrors(cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_odata, h_idata, maxNumBlocks*sizeof(T), cudaMemcpyHostToDevice));

    // warm-up
    for (int kernel = 0; kernel < 7; kernel++)
    {
        reduce<T>(maxN, maxThreads, maxNumBlocks, kernel, d_idata, d_odata);
    }

    int testIterations = 100;

    StopWatchInterface *timer = 0;
    sdkCreateTimer(&timer);

    // print headers
    printf("Time in milliseconds for various numbers of elements for each kernel\n\n\n");
    printf("Kernel");

    for (int i = minN; i <= maxN; i *= 2)
    {
        printf(", %d", i);
    }

    for (int kernel = 0; kernel < 7; kernel++)
    {
        printf("\n%d", kernel);

        for (int i = minN; i <= maxN; i *= 2)
        {
            sdkResetTimer(&timer);
            int numBlocks = 0;
            int numThreads = 0;
            getNumBlocksAndThreads(kernel, i, maxBlocks, maxThreads, numBlocks, numThreads);

            float reduceTime;

            if (numBlocks <= MAX_BLOCK_DIM_SIZE)
            {
                benchmarkReduce(i, numThreads, numBlocks, maxThreads, maxBlocks, kernel,
                                testIterations, false, 1, timer, h_odata, d_idata, d_odata);
                reduceTime =  sdkGetAverageTimerValue(&timer);
            }
            else
            {
                reduceTime = -1.0;
            }

            printf(", %.5f", reduceTime);
        }
    }

    // cleanup
    sdkDeleteTimer(&timer);
    free(h_idata);
    free(h_odata);

    checkCudaErrors(cudaFree(d_idata));
    checkCudaErrors(cudaFree(d_odata));
}
TEST(RMDCuTests, deviceImageSobelTexTest)
{
  rmd::test::Dataset dataset;
  if(!dataset.loadPathFromEnv())
  {
    FAIL() << "could not retrieve dataset path from the environment variable '"
           << rmd::test::Dataset::getDataPathEnvVar();
  }
  cv::Mat img;
  if(!dataset.readImage(img, "scene_000.png"))
  {
    FAIL() << "could not could not load test image from dataset";
  }

  cv::Mat img_flt;
  img.convertTo(img_flt, CV_32F, 1./255.);

  // Compare results of the Scharr operator to compute image gradient
  const size_t w = img_flt.cols;
  const size_t h = img_flt.rows;

  // Opencv gradient computation
  cv::Mat ocv_grad_x(h, w, CV_32FC1);
  cv::Mat ocv_grad_y(h, w, CV_32FC1);
  double t = (double)cv::getTickCount();
  cv::Sobel(img_flt, ocv_grad_x, CV_32F, 1, 0, CV_SCHARR);
  cv::Sobel(img_flt, ocv_grad_y, CV_32F, 0, 1, CV_SCHARR);
  t = ((double)cv::getTickCount() - t)/cv::getTickFrequency();
  printf("Opencv execution time: %f seconds.\n", t);

  // CUDA gradient computation

  // upload data to device memory
  rmd::DeviceImage<float> in_img(w, h);
  in_img.setDevData(reinterpret_cast<float*>(img_flt.data));

  // compute gradient on device
  rmd::DeviceImage<float2> out_grad(w, h);

  StopWatchInterface * timer = NULL;
  sdkCreateTimer(&timer);
  sdkResetTimer(&timer);
  sdkStartTimer(&timer);
  rmd::sobelTex(in_img, out_grad);
  sdkStopTimer(&timer);
  t = sdkGetAverageTimerValue(&timer) / 1000.0;
  printf("CUDA execution time: %f seconds.\n", t);

  // download result to host memory
  float2 * cu_grad = new float2[w*h];
  out_grad.getDevData(cu_grad);

  for(size_t y=1; y<h-1; ++y)
  {
    for(size_t x=1; x<w-1; ++x)
    {
      ASSERT_NEAR(ocv_grad_x.at<float>(y, x), cu_grad[y*w+x].x, 0.00001f);
      ASSERT_NEAR(ocv_grad_y.at<float>(y, x), cu_grad[y*w+x].y, 0.00001f);
    }
  }
  delete cu_grad;
}