void computeFPS() { frameCount++; fpsCount++; if (fpsCount == fpsLimit) { char fps[256]; char meshInf[256]; float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); std::string f; if (psystem->getFriction()) sprintf(fps, "CUDA Particles (%d particles): %3.1f fps with friction", (int)psystem->getNumParticles(), ifps ); else sprintf(fps, "CUDA Particles (%d particles): %3.1f fps without friction", (int)psystem->getNumParticles(), ifps); if (displayMode == ParticleRenderer::MESH_MODE) { sprintf(meshInf, " Face:%d Vertex:%d ", psystem->getNumFaces(), psystem->getNumVertexes()); strcat(fps, meshInf); } glutSetWindowTitle(fps); fpsCount = 0; fpsLimit = (int)MAX(ifps, 1.f); sdkResetTimer(&timer); } }
void siTest(T *d_ptclA, T *d_ptclA_new, T *d_wghtA, unsigned int size, int stateDim) { int blocks, threads; float elapsedTimeInMs = 0.0f; threads = BLOCK_SIZE; blocks = (size + threads - 1) / threads; #ifdef NVS while (blocks > GRID_LIMIT){ blocks >>= 1; threads <<= 1; } #endif StopWatchInterface *timer = NULL; sdkCreateTimer(&timer); for (int i = 0 ; i < TEST_ITERATIONS ; i ++){ cudaDeviceSynchronize(); sdkStartTimer(&timer); SI<T>(blocks, threads, d_ptclA, d_ptclA_new, d_wghtA, size, stateDim); checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&timer); } elapsedTimeInMs = sdkGetAverageTimerValue(&timer); printf("%f\t", elapsedTimeInMs); printf("size=%u, stateDim=%d, blocks=%d, threads=%d\n",size, stateDim, blocks, threads); sdkDeleteTimer(&timer); }
void display(void) { pthread_mutex_lock(&display_mutex); if (!ref_file) { sdkStartTimer(&timer); simulateFluids(); } // render points from vertex buffer glClear(GL_COLOR_BUFFER_BIT); glClearColor(1.f/256*172, 1.f/256*101, 1.f/256*4, 0.f); glColor4f(1.f, 1.f, 1.f, 0.5f); glPointSize(1); glEnable(GL_POINT_SMOOTH); glEnable(GL_BLEND); glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA); glEnableClientState(GL_VERTEX_ARRAY); glDisable(GL_DEPTH_TEST); glDisable(GL_CULL_FACE); glBindBufferARB(GL_ARRAY_BUFFER_ARB, vbo); #ifdef OPTIMUS glBufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(cData) * DS, particles, GL_DYNAMIC_DRAW_ARB); #endif glVertexPointer(2, GL_FLOAT, 0, NULL); glDrawArrays(GL_POINTS, 0, DS); glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0); glDisableClientState(GL_VERTEX_ARRAY); glDisableClientState(GL_TEXTURE_COORD_ARRAY); glDisable(GL_TEXTURE_2D); if (ref_file) { return; } // Finish timing before swap buffers to avoid refresh sync sdkStopTimer(&timer); glutSwapBuffers(); fpsCount++; if (fpsCount == fpsLimit) { char fps[256]; float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); sprintf(fps, "Caffe Macchiato / Stable Fluids (%d x %d): %3.1f fps", DIM, DIM, ifps); glutSetWindowTitle(fps); fpsCount = 0; fpsLimit = (int)MAX(ifps, 1.f); sdkResetTimer(&timer); } glutPostRedisplay(); pthread_mutex_unlock(&display_mutex); }
void display(void) { if (!ref_file) { sdkStartTimer(&timer); simulateFluids(); } // render points from vertex buffer glClear(GL_COLOR_BUFFER_BIT); glColor4f(0,1,0,0.5f); glPointSize(1); glEnable(GL_POINT_SMOOTH); glEnable(GL_BLEND); glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA); glEnableClientState(GL_VERTEX_ARRAY); glDisable(GL_DEPTH_TEST); glDisable(GL_CULL_FACE); glBindBufferARB(GL_ARRAY_BUFFER_ARB, vbo); glVertexPointer(2, GL_FLOAT, 0, NULL); glDrawArrays(GL_POINTS, 0, DS); glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0); glDisableClientState(GL_VERTEX_ARRAY); glDisableClientState(GL_TEXTURE_COORD_ARRAY); glDisable(GL_TEXTURE_2D); if (ref_file) { return; } // Finish timing before swap buffers to avoid refresh sync sdkStopTimer(&timer); glutSwapBuffers(); fpsCount++; if (fpsCount == fpsLimit) { char fps[256]; float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); sprintf(fps, "Cuda/GL Stable Fluids (%d x %d): %3.1f fps", DIM, DIM, ifps); glutSetWindowTitle(fps); fpsCount = 0; fpsLimit = (int)MAX(ifps, 1.f); sdkResetTimer(&timer); } glutPostRedisplay(); }
void computeFPS() { frameCount++; fpsCount++; if (fpsCount == fpsLimit) { char fps[256]; float ifps = 1.f / (sdkGetAverageTimerValue(&hTimer) / 1000.f); sprintf(fps, "<CUDA %s Set> %3.1f fps", g_isJuliaSet ? "Julia" : "Mandelbrot", ifps); glutSetWindowTitle(fps); fpsCount = 0; fpsLimit = MAX(1.f, (float)ifps); sdkResetTimer(&hTimer); } }
void computeFPS() { frameCount++; fpsCount++; if (fpsCount == fpsLimit) { char fps[256]; float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); sprintf(fps, "Dark Matter PBVR: %3.1f fps (Max 100Hz)", ifps); glutSetWindowTitle(fps); fpsCount = 0; fpsLimit = (int)MAX(ifps, 1.f); sdkResetTimer(&timer); } }
void computeFPS() { frameCount++; fpsCount++; if (fpsCount == fpsLimit) { char fps[256]; float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); sprintf(fps, "CUDA Edge Detection (%s): %3.1f fps", filterMode[g_SobelDisplayMode], ifps); glutSetWindowTitle(fps); fpsCount = 0; sdkResetTimer(&timer); } }
void computeFPS() { frameCount++; fpsCount++; if (fpsCount == fpsLimit) { char fps[256]; float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); sprintf(fps, "%s (sigma=%4.2f): %3.1f fps", sSDKsample, sigma, ifps); glutSetWindowTitle(fps); fpsCount = 0; fpsLimit = ftoi(MAX(ifps, 1.f)); sdkResetTimer(&timer); } }
void computeFPS() { frameCount++; fpsCount++; if (fpsCount == fpsLimit) { char fps[256]; float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); sprintf(fps, "<%s>: %3.1f fps", filterMode[g_Kernel], ifps); glutSetWindowTitle(fps); fpsCount = 0; //fpsLimit = (int)MAX(ifps, 1.f); sdkResetTimer(&timer); } }
void computeFPS() { frameCount++; fpsCount++; if (fpsCount == fpsLimit) { avgFPS = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); fpsCount = 0; fpsLimit = (int)MAX(avgFPS, 1.f); sdkResetTimer(&timer); } char fps[256]; sprintf(fps, "MPI Cuda GL Interop (VBO): %3.1f fps (Max 100Hz)", avgFPS); glutSetWindowTitle(fps); }
void computeFPS() { frameCount++; fpsCount++; if (fpsCount == fpsLimit) { char fps[256]; float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); sprintf(fps, "CUDA Particles (%d particles): %3.1f fps", numParticles, ifps); glutSetWindowTitle(fps); fpsCount = 0; fpsLimit = (int)MAX(ifps, 1.f); sdkResetTimer(&timer); } }
void computeFPS() { frameCount++; fpsCount++; if (fpsCount == fpsLimit) { char fps[256]; float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); sprintf(fps, "CUDA 3D Volume Filtering: %3.1f fps", ifps); glutSetWindowTitle(fps); fpsCount = 0; fpsLimit = MAX(1.f, ifps); sdkResetTimer(&timer); } }
void computeFPS() { frameCount++; fpsCount++; if (fpsCount == fpsLimit) { char fps[256]; //float ifps = 1.f / (cutGetAverageTimerValue(timer) / 1000.f); float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); sprintf(fps, "Dam Break (%d particles): %3.1f fps; elapsed Time: %f", psystem->getNumParticles(), ifps, psystem->getElapsedTime()); glutSetWindowTitle(fps); fpsCount = 0; //cutilCheckError(cutResetTimer(timer)); sdkResetTimer(&timer); } }
void computeFPS() { frameCount++; fpsCount++; if (fpsCount == fpsLimit-1) { g_Verify = true; } if (fpsCount == fpsLimit) { char fps[256]; float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); sprintf(fps, "%s %s <%s>: %3.1f fps", "", sSDKsample, sFilterMode[g_FilterMode], ifps); glutSetWindowTitle(fps); fpsCount = 0; sdkResetTimer(&timer); } }
// Calculate the Frames per second and print in the title bar void computeFPS() { frameCount++; fpsCount++; if (fpsCount == fpsLimit) { avgFPS = 1.0f / (sdkGetAverageTimerValue(&timer) / 1000.0f); fpsCount = 0; fpsLimit = (int)MAX(avgFPS, 1.0f); sdkResetTimer(&timer); } char fps[256]; sprintf(fps, "CUDA Rolling Box Filter <Animation=%s> (radius=%d, passes=%d): %3.1f fps", (!g_bInteractive ? "ON" : "OFF"), filter_radius, iterations, avgFPS); glutSetWindowTitle(fps); if (!g_bInteractive) { varySigma(); } }
void computeFPS() { fpsCount++; if (fpsCount == fpsLimit) { char fps[256]; float ifps = 1.0f / (sdkGetAverageTimerValue(&timer) / 1000.0f); sprintf(fps, "CUDA Bilateral Filter: %3.f fps (radius=%d, iter=%d, euclidean=%.2f, gaussian=%.2f)", ifps, filter_radius, iterations, (double)euclidean_delta, (double)gaussian_delta); glutSetWindowTitle(fps); fpsCount = 0; fpsLimit = (int)MAX(ifps, 1.0f); sdkResetTimer(&timer); } if (!g_bInteractive) { varyEuclidean(); } }
void computeFPS(HWND hWnd, bool bUseInterop) { sdkStopTimer(&frame_timer); if (g_bRunning) { g_fpsCount++; if (!g_pFrameQueue->isEndOfDecode()) { g_FrameCount++; } } char sFPS[256]; std::string sDecodeStatus; if (g_bDeviceLost) { sDecodeStatus = "DeviceLost!\0"; sprintf(sFPS, "%s [%s] - [%s %d]", sAppName, sDecodeStatus.c_str(), (g_bIsProgressive ? "Frame" : "Field"), g_DecodeFrameCount); if (bUseInterop && (!g_bQAReadback)) { SetWindowText(hWnd, sFPS); UpdateWindow(hWnd); } sdkResetTimer(&frame_timer); g_fpsCount = 0; return; } if (g_pFrameQueue->isEndOfDecode()) { sDecodeStatus = "STOP (End of File)\0"; // we only want to record this once if (total_time == 0.0f) { total_time = sdkGetTimerValue(&global_timer); } sdkStopTimer(&global_timer); if (g_bAutoQuit) { g_bRunning = false; g_bDone = true; } } else { if (!g_bRunning) { sDecodeStatus = "PAUSE\0"; sprintf(sFPS, "%s [%s] - [%s %d] - Video Display %s / Vsync %s", sAppName, sDecodeStatus.c_str(), (g_bIsProgressive ? "Frame" : "Field"), g_DecodeFrameCount, g_bUseDisplay ? "ON" : "OFF", g_bUseVsync ? "ON" : "OFF"); if (bUseInterop && (!g_bQAReadback)) { SetWindowText(hWnd, sFPS); UpdateWindow(hWnd); } } else { if (g_bFrameStep) { sDecodeStatus = "STEP\0"; } else { sDecodeStatus = "PLAY\0"; } } if (g_fpsCount == g_fpsLimit) { float ifps = 1.f / (sdkGetAverageTimerValue(&frame_timer) / 1000.f); sprintf(sFPS, "[%s] [%s] - [%3.1f fps, %s %d] - Video Display %s / Vsync %s", sAppName, sDecodeStatus.c_str(), ifps, (g_bIsProgressive ? "Frame" : "Field"), g_DecodeFrameCount, g_bUseDisplay ? "ON" : "OFF", g_bUseVsync ? "ON" : "OFF"); if (bUseInterop && (!g_bQAReadback)) { SetWindowText(hWnd, sFPS); UpdateWindow(hWnd); } printf("[%s] - [%s: %04d, %04.1f fps, time: %04.2f (ms) ]\n", sSDKname, (g_bIsProgressive ? "Frame" : "Field"), g_FrameCount, ifps, 1000.f/ifps); sdkResetTimer(&frame_timer); g_fpsCount = 0; } } sdkStartTimer(&frame_timer); }
bool InfiniTAMApp::ProcessFrame(void) { if (!mImageSource->hasMoreImages()) return false; mImageSource->getImages(inputRGBImage, inputRawDepthImage); if (mImuSource != NULL) { if (!mImuSource->hasMoreMeasurements()) return false; else mImuSource->getMeasurement(inputIMUMeasurement); } sdkResetTimer(&timer_instant); sdkStartTimer(&timer_instant); sdkStartTimer(&timer_average); //actual processing on the mainEngine if (mImuSource != NULL) mMainEngine->ProcessFrame(inputRGBImage, inputRawDepthImage, inputIMUMeasurement); else mMainEngine->ProcessFrame(inputRGBImage, inputRawDepthImage); ITMSafeCall(cudaDeviceSynchronize()); sdkStopTimer(&timer_instant); sdkStopTimer(&timer_average); __android_log_print(ANDROID_LOG_VERBOSE, "InfiniTAM", "Process Frame finished: %f %f", sdkGetTimerValue(&timer_instant), sdkGetAverageTimerValue(&timer_average)); return true; }
TEST(RMDCuTests, seedMatrixInit) { const boost::filesystem::path dataset_path("../test_data"); const boost::filesystem::path sequence_file_path("../test_data/first_200_frames_traj_over_table_input_sequence.txt"); rmd::PinholeCamera cam(481.2f, -480.0f, 319.5f, 239.5f); rmd::test::Dataset dataset(dataset_path.string(), sequence_file_path.string(), cam); if (!dataset.readDataSequence()) FAIL() << "could not read dataset"; const size_t ref_ind = 1; const size_t curr_ind = 20; const auto ref_entry = dataset(ref_ind); cv::Mat ref_img; dataset.readImage(ref_img, ref_entry); cv::Mat ref_img_flt; ref_img.convertTo(ref_img_flt, CV_32F, 1.0f/255.0f); cv::Mat ref_depthmap; dataset.readDepthmap(ref_depthmap, ref_entry, ref_img.cols, ref_img.rows); rmd::SE3<float> T_world_ref; dataset.readCameraPose(T_world_ref, ref_entry); const auto curr_entry = dataset(curr_ind); cv::Mat curr_img; dataset.readImage(curr_img, curr_entry); cv::Mat curr_img_flt; curr_img.convertTo(curr_img_flt, CV_32F, 1.0f/255.0f); rmd::SE3<float> T_world_curr; dataset.readCameraPose(T_world_curr, curr_entry); const float min_scene_depth = 0.4f; const float max_scene_depth = 1.8f; rmd::SeedMatrix seeds(ref_img.cols, ref_img.rows, cam); StopWatchInterface * timer = NULL; sdkCreateTimer(&timer); sdkResetTimer(&timer); sdkStartTimer(&timer); seeds.setReferenceImage( reinterpret_cast<float*>(ref_img_flt.data), T_world_ref.inv(), min_scene_depth, max_scene_depth); sdkStopTimer(&timer); double t = sdkGetAverageTimerValue(&timer) / 1000.0; printf("setReference image CUDA execution time: %f seconds.\n", t); cv::Mat initial_depthmap(ref_img.rows, ref_img.cols, CV_32FC1); seeds.downloadDepthmap(reinterpret_cast<float*>(initial_depthmap.data)); cv::Mat initial_sigma_sq(ref_img.rows, ref_img.cols, CV_32FC1); seeds.downloadSigmaSq(reinterpret_cast<float*>(initial_sigma_sq.data)); cv::Mat initial_a(ref_img.rows, ref_img.cols, CV_32FC1); seeds.downloadA(reinterpret_cast<float*>(initial_a.data)); cv::Mat initial_b(ref_img.rows, ref_img.cols, CV_32FC1); seeds.downloadB(reinterpret_cast<float*>(initial_b.data)); const float avg_scene_depth = (min_scene_depth+max_scene_depth)/2.0f; const float max_scene_sigma_sq = (max_scene_depth - min_scene_depth) * (max_scene_depth - min_scene_depth) / 36.0f; for(size_t r=0; r<ref_img.rows; ++r) { for(size_t c=0; c<ref_img.cols; ++c) { ASSERT_FLOAT_EQ(avg_scene_depth, initial_depthmap.at<float>(r, c)); ASSERT_FLOAT_EQ(max_scene_sigma_sq, initial_sigma_sq.at<float>(r, c)); ASSERT_FLOAT_EQ(10.0f, initial_a.at<float>(r, c)); ASSERT_FLOAT_EQ(10.0f, initial_b.at<float>(r, c)); } } // Test initialization of NCC template statistics // CUDA computation cv::Mat cu_sum_templ(ref_img.rows, ref_img.cols, CV_32FC1); seeds.downloadSumTempl(reinterpret_cast<float*>(cu_sum_templ.data)); cv::Mat cu_const_templ_denom(ref_img.rows, ref_img.cols, CV_32FC1); seeds.downloadConstTemplDenom(reinterpret_cast<float*>(cu_const_templ_denom.data)); // Host computation cv::Mat ocv_sum_templ(ref_img.rows, ref_img.cols, CV_32FC1); cv::Mat ocv_const_templ_denom(ref_img.rows, ref_img.cols, CV_32FC1); const int side = seeds.getPatchSide(); for(size_t y=side; y<ref_img.rows-side/2; ++y) { for(size_t x=side; x<ref_img.cols-side/2; ++x) { double sum_templ = 0.0f; double sum_templ_sq = 0.0f; for(int patch_y=0; patch_y<side; ++patch_y) { for(int patch_x=0; patch_x<side; ++patch_x) { const double templ = (double) ref_img_flt.at<float>( y-side/2+patch_y, x-side/2+patch_x ); sum_templ += templ; sum_templ_sq += templ*templ; } } ocv_sum_templ.at<float>(y, x) = (float) sum_templ; ocv_const_templ_denom.at<float>(y, x) = (float) ( ((double)(side*side))*sum_templ_sq - sum_templ*sum_templ ); } } for(size_t r=side; r<ref_img.rows-side/2; ++r) { for(size_t c=side; c<ref_img.cols-side/2; ++c) { ASSERT_NEAR(ocv_sum_templ.at<float>(r, c), cu_sum_templ.at<float>(r, c), 0.00001f); ASSERT_NEAR(ocv_const_templ_denom.at<float>(r, c), cu_const_templ_denom.at<float>(r, c), 0.001f); } } }
TEST(RMDCuTests, seedMatrixCheck) { const boost::filesystem::path dataset_path("../test_data"); const boost::filesystem::path sequence_file_path("../test_data/first_200_frames_traj_over_table_input_sequence.txt"); rmd::PinholeCamera cam(481.2f, -480.0f, 319.5f, 239.5f); rmd::test::Dataset dataset(dataset_path.string(), sequence_file_path.string(), cam); if (!dataset.readDataSequence()) FAIL() << "could not read dataset"; const size_t ref_ind = 1; const size_t curr_ind = 20; const auto ref_entry = dataset(ref_ind); cv::Mat ref_img; dataset.readImage(ref_img, ref_entry); cv::Mat ref_img_flt; ref_img.convertTo(ref_img_flt, CV_32F, 1.0f/255.0f); cv::Mat ref_depthmap; dataset.readDepthmap(ref_depthmap, ref_entry, ref_img.cols, ref_img.rows); rmd::SE3<float> T_world_ref; dataset.readCameraPose(T_world_ref, ref_entry); const auto curr_entry = dataset(curr_ind); cv::Mat curr_img; dataset.readImage(curr_img, curr_entry); cv::Mat curr_img_flt; curr_img.convertTo(curr_img_flt, CV_32F, 1.0f/255.0f); rmd::SE3<float> T_world_curr; dataset.readCameraPose(T_world_curr, curr_entry); const float min_scene_depth = 0.4f; const float max_scene_depth = 1.8f; rmd::SeedMatrix seeds(ref_img.cols, ref_img.rows, cam); seeds.setReferenceImage( reinterpret_cast<float*>(ref_img_flt.data), T_world_ref.inv(), min_scene_depth, max_scene_depth); StopWatchInterface * timer = NULL; sdkCreateTimer(&timer); sdkResetTimer(&timer); sdkStartTimer(&timer); seeds.update( reinterpret_cast<float*>(ref_img_flt.data), T_world_curr.inv()); sdkStopTimer(&timer); double t = sdkGetAverageTimerValue(&timer) / 1000.0; printf("update CUDA execution time: %f seconds.\n", t); cv::Mat cu_convergence(ref_img.rows, ref_img.cols, CV_32SC1); seeds.downloadConvergence(reinterpret_cast<int*>(cu_convergence.data)); const int side = seeds.getPatchSide(); for(size_t r=0; r<ref_img.rows; ++r) { for(size_t c=0; c<ref_img.cols; ++c) { if(r>ref_img.rows-side-1 || r<side || c>ref_img.cols-side-1 || c<side) { ASSERT_EQ(rmd::ConvergenceStates::BORDER, cu_convergence.at<int>(r, c)) << "(r, c) = (" << r << ", " << c <<")"; } else { const int result = cu_convergence.at<int>(r, c); const bool success = (result == rmd::ConvergenceStates::UPDATE || result == rmd::ConvergenceStates::DIVERGED || result == rmd::ConvergenceStates::CONVERGED || result == rmd::ConvergenceStates::NOT_VISIBLE || result == rmd::ConvergenceStates::NO_MATCH ); ASSERT_EQ(true, success) << "(r, c) = (" << r << ", " << c <<")"; } } } }
bool runTest(int argc, char **argv, ReduceType datatype) { int size = 1<<24; // number of elements to reduce int maxThreads = 256; // number of threads per block int whichKernel = 6; int maxBlocks = 64; bool cpuFinalReduction = false; int cpuFinalThreshold = 1; if (checkCmdLineFlag(argc, (const char **) argv, "n")) { size = getCmdLineArgumentInt(argc, (const char **) argv, "n"); } if (checkCmdLineFlag(argc, (const char **) argv, "threads")) { maxThreads = getCmdLineArgumentInt(argc, (const char **) argv, "threads"); } if (checkCmdLineFlag(argc, (const char **) argv, "kernel")) { whichKernel = getCmdLineArgumentInt(argc, (const char **) argv, "kernel"); } if (checkCmdLineFlag(argc, (const char **) argv, "maxblocks")) { maxBlocks = getCmdLineArgumentInt(argc, (const char **) argv, "maxblocks"); } printf("%d elements\n", size); printf("%d threads (max)\n", maxThreads); cpuFinalReduction = checkCmdLineFlag(argc, (const char **) argv, "cpufinal"); if (checkCmdLineFlag(argc, (const char **) argv, "cputhresh")) { cpuFinalThreshold = getCmdLineArgumentInt(argc, (const char **) argv, "cputhresh"); } bool runShmoo = checkCmdLineFlag(argc, (const char **) argv, "shmoo"); if (runShmoo) { shmoo<T>(1, 33554432, maxThreads, maxBlocks, datatype); } else { // create random input data on CPU unsigned int bytes = size * sizeof(T); T *h_idata = (T *) malloc(bytes); for (int i=0; i<size; i++) { // Keep the numbers small so we don't get truncation error in the sum if (datatype == REDUCE_INT) { h_idata[i] = (T)(rand() & 0xFF); } else { h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX; } } int numBlocks = 0; int numThreads = 0; getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads); if (numBlocks == 1) { cpuFinalThreshold = 1; } // allocate mem for the result on host side T *h_odata = (T *) malloc(numBlocks*sizeof(T)); printf("%d blocks\n\n", numBlocks); // allocate device memory and data T *d_idata = NULL; T *d_odata = NULL; checkCudaErrors(cudaMalloc((void **) &d_idata, bytes)); checkCudaErrors(cudaMalloc((void **) &d_odata, numBlocks*sizeof(T))); // copy data directly to device memory checkCudaErrors(cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_odata, h_idata, numBlocks*sizeof(T), cudaMemcpyHostToDevice)); // warm-up reduce<T>(size, numThreads, numBlocks, whichKernel, d_idata, d_odata); int testIterations = 100; StopWatchInterface *timer = 0; sdkCreateTimer(&timer); T gpu_result = 0; gpu_result = benchmarkReduce<T>(size, numThreads, numBlocks, maxThreads, maxBlocks, whichKernel, testIterations, cpuFinalReduction, cpuFinalThreshold, timer, h_odata, d_idata, d_odata); double reduceTime = sdkGetAverageTimerValue(&timer) * 1e-3; printf("Reduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %d, Workgroup = %u\n", 1.0e-9 * ((double)bytes)/reduceTime, reduceTime, size, 1, numThreads); // compute reference solution T cpu_result = reduceCPU<T>(h_idata, size); int precision = 0; double threshold = 0; double diff = 0; if (datatype == REDUCE_INT) { printf("\nGPU result = %d\n", (int)gpu_result); printf("CPU result = %d\n\n", (int)cpu_result); } else { if (datatype == REDUCE_FLOAT) { precision = 8; threshold = 1e-8 * size; } else { precision = 12; threshold = 1e-12 * size; } printf("\nGPU result = %.*f\n", precision, (double)gpu_result); printf("CPU result = %.*f\n\n", precision, (double)cpu_result); diff = fabs((double)gpu_result - (double)cpu_result); } // cleanup sdkDeleteTimer(&timer); free(h_idata); free(h_odata); checkCudaErrors(cudaFree(d_idata)); checkCudaErrors(cudaFree(d_odata)); if (datatype == REDUCE_INT) { return (gpu_result == cpu_result); } else { return (diff < threshold); } } return true; }
void shmoo(int minN, int maxN, int maxThreads, int maxBlocks, ReduceType datatype) { // create random input data on CPU unsigned int bytes = maxN * sizeof(T); T *h_idata = (T *) malloc(bytes); for (int i = 0; i < maxN; i++) { // Keep the numbers small so we don't get truncation error in the sum if (datatype == REDUCE_INT) { h_idata[i] = (T)(rand() & 0xFF); } else { h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX; } } int maxNumBlocks = MIN(maxN / maxThreads, MAX_BLOCK_DIM_SIZE); // allocate mem for the result on host side T *h_odata = (T *) malloc(maxNumBlocks*sizeof(T)); // allocate device memory and data T *d_idata = NULL; T *d_odata = NULL; checkCudaErrors(cudaMalloc((void **) &d_idata, bytes)); checkCudaErrors(cudaMalloc((void **) &d_odata, maxNumBlocks*sizeof(T))); // copy data directly to device memory checkCudaErrors(cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_odata, h_idata, maxNumBlocks*sizeof(T), cudaMemcpyHostToDevice)); // warm-up for (int kernel = 0; kernel < 7; kernel++) { reduce<T>(maxN, maxThreads, maxNumBlocks, kernel, d_idata, d_odata); } int testIterations = 100; StopWatchInterface *timer = 0; sdkCreateTimer(&timer); // print headers printf("Time in milliseconds for various numbers of elements for each kernel\n\n\n"); printf("Kernel"); for (int i = minN; i <= maxN; i *= 2) { printf(", %d", i); } for (int kernel = 0; kernel < 7; kernel++) { printf("\n%d", kernel); for (int i = minN; i <= maxN; i *= 2) { sdkResetTimer(&timer); int numBlocks = 0; int numThreads = 0; getNumBlocksAndThreads(kernel, i, maxBlocks, maxThreads, numBlocks, numThreads); float reduceTime; if (numBlocks <= MAX_BLOCK_DIM_SIZE) { benchmarkReduce(i, numThreads, numBlocks, maxThreads, maxBlocks, kernel, testIterations, false, 1, timer, h_odata, d_idata, d_odata); reduceTime = sdkGetAverageTimerValue(&timer); } else { reduceTime = -1.0; } printf(", %.5f", reduceTime); } } // cleanup sdkDeleteTimer(&timer); free(h_idata); free(h_odata); checkCudaErrors(cudaFree(d_idata)); checkCudaErrors(cudaFree(d_odata)); }
TEST(RMDCuTests, deviceImageSobelTexTest) { rmd::test::Dataset dataset; if(!dataset.loadPathFromEnv()) { FAIL() << "could not retrieve dataset path from the environment variable '" << rmd::test::Dataset::getDataPathEnvVar(); } cv::Mat img; if(!dataset.readImage(img, "scene_000.png")) { FAIL() << "could not could not load test image from dataset"; } cv::Mat img_flt; img.convertTo(img_flt, CV_32F, 1./255.); // Compare results of the Scharr operator to compute image gradient const size_t w = img_flt.cols; const size_t h = img_flt.rows; // Opencv gradient computation cv::Mat ocv_grad_x(h, w, CV_32FC1); cv::Mat ocv_grad_y(h, w, CV_32FC1); double t = (double)cv::getTickCount(); cv::Sobel(img_flt, ocv_grad_x, CV_32F, 1, 0, CV_SCHARR); cv::Sobel(img_flt, ocv_grad_y, CV_32F, 0, 1, CV_SCHARR); t = ((double)cv::getTickCount() - t)/cv::getTickFrequency(); printf("Opencv execution time: %f seconds.\n", t); // CUDA gradient computation // upload data to device memory rmd::DeviceImage<float> in_img(w, h); in_img.setDevData(reinterpret_cast<float*>(img_flt.data)); // compute gradient on device rmd::DeviceImage<float2> out_grad(w, h); StopWatchInterface * timer = NULL; sdkCreateTimer(&timer); sdkResetTimer(&timer); sdkStartTimer(&timer); rmd::sobelTex(in_img, out_grad); sdkStopTimer(&timer); t = sdkGetAverageTimerValue(&timer) / 1000.0; printf("CUDA execution time: %f seconds.\n", t); // download result to host memory float2 * cu_grad = new float2[w*h]; out_grad.getDevData(cu_grad); for(size_t y=1; y<h-1; ++y) { for(size_t x=1; x<w-1; ++x) { ASSERT_NEAR(ocv_grad_x.at<float>(y, x), cu_grad[y*w+x].x, 0.00001f); ASSERT_NEAR(ocv_grad_y.at<float>(y, x), cu_grad[y*w+x].y, 0.00001f); } } delete cu_grad; }