void copy_image(PPM_IMG img_in) { StopWatchInterface *timer=NULL; PPM_IMG host_img; PPM_IMG device_img; int size = img_in.w * img_in.h * sizeof(unsigned char); host_img.w = img_in.w; host_img.h = img_in.h; host_img.img_r = (unsigned char *)malloc(size); host_img.img_g = (unsigned char *)malloc(size); host_img.img_b = (unsigned char *)malloc(size); device_img.w = img_in.w; device_img.h = img_in.h; cudaMalloc((void **)&(device_img.img_r), size); cudaMalloc((void **)&(device_img.img_g), size); cudaMalloc((void **)&(device_img.img_b), size); launchEmptyKernel(); // lauch an empty kernel printf("Starting copy image...\n"); // CPU to GPU sdkCreateTimer(&timer); sdkStartTimer(&timer); cudaMemcpy(device_img.img_r, img_in.img_r, size, cudaMemcpyHostToDevice); cudaMemcpy(device_img.img_g, img_in.img_g, size, cudaMemcpyHostToDevice); cudaMemcpy(device_img.img_b, img_in.img_b, size, cudaMemcpyHostToDevice); sdkStopTimer(&timer); printf("Time of copy image from CPU to GPU: %f (ms)\n", sdkGetTimerValue(&timer)); sdkDeleteTimer(&timer); // GPU to CPU sdkCreateTimer(&timer); sdkStartTimer(&timer); cudaMemcpy(host_img.img_r, device_img.img_r, size, cudaMemcpyDeviceToHost); cudaMemcpy(host_img.img_g, device_img.img_g, size, cudaMemcpyDeviceToHost); cudaMemcpy(host_img.img_b, device_img.img_b, size, cudaMemcpyDeviceToHost); sdkStopTimer(&timer); printf("Time of copy image from GPU to CPU: %f (ms)\n", sdkGetTimerValue(&timer)); sdkDeleteTimer(&timer); cudaFree(device_img.img_r); cudaFree(device_img.img_g); cudaFree(device_img.img_b); free(host_img.img_r); free(host_img.img_g); free(host_img.img_b); }
void benchmark(int iterations) { // allocate memory for result unsigned int *d_result; unsigned int size = width * height * sizeof(unsigned int); checkCudaErrors(cudaMalloc((void **) &d_result, size)); // warm-up gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); checkCudaErrors(cudaDeviceSynchronize()); sdkStartTimer(&timer); // execute the kernel for (int i = 0; i < iterations; i++) { gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); } checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&timer); // check if kernel execution generated an error getLastCudaError("Kernel execution failed"); printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); printf("%.2f Mpixels/sec\n", (width*height*iterations / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6); checkCudaErrors(cudaFree(d_result)); }
void siTest(T *d_ptclA, T *d_ptclA_new, T *d_wghtA, unsigned int size, int stateDim) { int blocks, threads; float elapsedTimeInMs = 0.0f; threads = BLOCK_SIZE; blocks = (size + threads - 1) / threads; #ifdef NVS while (blocks > GRID_LIMIT){ blocks >>= 1; threads <<= 1; } #endif StopWatchInterface *timer = NULL; sdkCreateTimer(&timer); for (int i = 0 ; i < TEST_ITERATIONS ; i ++){ cudaDeviceSynchronize(); sdkStartTimer(&timer); SI<T>(blocks, threads, d_ptclA, d_ptclA_new, d_wghtA, size, stateDim); checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&timer); } elapsedTimeInMs = sdkGetAverageTimerValue(&timer); printf("%f\t", elapsedTimeInMs); printf("size=%u, stateDim=%d, blocks=%d, threads=%d\n",size, stateDim, blocks, threads); sdkDeleteTimer(&timer); }
void runBenchmark(int iterations, char *exec_path) { printf("Run %u particles simulation for %d iterations...\n\n", numParticles, iterations); cudaDeviceSynchronize(); sdkStartTimer(&timer); for (int i = 0; i < iterations; ++i) { psystem->update(timestep); } cudaDeviceSynchronize(); sdkStopTimer(&timer); float fAvgSeconds = ((float)1.0e-3 * (float)sdkGetTimerValue(&timer)/(float)iterations); printf("particles, Throughput = %.4f KParticles/s, Time = %.5f s, Size = %u particles, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-3 * numParticles)/fAvgSeconds, fAvgSeconds, numParticles, 1, 0); if (g_refFile) { printf("\nChecking result...\n\n"); float *hPos = (float *)malloc(sizeof(float)*4*psystem->getNumParticles()); copyArrayFromDevice(hPos, psystem->getCudaPosVBO(), 0, sizeof(float)*4*psystem->getNumParticles()); sdkDumpBin((void *)hPos, sizeof(float)*4*psystem->getNumParticles(), "particles.bin"); if (!sdkCompareBin2BinFloat("particles.bin", g_refFile, sizeof(float)*4*psystem->getNumParticles(), MAX_EPSILON_ERROR, THRESHOLD, exec_path)) { g_TotalErrors++; } } }
void runBenchmark(int iterations) { printf("[%s] (Benchmark Mode)\n", sSDKsample); sdkCreateTimer(&timer); uchar4 *d_output; checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); size_t num_bytes; checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_output, &num_bytes, cuda_pbo_resource)); sdkStartTimer(&timer); for (int i = 0; i < iterations; ++i) { render(imageWidth, imageHeight, tx, ty, scale, cx, cy, blockSize, gridSize, g_FilterMode, d_output); } cudaDeviceSynchronize(); sdkStopTimer(&timer); float time = sdkGetTimerValue(&timer) / (float) iterations; checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); printf("time: %0.3f ms, %f Mpixels/sec\n", time, (width*height / (time * 0.001f)) / 1e6); }
void runAutoTest(const char *ref_file, char *exec_path) { checkCudaErrors(cudaMalloc((void **)&d_output, width*height*sizeof(GLubyte)*4)); // render the volumeData render_kernel(gridSize, blockSize, d_output, width, height, w); checkCudaErrors(cudaDeviceSynchronize()); getLastCudaError("render_kernel failed"); void *h_output = malloc(width*height*sizeof(GLubyte)*4); checkCudaErrors(cudaMemcpy(h_output, d_output, width*height*sizeof(GLubyte)*4, cudaMemcpyDeviceToHost)); sdkDumpBin(h_output, width*height*sizeof(GLubyte)*4, "simpleTexture3D.bin"); bool bTestResult = sdkCompareBin2BinFloat("simpleTexture3D.bin", sdkFindFilePath(ref_file, exec_path), width*height, MAX_EPSILON_ERROR, THRESHOLD, exec_path); checkCudaErrors(cudaFree(d_output)); free(h_output); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); sdkStopTimer(&timer); sdkDeleteTimer(&timer); exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); }
void display(void) { pthread_mutex_lock(&display_mutex); if (!ref_file) { sdkStartTimer(&timer); simulateFluids(); } // render points from vertex buffer glClear(GL_COLOR_BUFFER_BIT); glClearColor(1.f/256*172, 1.f/256*101, 1.f/256*4, 0.f); glColor4f(1.f, 1.f, 1.f, 0.5f); glPointSize(1); glEnable(GL_POINT_SMOOTH); glEnable(GL_BLEND); glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA); glEnableClientState(GL_VERTEX_ARRAY); glDisable(GL_DEPTH_TEST); glDisable(GL_CULL_FACE); glBindBufferARB(GL_ARRAY_BUFFER_ARB, vbo); #ifdef OPTIMUS glBufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(cData) * DS, particles, GL_DYNAMIC_DRAW_ARB); #endif glVertexPointer(2, GL_FLOAT, 0, NULL); glDrawArrays(GL_POINTS, 0, DS); glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0); glDisableClientState(GL_VERTEX_ARRAY); glDisableClientState(GL_TEXTURE_COORD_ARRAY); glDisable(GL_TEXTURE_2D); if (ref_file) { return; } // Finish timing before swap buffers to avoid refresh sync sdkStopTimer(&timer); glutSwapBuffers(); fpsCount++; if (fpsCount == fpsLimit) { char fps[256]; float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); sprintf(fps, "Caffe Macchiato / Stable Fluids (%d x %d): %3.1f fps", DIM, DIM, ifps); glutSetWindowTitle(fps); fpsCount = 0; fpsLimit = (int)MAX(ifps, 1.f); sdkResetTimer(&timer); } glutPostRedisplay(); pthread_mutex_unlock(&display_mutex); }
void displayFunc(void) { sdkStartTimer(&timer); TColor *d_dst = NULL; size_t num_bytes; if (frameCounter++ == 0) { sdkResetTimer(&timer); } // DEPRECATED: checkCudaErrors(cudaGLMapBufferObject((void**)&d_dst, gl_PBO)); checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); getLastCudaError("cudaGraphicsMapResources failed"); checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_dst, &num_bytes, cuda_pbo_resource)); getLastCudaError("cudaGraphicsResourceGetMappedPointer failed"); checkCudaErrors(CUDA_Bind2TextureArray()); runImageFilters(d_dst); checkCudaErrors(CUDA_UnbindTexture()); // DEPRECATED: checkCudaErrors(cudaGLUnmapBufferObject(gl_PBO)); checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); // Common display code path { glClear(GL_COLOR_BUFFER_BIT); glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, imageW, imageH, GL_RGBA, GL_UNSIGNED_BYTE, BUFFER_DATA(0)); glBegin(GL_TRIANGLES); glTexCoord2f(0, 0); glVertex2f(-1, -1); glTexCoord2f(2, 0); glVertex2f(+3, -1); glTexCoord2f(0, 2); glVertex2f(-1, +3); glEnd(); glFinish(); } if (frameCounter == frameN) { frameCounter = 0; if (g_FPS) { printf("FPS: %3.1f\n", frameN / (sdkGetTimerValue(&timer) * 0.001)); g_FPS = false; } } glutSwapBuffers(); glutReportErrors(); sdkStopTimer(&timer); computeFPS(); }
void EyeDescriptor::mainHough(cv::Mat& dst) { StopWatchInterface *timer = NULL; sdkCreateTimer(&timer); sdkStartTimer(&timer); //pixels which should be considered std::vector<std::pair<int, int>> edgesIdx; for (int y = 0; y < height; y++) for (int x = 0; x < width; x++) if (dst.at<uchar>(y, x) == 255) { edgesIdx.push_back( std::pair<int, int>(x, y) ); } int NPixelsEdges = (int) edgesIdx.size(); for (int ipixel = 0; ipixel < NPixelsEdges; ++ipixel) { int x = edgesIdx[ipixel].first; int y = edgesIdx[ipixel].second; //gradient angle? int q_angle = localGradient_angles[x + y * width]; //we check for each radius // from rmin to rmax for (double r = rmin; r < rmax; r += rdelta) { int eps = 40; if (std::abs(r) < eps) continue; int ri = int(((r - rmin) / (rmax - rmin))*rstepnumb); if (ri == rstepnumb) ri = rstepnumb - 1; //small chance for drawing rmax and getting out of bounds int x0 = int(x - r*ci[q_angle] + 0.5); int y0 = int(y - r*si[q_angle] + 0.5); if (!(x0>=0 && x0 < width && y0>=0 && y0 < height)) continue; int tmp = ++accummulator[x0 + y0*width + ri*height*width]; if (tmp > houghmaxval){ houghmaxval = tmp; x_maxval = x0; y_maxval = y0; r_maxval = int(std::abs(r)+0.5); } } } sdkStopTimer(&timer); float execution_time = sdkGetTimerValue(&timer); std::cout << "Main loop, TIME: " << execution_time << "[ms]" << std::endl; }
// display results using OpenGL void display() { sdkStartTimer(&timer); // execute filter, writing results to pbo unsigned int *dResult; //DEPRECATED: checkCudaErrors( cudaGLMapBufferObject((void**)&d_result, pbo) ); checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); size_t num_bytes; checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&dResult, &num_bytes, cuda_pbo_resource)); bilateralFilterRGBA(dResult, width, height, euclidean_delta, filter_radius, iterations, kernel_timer); // DEPRECATED: checkCudaErrors(cudaGLUnmapBufferObject(pbo)); checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); // Common display code path { glClear(GL_COLOR_BUFFER_BIT); // load texture from pbo glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); glBindTexture(GL_TEXTURE_2D, texid); glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0); glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0); // fragment program is required to display floating point texture glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, shader); glEnable(GL_FRAGMENT_PROGRAM_ARB); glDisable(GL_DEPTH_TEST); glBegin(GL_QUADS); { glTexCoord2f(0, 0); glVertex2f(0, 0); glTexCoord2f(1, 0); glVertex2f(1, 0); glTexCoord2f(1, 1); glVertex2f(1, 1); glTexCoord2f(0, 1); glVertex2f(0, 1); } glEnd(); glBindTexture(GL_TEXTURE_TYPE, 0); glDisable(GL_FRAGMENT_PROGRAM_ARB); } glutSwapBuffers(); glutReportErrors(); sdkStopTimer(&timer); computeFPS(); }
void run_cpu_color_test(PPM_IMG img_in) { StopWatchInterface *timer=NULL; printf("Starting CPU processing...\n"); sdkCreateTimer(&timer); sdkStartTimer(&timer); img_obuf_yuv_cpu = rgb2yuv(img_in); //Start RGB 2 YUV sdkStopTimer(&timer); printf("RGB to YUV conversion time: %f (ms)\n", sdkGetTimerValue(&timer)); sdkDeleteTimer(&timer); sdkCreateTimer(&timer); sdkStartTimer(&timer); img_obuf_rgb_cpu = yuv2rgb(img_obuf_yuv_cpu); //Start YUV 2 RGB sdkStopTimer(&timer); printf("YUV to RGB conversion time: %f (ms)\n", sdkGetTimerValue(&timer)); sdkDeleteTimer(&timer); write_yuv(img_obuf_yuv_cpu, "out_yuv.yuv"); write_ppm(img_obuf_rgb_cpu, "out_rgb.ppm"); }
void display(void) { if (!ref_file) { sdkStartTimer(&timer); simulateFluids(); } // render points from vertex buffer glClear(GL_COLOR_BUFFER_BIT); glColor4f(0,1,0,0.5f); glPointSize(1); glEnable(GL_POINT_SMOOTH); glEnable(GL_BLEND); glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA); glEnableClientState(GL_VERTEX_ARRAY); glDisable(GL_DEPTH_TEST); glDisable(GL_CULL_FACE); glBindBufferARB(GL_ARRAY_BUFFER_ARB, vbo); glVertexPointer(2, GL_FLOAT, 0, NULL); glDrawArrays(GL_POINTS, 0, DS); glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0); glDisableClientState(GL_VERTEX_ARRAY); glDisableClientState(GL_TEXTURE_COORD_ARRAY); glDisable(GL_TEXTURE_2D); if (ref_file) { return; } // Finish timing before swap buffers to avoid refresh sync sdkStopTimer(&timer); glutSwapBuffers(); fpsCount++; if (fpsCount == fpsLimit) { char fps[256]; float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); sprintf(fps, "Cuda/GL Stable Fluids (%d x %d): %3.1f fps", DIM, DIM, ifps); glutSetWindowTitle(fps); fpsCount = 0; fpsLimit = (int)MAX(ifps, 1.f); sdkResetTimer(&timer); } glutPostRedisplay(); }
void run_gpu_color_test(PPM_IMG img_in) { StopWatchInterface *timer=NULL; launchEmptyKernel(); // lauch an empty kernel printf("Starting GPU processing...\n"); sdkCreateTimer(&timer); sdkStartTimer(&timer); img_obuf_yuv_gpu = rgb2yuvGPU(img_in); //Start RGB 2 YUV sdkStopTimer(&timer); printf("RGB to YUV conversion time(GPU): %f (ms)\n", sdkGetTimerValue(&timer)); sdkDeleteTimer(&timer); sdkCreateTimer(&timer); sdkStartTimer(&timer); img_obuf_rgb_gpu = yuv2rgbGPU(img_obuf_yuv_gpu); //Start YUV 2 RGB sdkStopTimer(&timer); printf("YUV to RGB conversion time(GPU): %f (ms)\n", sdkGetTimerValue(&timer)); sdkDeleteTimer(&timer); write_ppm(img_obuf_rgb_gpu, "out_rgb.ppm"); write_yuv(img_obuf_yuv_gpu, "out_yuv.yuv"); }
// This is the normal display path void display(void) { sdkStartTimer(&timer); // Sobel operation Pixel *data = NULL; // map PBO to get CUDA device pointer checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); size_t num_bytes; checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&data, &num_bytes, cuda_pbo_resource)); //printf("CUDA mapped PBO: May access %ld bytes\n", num_bytes); sobelFilter(data, imWidth, imHeight, g_SobelDisplayMode, imageScale); checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); glClear(GL_COLOR_BUFFER_BIT); glBindTexture(GL_TEXTURE_2D, texid); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_buffer); glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, imWidth, imHeight, GL_LUMINANCE, GL_UNSIGNED_BYTE, OFFSET(0)); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); glDisable(GL_DEPTH_TEST); glEnable(GL_TEXTURE_2D); glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT); glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT); glBegin(GL_QUADS); glVertex2f(0, 0); glTexCoord2f(0, 0); glVertex2f(0, 1); glTexCoord2f(1, 0); glVertex2f(1, 1); glTexCoord2f(1, 1); glVertex2f(1, 0); glTexCoord2f(0, 1); glEnd(); glBindTexture(GL_TEXTURE_2D, 0); glutSwapBuffers(); sdkStopTimer(&timer); computeFPS(); }
//////////////////////////////////////////////////////////////////////////////// //! Display callback //////////////////////////////////////////////////////////////////////////////// void display() { sdkStartTimer(&timer); // run CUDA kernel to generate vertex positions runCuda(&cuda_vbo_resource); //簡易ライトセット glEnable(GL_LIGHTING); glEnable(GL_LIGHT0); glLightfv(GL_LIGHT0, GL_POSITION, gkLightPos); glLightfv(GL_LIGHT0, GL_DIFFUSE, gkLightDiff); // glLightfv(GL_LIGHT0, GL_AMBIENT, gkLightAmb); glEnable(GL_LIGHT1); glLightfv(GL_LIGHT1, GL_POSITION, gkLightPos2); glLightfv(GL_LIGHT1, GL_DIFFUSE, gkLightDiff2); //Zバッファ有効 glEnable(GL_DEPTH_TEST); glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); // set view matrix glMatrixMode(GL_MODELVIEW); glLoadIdentity(); glTranslatef(0.0, 0.0, translate_z); glRotatef(rotate_x, 1.0, 0.0, 0.0); glRotatef(rotate_y, 0.0, 1.0, 0.0); // Earth // glMaterialfv(GL_FRONT, GL_DIFFUSE, gkMaterial); glutSolidSphere(50.0 * h_axis_radius, 20, 20); glDisable(GL_LIGHTING); // render from the vbo glBindBuffer(GL_ARRAY_BUFFER, vbo); glVertexPointer(4, GL_FLOAT, 0, 0); glEnableClientState(GL_VERTEX_ARRAY); glColor3f(1.0, 1.0, 1.0); glDrawArrays(GL_POINTS, 0, mesh_width * mesh_height); glDisableClientState(GL_VERTEX_ARRAY); glutSwapBuffers(); g_fAnim += 0.01f; sdkStopTimer(&timer); computeFPS(); }
bool InfiniTAMApp::ProcessFrame(void) { if (!mImageSource->hasMoreImages()) return false; mImageSource->getImages(inputRGBImage, inputRawDepthImage); if (mImuSource != NULL) { if (!mImuSource->hasMoreMeasurements()) return false; else mImuSource->getMeasurement(inputIMUMeasurement); } sdkResetTimer(&timer_instant); sdkStartTimer(&timer_instant); sdkStartTimer(&timer_average); //actual processing on the mainEngine if (mImuSource != NULL) mMainEngine->ProcessFrame(inputRGBImage, inputRawDepthImage, inputIMUMeasurement); else mMainEngine->ProcessFrame(inputRGBImage, inputRawDepthImage); ITMSafeCall(cudaDeviceSynchronize()); sdkStopTimer(&timer_instant); sdkStopTimer(&timer_average); __android_log_print(ANDROID_LOG_VERBOSE, "InfiniTAM", "Process Frame finished: %f %f", sdkGetTimerValue(&timer_instant), sdkGetAverageTimerValue(&timer_average)); return true; }
void _runBenchmark(int iterations) { // once without timing to prime the device if (!useCpu) { m_nbody->update(activeParams.m_timestep); } if (useCpu) { sdkCreateTimer(&timer); sdkStartTimer(&timer); } else { checkCudaErrors(cudaEventRecord(startEvent, 0)); } for (int i = 0; i < iterations; ++i) { m_nbody->update(activeParams.m_timestep); } float milliseconds = 0; if (useCpu) { sdkStopTimer(&timer); milliseconds = sdkGetTimerValue(&timer); sdkStartTimer(&timer); } else { checkCudaErrors(cudaEventRecord(stopEvent, 0)); checkCudaErrors(cudaEventSynchronize(stopEvent)); checkCudaErrors(cudaEventElapsedTime(&milliseconds, startEvent, stopEvent)); } double interactionsPerSecond = 0; double gflops = 0; computePerfStats(interactionsPerSecond, gflops, milliseconds, iterations); printf("%d bodies, total time for %d iterations: %.3f ms, mean %f\n", numBodies, iterations, milliseconds, milliseconds/iterations); printf("= %.3f billion interactions per second\n", interactionsPerSecond); printf("= %.3f %s-precision GFLOP/s at %d flops per interaction\n", gflops, (sizeof(T) > 4) ? "double" : "single", flopsPerInteraction); }
void runBenchmark(int iterations, char *exec_path) { printf("Run %u particles simulation for %d iterations...\n\n", numParticles, iterations); cudaDeviceSynchronize(); sdkStartTimer(&timer); for (int i = 0; i < iterations; ++i) { psystem->update(timestep); } cudaDeviceSynchronize(); sdkStopTimer(&timer); float fAvgSeconds = ((float)1.0e-3 * (float)sdkGetTimerValue(&timer)/(float)iterations); printf("particles, Throughput = %.4f KParticles/s, Time = %.5f s, Size = %u particles, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-3 * numParticles)/fAvgSeconds, fAvgSeconds, numParticles, 1, 0); }
bool runSingleTest(const char *ref_file, const char *exec_path) { // allocate memory for result int nTotalErrors = 0; unsigned int *d_result; unsigned int size = width * height * sizeof(unsigned int); checkCudaErrors(cudaMalloc((void **) &d_result, size)); // warm-up gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); checkCudaErrors(cudaDeviceSynchronize()); sdkStartTimer(&timer); gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); checkCudaErrors(cudaDeviceSynchronize()); getLastCudaError("Kernel execution failed"); sdkStopTimer(&timer); unsigned char *h_result = (unsigned char *)malloc(width*height*4); checkCudaErrors(cudaMemcpy(h_result, d_result, width*height*4, cudaMemcpyDeviceToHost)); char dump_file[1024]; sprintf(dump_file, "lena_%02d.ppm", (int)sigma); sdkSavePPM4ub(dump_file, h_result, width, height); if (!sdkComparePPM(dump_file, sdkFindFilePath(ref_file, exec_path), MAX_EPSILON_ERROR, THRESHOLD, false)) { nTotalErrors++; } printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); printf("%.2f Mpixels/sec\n", (width*height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6); checkCudaErrors(cudaFree(d_result)); free(h_result); printf("Summary: %d errors!\n", nTotalErrors); printf(nTotalErrors == 0 ? "Test passed\n": "Test failed!\n"); return (nTotalErrors == 0); }
//////////////////////////////////////////////////////////////////////////////// //! Run a simple benchmark test for CUDA //////////////////////////////////////////////////////////////////////////////// int runBenchmark(int argc, char **argv) { printf("[runBenchmark]: [%s]\n", sSDKsample); loadImageData(argc, argv); initCuda(); unsigned int *dResult; size_t pitch; checkCudaErrors(cudaMallocPitch((void **)&dResult, &pitch, width*sizeof(unsigned int), height)); sdkStartTimer(&kernel_timer); // warm-up bilateralFilterRGBA(dResult, width, height, euclidean_delta, filter_radius, iterations, kernel_timer); checkCudaErrors(cudaDeviceSynchronize()); // Start round-trip timer and process iCycles loops on the GPU iterations = 1; // standard 1-pass filtering const int iCycles = 150; double dProcessingTime = 0.0; printf("\nRunning BilateralFilterGPU for %d cycles...\n\n", iCycles); for (int i = 0; i < iCycles; i++) { dProcessingTime += bilateralFilterRGBA(dResult, width, height, euclidean_delta, filter_radius, iterations, kernel_timer); } // check if kernel execution generated an error and sync host getLastCudaError("Error: bilateralFilterRGBA Kernel execution FAILED"); checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&kernel_timer); // Get average computation time dProcessingTime /= (double)iCycles; // log testname, throughput, timing and config info to sample and master logs printf("bilateralFilter-texture, Throughput = %.4f M RGBA Pixels/s, Time = %.5f s, Size = %u RGBA Pixels, NumDevsUsed = %u\n", (1.0e-6 * width * height)/dProcessingTime, dProcessingTime, (width * height), 1); printf("\n"); return 0; }
// display results using OpenGL void display() { sdkStartTimer(&timer); // execute filter, writing results to pbo unsigned int *d_result; checkCudaErrors(cudaGLMapBufferObject((void **)&d_result, pbo)); gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); checkCudaErrors(cudaGLUnmapBufferObject(pbo)); // load texture from pbo glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); glBindTexture(GL_TEXTURE_2D, texid); glPixelStorei(GL_UNPACK_ALIGNMENT, 1); glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0); glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0); // display results glClear(GL_COLOR_BUFFER_BIT); glEnable(GL_TEXTURE_2D); glDisable(GL_DEPTH_TEST); glBegin(GL_QUADS); glTexCoord2f(0, 1); glVertex2f(0, 0); glTexCoord2f(1, 1); glVertex2f(1, 0); glTexCoord2f(1, 0); glVertex2f(1, 1); glTexCoord2f(0, 0); glVertex2f(0, 1); glEnd(); glDisable(GL_TEXTURE_2D); glutSwapBuffers(); sdkStopTimer(&timer); computeFPS(); }
//////////////////////////////////////////////////////////////////////////////// //! Run a simple benchmark test for CUDA //////////////////////////////////////////////////////////////////////////////// int runBenchmark() { printf("[runBenchmark]: [%s]\n", sSDKsample); initCuda(true); unsigned int *d_result; checkCudaErrors(cudaMalloc((void **)&d_result, width*height*sizeof(unsigned int))); // warm-up boxFilterRGBA(d_img, d_temp, d_temp, width, height, filter_radius, iterations, nthreads, kernel_timer); checkCudaErrors(cudaDeviceSynchronize()); sdkStartTimer(&kernel_timer); // Start round-trip timer and process iCycles loops on the GPU iterations = 1; // standard 1-pass filtering const int iCycles = 150; double dProcessingTime = 0.0; printf("\nRunning BoxFilterGPU for %d cycles...\n\n", iCycles); for (int i = 0; i < iCycles; i++) { dProcessingTime += boxFilterRGBA(d_img, d_temp, d_img, width, height, filter_radius, iterations, nthreads, kernel_timer); } // check if kernel execution generated an error and sync host getLastCudaError("Error: boxFilterRGBA Kernel execution FAILED"); checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&kernel_timer); // Get average computation time dProcessingTime /= (double)iCycles; // log testname, throughput, timing and config info to sample and master logs printf("boxFilter-texture, Throughput = %.4f M RGBA Pixels/s, Time = %.5f s, Size = %u RGBA Pixels, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * width * height)/dProcessingTime, dProcessingTime, (width * height), 1, nthreads); printf("\n"); return 0; }
// display results using OpenGL (called by GLUT) void display() { sdkStartTimer(&timer); render(); // display results glClear(GL_COLOR_BUFFER_BIT); // draw image from PBO glDisable(GL_DEPTH_TEST); glRasterPos2i(0, 0); glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); glDrawPixels(width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0); glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0); glutSwapBuffers(); glutReportErrors(); sdkStopTimer(&timer); computeFPS(); }
void display() { //cutilCheckError(cutStartTimer(timer)); sdkStartTimer(&timer); if(IsFirstTime){ IsFirstTime = false; psystem->update(); if (renderer) renderer->setVertexBuffer(psystem->getCurrentReadBuffer(), psystem->getNumParticles()); } if (!bPause){ psystem->update(); if (renderer) renderer->setVertexBuffer(psystem->getCurrentReadBuffer(), psystem->getNumParticles()); } glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); glMatrixMode(GL_MODELVIEW); glLoadIdentity(); for (int c = 0; c < 3; ++c) { camera_trans_lag[c] += (camera_trans[c] - camera_trans_lag[c]) * inertia; camera_rot_lag[c] += (camera_rot[c] - camera_rot_lag[c]) * inertia; } glTranslatef(camera_trans_lag[0], camera_trans_lag[1], camera_trans_lag[2]); glRotatef(camera_rot_lag[0], 1.0, 0.0, 0.0); glRotatef(camera_rot_lag[1], 0.0, 1.0, 0.0); glGetFloatv(GL_MODELVIEW_MATRIX, modelView); glColor3f(0.0, 0.0, 0.0); //glutWireCube(2.0); if (renderer) renderer->display(); //cutilCheckError(cutStopTimer(timer)); sdkStopTimer(&timer); glutSwapBuffers(); glutReportErrors(); computeFPS(); }
static CUT_THREADPROC solverThread(TOptionPlan *plan) { //Init GPU checkCudaErrors(cudaSetDevice(plan->device)); cudaDeviceProp deviceProp; checkCudaErrors(cudaGetDeviceProperties(&deviceProp, plan->device)); //Start the timer sdkStartTimer(&hTimer[plan->device]); // Allocate intermediate memory for MC integrator and initialize // RNG states initMonteCarloGPU(plan); // Main commputation MonteCarloGPU(plan); checkCudaErrors(cudaDeviceSynchronize()); //Stop the timer sdkStopTimer(&hTimer[plan->device]); //Shut down this GPU closeMonteCarloGPU(plan); cudaStreamSynchronize(0); printf("solverThread() finished - GPU Device %d: %s\n", plan->device, deviceProp.name); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); CUT_THREADEND; }
void runBenchmark(int iterations, char *exec_path) { int file_count=0, iterationsPerFrame = (int)(1.0/(30.0*timestep)); printf("Run %u particles simulation for %d iterations...\n\n", numParticles, iterations); //abb58: 1. what are you trying to sync??? cudaDeviceSynchronize(); sdkStartTimer(&timer); for (int i = 0; i < iterations; ++i) { psystem->update(timestep); if (i % iterationsPerFrame == 0) { psystem->writeParticles(fpout, 0, numParticles, file_count); //psystem->dumpParticles(0, numParticles, file_count); file_count++; } } cudaDeviceSynchronize(); sdkStopTimer(&timer); float fAvgSeconds = ((float)1.0e-3 * (float)sdkGetTimerValue(&timer)/(float)iterations); }
// display results using OpenGL (called by GLUT) void display() { sdkStartTimer(&timer); // map PBO to get CUDA device pointer uchar4 *d_output; checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); size_t num_bytes; checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_output, &num_bytes, cuda_pbo_resource)); render(imageWidth, imageHeight, tx, ty, scale, cx, cy, blockSize, gridSize, g_FilterMode, d_output); checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); // Common display path { // display results glClear(GL_COLOR_BUFFER_BIT); #if USE_BUFFER_TEX // display using buffer texture glBindTexture(GL_TEXTURE_BUFFER_EXT, bufferTex); glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, fprog); glEnable(GL_FRAGMENT_PROGRAM_ARB); glProgramLocalParameterI4iNV(GL_FRAGMENT_PROGRAM_ARB, 0, width, 0, 0, 0); #else // download image from PBO to OpenGL texture glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); glBindTexture(GL_TEXTURE_TYPE, displayTex); glPixelStorei(GL_UNPACK_ALIGNMENT, 1); glTexSubImage2D(GL_TEXTURE_TYPE, 0, 0, 0, width, height, GL_BGRA, GL_UNSIGNED_BYTE, 0); glEnable(GL_TEXTURE_TYPE); #endif // draw textured quad glDisable(GL_DEPTH_TEST); glBegin(GL_QUADS); glTexCoord2f(0.0f , (GLfloat)height); glVertex2f(0.0f, 0.0f); glTexCoord2f((GLfloat)width, (GLfloat)height); glVertex2f(1.0f, 0.0f); glTexCoord2f((GLfloat)width, 0.0f); glVertex2f(1.0f, 1.0f); glTexCoord2f(0.0f , 0.0f); glVertex2f(0.0f, 1.0f); glEnd(); glDisable(GL_TEXTURE_TYPE); glDisable(GL_FRAGMENT_PROGRAM_ARB); glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0); if (drawCurves) { // draw spline curves glPushMatrix(); glScalef(0.25, 0.25, 1.0); glTranslatef(0.0, 2.0, 0.0); glColor3f(1.0, 0.0, 0.0); plotCurve(bspline_w3); glTranslatef(1.0, 0.0, 0.0); glColor3f(0.0, 1.0, 0.0); plotCurve(bspline_w2); glTranslatef(1.0, 0.0, 0.0); glColor3f(0.0, 0.0, 1.0); plotCurve(bspline_w1); glTranslatef(1.0, 0.0, 0.0); glColor3f(1.0, 0.0, 1.0); plotCurve(bspline_w0); glPopMatrix(); glColor3f(1.0, 1.0, 1.0); } } glutSwapBuffers(); glutReportErrors(); sdkStopTimer(&timer); computeFPS(); }
//////////////////////////////////////////////////////////////////////////////// // Test driver //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { uint *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal; uint *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal; StopWatchInterface *hTimer = NULL; const uint N = 4 * 1048576; const uint DIR = 1; const uint numValues = 65536; printf("%s Starting...\n\n", argv[0]); int dev = findCudaDevice(argc, (const char **) argv); if (dev == -1) { return EXIT_FAILURE; } printf("Allocating and initializing host arrays...\n\n"); sdkCreateTimer(&hTimer); h_SrcKey = (uint *)malloc(N * sizeof(uint)); h_SrcVal = (uint *)malloc(N * sizeof(uint)); h_DstKey = (uint *)malloc(N * sizeof(uint)); h_DstVal = (uint *)malloc(N * sizeof(uint)); srand(2009); for (uint i = 0; i < N; i++) { h_SrcKey[i] = rand() % numValues; } fillValues(h_SrcVal, N); printf("Allocating and initializing CUDA arrays...\n\n"); checkCudaErrors(cudaMalloc((void **)&d_DstKey, N * sizeof(uint))); checkCudaErrors(cudaMalloc((void **)&d_DstVal, N * sizeof(uint))); checkCudaErrors(cudaMalloc((void **)&d_BufKey, N * sizeof(uint))); checkCudaErrors(cudaMalloc((void **)&d_BufVal, N * sizeof(uint))); checkCudaErrors(cudaMalloc((void **)&d_SrcKey, N * sizeof(uint))); checkCudaErrors(cudaMalloc((void **)&d_SrcVal, N * sizeof(uint))); checkCudaErrors(cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice)); printf("Initializing GPU merge sort...\n"); initMergeSort(); printf("Running GPU merge sort...\n"); checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); mergeSort( d_DstKey, d_DstVal, d_BufKey, d_BufVal, d_SrcKey, d_SrcVal, N, DIR ); checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); printf("Time: %f ms\n", sdkGetTimerValue(&hTimer)); printf("Reading back GPU merge sort results...\n"); checkCudaErrors(cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost)); checkCudaErrors(cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost)); printf("Inspecting the results...\n"); uint keysFlag = validateSortedKeys( h_DstKey, h_SrcKey, 1, N, numValues, DIR ); uint valuesFlag = validateSortedValues( h_DstKey, h_DstVal, h_SrcKey, 1, N ); printf("Shutting down...\n"); closeMergeSort(); sdkDeleteTimer(&hTimer); checkCudaErrors(cudaFree(d_SrcVal)); checkCudaErrors(cudaFree(d_SrcKey)); checkCudaErrors(cudaFree(d_BufVal)); checkCudaErrors(cudaFree(d_BufKey)); checkCudaErrors(cudaFree(d_DstVal)); checkCudaErrors(cudaFree(d_DstKey)); free(h_DstVal); free(h_DstKey); free(h_SrcVal); free(h_SrcKey); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit((keysFlag && valuesFlag) ? EXIT_SUCCESS : EXIT_FAILURE); }
TEST(RMDCuTests, seedMatrixInit) { const boost::filesystem::path dataset_path("../test_data"); const boost::filesystem::path sequence_file_path("../test_data/first_200_frames_traj_over_table_input_sequence.txt"); rmd::PinholeCamera cam(481.2f, -480.0f, 319.5f, 239.5f); rmd::test::Dataset dataset(dataset_path.string(), sequence_file_path.string(), cam); if (!dataset.readDataSequence()) FAIL() << "could not read dataset"; const size_t ref_ind = 1; const size_t curr_ind = 20; const auto ref_entry = dataset(ref_ind); cv::Mat ref_img; dataset.readImage(ref_img, ref_entry); cv::Mat ref_img_flt; ref_img.convertTo(ref_img_flt, CV_32F, 1.0f/255.0f); cv::Mat ref_depthmap; dataset.readDepthmap(ref_depthmap, ref_entry, ref_img.cols, ref_img.rows); rmd::SE3<float> T_world_ref; dataset.readCameraPose(T_world_ref, ref_entry); const auto curr_entry = dataset(curr_ind); cv::Mat curr_img; dataset.readImage(curr_img, curr_entry); cv::Mat curr_img_flt; curr_img.convertTo(curr_img_flt, CV_32F, 1.0f/255.0f); rmd::SE3<float> T_world_curr; dataset.readCameraPose(T_world_curr, curr_entry); const float min_scene_depth = 0.4f; const float max_scene_depth = 1.8f; rmd::SeedMatrix seeds(ref_img.cols, ref_img.rows, cam); StopWatchInterface * timer = NULL; sdkCreateTimer(&timer); sdkResetTimer(&timer); sdkStartTimer(&timer); seeds.setReferenceImage( reinterpret_cast<float*>(ref_img_flt.data), T_world_ref.inv(), min_scene_depth, max_scene_depth); sdkStopTimer(&timer); double t = sdkGetAverageTimerValue(&timer) / 1000.0; printf("setReference image CUDA execution time: %f seconds.\n", t); cv::Mat initial_depthmap(ref_img.rows, ref_img.cols, CV_32FC1); seeds.downloadDepthmap(reinterpret_cast<float*>(initial_depthmap.data)); cv::Mat initial_sigma_sq(ref_img.rows, ref_img.cols, CV_32FC1); seeds.downloadSigmaSq(reinterpret_cast<float*>(initial_sigma_sq.data)); cv::Mat initial_a(ref_img.rows, ref_img.cols, CV_32FC1); seeds.downloadA(reinterpret_cast<float*>(initial_a.data)); cv::Mat initial_b(ref_img.rows, ref_img.cols, CV_32FC1); seeds.downloadB(reinterpret_cast<float*>(initial_b.data)); const float avg_scene_depth = (min_scene_depth+max_scene_depth)/2.0f; const float max_scene_sigma_sq = (max_scene_depth - min_scene_depth) * (max_scene_depth - min_scene_depth) / 36.0f; for(size_t r=0; r<ref_img.rows; ++r) { for(size_t c=0; c<ref_img.cols; ++c) { ASSERT_FLOAT_EQ(avg_scene_depth, initial_depthmap.at<float>(r, c)); ASSERT_FLOAT_EQ(max_scene_sigma_sq, initial_sigma_sq.at<float>(r, c)); ASSERT_FLOAT_EQ(10.0f, initial_a.at<float>(r, c)); ASSERT_FLOAT_EQ(10.0f, initial_b.at<float>(r, c)); } } // Test initialization of NCC template statistics // CUDA computation cv::Mat cu_sum_templ(ref_img.rows, ref_img.cols, CV_32FC1); seeds.downloadSumTempl(reinterpret_cast<float*>(cu_sum_templ.data)); cv::Mat cu_const_templ_denom(ref_img.rows, ref_img.cols, CV_32FC1); seeds.downloadConstTemplDenom(reinterpret_cast<float*>(cu_const_templ_denom.data)); // Host computation cv::Mat ocv_sum_templ(ref_img.rows, ref_img.cols, CV_32FC1); cv::Mat ocv_const_templ_denom(ref_img.rows, ref_img.cols, CV_32FC1); const int side = seeds.getPatchSide(); for(size_t y=side; y<ref_img.rows-side/2; ++y) { for(size_t x=side; x<ref_img.cols-side/2; ++x) { double sum_templ = 0.0f; double sum_templ_sq = 0.0f; for(int patch_y=0; patch_y<side; ++patch_y) { for(int patch_x=0; patch_x<side; ++patch_x) { const double templ = (double) ref_img_flt.at<float>( y-side/2+patch_y, x-side/2+patch_x ); sum_templ += templ; sum_templ_sq += templ*templ; } } ocv_sum_templ.at<float>(y, x) = (float) sum_templ; ocv_const_templ_denom.at<float>(y, x) = (float) ( ((double)(side*side))*sum_templ_sq - sum_templ*sum_templ ); } } for(size_t r=side; r<ref_img.rows-side/2; ++r) { for(size_t c=side; c<ref_img.cols-side/2; ++c) { ASSERT_NEAR(ocv_sum_templ.at<float>(r, c), cu_sum_templ.at<float>(r, c), 0.00001f); ASSERT_NEAR(ocv_const_templ_denom.at<float>(r, c), cu_const_templ_denom.at<float>(r, c), 0.001f); } } }
TEST(RMDCuTests, seedMatrixCheck) { const boost::filesystem::path dataset_path("../test_data"); const boost::filesystem::path sequence_file_path("../test_data/first_200_frames_traj_over_table_input_sequence.txt"); rmd::PinholeCamera cam(481.2f, -480.0f, 319.5f, 239.5f); rmd::test::Dataset dataset(dataset_path.string(), sequence_file_path.string(), cam); if (!dataset.readDataSequence()) FAIL() << "could not read dataset"; const size_t ref_ind = 1; const size_t curr_ind = 20; const auto ref_entry = dataset(ref_ind); cv::Mat ref_img; dataset.readImage(ref_img, ref_entry); cv::Mat ref_img_flt; ref_img.convertTo(ref_img_flt, CV_32F, 1.0f/255.0f); cv::Mat ref_depthmap; dataset.readDepthmap(ref_depthmap, ref_entry, ref_img.cols, ref_img.rows); rmd::SE3<float> T_world_ref; dataset.readCameraPose(T_world_ref, ref_entry); const auto curr_entry = dataset(curr_ind); cv::Mat curr_img; dataset.readImage(curr_img, curr_entry); cv::Mat curr_img_flt; curr_img.convertTo(curr_img_flt, CV_32F, 1.0f/255.0f); rmd::SE3<float> T_world_curr; dataset.readCameraPose(T_world_curr, curr_entry); const float min_scene_depth = 0.4f; const float max_scene_depth = 1.8f; rmd::SeedMatrix seeds(ref_img.cols, ref_img.rows, cam); seeds.setReferenceImage( reinterpret_cast<float*>(ref_img_flt.data), T_world_ref.inv(), min_scene_depth, max_scene_depth); StopWatchInterface * timer = NULL; sdkCreateTimer(&timer); sdkResetTimer(&timer); sdkStartTimer(&timer); seeds.update( reinterpret_cast<float*>(ref_img_flt.data), T_world_curr.inv()); sdkStopTimer(&timer); double t = sdkGetAverageTimerValue(&timer) / 1000.0; printf("update CUDA execution time: %f seconds.\n", t); cv::Mat cu_convergence(ref_img.rows, ref_img.cols, CV_32SC1); seeds.downloadConvergence(reinterpret_cast<int*>(cu_convergence.data)); const int side = seeds.getPatchSide(); for(size_t r=0; r<ref_img.rows; ++r) { for(size_t c=0; c<ref_img.cols; ++c) { if(r>ref_img.rows-side-1 || r<side || c>ref_img.cols-side-1 || c<side) { ASSERT_EQ(rmd::ConvergenceStates::BORDER, cu_convergence.at<int>(r, c)) << "(r, c) = (" << r << ", " << c <<")"; } else { const int result = cu_convergence.at<int>(r, c); const bool success = (result == rmd::ConvergenceStates::UPDATE || result == rmd::ConvergenceStates::DIVERGED || result == rmd::ConvergenceStates::CONVERGED || result == rmd::ConvergenceStates::NOT_VISIBLE || result == rmd::ConvergenceStates::NO_MATCH ); ASSERT_EQ(true, success) << "(r, c) = (" << r << ", " << c <<")"; } } } }