void runImageFilters(TColor *d_dst) { switch(g_Kernel){ case 0: cuda_Copy(d_dst, imageW, imageH); break; case 1: if(!g_Diag) cuda_KNN(d_dst, imageW, imageH, 1.0f / (knnNoise * knnNoise), lerpC); else cuda_KNNdiag(d_dst, imageW, imageH, 1.0f / (knnNoise * knnNoise), lerpC); break; case 2: if(!g_Diag) cuda_NLM(d_dst, imageW, imageH, 1.0f / (nlmNoise * nlmNoise), lerpC); else cuda_NLMdiag(d_dst, imageW, imageH, 1.0f / (nlmNoise * nlmNoise), lerpC); break; case 3: if(!g_Diag) cuda_NLM2(d_dst, imageW, imageH, 1.0f / (nlmNoise * nlmNoise), lerpC); else cuda_NLM2diag(d_dst, imageW, imageH, 1.0f / (nlmNoise * nlmNoise), lerpC); break; } cutilCheckMsg("Filtering kernel execution failed.\n"); }
void benchmark(int iterations) { // allocate memory for result unsigned int *d_result; unsigned int size = width * height * sizeof(unsigned int); cutilSafeCall( cudaMalloc( (void**) &d_result, size)); // warm-up gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError( cutStartTimer( timer)); // execute the kernel for(int i=0; i<iterations; i++) { gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); } cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError( cutStopTimer( timer)); // check if kernel execution generated an error cutilCheckMsg("Kernel execution failed"); printf("Processing time: %f (ms)\n", cutGetTimerValue( timer)); printf("%.2f Mpixels/sec\n", (width*height*iterations / (cutGetTimerValue( timer) / 1000.0f)) / 1e6); cutilSafeCall(cudaFree(d_result)); }
/** * Selects a GPU for the CUDA execution. * @param id id of the selected GPU. */ void selectGPU(int id) { int deviceId; if (id == -1) { deviceId = cutGetMaxGflopsDeviceId(); } else { deviceId = id; } cutilSafeCall(cudaSetDevice( deviceId )); cutilCheckMsg("cudaSetDevice failed"); }
//////////////////////////////////////////////////////////////////////////////// //! Run a simple benchmark test for CUDA //////////////////////////////////////////////////////////////////////////////// void runBenchmark( int argc, char **argv ) { int devID = 0; shrLog("[runBenchmark]: [%s]\n", sSDKsample); devID = cutilChooseCudaDevice(argc, argv); loadImageData(argc, argv); initCuda(); g_CheckRender = new CheckBackBuffer(width, height, 4, false); g_CheckRender->setExecPath(argv[0]); unsigned int *d_result; cutilSafeCall( cudaMalloc( (void **)&d_result, width*height*sizeof(unsigned int)) ); // warm-up boxFilterRGBA(d_img, d_temp, d_temp, width, height, filter_radius, iterations, nthreads); cutilSafeCall( cutilDeviceSynchronize() ); // Start round-trip timer and process iCycles loops on the GPU iterations = 1; // standard 1-pass filtering const int iCycles = 150; double dProcessingTime = 0.0; shrLog("\nRunning BoxFilterGPU for %d cycles...\n\n", iCycles); shrDeltaT(2); for (int i = 0; i < iCycles; i++) { dProcessingTime += boxFilterRGBA(d_img, d_temp, d_img, width, height, filter_radius, iterations, nthreads); } // check if kernel execution generated an error and sync host cutilCheckMsg("Error: boxFilterRGBA Kernel execution FAILED"); cutilSafeCall(cutilDeviceSynchronize()); // Get average computation time dProcessingTime /= (double)iCycles; // log testname, throughput, timing and config info to sample and master logs shrLogEx(LOGBOTH | MASTER, 0, "boxFilter-texture, Throughput = %.4f M RGBA Pixels/s, Time = %.5f s, Size = %u RGBA Pixels, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * width * height)/dProcessingTime, dProcessingTime, (width * height), 1, nthreads); shrLog("\n"); }
void runAutoTest(int argc, char **argv) { int devID = 0; shrLog("[runAutoTest]: [%s] (automated testing w/ readback)\n", sSDKsample); devID = cutilChooseCudaDevice(argc, argv); loadImageData(argc, argv); initCuda(); g_CheckRender = new CheckBackBuffer(width, height, 4, false); g_CheckRender->setExecPath(argv[0]); unsigned int *d_result; cutilSafeCall( cudaMalloc( (void **)&d_result, width*height*sizeof(unsigned int)) ); for(int i = 0; i < 4; i++) { shrLog("[AutoTest]: %s (radius=%d)", sSDKsample, filter_radius ); bilateralFilterRGBA(d_result, width, height, euclidean_delta, filter_radius, iterations, nthreads); // check if kernel execution generated an error cutilCheckMsg("Error: bilateralFilterRGBA Kernel execution FAILED"); cutilSafeCall( cutilDeviceSynchronize() ); cudaMemcpy(g_CheckRender->imageData(), d_result, width*height*sizeof(unsigned int), cudaMemcpyDeviceToHost); g_CheckRender->savePPM(sOriginal[i], false, NULL); if (!g_CheckRender->PPMvsPPM(sOriginal[i], sReference[i], MAX_EPSILON_ERROR, 0.15f)) { g_TotalErrors++; } gaussian_delta += 1.0f; euclidean_delta *= 1.25f; updateGaussian(gaussian_delta, filter_radius); } cutilSafeCall( cudaFree( d_result ) ); delete g_CheckRender; }
void runAutoTest(int argc, char **argv) { // allocate memory for result unsigned int *d_result; unsigned int size = width * height * sizeof(unsigned int); cutilSafeCall( cudaMalloc( (void**) &d_result, size)); // warm-up gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError( cutStartTimer( timer)); while (sigma <= 22) { gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); cutilSafeCall( cudaThreadSynchronize() ); // check if kernel execution generated an error cutilCheckMsg("Kernel execution failed"); cudaMemcpy(g_CheckRender->imageData(), d_result, width*height*4, cudaMemcpyDeviceToHost); g_CheckRender->savePPM(sOriginal[g_Index], false, NULL); if (!g_CheckRender->PPMvsPPM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, 0.50f)) { g_TotalErrors++; } g_Index++; sigma += 4; } cutilCheckError( cutStopTimer( timer)); printf("Processing time: %f (ms)\n", cutGetTimerValue( timer)); printf("%.2f Mpixels/sec\n", (width*height*g_Index / (cutGetTimerValue( timer) / 1000.0f)) / 1e6); printf("Summary: %d errors!\n", g_TotalErrors); printf("Test %s!\n", (g_TotalErrors==0) ? "PASSED" : "FAILED"); cutilSafeCall(cudaFree(d_result)); }
// render image using CUDA void render() { copyInvViewMatrix(invViewMatrix, sizeof(float4)*3); // map PBO to get CUDA device pointer uint *d_output; // map PBO to get CUDA device pointer cutilSafeCall(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); size_t num_bytes; cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void **)&d_output, &num_bytes, cuda_pbo_resource)); //printf("CUDA mapped PBO: May access %ld bytes\n", num_bytes); // clear image cutilSafeCall(cudaMemset(d_output, 0, width*height*4)); // call CUDA kernel, writing results to PBO render_kernel(gridSize, blockSize, d_output, width, height, density, brightness, transferOffset, transferScale); cutilCheckMsg("kernel failed"); cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { //start logs shrSetLogFileName ("volumeRender.txt"); shrLog("%s Starting...\n\n", argv[0]); if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") || cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) { g_bQAReadback = true; fpsLimit = frameCheckNumber; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) { g_bQAGLVerify = true; fpsLimit = frameCheckNumber; } if (g_bQAReadback) { // use command-line specified CUDA device, otherwise use device with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilDeviceInit(argc, argv); } else { cudaSetDevice( cutGetMaxGflopsDeviceId() ); } } else { // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL( &argc, argv ); // use command-line specified CUDA device, otherwise use device with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilGLDeviceInit(argc, argv); } else { cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); } /* int device; struct cudaDeviceProp prop; cudaGetDevice( &device ); cudaGetDeviceProperties( &prop, device ); if( !strncmp( "Tesla", prop.name, 5 ) ) { shrLog("This sample needs a card capable of OpenGL and display.\n"); shrLog("Please choose a different device with the -device=x argument.\n"); cutilExit(argc, argv); } */ } // parse arguments char *filename; if (cutGetCmdLineArgumentstr( argc, (const char**) argv, "file", &filename)) { volumeFilename = filename; } int n; if (cutGetCmdLineArgumenti( argc, (const char**) argv, "size", &n)) { volumeSize.width = volumeSize.height = volumeSize.depth = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "xsize", &n)) { volumeSize.width = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "ysize", &n)) { volumeSize.height = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "zsize", &n)) { volumeSize.depth = n; } // load volume data char* path = shrFindFilePath(volumeFilename, argv[0]); if (path == 0) { shrLog("Error finding file '%s'\n", volumeFilename); exit(EXIT_FAILURE); } size_t size = volumeSize.width*volumeSize.height*volumeSize.depth*sizeof(VolumeType); void *h_volume = loadRawFile(path, size); initCuda(h_volume, volumeSize); free(h_volume); cutilCheckError( cutCreateTimer( &timer)); shrLog("Press '=' and '-' to change density\n" " ']' and '[' to change brightness\n" " ';' and ''' to modify transfer function offset\n" " '.' and ',' to modify transfer function scale\n\n"); // calculate new grid size gridSize = dim3(iDivUp(width, blockSize.x), iDivUp(height, blockSize.y)); if (g_bQAReadback) { g_CheckRender = new CheckBackBuffer(width, height, 4, false); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); uint *d_output; cutilSafeCall(cudaMalloc((void**)&d_output, width*height*sizeof(uint))); cutilSafeCall(cudaMemset(d_output, 0, width*height*sizeof(uint))); float modelView[16] = { 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 4.0f, 1.0f }; invViewMatrix[0] = modelView[0]; invViewMatrix[1] = modelView[4]; invViewMatrix[2] = modelView[8]; invViewMatrix[3] = modelView[12]; invViewMatrix[4] = modelView[1]; invViewMatrix[5] = modelView[5]; invViewMatrix[6] = modelView[9]; invViewMatrix[7] = modelView[13]; invViewMatrix[8] = modelView[2]; invViewMatrix[9] = modelView[6]; invViewMatrix[10] = modelView[10]; invViewMatrix[11] = modelView[14]; // call CUDA kernel, writing results to PBO copyInvViewMatrix(invViewMatrix, sizeof(float4)*3); // Start timer 0 and process n loops on the GPU int nIter = 10; for (int i = -1; i < nIter; i++) { if( i == 0 ) { cudaThreadSynchronize(); cutStartTimer(timer); } render_kernel(gridSize, blockSize, d_output, width, height, density, brightness, transferOffset, transferScale); } cudaThreadSynchronize(); cutStopTimer(timer); // Get elapsed time and throughput, then log to sample and master logs double dAvgTime = cutGetTimerValue(timer)/(nIter * 1000.0); shrLogEx(LOGBOTH | MASTER, 0, "volumeRender, Throughput = %.4f MTexels/s, Time = %.5f s, Size = %u Texels, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * width * height)/dAvgTime, dAvgTime, (width * height), 1, blockSize.x * blockSize.y); cutilCheckMsg("Error: render_kernel() execution FAILED"); cutilSafeCall( cudaThreadSynchronize() ); cutilSafeCall( cudaMemcpy(g_CheckRender->imageData(), d_output, width*height*4, cudaMemcpyDeviceToHost) ); g_CheckRender->savePPM(sOriginal[g_Index], true, NULL); if (!g_CheckRender->PPMvsPPM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, THRESHOLD)) { shrLog("\nFAILED\n\n"); } else { shrLog("\nPASSED\n\n"); } cudaFree(d_output); freeCudaBuffers(); if (g_CheckRender) { delete g_CheckRender; g_CheckRender = NULL; } } else { // This is the normal rendering path for VolumeRender glutDisplayFunc(display); glutKeyboardFunc(keyboard); glutMouseFunc(mouse); glutMotionFunc(motion); glutReshapeFunc(reshape); glutIdleFunc(idle); initPixelBuffer(); if (g_bQAGLVerify) { g_CheckRender = new CheckBackBuffer(width, height, 4); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); } atexit(cleanup); glutMainLoop(); } cudaThreadExit(); shrEXIT(argc, (const char**)argv); }
CUDASolverBundling::CUDASolverBundling(unsigned int maxNumberOfImages, unsigned int maxNumResiduals) : m_maxNumberOfImages(maxNumberOfImages) , THREADS_PER_BLOCK(512) // keep consistent with the GPU { m_timer = NULL; //m_timer = new CUDATimer(); //if (GlobalBundlingState::get().s_enableDetailedTimings) m_timer = new CUDATimer(); m_bRecordConvergence = GlobalBundlingState::get().s_recordSolverConvergence; //TODO PARAMS const unsigned int submapSize = GlobalBundlingState::get().s_submapSize; m_verifyOptDistThresh = 0.02f;//GlobalAppState::get().s_verifyOptDistThresh; m_verifyOptPercentThresh = 0.05f;//GlobalAppState::get().s_verifyOptPercentThresh; const unsigned int numberOfVariables = maxNumberOfImages; m_maxCorrPerImage = math::clamp(maxNumResiduals / maxNumberOfImages, 1000u, 4000u); // State MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_deltaRot, sizeof(float3)*numberOfVariables)); MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_deltaTrans, sizeof(float3)*numberOfVariables)); MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_rRot, sizeof(float3)*numberOfVariables)); MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_rTrans, sizeof(float3)*numberOfVariables)); MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_zRot, sizeof(float3)*numberOfVariables)); MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_zTrans, sizeof(float3)*numberOfVariables)); MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_pRot, sizeof(float3)*numberOfVariables)); MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_pTrans, sizeof(float3)*numberOfVariables)); MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_Jp, sizeof(float3)*maxNumResiduals)); MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_Ap_XRot, sizeof(float3)*numberOfVariables)); MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_Ap_XTrans, sizeof(float3)*numberOfVariables)); MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_scanAlpha, sizeof(float) * 2)); MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_rDotzOld, sizeof(float) *numberOfVariables)); MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_precondionerRot, sizeof(float3)*numberOfVariables)); MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_precondionerTrans, sizeof(float3)*numberOfVariables)); MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_sumResidual, sizeof(float))); unsigned int n = (maxNumResiduals + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverExtra.d_maxResidual, sizeof(float) * n)); MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverExtra.d_maxResidualIndex, sizeof(int) * n)); m_solverExtra.h_maxResidual = new float[n]; m_solverExtra.h_maxResidualIndex = new int[n]; MLIB_CUDA_SAFE_CALL(cudaMalloc(&d_variablesToCorrespondences, sizeof(int)*m_maxNumberOfImages*m_maxCorrPerImage)); MLIB_CUDA_SAFE_CALL(cudaMalloc(&d_numEntriesPerRow, sizeof(int)*m_maxNumberOfImages)); MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_countHighResidual, sizeof(int))); MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_denseJtJ, sizeof(float) * 36 * numberOfVariables * numberOfVariables)); MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_denseJtr, sizeof(float) * 6 * numberOfVariables)); m_maxNumDenseImPairs = m_maxNumberOfImages * (m_maxNumberOfImages - 1) / 2; MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_denseCorrCounts, sizeof(float) * m_maxNumDenseImPairs)); MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_denseOverlappingImages, sizeof(uint2) * m_maxNumDenseImPairs)); MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_numDenseOverlappingImages, sizeof(int))); MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_corrCount, sizeof(int))); MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_corrCountColor, sizeof(int))); MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_sumResidualColor, sizeof(float))); #ifdef USE_LIE_SPACE MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_xTransforms, sizeof(float4x4)*m_maxNumberOfImages)); MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_xTransformInverses, sizeof(float4x4)*m_maxNumberOfImages)); #else m_solverState.d_xTransforms = NULL; m_solverState.d_xTransformInverses = NULL; #endif #ifdef NEW_GUIDED_REMOVE cudaMalloc(&d_transforms, sizeof(float4x4)*m_maxNumberOfImages); #endif //solve params m_maxResidualThresh = GlobalBundlingState::get().s_optMaxResThresh; m_defaultParams.denseDistThresh = GlobalBundlingState::get().s_denseDistThresh; m_defaultParams.denseNormalThresh = GlobalBundlingState::get().s_denseNormalThresh; m_defaultParams.denseColorThresh = GlobalBundlingState::get().s_denseColorThresh; m_defaultParams.denseColorGradientMin = GlobalBundlingState::get().s_denseColorGradientMin; m_defaultParams.denseDepthMin = GlobalBundlingState::get().s_denseDepthMin; m_defaultParams.denseDepthMax = GlobalBundlingState::get().s_denseDepthMax; m_defaultParams.denseOverlapCheckSubsampleFactor = GlobalBundlingState::get().s_denseOverlapCheckSubsampleFactor; //!!!DEBUGGING MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_deltaRot, -1, sizeof(float3)*numberOfVariables)); MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_deltaTrans, -1, sizeof(float3)*numberOfVariables)); MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_rRot, -1, sizeof(float3)*numberOfVariables)); MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_rTrans, -1, sizeof(float3)*numberOfVariables)); MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_zRot, -1, sizeof(float3)*numberOfVariables)); MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_zTrans, -1, sizeof(float3)*numberOfVariables)); MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_pRot, -1, sizeof(float3)*numberOfVariables)); MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_pTrans, -1, sizeof(float3)*numberOfVariables)); MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_Jp, -1, sizeof(float3)*maxNumResiduals)); MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_Ap_XRot, -1, sizeof(float3)*numberOfVariables)); MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_Ap_XTrans, -1, sizeof(float3)*numberOfVariables)); MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_scanAlpha, -1, sizeof(float) * 2)); MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_rDotzOld, -1, sizeof(float) *numberOfVariables)); MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_precondionerRot, -1, sizeof(float3)*numberOfVariables)); MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_precondionerTrans, -1, sizeof(float3)*numberOfVariables)); MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_sumResidual, -1, sizeof(float))); MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverExtra.d_maxResidual, -1, sizeof(float) * n)); MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverExtra.d_maxResidualIndex, -1, sizeof(int) * n)); MLIB_CUDA_SAFE_CALL(cudaMemset(d_variablesToCorrespondences, -1, sizeof(int)*m_maxNumberOfImages*m_maxCorrPerImage)); MLIB_CUDA_SAFE_CALL(cudaMemset(d_numEntriesPerRow, -1, sizeof(int)*m_maxNumberOfImages)); MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_countHighResidual, -1, sizeof(int))); MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_denseJtJ, -1, sizeof(float) * 36 * numberOfVariables * numberOfVariables)); MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_denseJtr, -1, sizeof(float) * 6 * numberOfVariables)); MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_denseCorrCounts, -1, sizeof(float) * m_maxNumDenseImPairs)); MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_denseOverlappingImages, -1, sizeof(uint2) * m_maxNumDenseImPairs)); MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_numDenseOverlappingImages, -1, sizeof(int))); MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_corrCount, -1, sizeof(int))); MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_corrCountColor, -1, sizeof(int))); MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_sumResidualColor, -1, sizeof(float))); cutilSafeCall(cudaDeviceSynchronize()); cutilCheckMsg(__FUNCTION__); //!!!DEBUGGING }
void CMarchingCubes::ComputeIsosurface(ElemType* _pFval, ElemType _isoValue, RenderData* _pRender) { int threads = 128; dim3 grid(m_NumVoxels / threads, 1, 1); // get around maximum grid size of 65535 in each dimension if (grid.x > 65535) { grid.y = grid.x / 32768; grid.x = 32768; } uint totalVerts = 0; int size = m_GridSize.x * m_GridSize.y * m_GridSize.z * sizeof(float); ////////////////////////////////////////////////////////////////////////// int len = m_GridSize.x * m_GridSize.y * m_GridSize.z; float *pFvalTemp = new float[len]; for (int i = 0; i < len; i++) { pFvalTemp[i] = _pFval[i]; } ////////////////////////////////////////////////////////////////////////// float* pdVolumeFval; // ¶¥µãº¯ÊýÖµÎÆÀí(n¡¡Surface) cutilSafeCall(cudaMalloc((void**) &pdVolumeFval, size)); cutilSafeCall(cudaMemcpy(pdVolumeFval, pFvalTemp, size, cudaMemcpyHostToDevice) ); bindVolumeValTexture(pdVolumeFval); delete []pFvalTemp; // calculate number of vertices need per voxel launch_classifyVoxel(grid, threads, m_pdVoxelVerts, m_pdVoxelOccupied, pdVolumeFval, m_GridSize, m_NumVoxels, _isoValue); #if DEBUG_BUFFERS printf("voxelVerts:\n"); dumpBuffer(m_pdVoxelVerts, m_NumVoxels); #endif #if SKIP_EMPTY_VOXELS // scan voxel occupied array cudppScan(m_Scanplan, m_pdVoxelOccupiedScan, m_pdVoxelOccupied, m_NumVoxels); #if DEBUG_BUFFERS printf("voxelOccupiedScan:\n"); dumpBuffer(m_pdVoxelOccupiedScan, m_NumVoxels); #endif // read back values to calculate total number of non-empty voxels // since we are using an exclusive scan, the total is the last value of // the scan result plus the last value in the input array { uint lastElement, lastScanElement; cutilSafeCall(cudaMemcpy((void *) &lastElement, (void *) (m_pdVoxelOccupied + m_NumVoxels - 1), sizeof(uint), cudaMemcpyDeviceToHost)); cutilSafeCall(cudaMemcpy((void *) &lastScanElement, (void *) (m_pdVoxelOccupiedScan + m_NumVoxels - 1), sizeof(uint), cudaMemcpyDeviceToHost)); m_ActiveVoxels = lastElement + lastScanElement; } if (0 == m_ActiveVoxels) { // return if there are no full voxels totalVerts = 0; return; } // compact voxel index array launch_compactVoxels(grid, threads, m_pdCompactedVoxelArray, m_pdVoxelOccupied, m_pdVoxelOccupiedScan, m_NumVoxels); cutilCheckMsg("compactVoxels failed"); #endif // SKIP_EMPTY_VOXELS // scan voxel vertex count array cudppScan(m_Scanplan, m_pdVoxelVertsScan, m_pdVoxelVerts, m_NumVoxels); #if DEBUG_BUFFERS printf("voxelVertsScan:\n"); dumpBuffer(m_pdVoxelVertsScan, m_NumVoxels); #endif // readback total number of vertices { uint lastElement, lastScanElement; cutilSafeCall(cudaMemcpy((void *) &lastElement, (void *) (m_pdVoxelVerts + m_NumVoxels - 1), sizeof(uint), cudaMemcpyDeviceToHost)); cutilSafeCall(cudaMemcpy((void *) &lastScanElement, (void *) (m_pdVoxelVertsScan + m_NumVoxels - 1), sizeof(uint), cudaMemcpyDeviceToHost)); totalVerts = lastElement + lastScanElement; } // create VBOs GLuint posVbo, normalVbo; createVBO(&posVbo, totalVerts * sizeof(float) * 4); cutilSafeCall(cudaGLRegisterBufferObject(posVbo)); createVBO(&normalVbo, totalVerts * sizeof(float) * 4); cutilSafeCall(cudaGLRegisterBufferObject(normalVbo)); // generate triangles, writing to vertex buffers float4 *d_pos = 0, *d_normal = 0; cutilSafeCall(cudaGLMapBufferObject((void**)&d_pos, posVbo)); cutilSafeCall(cudaGLMapBufferObject((void**)&d_normal, normalVbo)); #if SKIP_EMPTY_VOXELS dim3 grid2((int) ceil(m_ActiveVoxels / (float) NTHREADS), 1, 1); #else dim3 grid2((int) ceil(m_NumVoxels / (float) NTHREADS), 1, 1); #endif while(grid2.x > 65535) { grid2.x/=2; grid2.y*=2; } launch_generateTriangles(grid2, NTHREADS, d_pos, d_normal, m_pdCompactedVoxelArray, m_pdVoxelVertsScan, m_pdVolume, pdVolumeFval, m_GridSize, _isoValue, m_ActiveVoxels, m_MaxVerts); cutilSafeCall(cudaGLUnmapBufferObject(normalVbo)); cutilSafeCall(cudaGLUnmapBufferObject(posVbo)); _pRender->posVbo = posVbo; _pRender->normalVbo = normalVbo; _pRender->totalVerts = totalVerts; cutilSafeCall(cudaFree(pdVolumeFval)); }
T benchmarkReduceMax(int n, int numThreads, int numBlocks, int maxThreads, int maxBlocks, int whichKernel, int testIterations, bool cpuFinalReduction, int cpuFinalThreshold, unsigned int timer, T* h_odata, T* d_idata, T* d_odata) { T gpu_result = 0; bool needReadBack = true; for (int i = 0; i < testIterations; ++i) { gpu_result = 0; cutilDeviceSynchronize(); cutilCheckError( cutStartTimer( timer)); // execute the kernel maxreduce<T>(n, numThreads, numBlocks, whichKernel, d_idata, d_odata); // check if kernel execution generated an error cutilCheckMsg("Kernel execution failed"); if (cpuFinalReduction) { // sum partial sums from each block on CPU // copy result from device to host cutilSafeCallNoSync( cudaMemcpy( h_odata, d_odata, numBlocks*sizeof(T), cudaMemcpyDeviceToHost) ); for(int i=0; i<numBlocks; i++) { gpu_result += h_odata[i]; } needReadBack = false; } else { // sum partial block sums on GPU int s=numBlocks; int kernel = whichKernel; while(s > cpuFinalThreshold) { int threads = 0, blocks = 0; getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, threads); maxreduce<T>(s, threads, blocks, kernel, d_odata, d_odata); if (kernel < 3) s = (s + threads - 1) / threads; else s = (s + (threads*2-1)) / (threads*2); } if (s > 1) { // copy result from device to host cutilSafeCallNoSync( cudaMemcpy( h_odata, d_odata, s * sizeof(T), cudaMemcpyDeviceToHost) ); for(int i=0; i < s; i++) { gpu_result += h_odata[i]; } needReadBack = false; } } cutilDeviceSynchronize(); cutilCheckError( cutStopTimer(timer) ); } if (needReadBack) { // copy final sum from device to host cutilSafeCallNoSync( cudaMemcpy( &gpu_result, d_odata, sizeof(T), cudaMemcpyDeviceToHost) ); } return gpu_result; }
//////////////////////////////////////////////////////////////////////////////// //! Run the Cuda part of the computation //////////////////////////////////////////////////////////////////////////////// void computeIsosurface() { int threads = 128; dim3 grid(numVoxels / threads, 1, 1); // get around maximum grid size of 65535 in each dimension if (grid.x > 65535) { grid.y = grid.x / 32768; grid.x = 32768; } // calculate number of vertices need per voxel launch_classifyVoxel(grid, threads, d_voxelVerts, d_voxelOccupied, d_volume, gridSize, gridSizeShift, gridSizeMask, numVoxels, voxelSize, isoValue); #if DEBUG_BUFFERS printf("voxelVerts:\n"); dumpBuffer(d_voxelVerts, numVoxels, sizeof(uint)); #endif #if SKIP_EMPTY_VOXELS // scan voxel occupied array cudppScan(scanplan, d_voxelOccupiedScan, d_voxelOccupied, numVoxels); #if DEBUG_BUFFERS printf("voxelOccupiedScan:\n"); dumpBuffer(d_voxelOccupiedScan, numVoxels, sizeof(uint)); #endif // read back values to calculate total number of non-empty voxels // since we are using an exclusive scan, the total is the last value of // the scan result plus the last value in the input array { uint lastElement, lastScanElement; cutilSafeCall(cudaMemcpy((void *) &lastElement, (void *) (d_voxelOccupied + numVoxels-1), sizeof(uint), cudaMemcpyDeviceToHost)); cutilSafeCall(cudaMemcpy((void *) &lastScanElement, (void *) (d_voxelOccupiedScan + numVoxels-1), sizeof(uint), cudaMemcpyDeviceToHost)); activeVoxels = lastElement + lastScanElement; } if (activeVoxels==0) { // return if there are no full voxels totalVerts = 0; return; } // compact voxel index array launch_compactVoxels(grid, threads, d_compVoxelArray, d_voxelOccupied, d_voxelOccupiedScan, numVoxels); cutilCheckMsg("compactVoxels failed"); #endif // SKIP_EMPTY_VOXELS // scan voxel vertex count array cudppScan(scanplan, d_voxelVertsScan, d_voxelVerts, numVoxels); #if DEBUG_BUFFERS printf("voxelVertsScan:\n"); dumpBuffer(d_voxelVertsScan, numVoxels, sizeof(uint)); #endif // readback total number of vertices { uint lastElement, lastScanElement; cutilSafeCall(cudaMemcpy((void *) &lastElement, (void *) (d_voxelVerts + numVoxels-1), sizeof(uint), cudaMemcpyDeviceToHost)); cutilSafeCall(cudaMemcpy((void *) &lastScanElement, (void *) (d_voxelVertsScan + numVoxels-1), sizeof(uint), cudaMemcpyDeviceToHost)); totalVerts = lastElement + lastScanElement; } // generate triangles, writing to vertex buffers if (!g_bQAReadback) { size_t num_bytes; // DEPRECATED: cutilSafeCall(cudaGLMapBufferObject((void**)&d_pos, posVbo)); cutilSafeCall(cudaGraphicsMapResources(1, &cuda_posvbo_resource, 0)); cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void**)&d_pos, &num_bytes, cuda_posvbo_resource)); // DEPRECATED: cutilSafeCall(cudaGLMapBufferObject((void**)&d_normal, normalVbo)); cutilSafeCall(cudaGraphicsMapResources(1, &cuda_normalvbo_resource, 0)); cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void**)&d_normal, &num_bytes, cuda_normalvbo_resource)); } #if SKIP_EMPTY_VOXELS dim3 grid2((int) ceil(activeVoxels / (float) NTHREADS), 1, 1); #else dim3 grid2((int) ceil(numVoxels / (float) NTHREADS), 1, 1); #endif while(grid2.x > 65535) { grid2.x/=2; grid2.y*=2; } #if SAMPLE_VOLUME launch_generateTriangles2(grid2, NTHREADS, d_pos, d_normal, d_compVoxelArray, d_voxelVertsScan, d_volume, gridSize, gridSizeShift, gridSizeMask, voxelSize, isoValue, activeVoxels, maxVerts); #else launch_generateTriangles(grid2, NTHREADS, d_pos, d_normal, d_compVoxelArray, d_voxelVertsScan, gridSize, gridSizeShift, gridSizeMask, voxelSize, isoValue, activeVoxels, maxVerts); #endif if (!g_bQAReadback) { // DEPRECATED: cutilSafeCall(cudaGLUnmapBufferObject(normalVbo)); cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_normalvbo_resource, 0)); // DEPRECATED: cutilSafeCall(cudaGLUnmapBufferObject(posVbo)); cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_posvbo_resource, 0)); } }
void displayFunc(void){ cutStartTimer(hTimer); TColor *d_dst = NULL; size_t num_bytes; if(frameCounter++ == 0) cutResetTimer(hTimer); // DEPRECATED: cutilSafeCall(cudaGLMapBufferObject((void**)&d_dst, gl_PBO)); cutilSafeCall(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); cutilCheckMsg("cudaGraphicsMapResources failed"); cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void**)&d_dst, &num_bytes, cuda_pbo_resource)); cutilCheckMsg("cudaGraphicsResourceGetMappedPointer failed"); cutilSafeCall( CUDA_Bind2TextureArray() ); runImageFilters(d_dst); cutilSafeCall( CUDA_UnbindTexture() ); // DEPRECATED: cutilSafeCall(cudaGLUnmapBufferObject(gl_PBO)); cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); if (g_bFBODisplay) { g_FrameBufferObject->bindRenderPath(); } // Common display code path { glClear(GL_COLOR_BUFFER_BIT); glTexSubImage2D( GL_TEXTURE_2D, 0, 0, 0, imageW, imageH, GL_RGBA, GL_UNSIGNED_BYTE, BUFFER_DATA(0) ); glBegin(GL_TRIANGLES); glTexCoord2f(0, 0); glVertex2f(-1, -1); glTexCoord2f(2, 0); glVertex2f(+3, -1); glTexCoord2f(0, 2); glVertex2f(-1, +3); glEnd(); glFinish(); } if (g_bFBODisplay) { g_FrameBufferObject->unbindRenderPath(); glBindTexture(GL_TEXTURE_2D, 0); } if (g_CheckRender && g_CheckRender->IsQAReadback() && g_Verify) { printf("> (Frame %d) readback BackBuffer\n", frameCount); if (g_bFBODisplay) { g_CheckRender->readback( imageW, imageH, g_FrameBufferObject->getFbo() ); } else { g_CheckRender->readback( imageW, imageH ); } g_CheckRender->savePPM ( sOriginal[g_Kernel], true, NULL ); if (!g_CheckRender->PPMvsPPM(sOriginal[g_Kernel], sReference[g_Kernel], MAX_EPSILON_ERROR, 0.15f)) { g_TotalErrors++; } g_Verify = false; } if(frameCounter == frameN){ frameCounter = 0; if(g_FPS){ printf("FPS: %3.1f\n", frameN / (cutGetTimerValue(hTimer) * 0.001) ); g_FPS = false; } } glutSwapBuffers(); cutStopTimer(hTimer); computeFPS(); }