void CMarchingCubes::ComputeIsosurface(ElemType* _pFval, ElemType _isoValue, RenderData* _pRender) { int threads = 128; dim3 grid(m_NumVoxels / threads, 1, 1); // get around maximum grid size of 65535 in each dimension if (grid.x > 65535) { grid.y = grid.x / 32768; grid.x = 32768; } uint totalVerts = 0; int size = m_GridSize.x * m_GridSize.y * m_GridSize.z * sizeof(float); ////////////////////////////////////////////////////////////////////////// int len = m_GridSize.x * m_GridSize.y * m_GridSize.z; float *pFvalTemp = new float[len]; for (int i = 0; i < len; i++) { pFvalTemp[i] = _pFval[i]; } ////////////////////////////////////////////////////////////////////////// float* pdVolumeFval; // ¶¥µãº¯ÊýÖµÎÆÀí(n¡¡Surface) cutilSafeCall(cudaMalloc((void**) &pdVolumeFval, size)); cutilSafeCall(cudaMemcpy(pdVolumeFval, pFvalTemp, size, cudaMemcpyHostToDevice) ); bindVolumeValTexture(pdVolumeFval); delete []pFvalTemp; // calculate number of vertices need per voxel launch_classifyVoxel(grid, threads, m_pdVoxelVerts, m_pdVoxelOccupied, pdVolumeFval, m_GridSize, m_NumVoxels, _isoValue); #if DEBUG_BUFFERS printf("voxelVerts:\n"); dumpBuffer(m_pdVoxelVerts, m_NumVoxels); #endif #if SKIP_EMPTY_VOXELS // scan voxel occupied array cudppScan(m_Scanplan, m_pdVoxelOccupiedScan, m_pdVoxelOccupied, m_NumVoxels); #if DEBUG_BUFFERS printf("voxelOccupiedScan:\n"); dumpBuffer(m_pdVoxelOccupiedScan, m_NumVoxels); #endif // read back values to calculate total number of non-empty voxels // since we are using an exclusive scan, the total is the last value of // the scan result plus the last value in the input array { uint lastElement, lastScanElement; cutilSafeCall(cudaMemcpy((void *) &lastElement, (void *) (m_pdVoxelOccupied + m_NumVoxels - 1), sizeof(uint), cudaMemcpyDeviceToHost)); cutilSafeCall(cudaMemcpy((void *) &lastScanElement, (void *) (m_pdVoxelOccupiedScan + m_NumVoxels - 1), sizeof(uint), cudaMemcpyDeviceToHost)); m_ActiveVoxels = lastElement + lastScanElement; } if (0 == m_ActiveVoxels) { // return if there are no full voxels totalVerts = 0; return; } // compact voxel index array launch_compactVoxels(grid, threads, m_pdCompactedVoxelArray, m_pdVoxelOccupied, m_pdVoxelOccupiedScan, m_NumVoxels); cutilCheckMsg("compactVoxels failed"); #endif // SKIP_EMPTY_VOXELS // scan voxel vertex count array cudppScan(m_Scanplan, m_pdVoxelVertsScan, m_pdVoxelVerts, m_NumVoxels); #if DEBUG_BUFFERS printf("voxelVertsScan:\n"); dumpBuffer(m_pdVoxelVertsScan, m_NumVoxels); #endif // readback total number of vertices { uint lastElement, lastScanElement; cutilSafeCall(cudaMemcpy((void *) &lastElement, (void *) (m_pdVoxelVerts + m_NumVoxels - 1), sizeof(uint), cudaMemcpyDeviceToHost)); cutilSafeCall(cudaMemcpy((void *) &lastScanElement, (void *) (m_pdVoxelVertsScan + m_NumVoxels - 1), sizeof(uint), cudaMemcpyDeviceToHost)); totalVerts = lastElement + lastScanElement; } // create VBOs GLuint posVbo, normalVbo; createVBO(&posVbo, totalVerts * sizeof(float) * 4); cutilSafeCall(cudaGLRegisterBufferObject(posVbo)); createVBO(&normalVbo, totalVerts * sizeof(float) * 4); cutilSafeCall(cudaGLRegisterBufferObject(normalVbo)); // generate triangles, writing to vertex buffers float4 *d_pos = 0, *d_normal = 0; cutilSafeCall(cudaGLMapBufferObject((void**)&d_pos, posVbo)); cutilSafeCall(cudaGLMapBufferObject((void**)&d_normal, normalVbo)); #if SKIP_EMPTY_VOXELS dim3 grid2((int) ceil(m_ActiveVoxels / (float) NTHREADS), 1, 1); #else dim3 grid2((int) ceil(m_NumVoxels / (float) NTHREADS), 1, 1); #endif while(grid2.x > 65535) { grid2.x/=2; grid2.y*=2; } launch_generateTriangles(grid2, NTHREADS, d_pos, d_normal, m_pdCompactedVoxelArray, m_pdVoxelVertsScan, m_pdVolume, pdVolumeFval, m_GridSize, _isoValue, m_ActiveVoxels, m_MaxVerts); cutilSafeCall(cudaGLUnmapBufferObject(normalVbo)); cutilSafeCall(cudaGLUnmapBufferObject(posVbo)); _pRender->posVbo = posVbo; _pRender->normalVbo = normalVbo; _pRender->totalVerts = totalVerts; cutilSafeCall(cudaFree(pdVolumeFval)); }
bool CompactingHashTable::Build(const unsigned n, const unsigned *d_keys, const unsigned *d_values) { CUDA_CHECK_ERROR("Failed before attempting to build."); unsigned num_failures = 1; unsigned num_attempts = 0; unsigned max_iterations = ComputeMaxIterations(n, table_size_, num_hash_functions_); unsigned total_table_size = table_size_ + kStashSize; while (num_failures && ++num_attempts < kMaxRestartAttempts) { if (num_hash_functions_ == 2) constants_2_.Generate(n, d_keys, table_size_); else if (num_hash_functions_ == 3) constants_3_.Generate(n, d_keys, table_size_); else if (num_hash_functions_ == 4) constants_4_.Generate(n, d_keys, table_size_); else constants_5_.Generate(n, d_keys, table_size_); // Initialize the cuckoo hash table. CUDAWrapper::ClearTable(total_table_size, kKeyEmpty, d_scratch_cuckoo_keys_); num_failures = 0; cudaMemcpy(d_failures_, &num_failures, sizeof(unsigned), cudaMemcpyHostToDevice); unsigned *d_stash_count = NULL; cudaMalloc((void**)&d_stash_count, sizeof(unsigned)); cudaMemset(d_stash_count, 0, sizeof(unsigned)); CUDAWrapper::CallHashBuildCompacting(n, num_hash_functions_, d_keys, table_size_, constants_2_, constants_3_, constants_4_, constants_5_, stash_constants_, max_iterations, d_scratch_cuckoo_keys_, d_stash_count, d_failures_); CUDA_SAFE_CALL(cudaMemcpy(&stash_count_, d_stash_count, sizeof(unsigned), cudaMemcpyDeviceToHost)); if (stash_count_) { char buffer[256]; sprintf(buffer, "Stash count: %u", stash_count_); PrintMessage(buffer, true); } CUDA_SAFE_CALL(cudaFree(d_stash_count)); CUDA_CHECK_ERROR("!!! Failed to cuckoo hash.\n"); CUDA_SAFE_CALL(cudaMemcpy(&num_failures, d_failures_, sizeof(unsigned), cudaMemcpyDeviceToHost)); #ifdef COUNT_UNINSERTED if (num_failures > 0) { char buffer[256]; sprintf(buffer, "Num failures: %u", num_failures); PrintMessage(buffer, true); } #endif } if (num_attempts >= kMaxRestartAttempts) { PrintMessage("Completely failed to build.", true); return false; } else if (num_attempts > 1) { char buffer[256]; sprintf(buffer, "Needed %u attempts", num_attempts); PrintMessage(buffer); } if (num_failures == 0) { CUDAWrapper::CallHashRemoveDuplicates(num_hash_functions_, table_size_, total_table_size, constants_2_, constants_3_, constants_4_, constants_5_, stash_constants_, d_scratch_cuckoo_keys_, d_scratch_counts_); // Do a prefix-sum over the values to assign each key a unique index. CUDPPConfiguration config; config.op = CUDPP_ADD; config.datatype = CUDPP_UINT; config.algorithm = CUDPP_SCAN; config.options = CUDPP_OPTION_FORWARD | CUDPP_OPTION_INCLUSIVE; CUDPPResult result = cudppPlan(theCudpp, &scanplan_, config, total_table_size, 1, 0); if (CUDPP_SUCCESS == result) { cudppScan(scanplan_, d_scratch_unique_ids_, d_scratch_counts_, total_table_size); } else { PrintMessage("!!! Failed to create plan.", true); } CUDA_CHECK_ERROR("!!! Scan failed.\n"); // Determine how many unique values there are. CUDA_SAFE_CALL(cudaMemcpy(&unique_keys_size_, d_scratch_unique_ids_ + total_table_size - 1, sizeof(unsigned), cudaMemcpyDeviceToHost)); // Copy the unique indices back and compact down the neighbors. CUDA_SAFE_CALL(cudaMalloc((void**)&d_unique_keys_, sizeof(unsigned) * unique_keys_size_)); CUDA_SAFE_CALL(cudaMemset(d_unique_keys_, 0xff, sizeof(unsigned) * unique_keys_size_)); CUDAWrapper::CallHashCompactDown(total_table_size, d_contents_, d_unique_keys_, d_scratch_cuckoo_keys_, d_scratch_unique_ids_); } CUDA_CHECK_ERROR("Error occurred during hash table build.\n"); return true; }
void IntIntSorter::executeOnGPUAsync(void * const keys, void * const vals, const int numKeys, int & numUniqueKeys, int ** keyOffsets, int ** valOffsets, int ** numVals) { if (numKeys == 0) { numUniqueKeys = 0; *keyOffsets = *valOffsets = NULL; *numVals = NULL; return; } if (numKeys > 32 * 1048576) { executeOnCPUAsync(keys, vals, numKeys, numUniqueKeys, keyOffsets, valOffsets, numVals); return; } int commRank; MPI_Comm_rank(MPI_COMM_WORLD, &commRank); CUDPPConfiguration cudppConfig; CUDPPHandle planHandle; void * gpuInputKeys = cudacpp::Runtime::malloc(sizeof(int) * numKeys); void * gpuInputVals = cudacpp::Runtime::malloc(sizeof(int) * numKeys); void * gpuUniqueFlags = cudacpp::Runtime::malloc(sizeof(int) * numKeys); void * gpuValOffsets = cudacpp::Runtime::malloc(sizeof(int) * numKeys); cudacpp::Runtime::memcpyHtoD(gpuInputKeys, keys, sizeof(int) * numKeys); cudacpp::Runtime::memcpyHtoD(gpuInputVals, vals, sizeof(int) * numKeys); /* what we need to get out of here: 1 - sorted keys and values 2 - num unique keys 3 - number of values for each key 4 - value offsets 5 - compacted keys to get: simply sort A = find unique values B = reverse exclusive scan of "A" C = if A[i] == 1 C[B[0] - B[i]] = i D = [0] = C[0] + 1 [N] = #keys - C[#keys - 1] [i] = C[i + 1] - C[i] E = forward exclusive scan D F = keys[E[i]] 1 = result of sort (only copy the values) 2 = B[0] 3 = D 4 = E 5 = F */ // 1 cudppConfig.algorithm = CUDPP_SORT_RADIX; cudppConfig.op = CUDPP_ADD; // ignored cudppConfig.datatype = CUDPP_UINT; cudppConfig.options = CUDPP_OPTION_KEY_VALUE_PAIRS; cudppPlan(&planHandle, cudppConfig, numKeys, 1, numKeys * sizeof(int)); cudppSort(planHandle, gpuInputKeys, gpuInputVals, sizeof(int) * 8, numKeys); cudppDestroyPlan(planHandle); cudacpp::Runtime::sync(); // cudacpp::Runtime::memcpyDtoH(keys, gpuInputKeys, sizeof(int) * numKeys); cudacpp::Runtime::memcpyDtoH(vals, gpuInputVals, sizeof(int) * numKeys); // 2 - A = gpuUniqueFlags gpmrIntIntSorterMarkUnique(gpuInputKeys, gpuUniqueFlags, numKeys); // 2 - B = gpuValOffsets cudppConfig.algorithm = CUDPP_SCAN; cudppConfig.op = CUDPP_ADD; // ignored cudppConfig.datatype = CUDPP_INT; cudppConfig.options = CUDPP_OPTION_EXCLUSIVE | CUDPP_OPTION_BACKWARD; cudppPlan(&planHandle, cudppConfig, numKeys, 1, numKeys * sizeof(int)); cudppScan(planHandle, gpuValOffsets, gpuUniqueFlags, numKeys); cudppDestroyPlan(planHandle); cudacpp::Runtime::sync(); cudacpp::Runtime::memcpyDtoH(&numUniqueKeys, gpuValOffsets, sizeof(int)); ++numUniqueKeys; // 2 - C = gpuInputVals and // 3 - D = gpuValOffsets cudacpp::Runtime::sync(); gpmrIntIntSorterFindOffsets(gpuInputKeys, gpuUniqueFlags, gpuValOffsets, gpuInputVals, gpuValOffsets, numKeys, numUniqueKeys); *numVals = reinterpret_cast<int * >(cudacpp::Runtime::mallocHost(numUniqueKeys * sizeof(int))); cudacpp::Runtime::sync(); cudacpp::Runtime::memcpyDtoH(*numVals, gpuValOffsets, sizeof(int) * numUniqueKeys); cudacpp::Runtime::sync(); // 4 - E = gpuUniqueFlags cudppConfig.algorithm = CUDPP_SCAN; cudppConfig.op = CUDPP_ADD; // ignored cudppConfig.datatype = CUDPP_INT; cudppConfig.options = CUDPP_OPTION_EXCLUSIVE | CUDPP_OPTION_FORWARD; cudppPlan(&planHandle, cudppConfig, numKeys, 1, numKeys * sizeof(int)); cudppScan(planHandle, gpuUniqueFlags, gpuValOffsets, numKeys); cudppDestroyPlan(planHandle); cudacpp::Runtime::sync(); *valOffsets = reinterpret_cast<int * >(cudacpp::Runtime::mallocHost(numUniqueKeys * sizeof(int))); cudacpp::Runtime::memcpyDtoH(*valOffsets, gpuUniqueFlags, sizeof(int) * numUniqueKeys); // 4 - F = gpuInputVals gpmrIntIntSorterSetCompactedKeys(gpuInputKeys, gpuUniqueFlags, gpuInputVals, numUniqueKeys); cudacpp::Runtime::memcpyDtoH(keys, gpuInputVals, sizeof(int) * numUniqueKeys); cudacpp::Runtime::free(gpuInputKeys); cudacpp::Runtime::free(gpuInputVals); cudacpp::Runtime::free(gpuUniqueFlags); cudacpp::Runtime::free(gpuValOffsets); }
//////////////////////////////////////////////////////////////////////////////// //! Run the Cuda part of the computation //////////////////////////////////////////////////////////////////////////////// void computeIsosurface() { int threads = 128; dim3 grid(numVoxels / threads, 1, 1); // get around maximum grid size of 65535 in each dimension if (grid.x > 65535) { grid.y = grid.x / 32768; grid.x = 32768; } // calculate number of vertices need per voxel launch_classifyVoxel(grid, threads, d_voxelVerts, d_voxelOccupied, d_volume, gridSize, gridSizeShift, gridSizeMask, numVoxels, voxelSize, isoValue); #if DEBUG_BUFFERS printf("voxelVerts:\n"); dumpBuffer(d_voxelVerts, numVoxels, sizeof(uint)); #endif #if SKIP_EMPTY_VOXELS // scan voxel occupied array cudppScan(scanplan, d_voxelOccupiedScan, d_voxelOccupied, numVoxels); #if DEBUG_BUFFERS printf("voxelOccupiedScan:\n"); dumpBuffer(d_voxelOccupiedScan, numVoxels, sizeof(uint)); #endif // read back values to calculate total number of non-empty voxels // since we are using an exclusive scan, the total is the last value of // the scan result plus the last value in the input array { uint lastElement, lastScanElement; cutilSafeCall(cudaMemcpy((void *) &lastElement, (void *) (d_voxelOccupied + numVoxels-1), sizeof(uint), cudaMemcpyDeviceToHost)); cutilSafeCall(cudaMemcpy((void *) &lastScanElement, (void *) (d_voxelOccupiedScan + numVoxels-1), sizeof(uint), cudaMemcpyDeviceToHost)); activeVoxels = lastElement + lastScanElement; } if (activeVoxels==0) { // return if there are no full voxels totalVerts = 0; return; } // compact voxel index array launch_compactVoxels(grid, threads, d_compVoxelArray, d_voxelOccupied, d_voxelOccupiedScan, numVoxels); cutilCheckMsg("compactVoxels failed"); #endif // SKIP_EMPTY_VOXELS // scan voxel vertex count array cudppScan(scanplan, d_voxelVertsScan, d_voxelVerts, numVoxels); #if DEBUG_BUFFERS printf("voxelVertsScan:\n"); dumpBuffer(d_voxelVertsScan, numVoxels, sizeof(uint)); #endif // readback total number of vertices { uint lastElement, lastScanElement; cutilSafeCall(cudaMemcpy((void *) &lastElement, (void *) (d_voxelVerts + numVoxels-1), sizeof(uint), cudaMemcpyDeviceToHost)); cutilSafeCall(cudaMemcpy((void *) &lastScanElement, (void *) (d_voxelVertsScan + numVoxels-1), sizeof(uint), cudaMemcpyDeviceToHost)); totalVerts = lastElement + lastScanElement; } // generate triangles, writing to vertex buffers if (!g_bQAReadback) { size_t num_bytes; // DEPRECATED: cutilSafeCall(cudaGLMapBufferObject((void**)&d_pos, posVbo)); cutilSafeCall(cudaGraphicsMapResources(1, &cuda_posvbo_resource, 0)); cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void**)&d_pos, &num_bytes, cuda_posvbo_resource)); // DEPRECATED: cutilSafeCall(cudaGLMapBufferObject((void**)&d_normal, normalVbo)); cutilSafeCall(cudaGraphicsMapResources(1, &cuda_normalvbo_resource, 0)); cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void**)&d_normal, &num_bytes, cuda_normalvbo_resource)); } #if SKIP_EMPTY_VOXELS dim3 grid2((int) ceil(activeVoxels / (float) NTHREADS), 1, 1); #else dim3 grid2((int) ceil(numVoxels / (float) NTHREADS), 1, 1); #endif while(grid2.x > 65535) { grid2.x/=2; grid2.y*=2; } #if SAMPLE_VOLUME launch_generateTriangles2(grid2, NTHREADS, d_pos, d_normal, d_compVoxelArray, d_voxelVertsScan, d_volume, gridSize, gridSizeShift, gridSizeMask, voxelSize, isoValue, activeVoxels, maxVerts); #else launch_generateTriangles(grid2, NTHREADS, d_pos, d_normal, d_compVoxelArray, d_voxelVertsScan, gridSize, gridSizeShift, gridSizeMask, voxelSize, isoValue, activeVoxels, maxVerts); #endif if (!g_bQAReadback) { // DEPRECATED: cutilSafeCall(cudaGLUnmapBufferObject(normalVbo)); cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_normalvbo_resource, 0)); // DEPRECATED: cutilSafeCall(cudaGLUnmapBufferObject(posVbo)); cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_posvbo_resource, 0)); } }
bool MultivalueHashTable::Build(const unsigned n, const unsigned *d_keys, const unsigned *d_vals) { CUDA_CHECK_ERROR("Failed before build."); unsigned *d_sorted_keys = NULL; CUDA_SAFE_CALL(cudaMalloc((void**)&d_sorted_keys, sizeof(unsigned) * n)); CUDA_SAFE_CALL(cudaMemcpy(d_sorted_keys, d_keys, sizeof(unsigned) * n, cudaMemcpyDeviceToDevice)); unsigned *d_sorted_vals = NULL; CUDA_SAFE_CALL(cudaMalloc((void**)&d_sorted_vals, sizeof(unsigned) * n)); CUDA_SAFE_CALL(cudaMemcpy(d_sorted_vals, d_vals, sizeof(unsigned) * n, cudaMemcpyDeviceToDevice)); CUDA_CHECK_ERROR("Failed to allocate."); CUDPPConfiguration sort_config; sort_config.algorithm = CUDPP_SORT_RADIX; sort_config.datatype = CUDPP_UINT; sort_config.options = CUDPP_OPTION_KEY_VALUE_PAIRS; CUDPPHandle sort_plan; CUDPPResult sort_result = cudppPlan(theCudpp, &sort_plan, sort_config, n, 1, 0); cudppRadixSort(sort_plan, d_sorted_keys, (void*)d_sorted_vals, n); if (sort_result != CUDPP_SUCCESS) { printf("Error in plan creation in MultivalueHashTable::build\n"); bool retval = false; cudppDestroyPlan(sort_plan); return retval; } CUDA_CHECK_ERROR("Failed to sort"); // Find the first key-value pair for each key. CUDAWrapper::CallCheckIfUnique(d_sorted_keys, n, d_scratch_is_unique_); // Assign a unique index from 0 to k-1 for each of the keys. cudppScan(scanplan_, d_scratch_offsets_, d_scratch_is_unique_, n); CUDA_CHECK_ERROR("Failed to scan"); // Check how many unique keys were found. unsigned num_unique_keys; CUDA_SAFE_CALL(cudaMemcpy(&num_unique_keys, d_scratch_offsets_ + n - 1, sizeof(unsigned), cudaMemcpyDeviceToHost)); CUDA_CHECK_ERROR("Failed to get # unique keys"); // Keep a list of the unique keys, and store info on each key's data // (location in the values array, how many there are). unsigned *d_compacted_keys = NULL; uint2 *d_index_counts_tmp = NULL; CUDA_SAFE_CALL(cudaMalloc((void**) &d_compacted_keys, sizeof(unsigned) * num_unique_keys)); CUDA_SAFE_CALL(cudaMalloc((void**) &d_index_counts_tmp, sizeof(uint2) * num_unique_keys)); CUDAWrapper::CallCompactKeys(d_sorted_keys, d_scratch_is_unique_, d_scratch_offsets_, n, d_index_counts_tmp, d_compacted_keys); // Determine the counts. CUDAWrapper::CallCountValues(d_index_counts_tmp, n, num_unique_keys); // Reinitialize the cuckoo hash table using the information we discovered. HashTable::Initialize(num_unique_keys, target_space_usage_, num_hash_functions_); d_index_counts_ = d_index_counts_tmp; d_unique_keys_ = d_compacted_keys; d_sorted_values_ = d_sorted_vals; sorted_values_size_ = n; // Build the cuckoo hash table with each key assigned a unique index. // Re-uses the sorted key memory as an array of values from 0 to k-1. CUDAWrapper::CallPrepareIndices(num_unique_keys, d_sorted_keys); bool success = HashTable::Build(num_unique_keys, d_unique_keys_, d_sorted_keys); CUDA_SAFE_CALL(cudaFree(d_sorted_keys)); return success; }