void CuckooHash::Init(const size_t maxSize, const unsigned short hFuncNum) { _maxSize = maxSize; _hFuncNum = hFuncNum; CUDA_CALL( cudaMalloc((void**)&_data, _maxSize*sizeof(int2)) ); CUDA_CALL( cudaMalloc((void**)&_hashConstants, _hFuncNum*sizeof(int)) ); CUDA_CHECK_ERROR("Init failed!\n"); }
void CuckooHash::FreeMemory() { CUDA_CALL( cudaFree(_data) ); CUDA_CALL( cudaFree(_hashConstants) ); CUDA_CHECK_ERROR("Free memory failed!\n"); _maxSize = 0; _currentSize = 0; _data = NULL; _hashConstants = NULL; }
void MaxDiff( const GPU::Classes::VolumeGPU<T>& a, const GPU::Classes::VolumeGPU<T>& b, T& maxDiff, dim3& maxLoc ) const { /*! Routine to compare two volumes, and find the voxel with the maximum difference. */ const dim3 aDims = a.GetDims(); const dim3 bDims = b.GetDims(); // Verify dimensions if( aDims != bDims ) { std::cerr << __FUNCTION__ << ": Dimension mismatch" << std::endl; exit( EXIT_FAILURE ); } // Allocate 'difference' array thrust::device_ptr<T> d_absDiffs; const unsigned int nVoxels = aDims.x * aDims.y * aDims.z; d_absDiffs = thrust::device_new<T>( nVoxels ); // Run the difference kernel dim3 grid, threads; threads.x = threads.y = kAbsDiffBlockSize; threads.z = 1; grid = a.CoverBlocks( kAbsDiffBlockSize ); grid.z = 1; ComputeAbsDiffs<T> <<<grid,threads>>> ( a, b, thrust::raw_pointer_cast( d_absDiffs ) ); CUDA_CHECK_ERROR( "ComputeAbsDiffs kernel failed!\n" ); // Extract the maximum and its location thrust::device_ptr<T> d_maxLoc; d_maxLoc = thrust::max_element( d_absDiffs, d_absDiffs + nVoxels ); maxDiff = *d_maxLoc; maxLoc = a.Index3D( d_maxLoc - d_absDiffs ); // Release 'difference' array thrust::device_delete( d_absDiffs ); }
double ErrL2Norm( const GPU::Classes::VolumeGPU<T>& cmp, const GPU::Classes::VolumeGPU<T>& ref ) const { /*! Routine to compute the error in the L2 norm between two volumes */ const dim3 cmpDims = cmp.GetDims(); const dim3 refDims = ref.GetDims(); // Verify dimensions if( refDims != cmpDims ) { std::cerr << __FUNCTION__ << ": Dimension mismatch" << std::endl; exit( EXIT_FAILURE ); } // Compute number of voxels const unsigned int nVoxels = refDims.x * refDims.y * refDims.z; // Allocate thrust arrays thrust::device_ptr<double> d_err; thrust::device_ptr<double> d_reference; d_err = thrust::device_new<double>( nVoxels ); d_reference = thrust::device_new<double>( nVoxels ); // Run the kernel dim3 grid, threads; threads.x = threads.y = kErrL2BlockSize; threads.z = 1; grid = ref.CoverBlocks( kErrL2BlockSize ); grid.z = 1; ErrL2Compute<T><<<grid,threads>>> ( cmp, ref, thrust::raw_pointer_cast( d_err ), thrust::raw_pointer_cast( d_reference ) ); CUDA_CHECK_ERROR( "ErrL2Compute kernel failed!\n" ); // Extract sums double totErr = thrust::reduce( d_err, d_err+nVoxels ); double totRef = thrust::reduce( d_reference, d_reference+nVoxels ); // Release thrust arrays thrust::device_delete( d_err ); thrust::device_delete( d_reference ); return( sqrt( totErr / totRef ) ); }
void inclusive_scan( size_t _N, real_t *d_x, real_t *d_y, BinaryOp binaryOp, Setter copy ) { real_t *d_block_x, *d_block_y; const size_t N = math::pow2ceil(_N); // maximum elements per block: 1024 * 2 (since each thread processes two elements) // TODO allow tweaking of this parameter const size_t elemPerBlock = 64; //const size_t elemPerBlock = 8; // each thread processes two elements dim3 block_dim = elemPerBlock / 2; // number of blocks dim3 grid_dim = N / elemPerBlock + (N % elemPerBlock == 0 ? 0 : 1); size_t nbytes_block = grid_dim.x * sizeof(real_t) * m; cudaMalloc((void**) &d_block_x, nbytes_block); cudaMalloc((void**) &d_block_y, nbytes_block); // elements are divided into blocks // each thread processes two elements within a block prescan<m> <<< grid_dim, block_dim, elemPerBlock*sizeof(real_t) * m >>> (elemPerBlock, d_x, d_y, binaryOp, copy); // one block; each thread processes a scan block from above aggregate_block_sum<m> <<< 1, grid_dim >>> (elemPerBlock, d_y, d_block_x, copy); // one block; each thread processes two scan block sums (hence need half the number of scan blocks from previous run) prescan<m> <<< 1, grid_dim.x/2, grid_dim.x*sizeof(real_t) * m >>> (grid_dim.x, d_block_x, d_block_y, binaryOp, copy); // each thread processes one element in original data // need twice as many blocks as before, since each thread now processes one element add_block_cumsum<m> <<< grid_dim.x*2, block_dim >>> (N, d_block_y, d_y, binaryOp, copy); cudaFree(d_block_x); cudaFree(d_block_y); CUDA_CHECK_ERROR("inclusive_scan"); }
bool CompactingHashTable::Build(const unsigned n, const unsigned *d_keys, const unsigned *d_values) { CUDA_CHECK_ERROR("Failed before attempting to build."); unsigned num_failures = 1; unsigned num_attempts = 0; unsigned max_iterations = ComputeMaxIterations(n, table_size_, num_hash_functions_); unsigned total_table_size = table_size_ + kStashSize; while (num_failures && ++num_attempts < kMaxRestartAttempts) { if (num_hash_functions_ == 2) constants_2_.Generate(n, d_keys, table_size_); else if (num_hash_functions_ == 3) constants_3_.Generate(n, d_keys, table_size_); else if (num_hash_functions_ == 4) constants_4_.Generate(n, d_keys, table_size_); else constants_5_.Generate(n, d_keys, table_size_); // Initialize the cuckoo hash table. CUDAWrapper::ClearTable(total_table_size, kKeyEmpty, d_scratch_cuckoo_keys_); num_failures = 0; cudaMemcpy(d_failures_, &num_failures, sizeof(unsigned), cudaMemcpyHostToDevice); unsigned *d_stash_count = NULL; cudaMalloc((void**)&d_stash_count, sizeof(unsigned)); cudaMemset(d_stash_count, 0, sizeof(unsigned)); CUDAWrapper::CallHashBuildCompacting(n, num_hash_functions_, d_keys, table_size_, constants_2_, constants_3_, constants_4_, constants_5_, stash_constants_, max_iterations, d_scratch_cuckoo_keys_, d_stash_count, d_failures_); CUDA_SAFE_CALL(cudaMemcpy(&stash_count_, d_stash_count, sizeof(unsigned), cudaMemcpyDeviceToHost)); if (stash_count_) { char buffer[256]; sprintf(buffer, "Stash count: %u", stash_count_); PrintMessage(buffer, true); } CUDA_SAFE_CALL(cudaFree(d_stash_count)); CUDA_CHECK_ERROR("!!! Failed to cuckoo hash.\n"); CUDA_SAFE_CALL(cudaMemcpy(&num_failures, d_failures_, sizeof(unsigned), cudaMemcpyDeviceToHost)); #ifdef COUNT_UNINSERTED if (num_failures > 0) { char buffer[256]; sprintf(buffer, "Num failures: %u", num_failures); PrintMessage(buffer, true); } #endif } if (num_attempts >= kMaxRestartAttempts) { PrintMessage("Completely failed to build.", true); return false; } else if (num_attempts > 1) { char buffer[256]; sprintf(buffer, "Needed %u attempts", num_attempts); PrintMessage(buffer); } if (num_failures == 0) { CUDAWrapper::CallHashRemoveDuplicates(num_hash_functions_, table_size_, total_table_size, constants_2_, constants_3_, constants_4_, constants_5_, stash_constants_, d_scratch_cuckoo_keys_, d_scratch_counts_); // Do a prefix-sum over the values to assign each key a unique index. CUDPPConfiguration config; config.op = CUDPP_ADD; config.datatype = CUDPP_UINT; config.algorithm = CUDPP_SCAN; config.options = CUDPP_OPTION_FORWARD | CUDPP_OPTION_INCLUSIVE; CUDPPResult result = cudppPlan(theCudpp, &scanplan_, config, total_table_size, 1, 0); if (CUDPP_SUCCESS == result) { cudppScan(scanplan_, d_scratch_unique_ids_, d_scratch_counts_, total_table_size); } else { PrintMessage("!!! Failed to create plan.", true); } CUDA_CHECK_ERROR("!!! Scan failed.\n"); // Determine how many unique values there are. CUDA_SAFE_CALL(cudaMemcpy(&unique_keys_size_, d_scratch_unique_ids_ + total_table_size - 1, sizeof(unsigned), cudaMemcpyDeviceToHost)); // Copy the unique indices back and compact down the neighbors. CUDA_SAFE_CALL(cudaMalloc((void**)&d_unique_keys_, sizeof(unsigned) * unique_keys_size_)); CUDA_SAFE_CALL(cudaMemset(d_unique_keys_, 0xff, sizeof(unsigned) * unique_keys_size_)); CUDAWrapper::CallHashCompactDown(total_table_size, d_contents_, d_unique_keys_, d_scratch_cuckoo_keys_, d_scratch_unique_ids_); } CUDA_CHECK_ERROR("Error occurred during hash table build.\n"); return true; }
bool MultivalueHashTable::Build(const unsigned n, const unsigned *d_keys, const unsigned *d_vals) { CUDA_CHECK_ERROR("Failed before build."); unsigned *d_sorted_keys = NULL; CUDA_SAFE_CALL(cudaMalloc((void**)&d_sorted_keys, sizeof(unsigned) * n)); CUDA_SAFE_CALL(cudaMemcpy(d_sorted_keys, d_keys, sizeof(unsigned) * n, cudaMemcpyDeviceToDevice)); unsigned *d_sorted_vals = NULL; CUDA_SAFE_CALL(cudaMalloc((void**)&d_sorted_vals, sizeof(unsigned) * n)); CUDA_SAFE_CALL(cudaMemcpy(d_sorted_vals, d_vals, sizeof(unsigned) * n, cudaMemcpyDeviceToDevice)); CUDA_CHECK_ERROR("Failed to allocate."); CUDPPConfiguration sort_config; sort_config.algorithm = CUDPP_SORT_RADIX; sort_config.datatype = CUDPP_UINT; sort_config.options = CUDPP_OPTION_KEY_VALUE_PAIRS; CUDPPHandle sort_plan; CUDPPResult sort_result = cudppPlan(theCudpp, &sort_plan, sort_config, n, 1, 0); cudppRadixSort(sort_plan, d_sorted_keys, (void*)d_sorted_vals, n); if (sort_result != CUDPP_SUCCESS) { printf("Error in plan creation in MultivalueHashTable::build\n"); bool retval = false; cudppDestroyPlan(sort_plan); return retval; } CUDA_CHECK_ERROR("Failed to sort"); // Find the first key-value pair for each key. CUDAWrapper::CallCheckIfUnique(d_sorted_keys, n, d_scratch_is_unique_); // Assign a unique index from 0 to k-1 for each of the keys. cudppScan(scanplan_, d_scratch_offsets_, d_scratch_is_unique_, n); CUDA_CHECK_ERROR("Failed to scan"); // Check how many unique keys were found. unsigned num_unique_keys; CUDA_SAFE_CALL(cudaMemcpy(&num_unique_keys, d_scratch_offsets_ + n - 1, sizeof(unsigned), cudaMemcpyDeviceToHost)); CUDA_CHECK_ERROR("Failed to get # unique keys"); // Keep a list of the unique keys, and store info on each key's data // (location in the values array, how many there are). unsigned *d_compacted_keys = NULL; uint2 *d_index_counts_tmp = NULL; CUDA_SAFE_CALL(cudaMalloc((void**) &d_compacted_keys, sizeof(unsigned) * num_unique_keys)); CUDA_SAFE_CALL(cudaMalloc((void**) &d_index_counts_tmp, sizeof(uint2) * num_unique_keys)); CUDAWrapper::CallCompactKeys(d_sorted_keys, d_scratch_is_unique_, d_scratch_offsets_, n, d_index_counts_tmp, d_compacted_keys); // Determine the counts. CUDAWrapper::CallCountValues(d_index_counts_tmp, n, num_unique_keys); // Reinitialize the cuckoo hash table using the information we discovered. HashTable::Initialize(num_unique_keys, target_space_usage_, num_hash_functions_); d_index_counts_ = d_index_counts_tmp; d_unique_keys_ = d_compacted_keys; d_sorted_values_ = d_sorted_vals; sorted_values_size_ = n; // Build the cuckoo hash table with each key assigned a unique index. // Re-uses the sorted key memory as an array of values from 0 to k-1. CUDAWrapper::CallPrepareIndices(num_unique_keys, d_sorted_keys); bool success = HashTable::Build(num_unique_keys, d_unique_keys_, d_sorted_keys); CUDA_SAFE_CALL(cudaFree(d_sorted_keys)); return success; }