bool MultivalueHashTable::Initialize(const unsigned max_table_entries, const float space_usage, const unsigned num_hash_functions) { bool success = HashTable::Initialize(max_table_entries, space_usage, num_hash_functions); target_space_usage_ = space_usage; // + 2N 32-bit entries CUDA_SAFE_CALL(cudaMalloc( (void**)&d_scratch_offsets_, sizeof(unsigned) * max_table_entries )); CUDA_SAFE_CALL(cudaMalloc( (void**)&d_scratch_is_unique_, sizeof(unsigned) * max_table_entries )); success &= (d_scratch_offsets_ != NULL); success &= (d_scratch_is_unique_ != NULL); // Allocate memory for the scan. // + Unknown memory usage CUDPPConfiguration config; config.op = CUDPP_ADD; config.datatype = CUDPP_UINT; config.algorithm = CUDPP_SCAN; config.options = CUDPP_OPTION_FORWARD | CUDPP_OPTION_INCLUSIVE; CUDPPResult result = cudppPlan(theCudpp, &scanplan_, config, max_table_entries, 1, 0); if (CUDPP_SUCCESS != result) { fprintf(stderr, "Failed to create plan."); return false; } return success; }
void VHParticlesRender::initParticlesRender(){ // Create the CUDPP radix sort CUDPPConfiguration sortConfig; sortConfig.algorithm = CUDPP_SORT_RADIX; sortConfig.datatype = CUDPP_FLOAT; sortConfig.op = CUDPP_ADD; sortConfig.options = CUDPP_OPTION_KEY_VALUE_PAIRS; cudppPlan(&m_sortHandle, sortConfig, pSys->nParts, 1, 0); //-----shaders /*char currentPath[_MAX_PATH]; getcwd(currentPath, _MAX_PATH); printf("Path : %s", currentPath);*/ simpleSpriteProg = new GLSLProgram("sprite.vs", "sprite.gs", "simpleSprite.ps"); shadowedSpriteProg = new GLSLProgram("sprite.vs", "sprite.gs", "shadowedSprite.ps"); shadowMapSpriteProg = new GLSLProgram("sprite.vs", "sprite.gs", "ShadowMapSprite.ps"); displayTexProg = new GLSLProgram("passThru.vs", "texture2D.ps"); blurProg = new GLSLProgram("passThru.vs", "blur.ps"); if(spritePath) loadSprite(spritePath); initFbos(width, height, true); }
void CMarchingCubes::InitMC(int _width, int _height, int _depth, ElemType* _pVolume) { // Data Array A[:, :, 1], A[:, :, 2], ... , A[:, : , n] //m_GridSize = make_uint3(_depth, _width, _height); m_GridSize = make_uint3(_width, _height, _depth); m_NumVoxels = m_GridSize.x * m_GridSize.y * m_GridSize.z; m_MaxVerts = m_GridSize.x * m_GridSize.y * 30; // Num of MaxVerts need change #ifdef _DEBUG printf("grids: %d * %d * %d = %d voxels\n", m_GridSize.x, m_GridSize.y, m_GridSize.z, m_NumVoxels); #endif // _DEBUG // needed change int size = m_GridSize.x * m_GridSize.y * m_GridSize.z * sizeof(float); ////////////////////////////////////////////////////////////////////////// int len = m_GridSize.x * m_GridSize.y * m_GridSize.z * 3; float *pVolTemp = new float[len]; for (int i = 0; i < len; i++) pVolTemp[i] = _pVolume[i]; ////////////////////////////////////////////////////////////////////////// cutilSafeCall(cudaMalloc((void**) &m_pdVolume, size * 3)); cutilSafeCall(cudaMemcpy(m_pdVolume, pVolTemp, size * 3, cudaMemcpyHostToDevice) ); bindVolumeTexture(m_pdVolume); // map the coordinates to the texture directly delete []pVolTemp; // allocate textures allocateTextures( &m_pdEdgeTable, &m_pdTriTable, &m_pdNumVertsTable ); // allocate device memory unsigned int memSize = sizeof(uint) * m_NumVoxels; cutilSafeCall(cudaMalloc((void**) &m_pdVoxelVerts, memSize)); cutilSafeCall(cudaMalloc((void**) &m_pdVoxelVertsScan, memSize)); cutilSafeCall(cudaMalloc((void**) &m_pdVoxelOccupied, memSize)); cutilSafeCall(cudaMalloc((void**) &m_pdVoxelOccupiedScan, memSize)); cutilSafeCall(cudaMalloc((void**) &m_pdCompactedVoxelArray, memSize)); // initialize CUDPP scan CUDPPConfiguration config; config.algorithm = CUDPP_SCAN; config.datatype = CUDPP_UINT; config.op = CUDPP_ADD; config.options = CUDPP_OPTION_FORWARD | CUDPP_OPTION_EXCLUSIVE; cudppPlan(&m_Scanplan, config, m_NumVoxels, 1, 0); }
void GPUSurfFeatureDetector::allocCudaResource() { //Allocating buffer for 2-pass feature location extraction on GPU unsigned int width = mWidth; for (int i=0; i<mNbOctave; ++i) { width /= 2; float* devicePass1 = NULL; float* devicePass2 = NULL; cudaMalloc((void**)&devicePass1, width*sizeof(float)); cudaMalloc((void**)&devicePass2, width*sizeof(float)); CUDPPHandle scanPlan; CUDPPConfiguration config = { CUDPP_SCAN, CUDPP_ADD, CUDPP_FLOAT, CUDPP_OPTION_FORWARD | CUDPP_OPTION_EXCLUSIVE }; cudppPlan(&scanPlan, config, width, 1, 0); mDeviceFeatureCounterPass1.push_back(devicePass1); mDeviceFeatureCounterPass2.push_back(devicePass2); mDeviceScanPlan.push_back(scanPlan); } //Allocating buffer for Feature location on GPU + CPU cudaMalloc((void**)&mDeviceFeatureFound, mNbFeatureMax*sizeof(Feature)); mHostFeatureFound = new Feature[mNbFeatureMax]; //Creating Cuda Texture mCudaNMSTexture = mCudaRoot->getTextureManager()->createTexture(mNMSTexture); //mCudaFeatureCudaTexture = mCudaRoot->getTextureManager()->createTexture(mFeatureCudaTexture); //mCudaFeatureTexture = mCudaRoot->getTextureManager()->createTexture(mFeatureTexture); //Registering Texture for CUDA mCudaNMSTexture->registerForCudaUse(); //mCudaFeatureCudaTexture->registerForCudaUse(); //mCudaFeatureTexture->registerForCudaUse(); mCudaIsAllocated = true; }
bool CompactingHashTable::Build(const unsigned n, const unsigned *d_keys, const unsigned *d_values) { CUDA_CHECK_ERROR("Failed before attempting to build."); unsigned num_failures = 1; unsigned num_attempts = 0; unsigned max_iterations = ComputeMaxIterations(n, table_size_, num_hash_functions_); unsigned total_table_size = table_size_ + kStashSize; while (num_failures && ++num_attempts < kMaxRestartAttempts) { if (num_hash_functions_ == 2) constants_2_.Generate(n, d_keys, table_size_); else if (num_hash_functions_ == 3) constants_3_.Generate(n, d_keys, table_size_); else if (num_hash_functions_ == 4) constants_4_.Generate(n, d_keys, table_size_); else constants_5_.Generate(n, d_keys, table_size_); // Initialize the cuckoo hash table. CUDAWrapper::ClearTable(total_table_size, kKeyEmpty, d_scratch_cuckoo_keys_); num_failures = 0; cudaMemcpy(d_failures_, &num_failures, sizeof(unsigned), cudaMemcpyHostToDevice); unsigned *d_stash_count = NULL; cudaMalloc((void**)&d_stash_count, sizeof(unsigned)); cudaMemset(d_stash_count, 0, sizeof(unsigned)); CUDAWrapper::CallHashBuildCompacting(n, num_hash_functions_, d_keys, table_size_, constants_2_, constants_3_, constants_4_, constants_5_, stash_constants_, max_iterations, d_scratch_cuckoo_keys_, d_stash_count, d_failures_); CUDA_SAFE_CALL(cudaMemcpy(&stash_count_, d_stash_count, sizeof(unsigned), cudaMemcpyDeviceToHost)); if (stash_count_) { char buffer[256]; sprintf(buffer, "Stash count: %u", stash_count_); PrintMessage(buffer, true); } CUDA_SAFE_CALL(cudaFree(d_stash_count)); CUDA_CHECK_ERROR("!!! Failed to cuckoo hash.\n"); CUDA_SAFE_CALL(cudaMemcpy(&num_failures, d_failures_, sizeof(unsigned), cudaMemcpyDeviceToHost)); #ifdef COUNT_UNINSERTED if (num_failures > 0) { char buffer[256]; sprintf(buffer, "Num failures: %u", num_failures); PrintMessage(buffer, true); } #endif } if (num_attempts >= kMaxRestartAttempts) { PrintMessage("Completely failed to build.", true); return false; } else if (num_attempts > 1) { char buffer[256]; sprintf(buffer, "Needed %u attempts", num_attempts); PrintMessage(buffer); } if (num_failures == 0) { CUDAWrapper::CallHashRemoveDuplicates(num_hash_functions_, table_size_, total_table_size, constants_2_, constants_3_, constants_4_, constants_5_, stash_constants_, d_scratch_cuckoo_keys_, d_scratch_counts_); // Do a prefix-sum over the values to assign each key a unique index. CUDPPConfiguration config; config.op = CUDPP_ADD; config.datatype = CUDPP_UINT; config.algorithm = CUDPP_SCAN; config.options = CUDPP_OPTION_FORWARD | CUDPP_OPTION_INCLUSIVE; CUDPPResult result = cudppPlan(theCudpp, &scanplan_, config, total_table_size, 1, 0); if (CUDPP_SUCCESS == result) { cudppScan(scanplan_, d_scratch_unique_ids_, d_scratch_counts_, total_table_size); } else { PrintMessage("!!! Failed to create plan.", true); } CUDA_CHECK_ERROR("!!! Scan failed.\n"); // Determine how many unique values there are. CUDA_SAFE_CALL(cudaMemcpy(&unique_keys_size_, d_scratch_unique_ids_ + total_table_size - 1, sizeof(unsigned), cudaMemcpyDeviceToHost)); // Copy the unique indices back and compact down the neighbors. CUDA_SAFE_CALL(cudaMalloc((void**)&d_unique_keys_, sizeof(unsigned) * unique_keys_size_)); CUDA_SAFE_CALL(cudaMemset(d_unique_keys_, 0xff, sizeof(unsigned) * unique_keys_size_)); CUDAWrapper::CallHashCompactDown(total_table_size, d_contents_, d_unique_keys_, d_scratch_cuckoo_keys_, d_scratch_unique_ids_); } CUDA_CHECK_ERROR("Error occurred during hash table build.\n"); return true; }
//////////////////////////////////////////////////////////////////////////////// // initialize marching cubes //////////////////////////////////////////////////////////////////////////////// void initMC(int argc, char** argv) { // parse command line arguments int n; if (cutGetCmdLineArgumenti( argc, (const char**) argv, "grid", &n)) { gridSizeLog2.x = gridSizeLog2.y = gridSizeLog2.z = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "gridx", &n)) { gridSizeLog2.x = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "gridy", &n)) { gridSizeLog2.y = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "gridz", &n)) { gridSizeLog2.z = n; } char *filename; if (cutGetCmdLineArgumentstr( argc, (const char**) argv, "file", &filename)) { volumeFilename = filename; } gridSize = make_uint3(1<<gridSizeLog2.x, 1<<gridSizeLog2.y, 1<<gridSizeLog2.z); gridSizeMask = make_uint3(gridSize.x-1, gridSize.y-1, gridSize.z-1); gridSizeShift = make_uint3(0, gridSizeLog2.x, gridSizeLog2.x+gridSizeLog2.y); numVoxels = gridSize.x*gridSize.y*gridSize.z; voxelSize = make_float3(2.0f / gridSize.x, 2.0f / gridSize.y, 2.0f / gridSize.z); maxVerts = gridSize.x*gridSize.y*100; printf("grid: %d x %d x %d = %d voxels\n", gridSize.x, gridSize.y, gridSize.z, numVoxels); printf("max verts = %d\n", maxVerts); #if SAMPLE_VOLUME // load volume data char* path = cutFindFilePath(volumeFilename, argv[0]); if (path == 0) { fprintf(stderr, "Error finding file '%s'\n", volumeFilename); cudaThreadExit(); exit(EXIT_FAILURE); } int size = gridSize.x*gridSize.y*gridSize.z*sizeof(uchar); uchar *volume = loadRawFile(path, size); cutilSafeCall(cudaMalloc((void**) &d_volume, size)); cutilSafeCall(cudaMemcpy(d_volume, volume, size, cudaMemcpyHostToDevice) ); free(volume); bindVolumeTexture(d_volume); #endif if (g_bQAReadback) { cudaMalloc((void **)&(d_pos), maxVerts*sizeof(float)*4); cudaMalloc((void **)&(d_normal), maxVerts*sizeof(float)*4); } else { // create VBOs createVBO(&posVbo, maxVerts*sizeof(float)*4); // DEPRECATED: cutilSafeCall( cudaGLRegisterBufferObject(posVbo) ); cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_posvbo_resource, posVbo, cudaGraphicsMapFlagsWriteDiscard)); createVBO(&normalVbo, maxVerts*sizeof(float)*4); // DEPRECATED: cutilSafeCall(cudaGLRegisterBufferObject(normalVbo)); cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_normalvbo_resource, normalVbo, cudaGraphicsMapFlagsWriteDiscard)); } // allocate textures allocateTextures( &d_edgeTable, &d_triTable, &d_numVertsTable ); // allocate device memory unsigned int memSize = sizeof(uint) * numVoxels; cutilSafeCall(cudaMalloc((void**) &d_voxelVerts, memSize)); cutilSafeCall(cudaMalloc((void**) &d_voxelVertsScan, memSize)); cutilSafeCall(cudaMalloc((void**) &d_voxelOccupied, memSize)); cutilSafeCall(cudaMalloc((void**) &d_voxelOccupiedScan, memSize)); cutilSafeCall(cudaMalloc((void**) &d_compVoxelArray, memSize)); // initialize CUDPP scan CUDPPConfiguration config; config.algorithm = CUDPP_SCAN; config.datatype = CUDPP_UINT; config.op = CUDPP_ADD; config.options = CUDPP_OPTION_FORWARD | CUDPP_OPTION_EXCLUSIVE; cudppPlan(&scanplan, config, numVoxels, 1, 0); }
void IntIntSorter::executeOnGPUAsync(void * const keys, void * const vals, const int numKeys, int & numUniqueKeys, int ** keyOffsets, int ** valOffsets, int ** numVals) { if (numKeys == 0) { numUniqueKeys = 0; *keyOffsets = *valOffsets = NULL; *numVals = NULL; return; } if (numKeys > 32 * 1048576) { executeOnCPUAsync(keys, vals, numKeys, numUniqueKeys, keyOffsets, valOffsets, numVals); return; } int commRank; MPI_Comm_rank(MPI_COMM_WORLD, &commRank); CUDPPConfiguration cudppConfig; CUDPPHandle planHandle; void * gpuInputKeys = cudacpp::Runtime::malloc(sizeof(int) * numKeys); void * gpuInputVals = cudacpp::Runtime::malloc(sizeof(int) * numKeys); void * gpuUniqueFlags = cudacpp::Runtime::malloc(sizeof(int) * numKeys); void * gpuValOffsets = cudacpp::Runtime::malloc(sizeof(int) * numKeys); cudacpp::Runtime::memcpyHtoD(gpuInputKeys, keys, sizeof(int) * numKeys); cudacpp::Runtime::memcpyHtoD(gpuInputVals, vals, sizeof(int) * numKeys); /* what we need to get out of here: 1 - sorted keys and values 2 - num unique keys 3 - number of values for each key 4 - value offsets 5 - compacted keys to get: simply sort A = find unique values B = reverse exclusive scan of "A" C = if A[i] == 1 C[B[0] - B[i]] = i D = [0] = C[0] + 1 [N] = #keys - C[#keys - 1] [i] = C[i + 1] - C[i] E = forward exclusive scan D F = keys[E[i]] 1 = result of sort (only copy the values) 2 = B[0] 3 = D 4 = E 5 = F */ // 1 cudppConfig.algorithm = CUDPP_SORT_RADIX; cudppConfig.op = CUDPP_ADD; // ignored cudppConfig.datatype = CUDPP_UINT; cudppConfig.options = CUDPP_OPTION_KEY_VALUE_PAIRS; cudppPlan(&planHandle, cudppConfig, numKeys, 1, numKeys * sizeof(int)); cudppSort(planHandle, gpuInputKeys, gpuInputVals, sizeof(int) * 8, numKeys); cudppDestroyPlan(planHandle); cudacpp::Runtime::sync(); // cudacpp::Runtime::memcpyDtoH(keys, gpuInputKeys, sizeof(int) * numKeys); cudacpp::Runtime::memcpyDtoH(vals, gpuInputVals, sizeof(int) * numKeys); // 2 - A = gpuUniqueFlags gpmrIntIntSorterMarkUnique(gpuInputKeys, gpuUniqueFlags, numKeys); // 2 - B = gpuValOffsets cudppConfig.algorithm = CUDPP_SCAN; cudppConfig.op = CUDPP_ADD; // ignored cudppConfig.datatype = CUDPP_INT; cudppConfig.options = CUDPP_OPTION_EXCLUSIVE | CUDPP_OPTION_BACKWARD; cudppPlan(&planHandle, cudppConfig, numKeys, 1, numKeys * sizeof(int)); cudppScan(planHandle, gpuValOffsets, gpuUniqueFlags, numKeys); cudppDestroyPlan(planHandle); cudacpp::Runtime::sync(); cudacpp::Runtime::memcpyDtoH(&numUniqueKeys, gpuValOffsets, sizeof(int)); ++numUniqueKeys; // 2 - C = gpuInputVals and // 3 - D = gpuValOffsets cudacpp::Runtime::sync(); gpmrIntIntSorterFindOffsets(gpuInputKeys, gpuUniqueFlags, gpuValOffsets, gpuInputVals, gpuValOffsets, numKeys, numUniqueKeys); *numVals = reinterpret_cast<int * >(cudacpp::Runtime::mallocHost(numUniqueKeys * sizeof(int))); cudacpp::Runtime::sync(); cudacpp::Runtime::memcpyDtoH(*numVals, gpuValOffsets, sizeof(int) * numUniqueKeys); cudacpp::Runtime::sync(); // 4 - E = gpuUniqueFlags cudppConfig.algorithm = CUDPP_SCAN; cudppConfig.op = CUDPP_ADD; // ignored cudppConfig.datatype = CUDPP_INT; cudppConfig.options = CUDPP_OPTION_EXCLUSIVE | CUDPP_OPTION_FORWARD; cudppPlan(&planHandle, cudppConfig, numKeys, 1, numKeys * sizeof(int)); cudppScan(planHandle, gpuUniqueFlags, gpuValOffsets, numKeys); cudppDestroyPlan(planHandle); cudacpp::Runtime::sync(); *valOffsets = reinterpret_cast<int * >(cudacpp::Runtime::mallocHost(numUniqueKeys * sizeof(int))); cudacpp::Runtime::memcpyDtoH(*valOffsets, gpuUniqueFlags, sizeof(int) * numUniqueKeys); // 4 - F = gpuInputVals gpmrIntIntSorterSetCompactedKeys(gpuInputKeys, gpuUniqueFlags, gpuInputVals, numUniqueKeys); cudacpp::Runtime::memcpyDtoH(keys, gpuInputVals, sizeof(int) * numUniqueKeys); cudacpp::Runtime::free(gpuInputKeys); cudacpp::Runtime::free(gpuInputVals); cudacpp::Runtime::free(gpuUniqueFlags); cudacpp::Runtime::free(gpuValOffsets); }
bool MultivalueHashTable::Build(const unsigned n, const unsigned *d_keys, const unsigned *d_vals) { CUDA_CHECK_ERROR("Failed before build."); unsigned *d_sorted_keys = NULL; CUDA_SAFE_CALL(cudaMalloc((void**)&d_sorted_keys, sizeof(unsigned) * n)); CUDA_SAFE_CALL(cudaMemcpy(d_sorted_keys, d_keys, sizeof(unsigned) * n, cudaMemcpyDeviceToDevice)); unsigned *d_sorted_vals = NULL; CUDA_SAFE_CALL(cudaMalloc((void**)&d_sorted_vals, sizeof(unsigned) * n)); CUDA_SAFE_CALL(cudaMemcpy(d_sorted_vals, d_vals, sizeof(unsigned) * n, cudaMemcpyDeviceToDevice)); CUDA_CHECK_ERROR("Failed to allocate."); CUDPPConfiguration sort_config; sort_config.algorithm = CUDPP_SORT_RADIX; sort_config.datatype = CUDPP_UINT; sort_config.options = CUDPP_OPTION_KEY_VALUE_PAIRS; CUDPPHandle sort_plan; CUDPPResult sort_result = cudppPlan(theCudpp, &sort_plan, sort_config, n, 1, 0); cudppRadixSort(sort_plan, d_sorted_keys, (void*)d_sorted_vals, n); if (sort_result != CUDPP_SUCCESS) { printf("Error in plan creation in MultivalueHashTable::build\n"); bool retval = false; cudppDestroyPlan(sort_plan); return retval; } CUDA_CHECK_ERROR("Failed to sort"); // Find the first key-value pair for each key. CUDAWrapper::CallCheckIfUnique(d_sorted_keys, n, d_scratch_is_unique_); // Assign a unique index from 0 to k-1 for each of the keys. cudppScan(scanplan_, d_scratch_offsets_, d_scratch_is_unique_, n); CUDA_CHECK_ERROR("Failed to scan"); // Check how many unique keys were found. unsigned num_unique_keys; CUDA_SAFE_CALL(cudaMemcpy(&num_unique_keys, d_scratch_offsets_ + n - 1, sizeof(unsigned), cudaMemcpyDeviceToHost)); CUDA_CHECK_ERROR("Failed to get # unique keys"); // Keep a list of the unique keys, and store info on each key's data // (location in the values array, how many there are). unsigned *d_compacted_keys = NULL; uint2 *d_index_counts_tmp = NULL; CUDA_SAFE_CALL(cudaMalloc((void**) &d_compacted_keys, sizeof(unsigned) * num_unique_keys)); CUDA_SAFE_CALL(cudaMalloc((void**) &d_index_counts_tmp, sizeof(uint2) * num_unique_keys)); CUDAWrapper::CallCompactKeys(d_sorted_keys, d_scratch_is_unique_, d_scratch_offsets_, n, d_index_counts_tmp, d_compacted_keys); // Determine the counts. CUDAWrapper::CallCountValues(d_index_counts_tmp, n, num_unique_keys); // Reinitialize the cuckoo hash table using the information we discovered. HashTable::Initialize(num_unique_keys, target_space_usage_, num_hash_functions_); d_index_counts_ = d_index_counts_tmp; d_unique_keys_ = d_compacted_keys; d_sorted_values_ = d_sorted_vals; sorted_values_size_ = n; // Build the cuckoo hash table with each key assigned a unique index. // Re-uses the sorted key memory as an array of values from 0 to k-1. CUDAWrapper::CallPrepareIndices(num_unique_keys, d_sorted_keys); bool success = HashTable::Build(num_unique_keys, d_unique_keys_, d_sorted_keys); CUDA_SAFE_CALL(cudaFree(d_sorted_keys)); return success; }