void CuckooHash::Init(const size_t maxSize, const unsigned short hFuncNum)
{
	_maxSize = maxSize;
	_hFuncNum = hFuncNum;
	CUDA_CALL( cudaMalloc((void**)&_data, _maxSize*sizeof(int2)) );
	CUDA_CALL( cudaMalloc((void**)&_hashConstants, _hFuncNum*sizeof(int)) );
	CUDA_CHECK_ERROR("Init failed!\n");
}
void CuckooHash::FreeMemory()
{
    CUDA_CALL( cudaFree(_data) );
    CUDA_CALL( cudaFree(_hashConstants) );
    CUDA_CHECK_ERROR("Free memory failed!\n");
	_maxSize  = 0;
	_currentSize = 0;
    _data = NULL;
    _hashConstants = NULL;
}
      void MaxDiff( const GPU::Classes::VolumeGPU<T>& a,
		    const GPU::Classes::VolumeGPU<T>& b,
		    T& maxDiff,
		    dim3& maxLoc ) const {
	/*!
	  Routine to compare two volumes, and find
	  the voxel with the maximum difference.
	*/
	const dim3 aDims = a.GetDims();
	const dim3 bDims = b.GetDims();

	// Verify dimensions
	if( aDims != bDims ) {
	  std::cerr << __FUNCTION__
		    << ": Dimension mismatch"
		    << std::endl;
	  exit( EXIT_FAILURE );
	}

	
	// Allocate 'difference' array
	thrust::device_ptr<T> d_absDiffs;
	
	const unsigned int nVoxels = aDims.x * aDims.y * aDims.z;

	d_absDiffs = thrust::device_new<T>( nVoxels );


	// Run the difference kernel
	dim3 grid, threads;
	threads.x = threads.y = kAbsDiffBlockSize;
	threads.z = 1;

	grid = a.CoverBlocks( kAbsDiffBlockSize );
	grid.z = 1;

	ComputeAbsDiffs<T>
	  <<<grid,threads>>>
	  ( a, b, thrust::raw_pointer_cast( d_absDiffs ) );
	CUDA_CHECK_ERROR( "ComputeAbsDiffs kernel failed!\n" );
	

	// Extract the maximum and its location
	thrust::device_ptr<T> d_maxLoc;
	d_maxLoc = thrust::max_element( d_absDiffs, d_absDiffs + nVoxels );

	maxDiff = *d_maxLoc;
	maxLoc = a.Index3D( d_maxLoc - d_absDiffs );

	// Release 'difference' array
	thrust::device_delete( d_absDiffs );
	
      }
      double ErrL2Norm( const GPU::Classes::VolumeGPU<T>& cmp,
			const GPU::Classes::VolumeGPU<T>& ref ) const {
	/*!
	  Routine to compute the error in the L2 norm between
	  two volumes
	*/
	const dim3 cmpDims = cmp.GetDims();
	const dim3 refDims = ref.GetDims();

	// Verify dimensions
	if( refDims != cmpDims ) {
	  std::cerr << __FUNCTION__
		    << ": Dimension mismatch"
		    << std::endl;
	  exit( EXIT_FAILURE );
	}
      
	// Compute number of voxels
	const unsigned int nVoxels = refDims.x * refDims.y * refDims.z;
	
	// Allocate thrust arrays
	thrust::device_ptr<double> d_err;
	thrust::device_ptr<double> d_reference;
	d_err = thrust::device_new<double>( nVoxels );
	d_reference = thrust::device_new<double>( nVoxels );
	
	// Run the kernel
	dim3 grid, threads;
	threads.x = threads.y = kErrL2BlockSize;
	threads.z = 1;

	grid = ref.CoverBlocks( kErrL2BlockSize );
	grid.z = 1;

	ErrL2Compute<T><<<grid,threads>>>
	  ( cmp, ref,
	    thrust::raw_pointer_cast( d_err ),
	    thrust::raw_pointer_cast( d_reference ) );
	CUDA_CHECK_ERROR( "ErrL2Compute kernel failed!\n" );

	// Extract sums
	double totErr = thrust::reduce( d_err, d_err+nVoxels );
	double totRef = thrust::reduce( d_reference, d_reference+nVoxels );
	
	// Release thrust arrays
	thrust::device_delete( d_err );
	thrust::device_delete( d_reference );

	return( sqrt( totErr / totRef ) );

      }
示例#5
0
void inclusive_scan(
	size_t _N,
	real_t *d_x, 
	real_t *d_y,
	BinaryOp binaryOp,
	Setter copy
) {
	
	real_t *d_block_x, *d_block_y;
	
	const size_t N = math::pow2ceil(_N);
	
	// maximum elements per block: 1024 * 2 (since each thread processes two elements)
	// TODO allow tweaking of this parameter
	const size_t elemPerBlock = 64;
	//const size_t elemPerBlock = 8;
	
	// each thread processes two elements
	dim3 block_dim = elemPerBlock / 2;
	
	// number of blocks
	dim3 grid_dim = N / elemPerBlock + (N % elemPerBlock == 0 ? 0 : 1);
	
	size_t nbytes_block = grid_dim.x * sizeof(real_t) * m;
	
	cudaMalloc((void**) &d_block_x, nbytes_block);
	cudaMalloc((void**) &d_block_y, nbytes_block);

	// elements are divided into blocks
	// each thread processes two elements within a block
	prescan<m> <<< grid_dim, block_dim, elemPerBlock*sizeof(real_t) * m >>> (elemPerBlock, d_x, d_y, binaryOp, copy);
	
	// one block; each thread processes a scan block from above
	aggregate_block_sum<m> <<< 1, grid_dim >>> (elemPerBlock, d_y, d_block_x, copy);
	
	// one block; each thread processes two scan block sums (hence need half the number of scan blocks from previous run)
	prescan<m> <<< 1, grid_dim.x/2, grid_dim.x*sizeof(real_t) * m >>> (grid_dim.x, d_block_x, d_block_y, binaryOp, copy);
	
	// each thread processes one element in original data
	// need twice as many blocks as before, since each thread now processes one element
	add_block_cumsum<m> <<< grid_dim.x*2, block_dim >>> (N, d_block_y, d_y, binaryOp, copy);
	
	cudaFree(d_block_x);
	cudaFree(d_block_y);

	CUDA_CHECK_ERROR("inclusive_scan");
	
}
示例#6
0
bool CompactingHashTable::Build(const unsigned  n,
                                const unsigned *d_keys,
                                const unsigned *d_values)
{
    CUDA_CHECK_ERROR("Failed before attempting to build.");

    unsigned num_failures = 1;
    unsigned num_attempts = 0;
    unsigned max_iterations = ComputeMaxIterations(n, table_size_,
                                                   num_hash_functions_);
    unsigned total_table_size = table_size_ + kStashSize;

    while (num_failures && ++num_attempts < kMaxRestartAttempts) {
        if (num_hash_functions_ == 2)
            constants_2_.Generate(n, d_keys, table_size_);
        else if (num_hash_functions_ == 3)
            constants_3_.Generate(n, d_keys, table_size_);
        else if (num_hash_functions_ == 4)
            constants_4_.Generate(n, d_keys, table_size_);
        else
            constants_5_.Generate(n, d_keys, table_size_);

        // Initialize the cuckoo hash table.
        CUDAWrapper::ClearTable(total_table_size,
                                kKeyEmpty,
                                d_scratch_cuckoo_keys_);

        num_failures = 0;
        cudaMemcpy(d_failures_,
                   &num_failures,
                   sizeof(unsigned),
                   cudaMemcpyHostToDevice);

        unsigned *d_stash_count = NULL;
        cudaMalloc((void**)&d_stash_count, sizeof(unsigned));
        cudaMemset(d_stash_count, 0, sizeof(unsigned));

        CUDAWrapper::CallHashBuildCompacting(n, 
                                             num_hash_functions_,
                                             d_keys,
                                             table_size_,
                                             constants_2_,
                                             constants_3_,
                                             constants_4_,
                                             constants_5_,
                                             stash_constants_,
                                             max_iterations,
                                             d_scratch_cuckoo_keys_,
                                             d_stash_count,
                                             d_failures_);

        CUDA_SAFE_CALL(cudaMemcpy(&stash_count_, d_stash_count, 
                                  sizeof(unsigned), cudaMemcpyDeviceToHost));
        if (stash_count_) {
            char buffer[256];
            sprintf(buffer, "Stash count: %u", stash_count_);
            PrintMessage(buffer, true);
        }
        CUDA_SAFE_CALL(cudaFree(d_stash_count));

        CUDA_CHECK_ERROR("!!! Failed to cuckoo hash.\n");

        CUDA_SAFE_CALL(cudaMemcpy(&num_failures,
                                  d_failures_,
                                  sizeof(unsigned),
                                  cudaMemcpyDeviceToHost));

#ifdef COUNT_UNINSERTED
        if (num_failures > 0) {
            char buffer[256];
            sprintf(buffer, "Num failures: %u", num_failures);
            PrintMessage(buffer, true);
        }
#endif
    }

    if (num_attempts >= kMaxRestartAttempts) {
        PrintMessage("Completely failed to build.", true);
        return false;
    } else if (num_attempts > 1) {
        char buffer[256];
        sprintf(buffer, "Needed %u attempts", num_attempts);
        PrintMessage(buffer);
    }

    if (num_failures == 0) {
        CUDAWrapper::CallHashRemoveDuplicates(num_hash_functions_,
                                              table_size_,
                                              total_table_size,
                                              constants_2_,
                                              constants_3_,
                                              constants_4_,
                                              constants_5_,
                                              stash_constants_,
                                              d_scratch_cuckoo_keys_,
                                              d_scratch_counts_);

        // Do a prefix-sum over the values to assign each key a unique index.
        CUDPPConfiguration config;
        config.op        = CUDPP_ADD;
        config.datatype  = CUDPP_UINT;
        config.algorithm = CUDPP_SCAN;
        config.options   = CUDPP_OPTION_FORWARD | CUDPP_OPTION_INCLUSIVE;

        CUDPPResult result = cudppPlan(theCudpp, &scanplan_, config, 
                                       total_table_size, 1, 0);
        if (CUDPP_SUCCESS == result) {
            cudppScan(scanplan_, d_scratch_unique_ids_, d_scratch_counts_,
                      total_table_size);
        } else {
            PrintMessage("!!! Failed to create plan.", true);
        }
        CUDA_CHECK_ERROR("!!! Scan failed.\n");

        // Determine how many unique values there are.
        CUDA_SAFE_CALL(cudaMemcpy(&unique_keys_size_,
                                  d_scratch_unique_ids_ + total_table_size - 1,
                                  sizeof(unsigned),
                                  cudaMemcpyDeviceToHost));

        // Copy the unique indices back and compact down the neighbors.
        CUDA_SAFE_CALL(cudaMalloc((void**)&d_unique_keys_,
                                  sizeof(unsigned) * unique_keys_size_));
        CUDA_SAFE_CALL(cudaMemset(d_unique_keys_,
                                  0xff,
                                  sizeof(unsigned) * unique_keys_size_));
        CUDAWrapper::CallHashCompactDown(total_table_size, 
                                         d_contents_, 
                                         d_unique_keys_, 
                                         d_scratch_cuckoo_keys_, 
                                         d_scratch_unique_ids_);
    }

    CUDA_CHECK_ERROR("Error occurred during hash table build.\n");

    return true;
}
示例#7
0
bool MultivalueHashTable::Build(const unsigned  n,
                                const unsigned *d_keys,
                                const unsigned *d_vals)
{
    CUDA_CHECK_ERROR("Failed before build.");

    unsigned *d_sorted_keys = NULL;
    CUDA_SAFE_CALL(cudaMalloc((void**)&d_sorted_keys, sizeof(unsigned) * n));
    CUDA_SAFE_CALL(cudaMemcpy(d_sorted_keys, d_keys, sizeof(unsigned) * n, 
                              cudaMemcpyDeviceToDevice));

    unsigned *d_sorted_vals = NULL;
    CUDA_SAFE_CALL(cudaMalloc((void**)&d_sorted_vals, sizeof(unsigned) * n));
    CUDA_SAFE_CALL(cudaMemcpy(d_sorted_vals, d_vals, sizeof(unsigned) * n, 
                              cudaMemcpyDeviceToDevice));
    CUDA_CHECK_ERROR("Failed to allocate.");

    CUDPPConfiguration sort_config;
    sort_config.algorithm = CUDPP_SORT_RADIX;                  
    sort_config.datatype = CUDPP_UINT;
    sort_config.options = CUDPP_OPTION_KEY_VALUE_PAIRS;

    CUDPPHandle sort_plan;
    CUDPPResult sort_result = cudppPlan(theCudpp, &sort_plan, sort_config, n,
                                        1, 0);
    cudppRadixSort(sort_plan, d_sorted_keys, (void*)d_sorted_vals, n);

    if (sort_result != CUDPP_SUCCESS)
    {
        printf("Error in plan creation in MultivalueHashTable::build\n");
        bool retval = false;
        cudppDestroyPlan(sort_plan);
        return retval;
    }
    CUDA_CHECK_ERROR("Failed to sort");

    // Find the first key-value pair for each key.
    CUDAWrapper::CallCheckIfUnique(d_sorted_keys, n, d_scratch_is_unique_);

    // Assign a unique index from 0 to k-1 for each of the keys.
    cudppScan(scanplan_, d_scratch_offsets_, d_scratch_is_unique_, n);
    CUDA_CHECK_ERROR("Failed to scan");

    // Check how many unique keys were found.
    unsigned num_unique_keys;
    CUDA_SAFE_CALL(cudaMemcpy(&num_unique_keys, d_scratch_offsets_ + n - 1,
                              sizeof(unsigned), cudaMemcpyDeviceToHost));
    CUDA_CHECK_ERROR("Failed to get # unique keys");

    // Keep a list of the unique keys, and store info on each key's data
    // (location in the values array, how many there are).
    unsigned *d_compacted_keys = NULL;
    uint2 *d_index_counts_tmp = NULL;
    CUDA_SAFE_CALL(cudaMalloc((void**) &d_compacted_keys,
                              sizeof(unsigned) * num_unique_keys));
    CUDA_SAFE_CALL(cudaMalloc((void**) &d_index_counts_tmp,
                              sizeof(uint2) * num_unique_keys));
    CUDAWrapper::CallCompactKeys(d_sorted_keys,
                                 d_scratch_is_unique_,
                                 d_scratch_offsets_,
                                 n,
                                 d_index_counts_tmp,
                                 d_compacted_keys);

    // Determine the counts.
    CUDAWrapper::CallCountValues(d_index_counts_tmp, n, num_unique_keys);

    // Reinitialize the cuckoo hash table using the information we discovered.
    HashTable::Initialize(num_unique_keys,
                          target_space_usage_,
                          num_hash_functions_);

    d_index_counts_  = d_index_counts_tmp;
    d_unique_keys_   = d_compacted_keys;
    d_sorted_values_ = d_sorted_vals;
    sorted_values_size_ = n;

    // Build the cuckoo hash table with each key assigned a unique index.
    // Re-uses the sorted key memory as an array of values from 0 to k-1.
    CUDAWrapper::CallPrepareIndices(num_unique_keys, d_sorted_keys);
    bool success = HashTable::Build(num_unique_keys, d_unique_keys_,
                                    d_sorted_keys);
    CUDA_SAFE_CALL(cudaFree(d_sorted_keys));
    return success;
}