Exemplo n.º 1
0
bool MultivalueHashTable::Initialize(const unsigned   max_table_entries,
                                     const float      space_usage,
                                     const unsigned   num_hash_functions)
{                                    
    bool success = HashTable::Initialize(max_table_entries, space_usage,
                                             num_hash_functions);
    target_space_usage_ = space_usage;

    // + 2N 32-bit entries
    CUDA_SAFE_CALL(cudaMalloc( (void**)&d_scratch_offsets_, 
                               sizeof(unsigned) * max_table_entries ));
    CUDA_SAFE_CALL(cudaMalloc( (void**)&d_scratch_is_unique_,
                               sizeof(unsigned) * max_table_entries ));

    success &= (d_scratch_offsets_ != NULL);
    success &= (d_scratch_is_unique_ != NULL);

    // Allocate memory for the scan.
    // + Unknown memory usage
    CUDPPConfiguration config;
    config.op            = CUDPP_ADD;
    config.datatype      = CUDPP_UINT;
    config.algorithm     = CUDPP_SCAN;
    config.options       = CUDPP_OPTION_FORWARD | CUDPP_OPTION_INCLUSIVE;
    CUDPPResult result   = cudppPlan(theCudpp, &scanplan_, config, 
                                     max_table_entries, 1, 0);
    if (CUDPP_SUCCESS != result) {
        fprintf(stderr, "Failed to create plan.");
        return false;
    }
    return success;
}
Exemplo n.º 2
0
void VHParticlesRender::initParticlesRender(){


		  // Create the CUDPP radix sort
    CUDPPConfiguration sortConfig;
    sortConfig.algorithm = CUDPP_SORT_RADIX;
    sortConfig.datatype = CUDPP_FLOAT;
    sortConfig.op = CUDPP_ADD;
    sortConfig.options = CUDPP_OPTION_KEY_VALUE_PAIRS;
    cudppPlan(&m_sortHandle, sortConfig, pSys->nParts, 1, 0);

	//-----shaders

	/*char currentPath[_MAX_PATH];
	getcwd(currentPath, _MAX_PATH);
	printf("Path : %s", currentPath);*/

	simpleSpriteProg = new GLSLProgram("sprite.vs", "sprite.gs", "simpleSprite.ps");
	shadowedSpriteProg = new GLSLProgram("sprite.vs", "sprite.gs", "shadowedSprite.ps");
	shadowMapSpriteProg = new GLSLProgram("sprite.vs", "sprite.gs", "ShadowMapSprite.ps");
	displayTexProg = new GLSLProgram("passThru.vs", "texture2D.ps");
	blurProg = new GLSLProgram("passThru.vs", "blur.ps");

	if(spritePath)
		loadSprite(spritePath);

	initFbos(width, height, true);
}
Exemplo n.º 3
0
void CMarchingCubes::InitMC(int _width, int _height, int _depth, ElemType* _pVolume)
{
	// Data Array A[:, :, 1], A[:, :, 2], ... , A[:, : , n]
	//m_GridSize = make_uint3(_depth, _width, _height);
	m_GridSize = make_uint3(_width, _height, _depth);
	m_NumVoxels = m_GridSize.x * m_GridSize.y * m_GridSize.z;
	m_MaxVerts = m_GridSize.x * m_GridSize.y * 30;					// Num of MaxVerts need change
#ifdef _DEBUG
	printf("grids: %d * %d * %d = %d voxels\n", m_GridSize.x, m_GridSize.y, m_GridSize.z, m_NumVoxels);
#endif // _DEBUG

	// needed change
	int size = m_GridSize.x * m_GridSize.y * m_GridSize.z * sizeof(float);
	//////////////////////////////////////////////////////////////////////////
	int len = m_GridSize.x * m_GridSize.y * m_GridSize.z * 3;
	float *pVolTemp = new float[len];
	for (int i = 0; i < len; i++)
		pVolTemp[i] = _pVolume[i];
	//////////////////////////////////////////////////////////////////////////
	cutilSafeCall(cudaMalloc((void**) &m_pdVolume, size * 3));
	cutilSafeCall(cudaMemcpy(m_pdVolume, pVolTemp, size * 3, cudaMemcpyHostToDevice) );
	bindVolumeTexture(m_pdVolume);				// map the coordinates to the texture directly
	delete []pVolTemp;

	// allocate textures
	allocateTextures( &m_pdEdgeTable, &m_pdTriTable, &m_pdNumVertsTable );

	// allocate device memory
	unsigned int memSize = sizeof(uint) * m_NumVoxels;
	cutilSafeCall(cudaMalloc((void**) &m_pdVoxelVerts, memSize));
	cutilSafeCall(cudaMalloc((void**) &m_pdVoxelVertsScan, memSize));
	cutilSafeCall(cudaMalloc((void**) &m_pdVoxelOccupied, memSize));
	cutilSafeCall(cudaMalloc((void**) &m_pdVoxelOccupiedScan, memSize));
	cutilSafeCall(cudaMalloc((void**) &m_pdCompactedVoxelArray, memSize));

	// initialize CUDPP scan
	CUDPPConfiguration config;
	config.algorithm = CUDPP_SCAN;
	config.datatype = CUDPP_UINT;
	config.op = CUDPP_ADD;
	config.options = CUDPP_OPTION_FORWARD | CUDPP_OPTION_EXCLUSIVE;
	cudppPlan(&m_Scanplan, config, m_NumVoxels, 1, 0);
}
void GPUSurfFeatureDetector::allocCudaResource()
{
	//Allocating buffer for 2-pass feature location extraction on GPU
	unsigned int width = mWidth;
	for (int i=0; i<mNbOctave; ++i)
	{		
		width /= 2;
		float* devicePass1 = NULL;
		float* devicePass2 = NULL;

		cudaMalloc((void**)&devicePass1, width*sizeof(float));
		cudaMalloc((void**)&devicePass2, width*sizeof(float));

		CUDPPHandle scanPlan;
		CUDPPConfiguration config = { CUDPP_SCAN, CUDPP_ADD, CUDPP_FLOAT, CUDPP_OPTION_FORWARD | CUDPP_OPTION_EXCLUSIVE };
		cudppPlan(&scanPlan, config, width, 1, 0);

		mDeviceFeatureCounterPass1.push_back(devicePass1);
		mDeviceFeatureCounterPass2.push_back(devicePass2);
		mDeviceScanPlan.push_back(scanPlan);
	}

	//Allocating buffer for Feature location on GPU + CPU
	cudaMalloc((void**)&mDeviceFeatureFound, mNbFeatureMax*sizeof(Feature));
	mHostFeatureFound = new Feature[mNbFeatureMax];

	//Creating Cuda Texture
	mCudaNMSTexture         = mCudaRoot->getTextureManager()->createTexture(mNMSTexture);
	//mCudaFeatureCudaTexture = mCudaRoot->getTextureManager()->createTexture(mFeatureCudaTexture);
	//mCudaFeatureTexture     = mCudaRoot->getTextureManager()->createTexture(mFeatureTexture);

	//Registering Texture for CUDA
	mCudaNMSTexture->registerForCudaUse();
	//mCudaFeatureCudaTexture->registerForCudaUse();
	//mCudaFeatureTexture->registerForCudaUse();

	mCudaIsAllocated = true;
}
Exemplo n.º 5
0
bool CompactingHashTable::Build(const unsigned  n,
                                const unsigned *d_keys,
                                const unsigned *d_values)
{
    CUDA_CHECK_ERROR("Failed before attempting to build.");

    unsigned num_failures = 1;
    unsigned num_attempts = 0;
    unsigned max_iterations = ComputeMaxIterations(n, table_size_,
                                                   num_hash_functions_);
    unsigned total_table_size = table_size_ + kStashSize;

    while (num_failures && ++num_attempts < kMaxRestartAttempts) {
        if (num_hash_functions_ == 2)
            constants_2_.Generate(n, d_keys, table_size_);
        else if (num_hash_functions_ == 3)
            constants_3_.Generate(n, d_keys, table_size_);
        else if (num_hash_functions_ == 4)
            constants_4_.Generate(n, d_keys, table_size_);
        else
            constants_5_.Generate(n, d_keys, table_size_);

        // Initialize the cuckoo hash table.
        CUDAWrapper::ClearTable(total_table_size,
                                kKeyEmpty,
                                d_scratch_cuckoo_keys_);

        num_failures = 0;
        cudaMemcpy(d_failures_,
                   &num_failures,
                   sizeof(unsigned),
                   cudaMemcpyHostToDevice);

        unsigned *d_stash_count = NULL;
        cudaMalloc((void**)&d_stash_count, sizeof(unsigned));
        cudaMemset(d_stash_count, 0, sizeof(unsigned));

        CUDAWrapper::CallHashBuildCompacting(n, 
                                             num_hash_functions_,
                                             d_keys,
                                             table_size_,
                                             constants_2_,
                                             constants_3_,
                                             constants_4_,
                                             constants_5_,
                                             stash_constants_,
                                             max_iterations,
                                             d_scratch_cuckoo_keys_,
                                             d_stash_count,
                                             d_failures_);

        CUDA_SAFE_CALL(cudaMemcpy(&stash_count_, d_stash_count, 
                                  sizeof(unsigned), cudaMemcpyDeviceToHost));
        if (stash_count_) {
            char buffer[256];
            sprintf(buffer, "Stash count: %u", stash_count_);
            PrintMessage(buffer, true);
        }
        CUDA_SAFE_CALL(cudaFree(d_stash_count));

        CUDA_CHECK_ERROR("!!! Failed to cuckoo hash.\n");

        CUDA_SAFE_CALL(cudaMemcpy(&num_failures,
                                  d_failures_,
                                  sizeof(unsigned),
                                  cudaMemcpyDeviceToHost));

#ifdef COUNT_UNINSERTED
        if (num_failures > 0) {
            char buffer[256];
            sprintf(buffer, "Num failures: %u", num_failures);
            PrintMessage(buffer, true);
        }
#endif
    }

    if (num_attempts >= kMaxRestartAttempts) {
        PrintMessage("Completely failed to build.", true);
        return false;
    } else if (num_attempts > 1) {
        char buffer[256];
        sprintf(buffer, "Needed %u attempts", num_attempts);
        PrintMessage(buffer);
    }

    if (num_failures == 0) {
        CUDAWrapper::CallHashRemoveDuplicates(num_hash_functions_,
                                              table_size_,
                                              total_table_size,
                                              constants_2_,
                                              constants_3_,
                                              constants_4_,
                                              constants_5_,
                                              stash_constants_,
                                              d_scratch_cuckoo_keys_,
                                              d_scratch_counts_);

        // Do a prefix-sum over the values to assign each key a unique index.
        CUDPPConfiguration config;
        config.op        = CUDPP_ADD;
        config.datatype  = CUDPP_UINT;
        config.algorithm = CUDPP_SCAN;
        config.options   = CUDPP_OPTION_FORWARD | CUDPP_OPTION_INCLUSIVE;

        CUDPPResult result = cudppPlan(theCudpp, &scanplan_, config, 
                                       total_table_size, 1, 0);
        if (CUDPP_SUCCESS == result) {
            cudppScan(scanplan_, d_scratch_unique_ids_, d_scratch_counts_,
                      total_table_size);
        } else {
            PrintMessage("!!! Failed to create plan.", true);
        }
        CUDA_CHECK_ERROR("!!! Scan failed.\n");

        // Determine how many unique values there are.
        CUDA_SAFE_CALL(cudaMemcpy(&unique_keys_size_,
                                  d_scratch_unique_ids_ + total_table_size - 1,
                                  sizeof(unsigned),
                                  cudaMemcpyDeviceToHost));

        // Copy the unique indices back and compact down the neighbors.
        CUDA_SAFE_CALL(cudaMalloc((void**)&d_unique_keys_,
                                  sizeof(unsigned) * unique_keys_size_));
        CUDA_SAFE_CALL(cudaMemset(d_unique_keys_,
                                  0xff,
                                  sizeof(unsigned) * unique_keys_size_));
        CUDAWrapper::CallHashCompactDown(total_table_size, 
                                         d_contents_, 
                                         d_unique_keys_, 
                                         d_scratch_cuckoo_keys_, 
                                         d_scratch_unique_ids_);
    }

    CUDA_CHECK_ERROR("Error occurred during hash table build.\n");

    return true;
}
Exemplo n.º 6
0
////////////////////////////////////////////////////////////////////////////////
// initialize marching cubes
////////////////////////////////////////////////////////////////////////////////
void
initMC(int argc, char** argv)
{
    // parse command line arguments
    int n;
    if (cutGetCmdLineArgumenti( argc, (const char**) argv, "grid", &n)) {
        gridSizeLog2.x = gridSizeLog2.y = gridSizeLog2.z = n;
    }
    if (cutGetCmdLineArgumenti( argc, (const char**) argv, "gridx", &n)) {
        gridSizeLog2.x = n;
    }
    if (cutGetCmdLineArgumenti( argc, (const char**) argv, "gridy", &n)) {
        gridSizeLog2.y = n;
    }
    if (cutGetCmdLineArgumenti( argc, (const char**) argv, "gridz", &n)) {
        gridSizeLog2.z = n;
    }

    char *filename;
    if (cutGetCmdLineArgumentstr( argc, (const char**) argv, "file", &filename)) {
        volumeFilename = filename;
    }

    gridSize = make_uint3(1<<gridSizeLog2.x, 1<<gridSizeLog2.y, 1<<gridSizeLog2.z);
    gridSizeMask = make_uint3(gridSize.x-1, gridSize.y-1, gridSize.z-1);
    gridSizeShift = make_uint3(0, gridSizeLog2.x, gridSizeLog2.x+gridSizeLog2.y);

    numVoxels = gridSize.x*gridSize.y*gridSize.z;
    voxelSize = make_float3(2.0f / gridSize.x, 2.0f / gridSize.y, 2.0f / gridSize.z);
    maxVerts = gridSize.x*gridSize.y*100;

    printf("grid: %d x %d x %d = %d voxels\n", gridSize.x, gridSize.y, gridSize.z, numVoxels);
    printf("max verts = %d\n", maxVerts);

#if SAMPLE_VOLUME
    // load volume data
    char* path = cutFindFilePath(volumeFilename, argv[0]);
    if (path == 0) {
        fprintf(stderr, "Error finding file '%s'\n", volumeFilename);
        cudaThreadExit();
        exit(EXIT_FAILURE);
    }

    int size = gridSize.x*gridSize.y*gridSize.z*sizeof(uchar);
    uchar *volume = loadRawFile(path, size);
    cutilSafeCall(cudaMalloc((void**) &d_volume, size));
    cutilSafeCall(cudaMemcpy(d_volume, volume, size, cudaMemcpyHostToDevice) );
    free(volume);

	bindVolumeTexture(d_volume);
#endif

    if (g_bQAReadback) {
        cudaMalloc((void **)&(d_pos),    maxVerts*sizeof(float)*4);
        cudaMalloc((void **)&(d_normal), maxVerts*sizeof(float)*4);
    } else {
        // create VBOs
        createVBO(&posVbo, maxVerts*sizeof(float)*4);
		// DEPRECATED: cutilSafeCall( cudaGLRegisterBufferObject(posVbo) );
		cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_posvbo_resource, posVbo, 
							   cudaGraphicsMapFlagsWriteDiscard));

        createVBO(&normalVbo, maxVerts*sizeof(float)*4);
        // DEPRECATED: cutilSafeCall(cudaGLRegisterBufferObject(normalVbo));
		cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_normalvbo_resource, normalVbo, 
							   cudaGraphicsMapFlagsWriteDiscard));

    }

    // allocate textures
	allocateTextures(	&d_edgeTable, &d_triTable, &d_numVertsTable );

    // allocate device memory
    unsigned int memSize = sizeof(uint) * numVoxels;
    cutilSafeCall(cudaMalloc((void**) &d_voxelVerts,            memSize));
    cutilSafeCall(cudaMalloc((void**) &d_voxelVertsScan,        memSize));
    cutilSafeCall(cudaMalloc((void**) &d_voxelOccupied,         memSize));
    cutilSafeCall(cudaMalloc((void**) &d_voxelOccupiedScan,     memSize));
    cutilSafeCall(cudaMalloc((void**) &d_compVoxelArray,   memSize));

    // initialize CUDPP scan
    CUDPPConfiguration config;
    config.algorithm    = CUDPP_SCAN;
    config.datatype     = CUDPP_UINT;
    config.op           = CUDPP_ADD;
    config.options      = CUDPP_OPTION_FORWARD | CUDPP_OPTION_EXCLUSIVE;
    cudppPlan(&scanplan, config, numVoxels, 1, 0);
}
Exemplo n.º 7
0
  void IntIntSorter::executeOnGPUAsync(void * const keys, void * const vals, const int numKeys, int & numUniqueKeys, int ** keyOffsets, int ** valOffsets, int ** numVals)
  {
    if (numKeys == 0)
    {
      numUniqueKeys = 0;
      *keyOffsets = *valOffsets = NULL;
      *numVals = NULL;
      return;
    }
    if (numKeys > 32 * 1048576)
    {
      executeOnCPUAsync(keys, vals, numKeys, numUniqueKeys, keyOffsets, valOffsets, numVals);
      return;
    }
    int commRank;
    MPI_Comm_rank(MPI_COMM_WORLD, &commRank);
    CUDPPConfiguration cudppConfig;
    CUDPPHandle planHandle;

    void * gpuInputKeys   = cudacpp::Runtime::malloc(sizeof(int) * numKeys);
    void * gpuInputVals   = cudacpp::Runtime::malloc(sizeof(int) * numKeys);
    void * gpuUniqueFlags = cudacpp::Runtime::malloc(sizeof(int) * numKeys);
    void * gpuValOffsets  = cudacpp::Runtime::malloc(sizeof(int) * numKeys);

    cudacpp::Runtime::memcpyHtoD(gpuInputKeys, keys, sizeof(int) * numKeys);
    cudacpp::Runtime::memcpyHtoD(gpuInputVals, vals, sizeof(int) * numKeys);

    /*
      what we need to get out of here:

      1 - sorted keys and values
      2 - num unique keys
      3 - number of values for each key
      4 - value offsets
      5 - compacted keys

      to get:
          simply sort
          A = find unique values
          B = reverse exclusive scan of "A"
          C = if A[i] == 1
                C[B[0] - B[i]] = i
          D = [0] = C[0] + 1
              [N] = #keys - C[#keys - 1]
              [i] = C[i + 1] - C[i]
          E = forward exclusive scan D
          F = keys[E[i]]

      1 = result of sort (only copy the values)
      2 = B[0]
      3 = D
      4 = E
      5 = F
    */

    // 1
    cudppConfig.algorithm  = CUDPP_SORT_RADIX;
    cudppConfig.op         = CUDPP_ADD; // ignored
    cudppConfig.datatype   = CUDPP_UINT;
    cudppConfig.options    = CUDPP_OPTION_KEY_VALUE_PAIRS;
    cudppPlan(&planHandle, cudppConfig, numKeys, 1, numKeys * sizeof(int));
    cudppSort(planHandle, gpuInputKeys, gpuInputVals, sizeof(int) * 8, numKeys);
    cudppDestroyPlan(planHandle);
    cudacpp::Runtime::sync();
    // cudacpp::Runtime::memcpyDtoH(keys, gpuInputKeys, sizeof(int) * numKeys);
    cudacpp::Runtime::memcpyDtoH(vals, gpuInputVals, sizeof(int) * numKeys);

    // 2 - A = gpuUniqueFlags
    gpmrIntIntSorterMarkUnique(gpuInputKeys, gpuUniqueFlags, numKeys);

    // 2 - B = gpuValOffsets
    cudppConfig.algorithm  = CUDPP_SCAN;
    cudppConfig.op         = CUDPP_ADD; // ignored
    cudppConfig.datatype   = CUDPP_INT;
    cudppConfig.options    = CUDPP_OPTION_EXCLUSIVE | CUDPP_OPTION_BACKWARD;
    cudppPlan(&planHandle, cudppConfig, numKeys, 1, numKeys * sizeof(int));
    cudppScan(planHandle, gpuValOffsets, gpuUniqueFlags, numKeys);
    cudppDestroyPlan(planHandle);
    cudacpp::Runtime::sync();
    cudacpp::Runtime::memcpyDtoH(&numUniqueKeys, gpuValOffsets, sizeof(int));
    ++numUniqueKeys;

    // 2 - C = gpuInputVals and
    // 3 - D = gpuValOffsets
    cudacpp::Runtime::sync();
    gpmrIntIntSorterFindOffsets(gpuInputKeys, gpuUniqueFlags, gpuValOffsets, gpuInputVals, gpuValOffsets, numKeys, numUniqueKeys);
    *numVals = reinterpret_cast<int * >(cudacpp::Runtime::mallocHost(numUniqueKeys * sizeof(int)));
    cudacpp::Runtime::sync();
    cudacpp::Runtime::memcpyDtoH(*numVals, gpuValOffsets, sizeof(int) * numUniqueKeys);
    cudacpp::Runtime::sync();

    // 4 - E = gpuUniqueFlags
    cudppConfig.algorithm  = CUDPP_SCAN;
    cudppConfig.op         = CUDPP_ADD; // ignored
    cudppConfig.datatype   = CUDPP_INT;
    cudppConfig.options    = CUDPP_OPTION_EXCLUSIVE | CUDPP_OPTION_FORWARD;
    cudppPlan(&planHandle, cudppConfig, numKeys, 1, numKeys * sizeof(int));
    cudppScan(planHandle, gpuUniqueFlags, gpuValOffsets, numKeys);
    cudppDestroyPlan(planHandle);
    cudacpp::Runtime::sync();
    *valOffsets = reinterpret_cast<int * >(cudacpp::Runtime::mallocHost(numUniqueKeys * sizeof(int)));
    cudacpp::Runtime::memcpyDtoH(*valOffsets, gpuUniqueFlags, sizeof(int) * numUniqueKeys);

    // 4 - F = gpuInputVals
    gpmrIntIntSorterSetCompactedKeys(gpuInputKeys, gpuUniqueFlags, gpuInputVals, numUniqueKeys);
    cudacpp::Runtime::memcpyDtoH(keys, gpuInputVals, sizeof(int) * numUniqueKeys);

    cudacpp::Runtime::free(gpuInputKeys);
    cudacpp::Runtime::free(gpuInputVals);
    cudacpp::Runtime::free(gpuUniqueFlags);
    cudacpp::Runtime::free(gpuValOffsets);
  }
Exemplo n.º 8
0
bool MultivalueHashTable::Build(const unsigned  n,
                                const unsigned *d_keys,
                                const unsigned *d_vals)
{
    CUDA_CHECK_ERROR("Failed before build.");

    unsigned *d_sorted_keys = NULL;
    CUDA_SAFE_CALL(cudaMalloc((void**)&d_sorted_keys, sizeof(unsigned) * n));
    CUDA_SAFE_CALL(cudaMemcpy(d_sorted_keys, d_keys, sizeof(unsigned) * n, 
                              cudaMemcpyDeviceToDevice));

    unsigned *d_sorted_vals = NULL;
    CUDA_SAFE_CALL(cudaMalloc((void**)&d_sorted_vals, sizeof(unsigned) * n));
    CUDA_SAFE_CALL(cudaMemcpy(d_sorted_vals, d_vals, sizeof(unsigned) * n, 
                              cudaMemcpyDeviceToDevice));
    CUDA_CHECK_ERROR("Failed to allocate.");

    CUDPPConfiguration sort_config;
    sort_config.algorithm = CUDPP_SORT_RADIX;                  
    sort_config.datatype = CUDPP_UINT;
    sort_config.options = CUDPP_OPTION_KEY_VALUE_PAIRS;

    CUDPPHandle sort_plan;
    CUDPPResult sort_result = cudppPlan(theCudpp, &sort_plan, sort_config, n,
                                        1, 0);
    cudppRadixSort(sort_plan, d_sorted_keys, (void*)d_sorted_vals, n);

    if (sort_result != CUDPP_SUCCESS)
    {
        printf("Error in plan creation in MultivalueHashTable::build\n");
        bool retval = false;
        cudppDestroyPlan(sort_plan);
        return retval;
    }
    CUDA_CHECK_ERROR("Failed to sort");

    // Find the first key-value pair for each key.
    CUDAWrapper::CallCheckIfUnique(d_sorted_keys, n, d_scratch_is_unique_);

    // Assign a unique index from 0 to k-1 for each of the keys.
    cudppScan(scanplan_, d_scratch_offsets_, d_scratch_is_unique_, n);
    CUDA_CHECK_ERROR("Failed to scan");

    // Check how many unique keys were found.
    unsigned num_unique_keys;
    CUDA_SAFE_CALL(cudaMemcpy(&num_unique_keys, d_scratch_offsets_ + n - 1,
                              sizeof(unsigned), cudaMemcpyDeviceToHost));
    CUDA_CHECK_ERROR("Failed to get # unique keys");

    // Keep a list of the unique keys, and store info on each key's data
    // (location in the values array, how many there are).
    unsigned *d_compacted_keys = NULL;
    uint2 *d_index_counts_tmp = NULL;
    CUDA_SAFE_CALL(cudaMalloc((void**) &d_compacted_keys,
                              sizeof(unsigned) * num_unique_keys));
    CUDA_SAFE_CALL(cudaMalloc((void**) &d_index_counts_tmp,
                              sizeof(uint2) * num_unique_keys));
    CUDAWrapper::CallCompactKeys(d_sorted_keys,
                                 d_scratch_is_unique_,
                                 d_scratch_offsets_,
                                 n,
                                 d_index_counts_tmp,
                                 d_compacted_keys);

    // Determine the counts.
    CUDAWrapper::CallCountValues(d_index_counts_tmp, n, num_unique_keys);

    // Reinitialize the cuckoo hash table using the information we discovered.
    HashTable::Initialize(num_unique_keys,
                          target_space_usage_,
                          num_hash_functions_);

    d_index_counts_  = d_index_counts_tmp;
    d_unique_keys_   = d_compacted_keys;
    d_sorted_values_ = d_sorted_vals;
    sorted_values_size_ = n;

    // Build the cuckoo hash table with each key assigned a unique index.
    // Re-uses the sorted key memory as an array of values from 0 to k-1.
    CUDAWrapper::CallPrepareIndices(num_unique_keys, d_sorted_keys);
    bool success = HashTable::Build(num_unique_keys, d_unique_keys_,
                                    d_sorted_keys);
    CUDA_SAFE_CALL(cudaFree(d_sorted_keys));
    return success;
}