Example #1
0
void CMarchingCubes::ComputeIsosurface(ElemType* _pFval, ElemType _isoValue, RenderData* _pRender)
{
	int threads = 128;
	dim3 grid(m_NumVoxels / threads, 1, 1);
	// get around maximum grid size of 65535 in each dimension
	if (grid.x > 65535) {
		grid.y = grid.x / 32768;
		grid.x = 32768;
	}

	uint totalVerts = 0;
	int size = m_GridSize.x * m_GridSize.y * m_GridSize.z * sizeof(float);
	//////////////////////////////////////////////////////////////////////////
	int len = m_GridSize.x * m_GridSize.y * m_GridSize.z;
	float *pFvalTemp = new float[len];
	for (int i = 0; i < len; i++)
	{
		pFvalTemp[i] = _pFval[i];
	}
	//////////////////////////////////////////////////////////////////////////
	float* pdVolumeFval;				// ¶¥µãº¯ÊýÖµÎÆÀí(n¡¡Surface)
	cutilSafeCall(cudaMalloc((void**) &pdVolumeFval, size));
	cutilSafeCall(cudaMemcpy(pdVolumeFval, pFvalTemp, size, cudaMemcpyHostToDevice) );
	bindVolumeValTexture(pdVolumeFval);
	delete []pFvalTemp;

	// calculate number of vertices need per voxel
	launch_classifyVoxel(grid, threads, 
		m_pdVoxelVerts, m_pdVoxelOccupied, pdVolumeFval,
		m_GridSize,	m_NumVoxels, _isoValue);

#if DEBUG_BUFFERS
	printf("voxelVerts:\n");
	dumpBuffer(m_pdVoxelVerts, m_NumVoxels);
#endif

#if SKIP_EMPTY_VOXELS

	// scan voxel occupied array
	cudppScan(m_Scanplan, m_pdVoxelOccupiedScan, m_pdVoxelOccupied, m_NumVoxels);

#if DEBUG_BUFFERS
	printf("voxelOccupiedScan:\n");
	dumpBuffer(m_pdVoxelOccupiedScan, m_NumVoxels);
#endif

	// read back values to calculate total number of non-empty voxels
	// since we are using an exclusive scan, the total is the last value of
	// the scan result plus the last value in the input array
	{
		uint lastElement, lastScanElement;
		cutilSafeCall(cudaMemcpy((void *) &lastElement, 
			(void *) (m_pdVoxelOccupied + m_NumVoxels - 1), 
			sizeof(uint), cudaMemcpyDeviceToHost));
		cutilSafeCall(cudaMemcpy((void *) &lastScanElement, 
			(void *) (m_pdVoxelOccupiedScan + m_NumVoxels - 1), 
			sizeof(uint), cudaMemcpyDeviceToHost));
		m_ActiveVoxels = lastElement + lastScanElement;
	}

	if (0 == m_ActiveVoxels) {
		// return if there are no full voxels
		totalVerts = 0;
		return;
	}

	// compact voxel index array
	launch_compactVoxels(grid, threads, m_pdCompactedVoxelArray, m_pdVoxelOccupied, m_pdVoxelOccupiedScan, m_NumVoxels);
	cutilCheckMsg("compactVoxels failed");

#endif // SKIP_EMPTY_VOXELS

	// scan voxel vertex count array
	cudppScan(m_Scanplan, m_pdVoxelVertsScan, m_pdVoxelVerts, m_NumVoxels);
#if DEBUG_BUFFERS
	printf("voxelVertsScan:\n");
	dumpBuffer(m_pdVoxelVertsScan, m_NumVoxels);
#endif

	// readback total number of vertices
	{
		uint lastElement, lastScanElement;
		cutilSafeCall(cudaMemcpy((void *) &lastElement, 
			(void *) (m_pdVoxelVerts + m_NumVoxels - 1), 
			sizeof(uint), cudaMemcpyDeviceToHost));
		cutilSafeCall(cudaMemcpy((void *) &lastScanElement, 
			(void *) (m_pdVoxelVertsScan + m_NumVoxels - 1), 
			sizeof(uint), cudaMemcpyDeviceToHost));
		totalVerts = lastElement + lastScanElement;
	}

	// create VBOs
	GLuint	posVbo, normalVbo;
	createVBO(&posVbo, totalVerts * sizeof(float) * 4);
	cutilSafeCall(cudaGLRegisterBufferObject(posVbo));
	createVBO(&normalVbo, totalVerts * sizeof(float) * 4);
	cutilSafeCall(cudaGLRegisterBufferObject(normalVbo));

	// generate triangles, writing to vertex buffers
	float4 *d_pos = 0, *d_normal = 0;
	cutilSafeCall(cudaGLMapBufferObject((void**)&d_pos, posVbo));
	cutilSafeCall(cudaGLMapBufferObject((void**)&d_normal, normalVbo));

#if SKIP_EMPTY_VOXELS
	dim3 grid2((int) ceil(m_ActiveVoxels / (float) NTHREADS), 1, 1);
#else
	dim3 grid2((int) ceil(m_NumVoxels / (float) NTHREADS), 1, 1);
#endif
	while(grid2.x > 65535) {
		grid2.x/=2;
		grid2.y*=2;
	}

	launch_generateTriangles(grid2, NTHREADS, d_pos, d_normal, 
		m_pdCompactedVoxelArray, m_pdVoxelVertsScan,
		m_pdVolume, pdVolumeFval,
		m_GridSize, _isoValue,
		m_ActiveVoxels, m_MaxVerts);

	cutilSafeCall(cudaGLUnmapBufferObject(normalVbo));
	cutilSafeCall(cudaGLUnmapBufferObject(posVbo));

	_pRender->posVbo = posVbo;
	_pRender->normalVbo = normalVbo;
	_pRender->totalVerts = totalVerts;
	cutilSafeCall(cudaFree(pdVolumeFval));
}
Example #2
0
bool CompactingHashTable::Build(const unsigned  n,
                                const unsigned *d_keys,
                                const unsigned *d_values)
{
    CUDA_CHECK_ERROR("Failed before attempting to build.");

    unsigned num_failures = 1;
    unsigned num_attempts = 0;
    unsigned max_iterations = ComputeMaxIterations(n, table_size_,
                                                   num_hash_functions_);
    unsigned total_table_size = table_size_ + kStashSize;

    while (num_failures && ++num_attempts < kMaxRestartAttempts) {
        if (num_hash_functions_ == 2)
            constants_2_.Generate(n, d_keys, table_size_);
        else if (num_hash_functions_ == 3)
            constants_3_.Generate(n, d_keys, table_size_);
        else if (num_hash_functions_ == 4)
            constants_4_.Generate(n, d_keys, table_size_);
        else
            constants_5_.Generate(n, d_keys, table_size_);

        // Initialize the cuckoo hash table.
        CUDAWrapper::ClearTable(total_table_size,
                                kKeyEmpty,
                                d_scratch_cuckoo_keys_);

        num_failures = 0;
        cudaMemcpy(d_failures_,
                   &num_failures,
                   sizeof(unsigned),
                   cudaMemcpyHostToDevice);

        unsigned *d_stash_count = NULL;
        cudaMalloc((void**)&d_stash_count, sizeof(unsigned));
        cudaMemset(d_stash_count, 0, sizeof(unsigned));

        CUDAWrapper::CallHashBuildCompacting(n, 
                                             num_hash_functions_,
                                             d_keys,
                                             table_size_,
                                             constants_2_,
                                             constants_3_,
                                             constants_4_,
                                             constants_5_,
                                             stash_constants_,
                                             max_iterations,
                                             d_scratch_cuckoo_keys_,
                                             d_stash_count,
                                             d_failures_);

        CUDA_SAFE_CALL(cudaMemcpy(&stash_count_, d_stash_count, 
                                  sizeof(unsigned), cudaMemcpyDeviceToHost));
        if (stash_count_) {
            char buffer[256];
            sprintf(buffer, "Stash count: %u", stash_count_);
            PrintMessage(buffer, true);
        }
        CUDA_SAFE_CALL(cudaFree(d_stash_count));

        CUDA_CHECK_ERROR("!!! Failed to cuckoo hash.\n");

        CUDA_SAFE_CALL(cudaMemcpy(&num_failures,
                                  d_failures_,
                                  sizeof(unsigned),
                                  cudaMemcpyDeviceToHost));

#ifdef COUNT_UNINSERTED
        if (num_failures > 0) {
            char buffer[256];
            sprintf(buffer, "Num failures: %u", num_failures);
            PrintMessage(buffer, true);
        }
#endif
    }

    if (num_attempts >= kMaxRestartAttempts) {
        PrintMessage("Completely failed to build.", true);
        return false;
    } else if (num_attempts > 1) {
        char buffer[256];
        sprintf(buffer, "Needed %u attempts", num_attempts);
        PrintMessage(buffer);
    }

    if (num_failures == 0) {
        CUDAWrapper::CallHashRemoveDuplicates(num_hash_functions_,
                                              table_size_,
                                              total_table_size,
                                              constants_2_,
                                              constants_3_,
                                              constants_4_,
                                              constants_5_,
                                              stash_constants_,
                                              d_scratch_cuckoo_keys_,
                                              d_scratch_counts_);

        // Do a prefix-sum over the values to assign each key a unique index.
        CUDPPConfiguration config;
        config.op        = CUDPP_ADD;
        config.datatype  = CUDPP_UINT;
        config.algorithm = CUDPP_SCAN;
        config.options   = CUDPP_OPTION_FORWARD | CUDPP_OPTION_INCLUSIVE;

        CUDPPResult result = cudppPlan(theCudpp, &scanplan_, config, 
                                       total_table_size, 1, 0);
        if (CUDPP_SUCCESS == result) {
            cudppScan(scanplan_, d_scratch_unique_ids_, d_scratch_counts_,
                      total_table_size);
        } else {
            PrintMessage("!!! Failed to create plan.", true);
        }
        CUDA_CHECK_ERROR("!!! Scan failed.\n");

        // Determine how many unique values there are.
        CUDA_SAFE_CALL(cudaMemcpy(&unique_keys_size_,
                                  d_scratch_unique_ids_ + total_table_size - 1,
                                  sizeof(unsigned),
                                  cudaMemcpyDeviceToHost));

        // Copy the unique indices back and compact down the neighbors.
        CUDA_SAFE_CALL(cudaMalloc((void**)&d_unique_keys_,
                                  sizeof(unsigned) * unique_keys_size_));
        CUDA_SAFE_CALL(cudaMemset(d_unique_keys_,
                                  0xff,
                                  sizeof(unsigned) * unique_keys_size_));
        CUDAWrapper::CallHashCompactDown(total_table_size, 
                                         d_contents_, 
                                         d_unique_keys_, 
                                         d_scratch_cuckoo_keys_, 
                                         d_scratch_unique_ids_);
    }

    CUDA_CHECK_ERROR("Error occurred during hash table build.\n");

    return true;
}
Example #3
0
  void IntIntSorter::executeOnGPUAsync(void * const keys, void * const vals, const int numKeys, int & numUniqueKeys, int ** keyOffsets, int ** valOffsets, int ** numVals)
  {
    if (numKeys == 0)
    {
      numUniqueKeys = 0;
      *keyOffsets = *valOffsets = NULL;
      *numVals = NULL;
      return;
    }
    if (numKeys > 32 * 1048576)
    {
      executeOnCPUAsync(keys, vals, numKeys, numUniqueKeys, keyOffsets, valOffsets, numVals);
      return;
    }
    int commRank;
    MPI_Comm_rank(MPI_COMM_WORLD, &commRank);
    CUDPPConfiguration cudppConfig;
    CUDPPHandle planHandle;

    void * gpuInputKeys   = cudacpp::Runtime::malloc(sizeof(int) * numKeys);
    void * gpuInputVals   = cudacpp::Runtime::malloc(sizeof(int) * numKeys);
    void * gpuUniqueFlags = cudacpp::Runtime::malloc(sizeof(int) * numKeys);
    void * gpuValOffsets  = cudacpp::Runtime::malloc(sizeof(int) * numKeys);

    cudacpp::Runtime::memcpyHtoD(gpuInputKeys, keys, sizeof(int) * numKeys);
    cudacpp::Runtime::memcpyHtoD(gpuInputVals, vals, sizeof(int) * numKeys);

    /*
      what we need to get out of here:

      1 - sorted keys and values
      2 - num unique keys
      3 - number of values for each key
      4 - value offsets
      5 - compacted keys

      to get:
          simply sort
          A = find unique values
          B = reverse exclusive scan of "A"
          C = if A[i] == 1
                C[B[0] - B[i]] = i
          D = [0] = C[0] + 1
              [N] = #keys - C[#keys - 1]
              [i] = C[i + 1] - C[i]
          E = forward exclusive scan D
          F = keys[E[i]]

      1 = result of sort (only copy the values)
      2 = B[0]
      3 = D
      4 = E
      5 = F
    */

    // 1
    cudppConfig.algorithm  = CUDPP_SORT_RADIX;
    cudppConfig.op         = CUDPP_ADD; // ignored
    cudppConfig.datatype   = CUDPP_UINT;
    cudppConfig.options    = CUDPP_OPTION_KEY_VALUE_PAIRS;
    cudppPlan(&planHandle, cudppConfig, numKeys, 1, numKeys * sizeof(int));
    cudppSort(planHandle, gpuInputKeys, gpuInputVals, sizeof(int) * 8, numKeys);
    cudppDestroyPlan(planHandle);
    cudacpp::Runtime::sync();
    // cudacpp::Runtime::memcpyDtoH(keys, gpuInputKeys, sizeof(int) * numKeys);
    cudacpp::Runtime::memcpyDtoH(vals, gpuInputVals, sizeof(int) * numKeys);

    // 2 - A = gpuUniqueFlags
    gpmrIntIntSorterMarkUnique(gpuInputKeys, gpuUniqueFlags, numKeys);

    // 2 - B = gpuValOffsets
    cudppConfig.algorithm  = CUDPP_SCAN;
    cudppConfig.op         = CUDPP_ADD; // ignored
    cudppConfig.datatype   = CUDPP_INT;
    cudppConfig.options    = CUDPP_OPTION_EXCLUSIVE | CUDPP_OPTION_BACKWARD;
    cudppPlan(&planHandle, cudppConfig, numKeys, 1, numKeys * sizeof(int));
    cudppScan(planHandle, gpuValOffsets, gpuUniqueFlags, numKeys);
    cudppDestroyPlan(planHandle);
    cudacpp::Runtime::sync();
    cudacpp::Runtime::memcpyDtoH(&numUniqueKeys, gpuValOffsets, sizeof(int));
    ++numUniqueKeys;

    // 2 - C = gpuInputVals and
    // 3 - D = gpuValOffsets
    cudacpp::Runtime::sync();
    gpmrIntIntSorterFindOffsets(gpuInputKeys, gpuUniqueFlags, gpuValOffsets, gpuInputVals, gpuValOffsets, numKeys, numUniqueKeys);
    *numVals = reinterpret_cast<int * >(cudacpp::Runtime::mallocHost(numUniqueKeys * sizeof(int)));
    cudacpp::Runtime::sync();
    cudacpp::Runtime::memcpyDtoH(*numVals, gpuValOffsets, sizeof(int) * numUniqueKeys);
    cudacpp::Runtime::sync();

    // 4 - E = gpuUniqueFlags
    cudppConfig.algorithm  = CUDPP_SCAN;
    cudppConfig.op         = CUDPP_ADD; // ignored
    cudppConfig.datatype   = CUDPP_INT;
    cudppConfig.options    = CUDPP_OPTION_EXCLUSIVE | CUDPP_OPTION_FORWARD;
    cudppPlan(&planHandle, cudppConfig, numKeys, 1, numKeys * sizeof(int));
    cudppScan(planHandle, gpuUniqueFlags, gpuValOffsets, numKeys);
    cudppDestroyPlan(planHandle);
    cudacpp::Runtime::sync();
    *valOffsets = reinterpret_cast<int * >(cudacpp::Runtime::mallocHost(numUniqueKeys * sizeof(int)));
    cudacpp::Runtime::memcpyDtoH(*valOffsets, gpuUniqueFlags, sizeof(int) * numUniqueKeys);

    // 4 - F = gpuInputVals
    gpmrIntIntSorterSetCompactedKeys(gpuInputKeys, gpuUniqueFlags, gpuInputVals, numUniqueKeys);
    cudacpp::Runtime::memcpyDtoH(keys, gpuInputVals, sizeof(int) * numUniqueKeys);

    cudacpp::Runtime::free(gpuInputKeys);
    cudacpp::Runtime::free(gpuInputVals);
    cudacpp::Runtime::free(gpuUniqueFlags);
    cudacpp::Runtime::free(gpuValOffsets);
  }
////////////////////////////////////////////////////////////////////////////////
//! Run the Cuda part of the computation
////////////////////////////////////////////////////////////////////////////////
void
computeIsosurface()
{
    int threads = 128;
    dim3 grid(numVoxels / threads, 1, 1);
    // get around maximum grid size of 65535 in each dimension
    if (grid.x > 65535) {
        grid.y = grid.x / 32768;
        grid.x = 32768;
    }

    // calculate number of vertices need per voxel
    launch_classifyVoxel(grid, threads, 
						d_voxelVerts, d_voxelOccupied, d_volume, 
						gridSize, gridSizeShift, gridSizeMask, 
						numVoxels, voxelSize, isoValue);
#if DEBUG_BUFFERS
    printf("voxelVerts:\n");
    dumpBuffer(d_voxelVerts, numVoxels, sizeof(uint));
#endif

#if SKIP_EMPTY_VOXELS
    // scan voxel occupied array
    cudppScan(scanplan, d_voxelOccupiedScan, d_voxelOccupied, numVoxels);
#if DEBUG_BUFFERS
    printf("voxelOccupiedScan:\n");
    dumpBuffer(d_voxelOccupiedScan, numVoxels, sizeof(uint));
#endif

    // read back values to calculate total number of non-empty voxels
    // since we are using an exclusive scan, the total is the last value of
    // the scan result plus the last value in the input array
    {
        uint lastElement, lastScanElement;
        cutilSafeCall(cudaMemcpy((void *) &lastElement, 
                       (void *) (d_voxelOccupied + numVoxels-1), 
                       sizeof(uint), cudaMemcpyDeviceToHost));
        cutilSafeCall(cudaMemcpy((void *) &lastScanElement, 
                       (void *) (d_voxelOccupiedScan + numVoxels-1), 
                       sizeof(uint), cudaMemcpyDeviceToHost));
        activeVoxels = lastElement + lastScanElement;
    }

    if (activeVoxels==0) {
        // return if there are no full voxels
        totalVerts = 0;
        return;
    }

    // compact voxel index array
    launch_compactVoxels(grid, threads, d_compVoxelArray, d_voxelOccupied, d_voxelOccupiedScan, numVoxels);
    cutilCheckMsg("compactVoxels failed");

#endif // SKIP_EMPTY_VOXELS

    // scan voxel vertex count array
    cudppScan(scanplan, d_voxelVertsScan, d_voxelVerts, numVoxels);
#if DEBUG_BUFFERS
    printf("voxelVertsScan:\n");
    dumpBuffer(d_voxelVertsScan, numVoxels, sizeof(uint));
#endif

    // readback total number of vertices
    {
        uint lastElement, lastScanElement;
        cutilSafeCall(cudaMemcpy((void *) &lastElement, 
                       (void *) (d_voxelVerts + numVoxels-1), 
                       sizeof(uint), cudaMemcpyDeviceToHost));
        cutilSafeCall(cudaMemcpy((void *) &lastScanElement, 
                       (void *) (d_voxelVertsScan + numVoxels-1), 
                       sizeof(uint), cudaMemcpyDeviceToHost));
        totalVerts = lastElement + lastScanElement;
    }

    // generate triangles, writing to vertex buffers
    if (!g_bQAReadback) {
		size_t num_bytes;
	    // DEPRECATED: cutilSafeCall(cudaGLMapBufferObject((void**)&d_pos, posVbo));
	    cutilSafeCall(cudaGraphicsMapResources(1, &cuda_posvbo_resource, 0));
	    cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void**)&d_pos, &num_bytes, cuda_posvbo_resource));

	    // DEPRECATED: cutilSafeCall(cudaGLMapBufferObject((void**)&d_normal, normalVbo));
	    cutilSafeCall(cudaGraphicsMapResources(1, &cuda_normalvbo_resource, 0));
	    cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void**)&d_normal, &num_bytes, cuda_normalvbo_resource));
    }

#if SKIP_EMPTY_VOXELS
    dim3 grid2((int) ceil(activeVoxels / (float) NTHREADS), 1, 1);
#else
    dim3 grid2((int) ceil(numVoxels / (float) NTHREADS), 1, 1);
#endif
    while(grid2.x > 65535) {
        grid2.x/=2;
        grid2.y*=2;
    }
#if SAMPLE_VOLUME
    launch_generateTriangles2(grid2, NTHREADS, d_pos, d_normal, 
                                            d_compVoxelArray, 
                                            d_voxelVertsScan, d_volume, 
                                            gridSize, gridSizeShift, gridSizeMask, 
                                            voxelSize, isoValue, activeVoxels, 
                                            maxVerts);
#else
    launch_generateTriangles(grid2, NTHREADS, d_pos, d_normal, 
                                           d_compVoxelArray, 
                                           d_voxelVertsScan, 
                                           gridSize, gridSizeShift, gridSizeMask, 
                                           voxelSize, isoValue, activeVoxels, 
                                           maxVerts);
#endif

    if (!g_bQAReadback) {
        // DEPRECATED: 		cutilSafeCall(cudaGLUnmapBufferObject(normalVbo));
		cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_normalvbo_resource, 0));
        // DEPRECATED: 		cutilSafeCall(cudaGLUnmapBufferObject(posVbo));
		cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_posvbo_resource, 0));
    }
}
Example #5
0
bool MultivalueHashTable::Build(const unsigned  n,
                                const unsigned *d_keys,
                                const unsigned *d_vals)
{
    CUDA_CHECK_ERROR("Failed before build.");

    unsigned *d_sorted_keys = NULL;
    CUDA_SAFE_CALL(cudaMalloc((void**)&d_sorted_keys, sizeof(unsigned) * n));
    CUDA_SAFE_CALL(cudaMemcpy(d_sorted_keys, d_keys, sizeof(unsigned) * n, 
                              cudaMemcpyDeviceToDevice));

    unsigned *d_sorted_vals = NULL;
    CUDA_SAFE_CALL(cudaMalloc((void**)&d_sorted_vals, sizeof(unsigned) * n));
    CUDA_SAFE_CALL(cudaMemcpy(d_sorted_vals, d_vals, sizeof(unsigned) * n, 
                              cudaMemcpyDeviceToDevice));
    CUDA_CHECK_ERROR("Failed to allocate.");

    CUDPPConfiguration sort_config;
    sort_config.algorithm = CUDPP_SORT_RADIX;                  
    sort_config.datatype = CUDPP_UINT;
    sort_config.options = CUDPP_OPTION_KEY_VALUE_PAIRS;

    CUDPPHandle sort_plan;
    CUDPPResult sort_result = cudppPlan(theCudpp, &sort_plan, sort_config, n,
                                        1, 0);
    cudppRadixSort(sort_plan, d_sorted_keys, (void*)d_sorted_vals, n);

    if (sort_result != CUDPP_SUCCESS)
    {
        printf("Error in plan creation in MultivalueHashTable::build\n");
        bool retval = false;
        cudppDestroyPlan(sort_plan);
        return retval;
    }
    CUDA_CHECK_ERROR("Failed to sort");

    // Find the first key-value pair for each key.
    CUDAWrapper::CallCheckIfUnique(d_sorted_keys, n, d_scratch_is_unique_);

    // Assign a unique index from 0 to k-1 for each of the keys.
    cudppScan(scanplan_, d_scratch_offsets_, d_scratch_is_unique_, n);
    CUDA_CHECK_ERROR("Failed to scan");

    // Check how many unique keys were found.
    unsigned num_unique_keys;
    CUDA_SAFE_CALL(cudaMemcpy(&num_unique_keys, d_scratch_offsets_ + n - 1,
                              sizeof(unsigned), cudaMemcpyDeviceToHost));
    CUDA_CHECK_ERROR("Failed to get # unique keys");

    // Keep a list of the unique keys, and store info on each key's data
    // (location in the values array, how many there are).
    unsigned *d_compacted_keys = NULL;
    uint2 *d_index_counts_tmp = NULL;
    CUDA_SAFE_CALL(cudaMalloc((void**) &d_compacted_keys,
                              sizeof(unsigned) * num_unique_keys));
    CUDA_SAFE_CALL(cudaMalloc((void**) &d_index_counts_tmp,
                              sizeof(uint2) * num_unique_keys));
    CUDAWrapper::CallCompactKeys(d_sorted_keys,
                                 d_scratch_is_unique_,
                                 d_scratch_offsets_,
                                 n,
                                 d_index_counts_tmp,
                                 d_compacted_keys);

    // Determine the counts.
    CUDAWrapper::CallCountValues(d_index_counts_tmp, n, num_unique_keys);

    // Reinitialize the cuckoo hash table using the information we discovered.
    HashTable::Initialize(num_unique_keys,
                          target_space_usage_,
                          num_hash_functions_);

    d_index_counts_  = d_index_counts_tmp;
    d_unique_keys_   = d_compacted_keys;
    d_sorted_values_ = d_sorted_vals;
    sorted_values_size_ = n;

    // Build the cuckoo hash table with each key assigned a unique index.
    // Re-uses the sorted key memory as an array of values from 0 to k-1.
    CUDAWrapper::CallPrepareIndices(num_unique_keys, d_sorted_keys);
    bool success = HashTable::Build(num_unique_keys, d_unique_keys_,
                                    d_sorted_keys);
    CUDA_SAFE_CALL(cudaFree(d_sorted_keys));
    return success;
}