void bitonicSort(btHashPosKey* pData, int lo, int n, bool dir)
			{
				if(n > 1)
				{
					int m = n / 2;
					bitonicSort(pData, lo, m, !dir);
					bitonicSort(pData, lo + m, n - m, dir);
					bitonicMerge(pData, lo, n, dir);
				}
			}
Ejemplo n.º 2
0
/** Procedure bitonicSort first produces a bitonic sequence by
 * recursively sorting its two halves in opposite directions, and then
 * calls bitonicMerge.
 **/
void bitonicSort(int lo, int cnt, int dir)
{
  if (cnt>1)
    {
      int k=cnt/2;
      bitonicSort(lo, k, ASCENDING);
      bitonicSort(lo+k, k, DESCENDING);
      bitonicMerge(lo, cnt, dir);
    }
}
Ejemplo n.º 3
0
/** Procedure bitonicSort first produces a bitonic sequence by
 * recursively sorting its two halves in opposite directions, and then
 * calls bitonicMerge.
 **/
void bitonicSort(int lo, int cnt, int dir)
{
  int k = cnt;
  k /= 2;
  _Pragma( "marker recMerge" )

  if (cnt > 1) {
    bitonicSort(lo, k, ASCENDING);
    bitonicSort(lo + k, k, DESCENDING);
  }

  bitonicMerge(lo, cnt, dir);
  _Pragma( "flowrestriction 1*bitonicMerge <= 31*recMerge" )

  return;
}
Ejemplo n.º 4
0
int main(void)
{
  /** Initialize array "a" with data **/
  int i;
  
  _Pragma( "loopbound min 32 max 32" )
  for (i = 0; i < 32; i++) {
    a[i] = (32 - i);
  }
  
  /** When called with parameters lo = 0, cnt = a.length() and dir =
  * ASCENDING, procedure bitonicSort sorts the whole array a. **/
  _Pragma( "marker recSort" )
  bitonicSort(0, 32, ASCENDING);
  _Pragma( "flowrestriction 1*bitonicSort <= 63*recSort" )

  /** Loop through array, printing out each element **/
  _Pragma( "loopbound min 32 max 32" )
  for (i = 0; i < 32; i++) {
  }
  
  return 0;
}
Ejemplo n.º 5
0
int main(int argc, char** argv) {

	uint num_of_segments;
	uint num_of_elements;
	uint i;

	scanf("%d", &num_of_segments);
	uint mem_size_seg = sizeof(int) * (num_of_segments + 1);
	uint *h_seg = (uint *) malloc(mem_size_seg);
	uint diffprev, diffcur;
	for (i = 0; i < num_of_segments + 1; i++) {
		scanf("%d", &h_seg[i]);

		if (i == 1)
			diffprev = h_seg[i] - h_seg[i - 1];
		else if (i > 1) {
			diffcur = h_seg[i] - h_seg[i - 1];
			if (diffcur != diffprev) {
				printf("Só funciona para segmentos de tamanhos iguais.\n");
				exit(1);
			}
		}

	}

	scanf("%d", &num_of_elements);
	uint mem_size_vec = sizeof(int) * num_of_elements;
	uint *h_vec = (uint *) malloc(mem_size_vec);
	uint *h_value = (uint *) malloc(mem_size_vec);
	for (i = 0; i < num_of_elements; i++) {
		scanf("%d", &h_vec[i]);
		h_value[i] = i;
	}

	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	uint *d_value, *d_value_out, *d_vec, *d_vec_out;

	cudaTest(cudaMalloc((void **) &d_vec, mem_size_vec));
	cudaTest(cudaMalloc((void **) &d_value, mem_size_vec));
	cudaTest(cudaMalloc((void **) &d_vec_out, mem_size_vec));
	cudaTest(cudaMalloc((void **) &d_value_out, mem_size_vec));

	for (int i = 0; i < EXECUTIONS; i++) {
	cudaTest(cudaMemcpy(d_vec, h_vec, mem_size_vec, cudaMemcpyHostToDevice));
	cudaTest(
			cudaMemcpy(d_value, h_value, mem_size_vec, cudaMemcpyHostToDevice));

	cudaEventRecord(start);
	uint threadCount = 0;
	threadCount = bitonicSort(d_vec_out, d_value_out, d_vec, d_value,
			num_of_elements / diffcur, diffcur, 1);
	cudaEventRecord(stop);

	cudaError_t errSync = cudaGetLastError();
	cudaError_t errAsync = cudaDeviceSynchronize();
	if (errSync != cudaSuccess)
		printf("Sync kernel error: %s\n", cudaGetErrorString(errSync));
	if (errAsync != cudaSuccess)
		printf("Async kernel error: %s\n", cudaGetErrorString(errAsync));

	if (ELAPSED_TIME == 1) {
		cudaEventSynchronize(stop);
		float milliseconds = 0;
		cudaEventElapsedTime(&milliseconds, start, stop);
		std::cout << milliseconds << "\n";
	}


	}
	cudaMemcpy(h_value, d_value_out, mem_size_vec, cudaMemcpyDeviceToHost);
	cudaMemcpy(h_vec, d_vec_out, mem_size_vec, cudaMemcpyDeviceToHost);

	if(ELAPSED_TIME != 1)
		print(h_vec, num_of_elements);

	free(h_vec);
	free(h_seg);
	free(h_value);

	cudaFree(d_vec);
	cudaFree(d_vec_out);
	cudaFree(d_value);
	cudaFree(d_value_out);
	cudaDeviceReset();

	return 0;
}
Ejemplo n.º 6
0
/** When called with parameters lo = 0, cnt = a.length() and dir =
 * ASCENDING, procedure bitonicSort sorts the whole array a.
 **/
void sort()
{
  bitonicSort(0, N, ASCENDING);
}
//Step the simulation
void ParticleSystem::update(float deltaTime){
    assert(m_bInitialized);

    setParameters(&m_params);
    setParametersHost(&m_params);

    //Download positions from VBO
    memHandle_t pos; 
    if (!m_bQATest)
    {
        glBindBufferARB(GL_ARRAY_BUFFER, m_posVbo);
        pos = (memHandle_t)glMapBufferARB(GL_ARRAY_BUFFER, GL_READ_WRITE);
        copyArrayToDevice(m_dPos, pos, 0, m_numParticles * 4 * sizeof(float));
    }

    integrateSystem(
        m_dPos,
        m_dVel,
        deltaTime,
        m_numParticles
    );

    calcHash(
        m_dHash,
        m_dIndex,
        m_dPos,
        m_numParticles
    );

    bitonicSort(NULL, m_dHash, m_dIndex, m_dHash, m_dIndex, 1, m_numParticles, 0);

    //Find start and end of each cell and
    //Reorder particle data for better cache coherency
    findCellBoundsAndReorder(
        m_dCellStart,
        m_dCellEnd,
        m_dReorderedPos,
        m_dReorderedVel,
        m_dHash,
        m_dIndex,
        m_dPos,
        m_dVel,
        m_numParticles,
        m_numGridCells
    );

    collide(
        m_dVel,
        m_dReorderedPos,
        m_dReorderedVel,
        m_dIndex,
        m_dCellStart,
        m_dCellEnd,
        m_numParticles,
        m_numGridCells
    );

    //Update buffers
    if (!m_bQATest)
    {
        copyArrayFromDevice(pos,m_dPos, 0, m_numParticles * 4 * sizeof(float));
        glUnmapBufferARB(GL_ARRAY_BUFFER);
    }
}
Ejemplo n.º 8
0
void BoidModelSHWay1::simulate(float dt){
	counter = !counter;

	cl_ulong startTime, endTime;
	//this will update our system by calculating new velocity and updating the positions of our particles
	//Make sure OpenGL is done using our VBOs
	glFinish();
	// map OpenGL buffer object for writing from OpenCL
	//this passes in the vector of VBO buffer objects (position and color)
	err = queue.enqueueAcquireGLObjects(&cl_pos_vbos, NULL, &event);
	err = queue.enqueueAcquireGLObjects(&cl_pos_vbos_out, NULL, &event);
	err = queue.enqueueAcquireGLObjects(&cl_vel_vbos, NULL, &event);
	err = queue.enqueueAcquireGLObjects(&cl_vel_vbos_out, NULL, &event);
	err = queue.enqueueAcquireGLObjects(&cl_color_vbos, NULL, &event);
	err = queue.enqueueAcquireGLObjects(&cl_color_vbos_out, NULL, &event);
	queue.finish();

	//Get grid hash value for every boid
	try
	{
		if (counter)
			err = kernel_getGridHash.setArg(0, cl_pos_vbos[0]);
		else
			err = kernel_getGridHash.setArg(0, cl_pos_vbos_out[0]); //position vbo

		err = kernel_getGridHash.setArg(1, cl_gridHash_unsorted);
		err = kernel_getGridHash.setArg(2, cl_gridIndex_unsorted);
		err = kernel_getGridHash.setArg(3, cl_simParams);
	}
	catch (cl::Error er) {
		log("ERROR: " + std::string(er.what()) + clHelper->oclErrorString(er.err()));
	}

	//std::vector<Vec4> test(num);
	//glGetBufferSubData(GL_ARRAY_BUFFER, 0, sizeof(Vec4)* num, test.data());

	//create gridHash
	err = queue.enqueueNDRangeKernel(kernel_getGridHash, cl::NullRange, cl::NDRange(num), cl::NullRange, NULL, &event);
	queue.finish();

	event.wait();
	event.getProfilingInfo<cl_ulong>(CL_PROFILING_COMMAND_START, &startTime);
	event.getProfilingInfo<cl_ulong>(CL_PROFILING_COMMAND_END, &endTime);
	times[0] = (endTime - startTime) / 1000;

	//set start and end index to 0
	unsigned int val = 0;
	try
	{
		err = kernel_memSet.setArg(0, cl_gridStartIndex);
		err = kernel_memSet.setArg(1, val);
		err = kernel_memSet.setArg(2, simParams.numCells);
	}
	catch (cl::Error er) {
		log("ERROR: " + std::string(er.what()) + clHelper->oclErrorString(er.err()));
	}

	err = queue.enqueueNDRangeKernel(kernel_memSet, cl::NullRange, cl::NDRange(simParams.numCells), cl::NullRange, NULL, &event);
	queue.finish();

	try
	{
		err = kernel_memSet.setArg(0, cl_gridEndIndex);
		err = kernel_memSet.setArg(1, val);
		err = kernel_memSet.setArg(2, simParams.numCells);
	}
	catch (cl::Error er) {
		log("ERROR: " + std::string(er.what()) + clHelper->oclErrorString(er.err()));
	}

	err = queue.enqueueNDRangeKernel(kernel_memSet, cl::NullRange, cl::NDRange(simParams.numCells), cl::NullRange, NULL, &event);
	queue.finish();

	//sort gridHash
	bitonicSort(cl_gridHash_sorted, cl_gridIndex_sorted, cl_gridHash_unsorted, cl_gridIndex_unsorted, 1, simParams.numBodies, 0);
	queue.finish();



	//unsigned int E[NUM_BOIDS];
	//queue.enqueueReadBuffer(cl_gridHash_sorted, CL_TRUE, 0, (size_t)(NUM_BOIDS * sizeof(unsigned int)), &E);
	//queue.finish();

	try
	{
		if (counter){
			err = kernel_findGridEdgeAndReorder.setArg(2, cl_pos_vbos_out[0]);	//pos out ordered
			err = kernel_findGridEdgeAndReorder.setArg(3, cl_vel_vbos_out[0]);	//vel out ordered
			err = kernel_findGridEdgeAndReorder.setArg(6, cl_pos_vbos[0]);		//pos in unordered
			err = kernel_findGridEdgeAndReorder.setArg(7, cl_vel_vbos[0]);		//vel in unordered
			err = kernel_findGridEdgeAndReorder.setArg(8, cl_goal_in);
			err = kernel_findGridEdgeAndReorder.setArg(9, cl_goal_out);
			err = kernel_findGridEdgeAndReorder.setArg(11, cl_color_vbos[0]);
			err = kernel_findGridEdgeAndReorder.setArg(10, cl_color_vbos_out[0]);
		}
		else {
			err = kernel_findGridEdgeAndReorder.setArg(6, cl_pos_vbos_out[0]);	//pos in
			err = kernel_findGridEdgeAndReorder.setArg(7, cl_vel_vbos_out[0]);	//vel in
			err = kernel_findGridEdgeAndReorder.setArg(2, cl_pos_vbos[0]);		//pos out
			err = kernel_findGridEdgeAndReorder.setArg(3, cl_vel_vbos[0]);		//vel out
			err = kernel_findGridEdgeAndReorder.setArg(9, cl_goal_in);
			err = kernel_findGridEdgeAndReorder.setArg(8, cl_goal_out);
			err = kernel_findGridEdgeAndReorder.setArg(10, cl_color_vbos[0]);
			err = kernel_findGridEdgeAndReorder.setArg(11, cl_color_vbos_out[0]);
		}

		err = kernel_findGridEdgeAndReorder.setArg(0, cl_gridStartIndex);
		err = kernel_findGridEdgeAndReorder.setArg(1, cl_gridEndIndex);
		err = kernel_findGridEdgeAndReorder.setArg(4, cl_gridHash_sorted);
		err = kernel_findGridEdgeAndReorder.setArg(5, cl_gridIndex_sorted);
		err = kernel_findGridEdgeAndReorder.setArg(12, cl::__local(sizeof(cl_uint)*(LOCAL_PREF + 1)));
		err = kernel_findGridEdgeAndReorder.setArg(13, num);
	}
	catch (cl::Error er) {
		log("ERROR: " + std::string(er.what()) + clHelper->oclErrorString(er.err()));
	}

	err = queue.enqueueNDRangeKernel(kernel_findGridEdgeAndReorder, cl::NullRange, cl::NDRange(num), cl::NDRange(LOCAL_PREF), NULL, &event);

	queue.finish();
	event.wait();
	event.getProfilingInfo<cl_ulong>(CL_PROFILING_COMMAND_START, &startTime);
	event.getProfilingInfo<cl_ulong>(CL_PROFILING_COMMAND_END, &endTime);
	times[2] = (endTime - startTime) / 1000000;

	try
	{
		if (counter)
			err = kernel_evalSH.setArg(0, cl_vel_vbos_out[0]);
		else
			err = kernel_evalSH.setArg(0, cl_vel_vbos[0]);

		err = kernel_evalSH.setArg(1, cl_shEvalX);
		err = kernel_evalSH.setArg(2, cl_shEvalY);
		err = kernel_evalSH.setArg(3, cl_shEvalZ);
		err = kernel_evalSH.setArg(4, cl_coef0X);
		err = kernel_evalSH.setArg(5, cl_coef0Y);
		err = kernel_evalSH.setArg(6, cl_coef0Z);
		err = kernel_evalSH.setArg(7, cl::__local(sizeof(cl_float)*(LOCAL_PREF)));
		err = kernel_evalSH.setArg(8, cl::__local(sizeof(cl_float)*(LOCAL_PREF)));
		err = kernel_evalSH.setArg(9, cl::__local(sizeof(cl_float)*(LOCAL_PREF)));
		err = kernel_evalSH.setArg(10, cl::__local(sizeof(cl_float8)*(LOCAL_PREF)));
		err = kernel_evalSH.setArg(11, cl::__local(sizeof(cl_float8)*(LOCAL_PREF)));
		err = kernel_evalSH.setArg(12, cl::__local(sizeof(cl_float8)*(LOCAL_PREF)));
		err = kernel_evalSH.setArg(13, cl_gridStartIndex);
		err = kernel_evalSH.setArg(14, cl_gridEndIndex);
	}
	catch (cl::Error er){
		log("ERROR: " + std::string(er.what()) + clHelper->oclErrorString(er.err()));
	}




	int localWorkSize = LOCAL_PREF;
	int globalWorkSize = simParams.numCells * LOCAL_PREF;
	err = queue.enqueueNDRangeKernel(kernel_evalSH, cl::NullRange, cl::NDRange(globalWorkSize), cl::NDRange(localWorkSize), NULL, &event);

	event.wait();
	event.getProfilingInfo<cl_ulong>(CL_PROFILING_COMMAND_START, &startTime);
	event.getProfilingInfo<cl_ulong>(CL_PROFILING_COMMAND_END, &endTime);
	times[4] = (endTime - startTime) / 1000000;

	std::vector<Vec4> C(2 * simParams.numCells);
	queue.enqueueReadBuffer(cl_shEvalX, CL_TRUE, 0, (size_t)2 * simParams.numCells * sizeof(Vec4), C.data());
	queue.finish();

	std::vector<unsigned int> E(simParams.numCells);
	queue.enqueueReadBuffer(cl_gridStartIndex, CL_TRUE, 0, (size_t)(simParams.numCells * sizeof(unsigned int)), E.data());
	queue.finish();

	std::vector<unsigned int> F(simParams.numCells);
	queue.enqueueReadBuffer(cl_gridEndIndex, CL_TRUE, 0, (size_t)(simParams.numCells * sizeof(unsigned int)), F.data());
	queue.finish();

	try
	{
		if (counter){
			err = kernel_simulate.setArg(0, cl_pos_vbos_out[0]);	//pos in
			err = kernel_simulate.setArg(1, cl_pos_vbos[0]);		//pos out
			err = kernel_simulate.setArg(2, cl_vel_vbos_out[0]);	//vel in
			err = kernel_simulate.setArg(3, cl_vel_vbos[0]);		//vel out
			err = kernel_simulate.setArg(6, cl_goal_out);
		}
		else{
			err = kernel_simulate.setArg(1, cl_pos_vbos_out[0]);	//pos out
			err = kernel_simulate.setArg(0, cl_pos_vbos[0]);		//pos in
			err = kernel_simulate.setArg(3, cl_vel_vbos_out[0]);	//vel out
			err = kernel_simulate.setArg(2, cl_vel_vbos[0]);		//vel in
			err = kernel_simulate.setArg(6, cl_goal_in);
		}

		err = kernel_simulate.setArg(4, cl_gridStartIndex);
		err = kernel_simulate.setArg(5, cl_gridEndIndex);
		err = kernel_simulate.setArg(7, cl_simParams);
		err = kernel_simulate.setArg(8, cl_range);
		err = kernel_simulate.setArg(9, dt);
	}
	catch (cl::Error er){
		log("ERROR: " + std::string(er.what()) + clHelper->oclErrorString(er.err()));
	}

	queue.finish();


	/*
	unsigned int D[12500];
	queue.enqueueReadBuffer(cl_gridEndIndex, CL_TRUE, 0, (size_t)12500 * sizeof(unsigned int), &D);
	queue.finish();*/



	queue.finish();

	//globalWorkSize = LOCAL_PREF * (simParams.numCells);
	localWorkSize = LOCAL_PREF;
	globalWorkSize = simParams.numBodies;

	err = queue.enqueueNDRangeKernel(kernel_simulate, cl::NullRange, cl::NDRange(globalWorkSize), cl::NDRange(localWorkSize), NULL, &eventSim);

	eventSim.wait();
	eventSim.getProfilingInfo<cl_ulong>(CL_PROFILING_COMMAND_START, &startTime);
	eventSim.getProfilingInfo<cl_ulong>(CL_PROFILING_COMMAND_END, &endTime);
	times[3] = (endTime - startTime) / 1000000;

	try
	{
		if (counter){
			err = kernel_useSH.setArg(0, cl_vel_vbos[0]);		//vel in
			err = kernel_useSH.setArg(1, cl_vel_vbos_out[0]);	//vel out
			err = kernel_useSH.setArg(8, cl_pos_vbos[0]);		//pos in	
			err = kernel_useSH.setArg(9, cl_pos_vbos_out[0]);	//pos out
		}
		else {
			err = kernel_useSH.setArg(1, cl_vel_vbos[0]);		//vel out
			err = kernel_useSH.setArg(0, cl_vel_vbos_out[0]);	//vel in
			err = kernel_useSH.setArg(9, cl_pos_vbos[0]);		//pos out
			err = kernel_useSH.setArg(8, cl_pos_vbos_out[0]);	//pos in
		}

		err = kernel_useSH.setArg(2, cl_gridStartIndex);
		err = kernel_useSH.setArg(3, cl_gridEndIndex);
		err = kernel_useSH.setArg(4, cl_shEvalX);
		err = kernel_useSH.setArg(5, cl_shEvalY);
		err = kernel_useSH.setArg(6, cl_shEvalZ);
		err = kernel_useSH.setArg(7, cl_simParams);
		err = kernel_useSH.setArg(10, cl::__local(sizeof(cl_float8)*(LOCAL_PREF)));
		err = kernel_useSH.setArg(11, cl::__local(sizeof(cl_float8)*(LOCAL_PREF)));
		err = kernel_useSH.setArg(12, cl::__local(sizeof(cl_float8)*(LOCAL_PREF)));
		err = kernel_useSH.setArg(13, cl_coef0X);
		err = kernel_useSH.setArg(14, cl_coef0Y);
		err = kernel_useSH.setArg(15, cl_coef0Z);
		err = kernel_useSH.setArg(16, cl::__local(sizeof(cl_float)*(LOCAL_PREF)));
		err = kernel_useSH.setArg(17, cl::__local(sizeof(cl_float)*(LOCAL_PREF)));
		err = kernel_useSH.setArg(18, cl::__local(sizeof(cl_float)*(LOCAL_PREF)));
		err = kernel_useSH.setArg(19, dt);
	}
	catch (cl::Error er){
		log("ERROR: " + std::string(er.what()) + clHelper->oclErrorString(er.err()));
	}

	localWorkSize = LOCAL_PREF;
	globalWorkSize = simParams.numBodies;
	err = queue.enqueueNDRangeKernel(kernel_useSH, cl::NullRange, cl::NDRange(globalWorkSize), cl::NDRange(localWorkSize), NULL, &event);
	queue.finish();

	event.wait();
	event.getProfilingInfo<cl_ulong>(CL_PROFILING_COMMAND_START, &startTime);
	event.getProfilingInfo<cl_ulong>(CL_PROFILING_COMMAND_END, &endTime);
	times[5] = (endTime - startTime) / 1000000;

	/*
	unsigned int A[8000];
	queue.enqueueReadBuffer(cl_range, CL_TRUE, 0, (size_t)(8000 * sizeof(unsigned int)), &A);
	queue.finish();
	*/
	/*std::vector<Vec4> X(simParams.numCells);
	queue.enqueueReadBuffer(cl_sumVel, CL_TRUE, 0, (size_t)simParams.numCells * sizeof(Vec4), X.data());
	queue.finish();
	*/

	//Release the VBOs so OpenGL can play with them
	err = queue.enqueueReleaseGLObjects(&cl_pos_vbos, NULL, &event);
	err = queue.enqueueReleaseGLObjects(&cl_pos_vbos_out, NULL, &event);
	err = queue.enqueueReleaseGLObjects(&cl_vel_vbos, NULL, &event);
	err = queue.enqueueReleaseGLObjects(&cl_vel_vbos_out, NULL, &event);
	err = queue.enqueueReleaseGLObjects(&cl_color_vbos, NULL, &event);
	err = queue.enqueueReleaseGLObjects(&cl_color_vbos_out, NULL, &event);
}
Ejemplo n.º 9
0
////////////////////////////////////////////////////////////////////////////////
// Test driver
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    cudaError_t error;
    printf("%s Starting...\n\n", argv[0]);

    printf("Starting up CUDA context...\n");
    int dev = findCudaDevice(argc, (const char **)argv);

    uint *h_InputKey, *h_InputVal, *h_OutputKeyGPU, *h_OutputValGPU;
    uint *d_InputKey, *d_InputVal,    *d_OutputKey,    *d_OutputVal;
    StopWatchInterface *hTimer = NULL;

    const uint             N = 1048576;
    const uint           DIR = 0;
    const uint     numValues = 65536;
    const uint numIterations = 1;

    printf("Allocating and initializing host arrays...\n\n");
    sdkCreateTimer(&hTimer);
    h_InputKey     = (uint *)malloc(N * sizeof(uint));
    h_InputVal     = (uint *)malloc(N * sizeof(uint));
    h_OutputKeyGPU = (uint *)malloc(N * sizeof(uint));
    h_OutputValGPU = (uint *)malloc(N * sizeof(uint));
    srand(2001);

    for (uint i = 0; i < N; i++)
    {
        h_InputKey[i] = rand() % numValues;
        h_InputVal[i] = i;
    }

    printf("Allocating and initializing CUDA arrays...\n\n");
    error = cudaMalloc((void **)&d_InputKey,  N * sizeof(uint));
    checkCudaErrors(error);
    error = cudaMalloc((void **)&d_InputVal,  N * sizeof(uint));
    checkCudaErrors(error);
    error = cudaMalloc((void **)&d_OutputKey, N * sizeof(uint));
    checkCudaErrors(error);
    error = cudaMalloc((void **)&d_OutputVal, N * sizeof(uint));
    checkCudaErrors(error);
    error = cudaMemcpy(d_InputKey, h_InputKey, N * sizeof(uint), cudaMemcpyHostToDevice);
    checkCudaErrors(error);
    error = cudaMemcpy(d_InputVal, h_InputVal, N * sizeof(uint), cudaMemcpyHostToDevice);
    checkCudaErrors(error);

    int flag = 1;
    printf("Running GPU bitonic sort (%u identical iterations)...\n\n", numIterations);

    for (uint arrayLength = 64; arrayLength <= N; arrayLength *= 2)
    {
        printf("Testing array length %u (%u arrays per batch)...\n", arrayLength, N / arrayLength);
        error = cudaDeviceSynchronize();
        checkCudaErrors(error);

        sdkResetTimer(&hTimer);
        sdkStartTimer(&hTimer);
        uint threadCount = 0;

        for (uint i = 0; i < numIterations; i++)
            threadCount = bitonicSort(
                              d_OutputKey,
                              d_OutputVal,
                              d_InputKey,
                              d_InputVal,
                              N / arrayLength,
                              arrayLength,
                              DIR
                          );

        error = cudaDeviceSynchronize();
        checkCudaErrors(error);

        sdkStopTimer(&hTimer);
        printf("Average time: %f ms\n\n", sdkGetTimerValue(&hTimer) / numIterations);

        if (arrayLength == N)
        {
            double dTimeSecs = 1.0e-3 * sdkGetTimerValue(&hTimer) / numIterations;
            printf("sortingNetworks-bitonic, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u elements, NumDevsUsed = %u, Workgroup = %u\n",
                   (1.0e-6 * (double)arrayLength/dTimeSecs), dTimeSecs, arrayLength, 1, threadCount);
        }

        printf("\nValidating the results...\n");
        printf("...reading back GPU results\n");
        error = cudaMemcpy(h_OutputKeyGPU, d_OutputKey, N * sizeof(uint), cudaMemcpyDeviceToHost);
        checkCudaErrors(error);
        error = cudaMemcpy(h_OutputValGPU, d_OutputVal, N * sizeof(uint), cudaMemcpyDeviceToHost);
        checkCudaErrors(error);

        int keysFlag = validateSortedKeys(h_OutputKeyGPU, h_InputKey, N / arrayLength, arrayLength, numValues, DIR);
        int valuesFlag = validateValues(h_OutputKeyGPU, h_OutputValGPU, h_InputKey, N / arrayLength, arrayLength);
        flag = flag && keysFlag && valuesFlag;

        printf("\n");
    }

    printf("Shutting down...\n");
    sdkDeleteTimer(&hTimer);
    cudaFree(d_OutputVal);
    cudaFree(d_OutputKey);
    cudaFree(d_InputVal);
    cudaFree(d_InputKey);
    free(h_OutputValGPU);
    free(h_OutputKeyGPU);
    free(h_InputVal);
    free(h_InputKey);

    cudaDeviceReset();
    exit(flag ? EXIT_SUCCESS : EXIT_FAILURE);
}