void bitonicSort(btHashPosKey* pData, int lo, int n, bool dir) { if(n > 1) { int m = n / 2; bitonicSort(pData, lo, m, !dir); bitonicSort(pData, lo + m, n - m, dir); bitonicMerge(pData, lo, n, dir); } }
/** Procedure bitonicSort first produces a bitonic sequence by * recursively sorting its two halves in opposite directions, and then * calls bitonicMerge. **/ void bitonicSort(int lo, int cnt, int dir) { if (cnt>1) { int k=cnt/2; bitonicSort(lo, k, ASCENDING); bitonicSort(lo+k, k, DESCENDING); bitonicMerge(lo, cnt, dir); } }
/** Procedure bitonicSort first produces a bitonic sequence by * recursively sorting its two halves in opposite directions, and then * calls bitonicMerge. **/ void bitonicSort(int lo, int cnt, int dir) { int k = cnt; k /= 2; _Pragma( "marker recMerge" ) if (cnt > 1) { bitonicSort(lo, k, ASCENDING); bitonicSort(lo + k, k, DESCENDING); } bitonicMerge(lo, cnt, dir); _Pragma( "flowrestriction 1*bitonicMerge <= 31*recMerge" ) return; }
int main(void) { /** Initialize array "a" with data **/ int i; _Pragma( "loopbound min 32 max 32" ) for (i = 0; i < 32; i++) { a[i] = (32 - i); } /** When called with parameters lo = 0, cnt = a.length() and dir = * ASCENDING, procedure bitonicSort sorts the whole array a. **/ _Pragma( "marker recSort" ) bitonicSort(0, 32, ASCENDING); _Pragma( "flowrestriction 1*bitonicSort <= 63*recSort" ) /** Loop through array, printing out each element **/ _Pragma( "loopbound min 32 max 32" ) for (i = 0; i < 32; i++) { } return 0; }
int main(int argc, char** argv) { uint num_of_segments; uint num_of_elements; uint i; scanf("%d", &num_of_segments); uint mem_size_seg = sizeof(int) * (num_of_segments + 1); uint *h_seg = (uint *) malloc(mem_size_seg); uint diffprev, diffcur; for (i = 0; i < num_of_segments + 1; i++) { scanf("%d", &h_seg[i]); if (i == 1) diffprev = h_seg[i] - h_seg[i - 1]; else if (i > 1) { diffcur = h_seg[i] - h_seg[i - 1]; if (diffcur != diffprev) { printf("Só funciona para segmentos de tamanhos iguais.\n"); exit(1); } } } scanf("%d", &num_of_elements); uint mem_size_vec = sizeof(int) * num_of_elements; uint *h_vec = (uint *) malloc(mem_size_vec); uint *h_value = (uint *) malloc(mem_size_vec); for (i = 0; i < num_of_elements; i++) { scanf("%d", &h_vec[i]); h_value[i] = i; } cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); uint *d_value, *d_value_out, *d_vec, *d_vec_out; cudaTest(cudaMalloc((void **) &d_vec, mem_size_vec)); cudaTest(cudaMalloc((void **) &d_value, mem_size_vec)); cudaTest(cudaMalloc((void **) &d_vec_out, mem_size_vec)); cudaTest(cudaMalloc((void **) &d_value_out, mem_size_vec)); for (int i = 0; i < EXECUTIONS; i++) { cudaTest(cudaMemcpy(d_vec, h_vec, mem_size_vec, cudaMemcpyHostToDevice)); cudaTest( cudaMemcpy(d_value, h_value, mem_size_vec, cudaMemcpyHostToDevice)); cudaEventRecord(start); uint threadCount = 0; threadCount = bitonicSort(d_vec_out, d_value_out, d_vec, d_value, num_of_elements / diffcur, diffcur, 1); cudaEventRecord(stop); cudaError_t errSync = cudaGetLastError(); cudaError_t errAsync = cudaDeviceSynchronize(); if (errSync != cudaSuccess) printf("Sync kernel error: %s\n", cudaGetErrorString(errSync)); if (errAsync != cudaSuccess) printf("Async kernel error: %s\n", cudaGetErrorString(errAsync)); if (ELAPSED_TIME == 1) { cudaEventSynchronize(stop); float milliseconds = 0; cudaEventElapsedTime(&milliseconds, start, stop); std::cout << milliseconds << "\n"; } } cudaMemcpy(h_value, d_value_out, mem_size_vec, cudaMemcpyDeviceToHost); cudaMemcpy(h_vec, d_vec_out, mem_size_vec, cudaMemcpyDeviceToHost); if(ELAPSED_TIME != 1) print(h_vec, num_of_elements); free(h_vec); free(h_seg); free(h_value); cudaFree(d_vec); cudaFree(d_vec_out); cudaFree(d_value); cudaFree(d_value_out); cudaDeviceReset(); return 0; }
/** When called with parameters lo = 0, cnt = a.length() and dir = * ASCENDING, procedure bitonicSort sorts the whole array a. **/ void sort() { bitonicSort(0, N, ASCENDING); }
//Step the simulation void ParticleSystem::update(float deltaTime){ assert(m_bInitialized); setParameters(&m_params); setParametersHost(&m_params); //Download positions from VBO memHandle_t pos; if (!m_bQATest) { glBindBufferARB(GL_ARRAY_BUFFER, m_posVbo); pos = (memHandle_t)glMapBufferARB(GL_ARRAY_BUFFER, GL_READ_WRITE); copyArrayToDevice(m_dPos, pos, 0, m_numParticles * 4 * sizeof(float)); } integrateSystem( m_dPos, m_dVel, deltaTime, m_numParticles ); calcHash( m_dHash, m_dIndex, m_dPos, m_numParticles ); bitonicSort(NULL, m_dHash, m_dIndex, m_dHash, m_dIndex, 1, m_numParticles, 0); //Find start and end of each cell and //Reorder particle data for better cache coherency findCellBoundsAndReorder( m_dCellStart, m_dCellEnd, m_dReorderedPos, m_dReorderedVel, m_dHash, m_dIndex, m_dPos, m_dVel, m_numParticles, m_numGridCells ); collide( m_dVel, m_dReorderedPos, m_dReorderedVel, m_dIndex, m_dCellStart, m_dCellEnd, m_numParticles, m_numGridCells ); //Update buffers if (!m_bQATest) { copyArrayFromDevice(pos,m_dPos, 0, m_numParticles * 4 * sizeof(float)); glUnmapBufferARB(GL_ARRAY_BUFFER); } }
void BoidModelSHWay1::simulate(float dt){ counter = !counter; cl_ulong startTime, endTime; //this will update our system by calculating new velocity and updating the positions of our particles //Make sure OpenGL is done using our VBOs glFinish(); // map OpenGL buffer object for writing from OpenCL //this passes in the vector of VBO buffer objects (position and color) err = queue.enqueueAcquireGLObjects(&cl_pos_vbos, NULL, &event); err = queue.enqueueAcquireGLObjects(&cl_pos_vbos_out, NULL, &event); err = queue.enqueueAcquireGLObjects(&cl_vel_vbos, NULL, &event); err = queue.enqueueAcquireGLObjects(&cl_vel_vbos_out, NULL, &event); err = queue.enqueueAcquireGLObjects(&cl_color_vbos, NULL, &event); err = queue.enqueueAcquireGLObjects(&cl_color_vbos_out, NULL, &event); queue.finish(); //Get grid hash value for every boid try { if (counter) err = kernel_getGridHash.setArg(0, cl_pos_vbos[0]); else err = kernel_getGridHash.setArg(0, cl_pos_vbos_out[0]); //position vbo err = kernel_getGridHash.setArg(1, cl_gridHash_unsorted); err = kernel_getGridHash.setArg(2, cl_gridIndex_unsorted); err = kernel_getGridHash.setArg(3, cl_simParams); } catch (cl::Error er) { log("ERROR: " + std::string(er.what()) + clHelper->oclErrorString(er.err())); } //std::vector<Vec4> test(num); //glGetBufferSubData(GL_ARRAY_BUFFER, 0, sizeof(Vec4)* num, test.data()); //create gridHash err = queue.enqueueNDRangeKernel(kernel_getGridHash, cl::NullRange, cl::NDRange(num), cl::NullRange, NULL, &event); queue.finish(); event.wait(); event.getProfilingInfo<cl_ulong>(CL_PROFILING_COMMAND_START, &startTime); event.getProfilingInfo<cl_ulong>(CL_PROFILING_COMMAND_END, &endTime); times[0] = (endTime - startTime) / 1000; //set start and end index to 0 unsigned int val = 0; try { err = kernel_memSet.setArg(0, cl_gridStartIndex); err = kernel_memSet.setArg(1, val); err = kernel_memSet.setArg(2, simParams.numCells); } catch (cl::Error er) { log("ERROR: " + std::string(er.what()) + clHelper->oclErrorString(er.err())); } err = queue.enqueueNDRangeKernel(kernel_memSet, cl::NullRange, cl::NDRange(simParams.numCells), cl::NullRange, NULL, &event); queue.finish(); try { err = kernel_memSet.setArg(0, cl_gridEndIndex); err = kernel_memSet.setArg(1, val); err = kernel_memSet.setArg(2, simParams.numCells); } catch (cl::Error er) { log("ERROR: " + std::string(er.what()) + clHelper->oclErrorString(er.err())); } err = queue.enqueueNDRangeKernel(kernel_memSet, cl::NullRange, cl::NDRange(simParams.numCells), cl::NullRange, NULL, &event); queue.finish(); //sort gridHash bitonicSort(cl_gridHash_sorted, cl_gridIndex_sorted, cl_gridHash_unsorted, cl_gridIndex_unsorted, 1, simParams.numBodies, 0); queue.finish(); //unsigned int E[NUM_BOIDS]; //queue.enqueueReadBuffer(cl_gridHash_sorted, CL_TRUE, 0, (size_t)(NUM_BOIDS * sizeof(unsigned int)), &E); //queue.finish(); try { if (counter){ err = kernel_findGridEdgeAndReorder.setArg(2, cl_pos_vbos_out[0]); //pos out ordered err = kernel_findGridEdgeAndReorder.setArg(3, cl_vel_vbos_out[0]); //vel out ordered err = kernel_findGridEdgeAndReorder.setArg(6, cl_pos_vbos[0]); //pos in unordered err = kernel_findGridEdgeAndReorder.setArg(7, cl_vel_vbos[0]); //vel in unordered err = kernel_findGridEdgeAndReorder.setArg(8, cl_goal_in); err = kernel_findGridEdgeAndReorder.setArg(9, cl_goal_out); err = kernel_findGridEdgeAndReorder.setArg(11, cl_color_vbos[0]); err = kernel_findGridEdgeAndReorder.setArg(10, cl_color_vbos_out[0]); } else { err = kernel_findGridEdgeAndReorder.setArg(6, cl_pos_vbos_out[0]); //pos in err = kernel_findGridEdgeAndReorder.setArg(7, cl_vel_vbos_out[0]); //vel in err = kernel_findGridEdgeAndReorder.setArg(2, cl_pos_vbos[0]); //pos out err = kernel_findGridEdgeAndReorder.setArg(3, cl_vel_vbos[0]); //vel out err = kernel_findGridEdgeAndReorder.setArg(9, cl_goal_in); err = kernel_findGridEdgeAndReorder.setArg(8, cl_goal_out); err = kernel_findGridEdgeAndReorder.setArg(10, cl_color_vbos[0]); err = kernel_findGridEdgeAndReorder.setArg(11, cl_color_vbos_out[0]); } err = kernel_findGridEdgeAndReorder.setArg(0, cl_gridStartIndex); err = kernel_findGridEdgeAndReorder.setArg(1, cl_gridEndIndex); err = kernel_findGridEdgeAndReorder.setArg(4, cl_gridHash_sorted); err = kernel_findGridEdgeAndReorder.setArg(5, cl_gridIndex_sorted); err = kernel_findGridEdgeAndReorder.setArg(12, cl::__local(sizeof(cl_uint)*(LOCAL_PREF + 1))); err = kernel_findGridEdgeAndReorder.setArg(13, num); } catch (cl::Error er) { log("ERROR: " + std::string(er.what()) + clHelper->oclErrorString(er.err())); } err = queue.enqueueNDRangeKernel(kernel_findGridEdgeAndReorder, cl::NullRange, cl::NDRange(num), cl::NDRange(LOCAL_PREF), NULL, &event); queue.finish(); event.wait(); event.getProfilingInfo<cl_ulong>(CL_PROFILING_COMMAND_START, &startTime); event.getProfilingInfo<cl_ulong>(CL_PROFILING_COMMAND_END, &endTime); times[2] = (endTime - startTime) / 1000000; try { if (counter) err = kernel_evalSH.setArg(0, cl_vel_vbos_out[0]); else err = kernel_evalSH.setArg(0, cl_vel_vbos[0]); err = kernel_evalSH.setArg(1, cl_shEvalX); err = kernel_evalSH.setArg(2, cl_shEvalY); err = kernel_evalSH.setArg(3, cl_shEvalZ); err = kernel_evalSH.setArg(4, cl_coef0X); err = kernel_evalSH.setArg(5, cl_coef0Y); err = kernel_evalSH.setArg(6, cl_coef0Z); err = kernel_evalSH.setArg(7, cl::__local(sizeof(cl_float)*(LOCAL_PREF))); err = kernel_evalSH.setArg(8, cl::__local(sizeof(cl_float)*(LOCAL_PREF))); err = kernel_evalSH.setArg(9, cl::__local(sizeof(cl_float)*(LOCAL_PREF))); err = kernel_evalSH.setArg(10, cl::__local(sizeof(cl_float8)*(LOCAL_PREF))); err = kernel_evalSH.setArg(11, cl::__local(sizeof(cl_float8)*(LOCAL_PREF))); err = kernel_evalSH.setArg(12, cl::__local(sizeof(cl_float8)*(LOCAL_PREF))); err = kernel_evalSH.setArg(13, cl_gridStartIndex); err = kernel_evalSH.setArg(14, cl_gridEndIndex); } catch (cl::Error er){ log("ERROR: " + std::string(er.what()) + clHelper->oclErrorString(er.err())); } int localWorkSize = LOCAL_PREF; int globalWorkSize = simParams.numCells * LOCAL_PREF; err = queue.enqueueNDRangeKernel(kernel_evalSH, cl::NullRange, cl::NDRange(globalWorkSize), cl::NDRange(localWorkSize), NULL, &event); event.wait(); event.getProfilingInfo<cl_ulong>(CL_PROFILING_COMMAND_START, &startTime); event.getProfilingInfo<cl_ulong>(CL_PROFILING_COMMAND_END, &endTime); times[4] = (endTime - startTime) / 1000000; std::vector<Vec4> C(2 * simParams.numCells); queue.enqueueReadBuffer(cl_shEvalX, CL_TRUE, 0, (size_t)2 * simParams.numCells * sizeof(Vec4), C.data()); queue.finish(); std::vector<unsigned int> E(simParams.numCells); queue.enqueueReadBuffer(cl_gridStartIndex, CL_TRUE, 0, (size_t)(simParams.numCells * sizeof(unsigned int)), E.data()); queue.finish(); std::vector<unsigned int> F(simParams.numCells); queue.enqueueReadBuffer(cl_gridEndIndex, CL_TRUE, 0, (size_t)(simParams.numCells * sizeof(unsigned int)), F.data()); queue.finish(); try { if (counter){ err = kernel_simulate.setArg(0, cl_pos_vbos_out[0]); //pos in err = kernel_simulate.setArg(1, cl_pos_vbos[0]); //pos out err = kernel_simulate.setArg(2, cl_vel_vbos_out[0]); //vel in err = kernel_simulate.setArg(3, cl_vel_vbos[0]); //vel out err = kernel_simulate.setArg(6, cl_goal_out); } else{ err = kernel_simulate.setArg(1, cl_pos_vbos_out[0]); //pos out err = kernel_simulate.setArg(0, cl_pos_vbos[0]); //pos in err = kernel_simulate.setArg(3, cl_vel_vbos_out[0]); //vel out err = kernel_simulate.setArg(2, cl_vel_vbos[0]); //vel in err = kernel_simulate.setArg(6, cl_goal_in); } err = kernel_simulate.setArg(4, cl_gridStartIndex); err = kernel_simulate.setArg(5, cl_gridEndIndex); err = kernel_simulate.setArg(7, cl_simParams); err = kernel_simulate.setArg(8, cl_range); err = kernel_simulate.setArg(9, dt); } catch (cl::Error er){ log("ERROR: " + std::string(er.what()) + clHelper->oclErrorString(er.err())); } queue.finish(); /* unsigned int D[12500]; queue.enqueueReadBuffer(cl_gridEndIndex, CL_TRUE, 0, (size_t)12500 * sizeof(unsigned int), &D); queue.finish();*/ queue.finish(); //globalWorkSize = LOCAL_PREF * (simParams.numCells); localWorkSize = LOCAL_PREF; globalWorkSize = simParams.numBodies; err = queue.enqueueNDRangeKernel(kernel_simulate, cl::NullRange, cl::NDRange(globalWorkSize), cl::NDRange(localWorkSize), NULL, &eventSim); eventSim.wait(); eventSim.getProfilingInfo<cl_ulong>(CL_PROFILING_COMMAND_START, &startTime); eventSim.getProfilingInfo<cl_ulong>(CL_PROFILING_COMMAND_END, &endTime); times[3] = (endTime - startTime) / 1000000; try { if (counter){ err = kernel_useSH.setArg(0, cl_vel_vbos[0]); //vel in err = kernel_useSH.setArg(1, cl_vel_vbos_out[0]); //vel out err = kernel_useSH.setArg(8, cl_pos_vbos[0]); //pos in err = kernel_useSH.setArg(9, cl_pos_vbos_out[0]); //pos out } else { err = kernel_useSH.setArg(1, cl_vel_vbos[0]); //vel out err = kernel_useSH.setArg(0, cl_vel_vbos_out[0]); //vel in err = kernel_useSH.setArg(9, cl_pos_vbos[0]); //pos out err = kernel_useSH.setArg(8, cl_pos_vbos_out[0]); //pos in } err = kernel_useSH.setArg(2, cl_gridStartIndex); err = kernel_useSH.setArg(3, cl_gridEndIndex); err = kernel_useSH.setArg(4, cl_shEvalX); err = kernel_useSH.setArg(5, cl_shEvalY); err = kernel_useSH.setArg(6, cl_shEvalZ); err = kernel_useSH.setArg(7, cl_simParams); err = kernel_useSH.setArg(10, cl::__local(sizeof(cl_float8)*(LOCAL_PREF))); err = kernel_useSH.setArg(11, cl::__local(sizeof(cl_float8)*(LOCAL_PREF))); err = kernel_useSH.setArg(12, cl::__local(sizeof(cl_float8)*(LOCAL_PREF))); err = kernel_useSH.setArg(13, cl_coef0X); err = kernel_useSH.setArg(14, cl_coef0Y); err = kernel_useSH.setArg(15, cl_coef0Z); err = kernel_useSH.setArg(16, cl::__local(sizeof(cl_float)*(LOCAL_PREF))); err = kernel_useSH.setArg(17, cl::__local(sizeof(cl_float)*(LOCAL_PREF))); err = kernel_useSH.setArg(18, cl::__local(sizeof(cl_float)*(LOCAL_PREF))); err = kernel_useSH.setArg(19, dt); } catch (cl::Error er){ log("ERROR: " + std::string(er.what()) + clHelper->oclErrorString(er.err())); } localWorkSize = LOCAL_PREF; globalWorkSize = simParams.numBodies; err = queue.enqueueNDRangeKernel(kernel_useSH, cl::NullRange, cl::NDRange(globalWorkSize), cl::NDRange(localWorkSize), NULL, &event); queue.finish(); event.wait(); event.getProfilingInfo<cl_ulong>(CL_PROFILING_COMMAND_START, &startTime); event.getProfilingInfo<cl_ulong>(CL_PROFILING_COMMAND_END, &endTime); times[5] = (endTime - startTime) / 1000000; /* unsigned int A[8000]; queue.enqueueReadBuffer(cl_range, CL_TRUE, 0, (size_t)(8000 * sizeof(unsigned int)), &A); queue.finish(); */ /*std::vector<Vec4> X(simParams.numCells); queue.enqueueReadBuffer(cl_sumVel, CL_TRUE, 0, (size_t)simParams.numCells * sizeof(Vec4), X.data()); queue.finish(); */ //Release the VBOs so OpenGL can play with them err = queue.enqueueReleaseGLObjects(&cl_pos_vbos, NULL, &event); err = queue.enqueueReleaseGLObjects(&cl_pos_vbos_out, NULL, &event); err = queue.enqueueReleaseGLObjects(&cl_vel_vbos, NULL, &event); err = queue.enqueueReleaseGLObjects(&cl_vel_vbos_out, NULL, &event); err = queue.enqueueReleaseGLObjects(&cl_color_vbos, NULL, &event); err = queue.enqueueReleaseGLObjects(&cl_color_vbos_out, NULL, &event); }
//////////////////////////////////////////////////////////////////////////////// // Test driver //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { cudaError_t error; printf("%s Starting...\n\n", argv[0]); printf("Starting up CUDA context...\n"); int dev = findCudaDevice(argc, (const char **)argv); uint *h_InputKey, *h_InputVal, *h_OutputKeyGPU, *h_OutputValGPU; uint *d_InputKey, *d_InputVal, *d_OutputKey, *d_OutputVal; StopWatchInterface *hTimer = NULL; const uint N = 1048576; const uint DIR = 0; const uint numValues = 65536; const uint numIterations = 1; printf("Allocating and initializing host arrays...\n\n"); sdkCreateTimer(&hTimer); h_InputKey = (uint *)malloc(N * sizeof(uint)); h_InputVal = (uint *)malloc(N * sizeof(uint)); h_OutputKeyGPU = (uint *)malloc(N * sizeof(uint)); h_OutputValGPU = (uint *)malloc(N * sizeof(uint)); srand(2001); for (uint i = 0; i < N; i++) { h_InputKey[i] = rand() % numValues; h_InputVal[i] = i; } printf("Allocating and initializing CUDA arrays...\n\n"); error = cudaMalloc((void **)&d_InputKey, N * sizeof(uint)); checkCudaErrors(error); error = cudaMalloc((void **)&d_InputVal, N * sizeof(uint)); checkCudaErrors(error); error = cudaMalloc((void **)&d_OutputKey, N * sizeof(uint)); checkCudaErrors(error); error = cudaMalloc((void **)&d_OutputVal, N * sizeof(uint)); checkCudaErrors(error); error = cudaMemcpy(d_InputKey, h_InputKey, N * sizeof(uint), cudaMemcpyHostToDevice); checkCudaErrors(error); error = cudaMemcpy(d_InputVal, h_InputVal, N * sizeof(uint), cudaMemcpyHostToDevice); checkCudaErrors(error); int flag = 1; printf("Running GPU bitonic sort (%u identical iterations)...\n\n", numIterations); for (uint arrayLength = 64; arrayLength <= N; arrayLength *= 2) { printf("Testing array length %u (%u arrays per batch)...\n", arrayLength, N / arrayLength); error = cudaDeviceSynchronize(); checkCudaErrors(error); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); uint threadCount = 0; for (uint i = 0; i < numIterations; i++) threadCount = bitonicSort( d_OutputKey, d_OutputVal, d_InputKey, d_InputVal, N / arrayLength, arrayLength, DIR ); error = cudaDeviceSynchronize(); checkCudaErrors(error); sdkStopTimer(&hTimer); printf("Average time: %f ms\n\n", sdkGetTimerValue(&hTimer) / numIterations); if (arrayLength == N) { double dTimeSecs = 1.0e-3 * sdkGetTimerValue(&hTimer) / numIterations; printf("sortingNetworks-bitonic, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u elements, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)arrayLength/dTimeSecs), dTimeSecs, arrayLength, 1, threadCount); } printf("\nValidating the results...\n"); printf("...reading back GPU results\n"); error = cudaMemcpy(h_OutputKeyGPU, d_OutputKey, N * sizeof(uint), cudaMemcpyDeviceToHost); checkCudaErrors(error); error = cudaMemcpy(h_OutputValGPU, d_OutputVal, N * sizeof(uint), cudaMemcpyDeviceToHost); checkCudaErrors(error); int keysFlag = validateSortedKeys(h_OutputKeyGPU, h_InputKey, N / arrayLength, arrayLength, numValues, DIR); int valuesFlag = validateValues(h_OutputKeyGPU, h_OutputValGPU, h_InputKey, N / arrayLength, arrayLength); flag = flag && keysFlag && valuesFlag; printf("\n"); } printf("Shutting down...\n"); sdkDeleteTimer(&hTimer); cudaFree(d_OutputVal); cudaFree(d_OutputKey); cudaFree(d_InputVal); cudaFree(d_InputKey); free(h_OutputValGPU); free(h_OutputKeyGPU); free(h_InputVal); free(h_InputKey); cudaDeviceReset(); exit(flag ? EXIT_SUCCESS : EXIT_FAILURE); }