extern "C" size_t scanExclusiveShort( cl_command_queue cqCommandQueue, cl_mem d_Dst, cl_mem d_Src, uint batchSize, uint arrayLength ) { //Check power-of-two factorization uint log2L; uint factorizationRemainder = factorRadix2(log2L, arrayLength); oclCheckError( factorizationRemainder == 1, shrTRUE); //Check supported size range oclCheckError( (arrayLength >= MIN_SHORT_ARRAY_SIZE) && (arrayLength <= MAX_SHORT_ARRAY_SIZE), shrTRUE ); //Check total batch size limit oclCheckError( (batchSize * arrayLength) <= MAX_BATCH_ELEMENTS, shrTRUE ); //Check all work-groups to be fully packed with data oclCheckError( (batchSize * arrayLength) % (4 * WORKGROUP_SIZE) == 0, shrTRUE); return scanExclusiveLocal1( cqCommandQueue, d_Dst, d_Src, batchSize, arrayLength ); }
extern "C" size_t scanExclusiveLarge( cl_command_queue cqCommandQueue, cl_mem d_Dst, cl_mem d_Src, uint batchSize, uint arrayLength ) { //Check power-of-two factorization uint log2L; uint factorizationRemainder = factorRadix2(log2L, arrayLength); oclCheckError( factorizationRemainder == 1, shrTRUE); //Check supported size range oclCheckError( (arrayLength >= MIN_LARGE_ARRAY_SIZE) && (arrayLength <= MAX_LARGE_ARRAY_SIZE), shrTRUE ); //Check total batch size limit oclCheckError( (batchSize * arrayLength) <= MAX_BATCH_ELEMENTS, shrTRUE ); scanExclusiveLocal1( cqCommandQueue, d_Dst, d_Src, (batchSize * arrayLength) / (4 * WORKGROUP_SIZE), 4 * WORKGROUP_SIZE ); scanExclusiveLocal2( cqCommandQueue, d_Buffer, d_Dst, d_Src, batchSize, arrayLength / (4 * WORKGROUP_SIZE) ); return uniformUpdate( cqCommandQueue, d_Dst, d_Buffer, (batchSize * arrayLength) / (4 * WORKGROUP_SIZE) ); }
void BoidModelSHWay1::bitonicSort( cl::Buffer d_DstKey, cl::Buffer d_DstVal, cl::Buffer d_SrcKey, cl::Buffer d_SrcVal, unsigned int batch, unsigned int arrayLength, unsigned int dir ){ if (arrayLength < 2) return; //Only power-of-two array lengths are supported so far cl_uint log2L; cl_uint factorizationRemainder = factorRadix2(log2L, arrayLength); if (factorizationRemainder != 1){ log("Array not a power of two"); return; } dir = (dir != 0); size_t localWorkSize, globalWorkSize; unsigned long long timeNow = GetTickCount64(); if (arrayLength <= LOCAL_SIZE_LIMIT) { try { err = kernel_bitonicSortLocal.setArg(0, d_DstKey); err = kernel_bitonicSortLocal.setArg(1, d_DstVal); err = kernel_bitonicSortLocal.setArg(2, d_SrcKey); err = kernel_bitonicSortLocal.setArg(3, d_SrcVal); err = kernel_bitonicSortLocal.setArg(4, arrayLength); err = kernel_bitonicSortLocal.setArg(5, dir); } catch (cl::Error er) { log("ERROR: " + std::string(er.what()) + clHelper->oclErrorString(er.err())); } localWorkSize = LOCAL_SIZE_LIMIT / 2; globalWorkSize = batch * arrayLength / 2; err = queue.enqueueNDRangeKernel(kernel_bitonicSortLocal, cl::NullRange, cl::NDRange(globalWorkSize), cl::NDRange(localWorkSize), NULL, NULL); queue.finish(); } else { try { err = kernel_bitonicSortLocal1.setArg(0, d_DstKey); err = kernel_bitonicSortLocal1.setArg(1, d_DstVal); err = kernel_bitonicSortLocal1.setArg(2, d_SrcKey); err = kernel_bitonicSortLocal1.setArg(3, d_SrcVal); } catch (cl::Error er) { log("ERROR: " + std::string(er.what()) + clHelper->oclErrorString(er.err())); } localWorkSize = LOCAL_SIZE_LIMIT / 2; globalWorkSize = batch * arrayLength / 2; err = queue.enqueueNDRangeKernel(kernel_bitonicSortLocal1, cl::NullRange, cl::NDRange(globalWorkSize), cl::NDRange(localWorkSize), NULL, NULL); queue.finish(); for (unsigned int size = 2 * LOCAL_SIZE_LIMIT; size <= arrayLength; size <<= 1) { for (unsigned stride = size / 2; stride > 0; stride >>= 1) { if (stride >= LOCAL_SIZE_LIMIT) { localWorkSize = LOCAL_SIZE_LIMIT / 4; globalWorkSize = batch * arrayLength / 2; //Launch bitonicMergeGlobal try { err = kernel_bitonicMergeGlobal.setArg(0, d_DstKey); err = kernel_bitonicMergeGlobal.setArg(1, d_DstVal); err = kernel_bitonicMergeGlobal.setArg(2, d_DstKey); err = kernel_bitonicMergeGlobal.setArg(3, d_DstVal); err = kernel_bitonicMergeGlobal.setArg(4, arrayLength); err = kernel_bitonicMergeGlobal.setArg(5, size); err = kernel_bitonicMergeGlobal.setArg(6, stride); err = kernel_bitonicMergeGlobal.setArg(7, dir); } catch (cl::Error er) { log("ERROR: " + std::string(er.what()) + clHelper->oclErrorString(er.err())); } err = queue.enqueueNDRangeKernel(kernel_bitonicMergeGlobal, cl::NullRange, cl::NDRange(globalWorkSize), cl::NDRange(localWorkSize), NULL, NULL); queue.finish(); } else { //Launch bitonicMergeLocal localWorkSize = LOCAL_SIZE_LIMIT / 2; globalWorkSize = batch * arrayLength / 2; try { err = kernel_bitonicMergeLocal.setArg(0, d_DstKey); err = kernel_bitonicMergeLocal.setArg(1, d_DstVal); err = kernel_bitonicMergeLocal.setArg(2, d_DstKey); err = kernel_bitonicMergeLocal.setArg(3, d_DstVal); err = kernel_bitonicMergeLocal.setArg(4, arrayLength); err = kernel_bitonicMergeLocal.setArg(5, stride); err = kernel_bitonicMergeLocal.setArg(6, size); err = kernel_bitonicMergeLocal.setArg(7, dir); } catch (cl::Error er) { log("ERROR: " + std::string(er.what()) + clHelper->oclErrorString(er.err())); } err = queue.enqueueNDRangeKernel(kernel_bitonicMergeLocal, cl::NullRange, cl::NDRange(globalWorkSize), cl::NDRange(localWorkSize), NULL, NULL); queue.finish(); break; } } } } times[1] = GetTickCount64() - timeNow; }
int Bitonic<T>::Sort(int batch, int arrayLength, int dir, Buffer<T> *cl_dstkey, Buffer<T> *cl_dstval, Buffer<T> *cl_srckey, Buffer<T> *cl_srcval) { if(arrayLength < 2) return 0; int arg = 0; k_bitonicSortLocal.setArg(arg++, cl_dstkey->getDevicePtr()); k_bitonicSortLocal.setArg(arg++, cl_dstval->getDevicePtr()); k_bitonicSortLocal.setArg(arg++, cl_srckey->getDevicePtr()); k_bitonicSortLocal.setArg(arg++, cl_srcval->getDevicePtr()); arg = 0; k_bitonicSortLocal1.setArg(arg++, cl_dstkey->getDevicePtr()); k_bitonicSortLocal1.setArg(arg++, cl_dstval->getDevicePtr()); k_bitonicSortLocal1.setArg(arg++, cl_srckey->getDevicePtr()); k_bitonicSortLocal1.setArg(arg++, cl_srcval->getDevicePtr()); arg = 0; k_bitonicMergeGlobal.setArg(arg++, cl_dstkey->getDevicePtr()); k_bitonicMergeGlobal.setArg(arg++, cl_dstval->getDevicePtr()); k_bitonicMergeGlobal.setArg(arg++, cl_dstkey->getDevicePtr()); k_bitonicMergeGlobal.setArg(arg++, cl_dstval->getDevicePtr()); arg = 0; k_bitonicMergeLocal.setArg(arg++, cl_dstkey->getDevicePtr()); k_bitonicMergeLocal.setArg(arg++, cl_dstval->getDevicePtr()); k_bitonicMergeLocal.setArg(arg++, cl_dstkey->getDevicePtr()); k_bitonicMergeLocal.setArg(arg++, cl_dstval->getDevicePtr()); //Only power-of-two array lengths are supported so far cl_uint log2L; cl_uint factorizationRemainder = factorRadix2(log2L, arrayLength); //printf("bitonic factorization remainder: %d\n", factorizationRemainder); dir = (dir != 0); //printf("dir: %d\n", dir); int localWorkSize; int globalWorkSize; if(arrayLength <= LOCAL_SIZE_LIMIT) { //Launch bitonicSortLocal k_bitonicSortLocal.setArg(4, arrayLength); k_bitonicSortLocal.setArg(5, dir); localWorkSize = LOCAL_SIZE_LIMIT / 2; globalWorkSize = batch * arrayLength / 2; k_bitonicSortLocal.execute(globalWorkSize, localWorkSize); } else { //Launch bitonicSortLocal1 localWorkSize = LOCAL_SIZE_LIMIT / 2; globalWorkSize = batch * arrayLength / 2; k_bitonicSortLocal1.execute(globalWorkSize, localWorkSize); for(uint size = 2 * LOCAL_SIZE_LIMIT; size <= arrayLength; size <<= 1) { for(unsigned stride = size / 2; stride > 0; stride >>= 1) { if(stride >= LOCAL_SIZE_LIMIT) { //Launch bitonicMergeGlobal k_bitonicMergeGlobal.setArg(4, arrayLength); k_bitonicMergeGlobal.setArg(5, size); k_bitonicMergeGlobal.setArg(6, stride); k_bitonicMergeGlobal.setArg(7, dir); globalWorkSize = batch * arrayLength / 2; k_bitonicMergeGlobal.execute(globalWorkSize); } else { //Launch bitonicMergeLocal k_bitonicMergeLocal.setArg(4, arrayLength); k_bitonicMergeLocal.setArg(5, stride); k_bitonicMergeLocal.setArg(6, size); k_bitonicMergeLocal.setArg(7, dir); localWorkSize = LOCAL_SIZE_LIMIT / 2; globalWorkSize = batch * arrayLength / 2; k_bitonicMergeLocal.execute(globalWorkSize, localWorkSize); break; } //printf("globalWorkSize: %d\n", globalWorkSize); } } } return localWorkSize; //scopy(num, cl_sort_output_hashes.getDevicePtr(), // cl_sort_hashes.getDevicePtr()); //scopy(num, cl_sort_output_indices.getDevicePtr(), // cl_sort_indices.getDevicePtr()); }