OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int length) : context(context), trait(trait), dataRange(NULL), bucketOfElement(NULL), offsetInBucket(NULL), bucketOffset(NULL), buckets(NULL), dataLength(length) { // Create kernels. std::map<std::string, std::string> replacements; replacements["DATA_TYPE"] = trait->getDataType(); replacements["KEY_TYPE"] = trait->getKeyType(); replacements["SORT_KEY"] = trait->getSortKey(); replacements["MIN_KEY"] = trait->getMinKey(); replacements["MAX_KEY"] = trait->getMaxKey(); replacements["MAX_VALUE"] = trait->getMaxValue(); replacements["VALUE_IS_INT2"] = (trait->getDataType() == std::string("int2") ? "1" : "0"); cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::sort, replacements)); shortListKernel = cl::Kernel(program, "sortShortList"); computeRangeKernel = cl::Kernel(program, "computeRange"); assignElementsKernel = cl::Kernel(program, "assignElementsToBuckets"); computeBucketPositionsKernel = cl::Kernel(program, "computeBucketPositions"); copyToBucketsKernel = cl::Kernel(program, "copyDataToBuckets"); sortBucketsKernel = cl::Kernel(program, "sortBuckets"); // Work out the work group sizes for various kernels. unsigned int maxGroupSize = std::min(256, (int) context.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>()); int maxSharedMem = context.getDevice().getInfo<CL_DEVICE_LOCAL_MEM_SIZE>(); unsigned int maxLocalBuffer = (unsigned int) ((maxSharedMem/trait->getDataSize())/2); unsigned int maxRangeSize = std::min(maxGroupSize, (unsigned int) computeRangeKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice())); unsigned int maxPositionsSize = std::min(maxGroupSize, (unsigned int) computeBucketPositionsKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice())); unsigned int maxShortListSize = shortListKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()); // On Qualcomm's OpenCL, it's essential to check against maxShortListSize. Otherwise you get a crash. // But AMD's OpenCL returns an inappropriately small value for it that is much shorter than the actual // maximum, so including the check hurts performance. For the moment I'm going to just comment it out. // If we officially support Qualcomm in the future, we'll need to do something better. isShortList = (length <= maxLocalBuffer/* && length < maxShortListSize*/); for (rangeKernelSize = 1; rangeKernelSize*2 <= maxRangeSize; rangeKernelSize *= 2) ; positionsKernelSize = std::min(rangeKernelSize, maxPositionsSize); sortKernelSize = (isShortList ? rangeKernelSize : rangeKernelSize/2); if (rangeKernelSize > length) rangeKernelSize = length; if (sortKernelSize > maxLocalBuffer) sortKernelSize = maxLocalBuffer; unsigned int targetBucketSize = sortKernelSize/2; unsigned int numBuckets = length/targetBucketSize; if (numBuckets < 1) numBuckets = 1; if (positionsKernelSize > numBuckets) positionsKernelSize = numBuckets; // Create workspace arrays. if (!isShortList) { dataRange = new OpenCLArray(context, 2, trait->getKeySize(), "sortDataRange"); bucketOffset = OpenCLArray::create<cl_uint>(context, numBuckets, "bucketOffset"); bucketOfElement = OpenCLArray::create<cl_uint>(context, length, "bucketOfElement"); offsetInBucket = OpenCLArray::create<cl_uint>(context, length, "offsetInBucket"); buckets = new OpenCLArray(context, length, trait->getDataSize(), "buckets"); } }
OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int length) : context(context), trait(trait), dataRange(NULL), bucketOfElement(NULL), offsetInBucket(NULL), bucketOffset(NULL), buckets(NULL), dataLength(length) { // Create kernels. std::map<std::string, std::string> replacements; replacements["DATA_TYPE"] = trait->getDataType(); replacements["KEY_TYPE"] = trait->getKeyType(); replacements["SORT_KEY"] = trait->getSortKey(); replacements["MIN_KEY"] = trait->getMinKey(); replacements["MAX_KEY"] = trait->getMaxKey(); replacements["MAX_VALUE"] = trait->getMaxValue(); replacements["VALUE_IS_INT2"] = (trait->getDataType() == std::string("int2") ? "1" : "0"); cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::sort, replacements)); shortListKernel = cl::Kernel(program, "sortShortList"); computeRangeKernel = cl::Kernel(program, "computeRange"); assignElementsKernel = cl::Kernel(program, "assignElementsToBuckets"); computeBucketPositionsKernel = cl::Kernel(program, "computeBucketPositions"); copyToBucketsKernel = cl::Kernel(program, "copyDataToBuckets"); sortBucketsKernel = cl::Kernel(program, "sortBuckets"); // Work out the work group sizes for various kernels. unsigned int maxGroupSize = std::min(256, (int) context.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>()); int maxSharedMem = context.getDevice().getInfo<CL_DEVICE_LOCAL_MEM_SIZE>(); unsigned int maxLocalBuffer = (unsigned int) ((maxSharedMem/trait->getDataSize())/2); isShortList = (length <= maxLocalBuffer); for (rangeKernelSize = 1; rangeKernelSize*2 <= maxGroupSize; rangeKernelSize *= 2) ; positionsKernelSize = rangeKernelSize; sortKernelSize = (isShortList ? rangeKernelSize : rangeKernelSize/2); if (rangeKernelSize > length) rangeKernelSize = length; if (sortKernelSize > maxLocalBuffer) sortKernelSize = maxLocalBuffer; unsigned int targetBucketSize = sortKernelSize/2; unsigned int numBuckets = length/targetBucketSize; if (numBuckets < 1) numBuckets = 1; if (positionsKernelSize > numBuckets) positionsKernelSize = numBuckets; // Create workspace arrays. if (!isShortList) { dataRange = new OpenCLArray(context, 2, trait->getKeySize(), "sortDataRange"); bucketOffset = OpenCLArray::create<cl_uint>(context, numBuckets, "bucketOffset"); bucketOfElement = OpenCLArray::create<cl_uint>(context, length, "bucketOfElement"); offsetInBucket = OpenCLArray::create<cl_uint>(context, length, "offsetInBucket"); buckets = new OpenCLArray(context, length, trait->getDataSize(), "buckets"); } }