CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length) : context(context), trait(trait), dataLength(length) { // Create kernels. map<string, string> replacements; replacements["DATA_TYPE"] = trait->getDataType(); replacements["KEY_TYPE"] = trait->getKeyType(); replacements["SORT_KEY"] = trait->getSortKey(); replacements["MIN_KEY"] = trait->getMinKey(); replacements["MAX_KEY"] = trait->getMaxKey(); replacements["MAX_VALUE"] = trait->getMaxValue(); CUmodule module = context.createModule(context.replaceStrings(CudaKernelSources::sort, replacements)); shortListKernel = context.getKernel(module, "sortShortList"); shortList2Kernel = context.getKernel(module, "sortShortList2"); computeRangeKernel = context.getKernel(module, "computeRange"); assignElementsKernel = context.getKernel(module, "assignElementsToBuckets"); computeBucketPositionsKernel = context.getKernel(module, "computeBucketPositions"); copyToBucketsKernel = context.getKernel(module, "copyDataToBuckets"); sortBucketsKernel = context.getKernel(module, "sortBuckets"); // Work out the work group sizes for various kernels. int maxBlockSize; cuDeviceGetAttribute(&maxBlockSize, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, context.getDevice()); int maxSharedMem; cuDeviceGetAttribute(&maxSharedMem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, context.getDevice()); int maxLocalBuffer = (maxSharedMem/trait->getDataSize())/2; int maxShortList = min(8192, max(maxLocalBuffer, CudaContext::ThreadBlockSize*context.getNumThreadBlocks())); isShortList = (length <= maxShortList); for (rangeKernelSize = 1; rangeKernelSize*2 <= maxBlockSize; rangeKernelSize *= 2) ; positionsKernelSize = rangeKernelSize; sortKernelSize = (isShortList ? rangeKernelSize/2 : rangeKernelSize/4); if (rangeKernelSize > length) rangeKernelSize = length; if (sortKernelSize > maxLocalBuffer) sortKernelSize = maxLocalBuffer; unsigned int targetBucketSize = sortKernelSize/2; unsigned int numBuckets = length/targetBucketSize; if (numBuckets < 1) numBuckets = 1; if (positionsKernelSize > numBuckets) positionsKernelSize = numBuckets; // Create workspace arrays. if (!isShortList) { dataRange.initialize(context, 2, trait->getKeySize(), "sortDataRange"); bucketOffset.initialize<uint1>(context, numBuckets, "bucketOffset"); bucketOfElement.initialize<uint1>(context, length, "bucketOfElement"); offsetInBucket.initialize<uint1>(context, length, "offsetInBucket"); } buckets.initialize(context, length, trait->getDataSize(), "buckets"); }
CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(context), cutoff(-1.0), useCutoff(false), anyExclusions(false), usePadding(true), exclusionIndices(NULL), exclusionRowIndices(NULL), exclusionTiles(NULL), exclusions(NULL), interactingTiles(NULL), interactingAtoms(NULL), interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), sortedBlocks(NULL), sortedBlockCenter(NULL), sortedBlockBoundingBox(NULL), oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), nonbondedForceGroup(0) { // Decide how many thread blocks to use. string errorMessage = "Error initializing nonbonded utilities"; int multiprocessors; CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, context.getDevice())); numForceThreadBlocks = 4*multiprocessors; forceThreadBlockSize = (context.getComputeCapability() < 2.0 ? 128 : 256); }