void CudaFFT3D::execFFT(CudaArray& in, CudaArray& out, bool forward) { CUfunction kernel1 = (forward ? zkernel : invzkernel); CUfunction kernel2 = (forward ? xkernel : invxkernel); CUfunction kernel3 = (forward ? ykernel : invykernel); void* args1[] = {&in.getDevicePointer(), &out.getDevicePointer()}; void* args2[] = {&out.getDevicePointer(), &in.getDevicePointer()}; if (packRealAsComplex) { CUfunction packKernel = (forward ? packForwardKernel : packBackwardKernel); CUfunction unpackKernel = (forward ? unpackForwardKernel : unpackBackwardKernel); int gridSize = xsize*ysize*zsize/2; // Pack the data into a half sized grid. context.executeKernel(packKernel, args1, gridSize, 128); // Perform the FFT. context.executeKernel(kernel1, args2, gridSize, zthreads); context.executeKernel(kernel2, args1, gridSize, xthreads); context.executeKernel(kernel3, args2, gridSize, ythreads); // Unpack the data. context.executeKernel(unpackKernel, args1, gridSize, 128); } else { context.executeKernel(kernel1, args1, xsize*ysize*zsize, zthreads); context.executeKernel(kernel2, args2, xsize*ysize*zsize, xthreads); context.executeKernel(kernel3, args1, xsize*ysize*zsize, ythreads); } }
void CudaSort::sort(CudaArray& data) { if (data.getSize() != dataLength || data.getElementSize() != trait->getDataSize()) throw OpenMMException("CudaSort called with different data size"); if (data.getSize() == 0) return; if (isShortList) { // We can use a simpler sort kernel that does the entire operation in one kernel. if (dataLength <= CudaContext::ThreadBlockSize*context.getNumThreadBlocks()) { void* sortArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &dataLength}; context.executeKernel(shortList2Kernel, sortArgs, dataLength); buckets.copyTo(data); } else { void* sortArgs[] = {&data.getDevicePointer(), &dataLength}; context.executeKernel(shortListKernel, sortArgs, sortKernelSize, sortKernelSize, dataLength*trait->getDataSize()); } } else { // Compute the range of data values. unsigned int numBuckets = bucketOffset.getSize(); void* rangeArgs[] = {&data.getDevicePointer(), &dataLength, &dataRange.getDevicePointer(), &numBuckets, &bucketOffset.getDevicePointer()}; context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelSize, rangeKernelSize, 2*rangeKernelSize*trait->getKeySize()); // Assign array elements to buckets. void* elementsArgs[] = {&data.getDevicePointer(), &dataLength, &numBuckets, &dataRange.getDevicePointer(), &bucketOffset.getDevicePointer(), &bucketOfElement.getDevicePointer(), &offsetInBucket.getDevicePointer()}; context.executeKernel(assignElementsKernel, elementsArgs, data.getSize(), 128); // Compute the position of each bucket. void* computeArgs[] = {&numBuckets, &bucketOffset.getDevicePointer()}; context.executeKernel(computeBucketPositionsKernel, computeArgs, positionsKernelSize, positionsKernelSize, positionsKernelSize*sizeof(int)); // Copy the data into the buckets. void* copyArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &dataLength, &bucketOffset.getDevicePointer(), &bucketOfElement.getDevicePointer(), &offsetInBucket.getDevicePointer()}; context.executeKernel(copyToBucketsKernel, copyArgs, data.getSize()); // Sort each bucket. void* sortArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &numBuckets, &bucketOffset.getDevicePointer()}; context.executeKernel(sortBucketsKernel, sortArgs, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize, sortKernelSize*trait->getDataSize()); } }