CudaFFT3D::CudaFFT3D(CudaContext& context, int xsize, int ysize, int zsize, bool realToComplex) : context(context), xsize(xsize), ysize(ysize), zsize(zsize) { packRealAsComplex = false; int packedXSize = xsize; int packedYSize = ysize; int packedZSize = zsize; if (realToComplex) { // If any axis size is even, we can pack the real values into a complex grid that is only half as large. // Look for an appropriate axis. packRealAsComplex = true; int packedAxis, bufferSize; if (xsize%2 == 0) { packedAxis = 0; packedXSize /= 2; bufferSize = packedXSize; } else if (ysize%2 == 0) { packedAxis = 1; packedYSize /= 2; bufferSize = packedYSize; } else if (zsize%2 == 0) { packedAxis = 2; packedZSize /= 2; bufferSize = packedZSize; } else packRealAsComplex = false; if (packRealAsComplex) { // Build the kernels for packing and unpacking the data. map<string, string> defines; defines["XSIZE"] = context.intToString(xsize); defines["YSIZE"] = context.intToString(ysize); defines["ZSIZE"] = context.intToString(zsize); defines["PACKED_AXIS"] = context.intToString(packedAxis); defines["PACKED_XSIZE"] = context.intToString(packedXSize); defines["PACKED_YSIZE"] = context.intToString(packedYSize); defines["PACKED_ZSIZE"] = context.intToString(packedZSize); defines["M_PI"] = context.doubleToString(M_PI); CUmodule module = context.createModule(CudaKernelSources::vectorOps+CudaKernelSources::fftR2C, defines); packForwardKernel = context.getKernel(module, "packForwardData"); unpackForwardKernel = context.getKernel(module, "unpackForwardData"); packBackwardKernel = context.getKernel(module, "packBackwardData"); unpackBackwardKernel = context.getKernel(module, "unpackBackwardData"); } } bool inputIsReal = (realToComplex && !packRealAsComplex); zkernel = createKernel(packedXSize, packedYSize, packedZSize, zthreads, 0, true, inputIsReal); xkernel = createKernel(packedYSize, packedZSize, packedXSize, xthreads, 1, true, inputIsReal); ykernel = createKernel(packedZSize, packedXSize, packedYSize, ythreads, 2, true, inputIsReal); invzkernel = createKernel(packedXSize, packedYSize, packedZSize, zthreads, 0, false, inputIsReal); invxkernel = createKernel(packedYSize, packedZSize, packedXSize, xthreads, 1, false, inputIsReal); invykernel = createKernel(packedZSize, packedXSize, packedYSize, ythreads, 2, false, inputIsReal); }
CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length) : context(context), trait(trait), dataLength(length) { // Create kernels. map<string, string> replacements; replacements["DATA_TYPE"] = trait->getDataType(); replacements["KEY_TYPE"] = trait->getKeyType(); replacements["SORT_KEY"] = trait->getSortKey(); replacements["MIN_KEY"] = trait->getMinKey(); replacements["MAX_KEY"] = trait->getMaxKey(); replacements["MAX_VALUE"] = trait->getMaxValue(); CUmodule module = context.createModule(context.replaceStrings(CudaKernelSources::sort, replacements)); shortListKernel = context.getKernel(module, "sortShortList"); shortList2Kernel = context.getKernel(module, "sortShortList2"); computeRangeKernel = context.getKernel(module, "computeRange"); assignElementsKernel = context.getKernel(module, "assignElementsToBuckets"); computeBucketPositionsKernel = context.getKernel(module, "computeBucketPositions"); copyToBucketsKernel = context.getKernel(module, "copyDataToBuckets"); sortBucketsKernel = context.getKernel(module, "sortBuckets"); // Work out the work group sizes for various kernels. int maxBlockSize; cuDeviceGetAttribute(&maxBlockSize, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, context.getDevice()); int maxSharedMem; cuDeviceGetAttribute(&maxSharedMem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, context.getDevice()); int maxLocalBuffer = (maxSharedMem/trait->getDataSize())/2; int maxShortList = min(8192, max(maxLocalBuffer, CudaContext::ThreadBlockSize*context.getNumThreadBlocks())); isShortList = (length <= maxShortList); for (rangeKernelSize = 1; rangeKernelSize*2 <= maxBlockSize; rangeKernelSize *= 2) ; positionsKernelSize = rangeKernelSize; sortKernelSize = (isShortList ? rangeKernelSize/2 : rangeKernelSize/4); if (rangeKernelSize > length) rangeKernelSize = length; if (sortKernelSize > maxLocalBuffer) sortKernelSize = maxLocalBuffer; unsigned int targetBucketSize = sortKernelSize/2; unsigned int numBuckets = length/targetBucketSize; if (numBuckets < 1) numBuckets = 1; if (positionsKernelSize > numBuckets) positionsKernelSize = numBuckets; // Create workspace arrays. if (!isShortList) { dataRange.initialize(context, 2, trait->getKeySize(), "sortDataRange"); bucketOffset.initialize<uint1>(context, numBuckets, "bucketOffset"); bucketOfElement.initialize<uint1>(context, length, "bucketOfElement"); offsetInBucket.initialize<uint1>(context, length, "offsetInBucket"); } buckets.initialize(context, length, trait->getDataSize(), "buckets"); }
CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const System& system) : context(context), randomPos(0) { // Create workspace arrays. lastStepSize = make_double2(0.0, 0.0); if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) { posDelta.initialize<double4>(context, context.getPaddedNumAtoms(), "posDelta"); vector<double4> deltas(posDelta.getSize(), make_double4(0.0, 0.0, 0.0, 0.0)); posDelta.upload(deltas); stepSize.initialize<double2>(context, 1, "stepSize"); stepSize.upload(&lastStepSize); } else { posDelta.initialize<float4>(context, context.getPaddedNumAtoms(), "posDelta"); vector<float4> deltas(posDelta.getSize(), make_float4(0.0f, 0.0f, 0.0f, 0.0f)); posDelta.upload(deltas); stepSize.initialize<float2>(context, 1, "stepSize"); float2 lastStepSizeFloat = make_float2(0.0f, 0.0f); stepSize.upload(&lastStepSizeFloat); } // Record the set of constraints and how many constraints each atom is involved in. vector<int> atom1; vector<int> atom2; vector<double> distance; vector<int> constraintCount(context.getNumAtoms(), 0); for (int i = 0; i < system.getNumConstraints(); i++) { int p1, p2; double d; system.getConstraintParameters(i, p1, p2, d); if (system.getParticleMass(p1) != 0 || system.getParticleMass(p2) != 0) { atom1.push_back(p1); atom2.push_back(p2); distance.push_back(d); constraintCount[p1]++; constraintCount[p2]++; } } // Identify clusters of three atoms that can be treated with SETTLE. First, for every // atom that might be part of such a cluster, make a list of the two other atoms it is // connected to. int numAtoms = system.getNumParticles(); vector<map<int, float> > settleConstraints(numAtoms); for (int i = 0; i < (int)atom1.size(); i++) { if (constraintCount[atom1[i]] == 2 && constraintCount[atom2[i]] == 2) { settleConstraints[atom1[i]][atom2[i]] = (float) distance[i]; settleConstraints[atom2[i]][atom1[i]] = (float) distance[i]; } } // Now remove the ones that don't actually form closed loops of three atoms. vector<int> settleClusters; for (int i = 0; i < (int)settleConstraints.size(); i++) { if (settleConstraints[i].size() == 2) { int partner1 = settleConstraints[i].begin()->first; int partner2 = (++settleConstraints[i].begin())->first; if (settleConstraints[partner1].size() != 2 || settleConstraints[partner2].size() != 2 || settleConstraints[partner1].find(partner2) == settleConstraints[partner1].end()) settleConstraints[i].clear(); else if (i < partner1 && i < partner2) settleClusters.push_back(i); } else settleConstraints[i].clear(); } // Record the SETTLE clusters. vector<bool> isShakeAtom(numAtoms, false); if (settleClusters.size() > 0) { vector<int4> atoms; vector<float2> params; for (int i = 0; i < (int) settleClusters.size(); i++) { int atom1 = settleClusters[i]; int atom2 = settleConstraints[atom1].begin()->first; int atom3 = (++settleConstraints[atom1].begin())->first; float dist12 = settleConstraints[atom1].find(atom2)->second; float dist13 = settleConstraints[atom1].find(atom3)->second; float dist23 = settleConstraints[atom2].find(atom3)->second; if (dist12 == dist13) { // atom1 is the central atom atoms.push_back(make_int4(atom1, atom2, atom3, 0)); params.push_back(make_float2(dist12, dist23)); } else if (dist12 == dist23) { // atom2 is the central atom atoms.push_back(make_int4(atom2, atom1, atom3, 0)); params.push_back(make_float2(dist12, dist13)); } else if (dist13 == dist23) { // atom3 is the central atom atoms.push_back(make_int4(atom3, atom1, atom2, 0)); params.push_back(make_float2(dist13, dist12)); } else continue; // We can't handle this with SETTLE isShakeAtom[atom1] = true; isShakeAtom[atom2] = true; isShakeAtom[atom3] = true; } if (atoms.size() > 0) { settleAtoms.initialize<int4>(context, atoms.size(), "settleAtoms"); settleParams.initialize<float2>(context, params.size(), "settleParams"); settleAtoms.upload(atoms); settleParams.upload(params); } } // Find clusters consisting of a central atom with up to three peripheral atoms. map<int, ShakeCluster> clusters; vector<bool> invalidForShake(numAtoms, false); for (int i = 0; i < (int) atom1.size(); i++) { if (isShakeAtom[atom1[i]]) continue; // This is being taken care of with SETTLE. // Determine which is the central atom. bool firstIsCentral; if (constraintCount[atom1[i]] > 1) firstIsCentral = true; else if (constraintCount[atom2[i]] > 1) firstIsCentral = false; else if (atom1[i] < atom2[i]) firstIsCentral = true; else firstIsCentral = false; int centralID, peripheralID; if (firstIsCentral) { centralID = atom1[i]; peripheralID = atom2[i]; } else { centralID = atom2[i]; peripheralID = atom1[i]; } // Add it to the cluster. if (clusters.find(centralID) == clusters.end()) { clusters[centralID] = ShakeCluster(centralID, 1.0/system.getParticleMass(centralID)); } ShakeCluster& cluster = clusters[centralID]; cluster.addAtom(peripheralID, distance[i], 1.0/system.getParticleMass(peripheralID)); if (constraintCount[peripheralID] != 1 || invalidForShake[atom1[i]] || invalidForShake[atom2[i]]) { cluster.markInvalid(clusters, invalidForShake); map<int, ShakeCluster>::iterator otherCluster = clusters.find(peripheralID); if (otherCluster != clusters.end() && otherCluster->second.valid) otherCluster->second.markInvalid(clusters, invalidForShake); } } int validShakeClusters = 0; for (map<int, ShakeCluster>::iterator iter = clusters.begin(); iter != clusters.end(); ++iter) { ShakeCluster& cluster = iter->second; if (cluster.valid) { cluster.valid = !invalidForShake[cluster.centralID] && cluster.size == constraintCount[cluster.centralID]; for (int i = 0; i < cluster.size; i++) if (invalidForShake[cluster.peripheralID[i]]) cluster.valid = false; if (cluster.valid) ++validShakeClusters; } } // Record the SHAKE clusters. if (validShakeClusters > 0) { vector<int4> atoms; vector<float4> params; int index = 0; for (map<int, ShakeCluster>::const_iterator iter = clusters.begin(); iter != clusters.end(); ++iter) { const ShakeCluster& cluster = iter->second; if (!cluster.valid) continue; atoms.push_back(make_int4(cluster.centralID, cluster.peripheralID[0], (cluster.size > 1 ? cluster.peripheralID[1] : -1), (cluster.size > 2 ? cluster.peripheralID[2] : -1))); params.push_back(make_float4((float) cluster.centralInvMass, (float) (0.5/(cluster.centralInvMass+cluster.peripheralInvMass)), (float) (cluster.distance*cluster.distance), (float) cluster.peripheralInvMass)); isShakeAtom[cluster.centralID] = true; isShakeAtom[cluster.peripheralID[0]] = true; if (cluster.size > 1) isShakeAtom[cluster.peripheralID[1]] = true; if (cluster.size > 2) isShakeAtom[cluster.peripheralID[2]] = true; ++index; } shakeAtoms.initialize<int4>(context, atoms.size(), "shakeAtoms"); shakeParams.initialize<float4>(context, params.size(), "shakeParams"); shakeAtoms.upload(atoms); shakeParams.upload(params); } // Find connected constraints for CCMA. vector<int> ccmaConstraints; for (unsigned i = 0; i < atom1.size(); i++) if (!isShakeAtom[atom1[i]]) ccmaConstraints.push_back(i); // Record the connections between constraints. int numCCMA = (int) ccmaConstraints.size(); if (numCCMA > 0) { // Record information needed by ReferenceCCMAAlgorithm. vector<pair<int, int> > refIndices(numCCMA); vector<double> refDistance(numCCMA); for (int i = 0; i < numCCMA; i++) { int index = ccmaConstraints[i]; refIndices[i] = make_pair(atom1[index], atom2[index]); refDistance[i] = distance[index]; } vector<double> refMasses(numAtoms); for (int i = 0; i < numAtoms; ++i) refMasses[i] = system.getParticleMass(i); // Look up angles for CCMA. vector<ReferenceCCMAAlgorithm::AngleInfo> angles; for (int i = 0; i < system.getNumForces(); i++) { const HarmonicAngleForce* force = dynamic_cast<const HarmonicAngleForce*>(&system.getForce(i)); if (force != NULL) { for (int j = 0; j < force->getNumAngles(); j++) { int atom1, atom2, atom3; double angle, k; force->getAngleParameters(j, atom1, atom2, atom3, angle, k); angles.push_back(ReferenceCCMAAlgorithm::AngleInfo(atom1, atom2, atom3, angle)); } } } // Create a ReferenceCCMAAlgorithm. It will build and invert the constraint matrix for us. ReferenceCCMAAlgorithm ccma(numAtoms, numCCMA, refIndices, refDistance, refMasses, angles, 0.1); vector<vector<pair<int, double> > > matrix = ccma.getMatrix(); int maxRowElements = 0; for (unsigned i = 0; i < matrix.size(); i++) maxRowElements = max(maxRowElements, (int) matrix[i].size()); maxRowElements++; // Build the list of constraints for each atom. vector<vector<int> > atomConstraints(context.getNumAtoms()); for (int i = 0; i < numCCMA; i++) { atomConstraints[atom1[ccmaConstraints[i]]].push_back(i); atomConstraints[atom2[ccmaConstraints[i]]].push_back(i); } int maxAtomConstraints = 0; for (unsigned i = 0; i < atomConstraints.size(); i++) maxAtomConstraints = max(maxAtomConstraints, (int) atomConstraints[i].size()); // Sort the constraints. vector<int> constraintOrder(numCCMA); for (int i = 0; i < numCCMA; ++i) constraintOrder[i] = i; sort(constraintOrder.begin(), constraintOrder.end(), ConstraintOrderer(atom1, atom2, ccmaConstraints)); vector<int> inverseOrder(numCCMA); for (int i = 0; i < numCCMA; ++i) inverseOrder[constraintOrder[i]] = i; for (int i = 0; i < (int)matrix.size(); ++i) for (int j = 0; j < (int)matrix[i].size(); ++j) matrix[i][j].first = inverseOrder[matrix[i][j].first]; // Record the CCMA data structures. ccmaAtoms.initialize<int2>(context, numCCMA, "CcmaAtoms"); ccmaAtomConstraints.initialize<int>(context, numAtoms*maxAtomConstraints, "CcmaAtomConstraints"); ccmaNumAtomConstraints.initialize<int>(context, numAtoms, "CcmaAtomConstraintsIndex"); ccmaConstraintMatrixColumn.initialize<int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn"); ccmaConverged.initialize<int>(context, 2, "ccmaConverged"); CHECK_RESULT2(cuMemHostAlloc((void**) &ccmaConvergedMemory, sizeof(int), CU_MEMHOSTALLOC_DEVICEMAP), "Error allocating pinned memory"); CHECK_RESULT2(cuMemHostGetDevicePointer(&ccmaConvergedDeviceMemory, ccmaConvergedMemory, 0), "Error getting device address for pinned memory"); vector<int2> atomsVec(ccmaAtoms.getSize()); vector<int> atomConstraintsVec(ccmaAtomConstraints.getSize()); vector<int> numAtomConstraintsVec(ccmaNumAtomConstraints.getSize()); vector<int> constraintMatrixColumnVec(ccmaConstraintMatrixColumn.getSize()); int elementSize = (context.getUseDoublePrecision() || context.getUseMixedPrecision() ? sizeof(double) : sizeof(float)); ccmaDistance.initialize(context, numCCMA, 4*elementSize, "CcmaDistance"); ccmaDelta1.initialize(context, numCCMA, elementSize, "CcmaDelta1"); ccmaDelta2.initialize(context, numCCMA, elementSize, "CcmaDelta2"); ccmaReducedMass.initialize(context, numCCMA, elementSize, "CcmaReducedMass"); ccmaConstraintMatrixValue.initialize(context, numCCMA*maxRowElements, elementSize, "ConstraintMatrixValue"); vector<double4> distanceVec(ccmaDistance.getSize()); vector<double> reducedMassVec(ccmaReducedMass.getSize()); vector<double> constraintMatrixValueVec(ccmaConstraintMatrixValue.getSize()); for (int i = 0; i < numCCMA; i++) { int index = constraintOrder[i]; int c = ccmaConstraints[index]; atomsVec[i].x = atom1[c]; atomsVec[i].y = atom2[c]; distanceVec[i].w = distance[c]; reducedMassVec[i] = (0.5/(1.0/system.getParticleMass(atom1[c])+1.0/system.getParticleMass(atom2[c]))); for (unsigned int j = 0; j < matrix[index].size(); j++) { constraintMatrixColumnVec[i+j*numCCMA] = matrix[index][j].first; constraintMatrixValueVec[i+j*numCCMA] = matrix[index][j].second; } constraintMatrixColumnVec[i+matrix[index].size()*numCCMA] = numCCMA; } ccmaDistance.upload(distanceVec, true); ccmaReducedMass.upload(reducedMassVec, true); ccmaConstraintMatrixValue.upload(constraintMatrixValueVec, true); for (unsigned int i = 0; i < atomConstraints.size(); i++) { numAtomConstraintsVec[i] = atomConstraints[i].size(); for (unsigned int j = 0; j < atomConstraints[i].size(); j++) { bool forward = (atom1[ccmaConstraints[atomConstraints[i][j]]] == i); atomConstraintsVec[i+j*numAtoms] = (forward ? inverseOrder[atomConstraints[i][j]]+1 : -inverseOrder[atomConstraints[i][j]]-1); } } ccmaAtoms.upload(atomsVec); ccmaAtomConstraints.upload(atomConstraintsVec); ccmaNumAtomConstraints.upload(numAtomConstraintsVec); ccmaConstraintMatrixColumn.upload(constraintMatrixColumnVec); } // Build the list of virtual sites. vector<int4> vsite2AvgAtomVec; vector<double2> vsite2AvgWeightVec; vector<int4> vsite3AvgAtomVec; vector<double4> vsite3AvgWeightVec; vector<int4> vsiteOutOfPlaneAtomVec; vector<double4> vsiteOutOfPlaneWeightVec; vector<int> vsiteLocalCoordsIndexVec; vector<int> vsiteLocalCoordsAtomVec; vector<int> vsiteLocalCoordsStartVec; vector<double> vsiteLocalCoordsWeightVec; vector<double4> vsiteLocalCoordsPosVec; for (int i = 0; i < numAtoms; i++) { if (system.isVirtualSite(i)) { if (dynamic_cast<const TwoParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) { // A two particle average. const TwoParticleAverageSite& site = dynamic_cast<const TwoParticleAverageSite&>(system.getVirtualSite(i)); vsite2AvgAtomVec.push_back(make_int4(i, site.getParticle(0), site.getParticle(1), 0)); vsite2AvgWeightVec.push_back(make_double2(site.getWeight(0), site.getWeight(1))); } else if (dynamic_cast<const ThreeParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) { // A three particle average. const ThreeParticleAverageSite& site = dynamic_cast<const ThreeParticleAverageSite&>(system.getVirtualSite(i)); vsite3AvgAtomVec.push_back(make_int4(i, site.getParticle(0), site.getParticle(1), site.getParticle(2))); vsite3AvgWeightVec.push_back(make_double4(site.getWeight(0), site.getWeight(1), site.getWeight(2), 0.0)); } else if (dynamic_cast<const OutOfPlaneSite*>(&system.getVirtualSite(i)) != NULL) { // An out of plane site. const OutOfPlaneSite& site = dynamic_cast<const OutOfPlaneSite&>(system.getVirtualSite(i)); vsiteOutOfPlaneAtomVec.push_back(make_int4(i, site.getParticle(0), site.getParticle(1), site.getParticle(2))); vsiteOutOfPlaneWeightVec.push_back(make_double4(site.getWeight12(), site.getWeight13(), site.getWeightCross(), 0.0)); } else if (dynamic_cast<const LocalCoordinatesSite*>(&system.getVirtualSite(i)) != NULL) { // A local coordinates site. const LocalCoordinatesSite& site = dynamic_cast<const LocalCoordinatesSite&>(system.getVirtualSite(i)); int numParticles = site.getNumParticles(); vector<double> origin, x, y; site.getOriginWeights(origin); site.getXWeights(x); site.getYWeights(y); vsiteLocalCoordsIndexVec.push_back(i); vsiteLocalCoordsStartVec.push_back(vsiteLocalCoordsAtomVec.size()); for (int j = 0; j < numParticles; j++) { vsiteLocalCoordsAtomVec.push_back(site.getParticle(j)); vsiteLocalCoordsWeightVec.push_back(origin[j]); vsiteLocalCoordsWeightVec.push_back(x[j]); vsiteLocalCoordsWeightVec.push_back(y[j]); } Vec3 pos = site.getLocalPosition(); vsiteLocalCoordsPosVec.push_back(make_double4(pos[0], pos[1], pos[2], 0.0)); } } } vsiteLocalCoordsStartVec.push_back(vsiteLocalCoordsAtomVec.size()); int num2Avg = vsite2AvgAtomVec.size(); int num3Avg = vsite3AvgAtomVec.size(); int numOutOfPlane = vsiteOutOfPlaneAtomVec.size(); int numLocalCoords = vsiteLocalCoordsPosVec.size(); vsite2AvgAtoms.initialize<int4>(context, max(1, num2Avg), "vsite2AvgAtoms"); vsite3AvgAtoms.initialize<int4>(context, max(1, num3Avg), "vsite3AvgAtoms"); vsiteOutOfPlaneAtoms.initialize<int4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneAtoms"); vsiteLocalCoordsIndex.initialize<int>(context, max(1, (int) vsiteLocalCoordsIndexVec.size()), "vsiteLocalCoordsIndex"); vsiteLocalCoordsAtoms.initialize<int>(context, max(1, (int) vsiteLocalCoordsAtomVec.size()), "vsiteLocalCoordsAtoms"); vsiteLocalCoordsStartIndex.initialize<int>(context, max(1, (int) vsiteLocalCoordsStartVec.size()), "vsiteLocalCoordsStartIndex"); if (num2Avg > 0) vsite2AvgAtoms.upload(vsite2AvgAtomVec); if (num3Avg > 0) vsite3AvgAtoms.upload(vsite3AvgAtomVec); if (numOutOfPlane > 0) vsiteOutOfPlaneAtoms.upload(vsiteOutOfPlaneAtomVec); if (numLocalCoords > 0) { vsiteLocalCoordsIndex.upload(vsiteLocalCoordsIndexVec); vsiteLocalCoordsAtoms.upload(vsiteLocalCoordsAtomVec); vsiteLocalCoordsStartIndex.upload(vsiteLocalCoordsStartVec); } int elementSize = (context.getUseDoublePrecision() ? sizeof(double) : sizeof(float)); vsite2AvgWeights.initialize(context, max(1, num2Avg), 2*elementSize, "vsite2AvgWeights"); vsite3AvgWeights.initialize(context, max(1, num3Avg), 4*elementSize, "vsite3AvgWeights"); vsiteOutOfPlaneWeights.initialize(context, max(1, numOutOfPlane), 4*elementSize, "vsiteOutOfPlaneWeights"); vsiteLocalCoordsWeights.initialize(context, max(1, (int) vsiteLocalCoordsWeightVec.size()), elementSize, "vsiteLocalCoordsWeights"); vsiteLocalCoordsPos.initialize(context, max(1, (int) vsiteLocalCoordsPosVec.size()), 4*elementSize, "vsiteLocalCoordsPos"); if (num2Avg > 0) vsite2AvgWeights.upload(vsite2AvgWeightVec, true); if (num3Avg > 0) vsite3AvgWeights.upload(vsite3AvgWeightVec, true); if (numOutOfPlane > 0) vsiteOutOfPlaneWeights.upload(vsiteOutOfPlaneWeightVec, true); if (numLocalCoords > 0) { vsiteLocalCoordsWeights.upload(vsiteLocalCoordsWeightVec, true); vsiteLocalCoordsPos.upload(vsiteLocalCoordsPosVec, true); } // Create the kernels used by this class. map<string, string> defines; defines["NUM_CCMA_CONSTRAINTS"] = context.intToString(numCCMA); defines["NUM_ATOMS"] = context.intToString(numAtoms); defines["NUM_2_AVERAGE"] = context.intToString(num2Avg); defines["NUM_3_AVERAGE"] = context.intToString(num3Avg); defines["NUM_OUT_OF_PLANE"] = context.intToString(numOutOfPlane); defines["NUM_LOCAL_COORDS"] = context.intToString(numLocalCoords); defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms()); CUmodule module = context.createModule(CudaKernelSources::vectorOps+CudaKernelSources::integrationUtilities, defines); settlePosKernel = context.getKernel(module, "applySettleToPositions"); settleVelKernel = context.getKernel(module, "applySettleToVelocities"); shakePosKernel = context.getKernel(module, "applyShakeToPositions"); shakeVelKernel = context.getKernel(module, "applyShakeToVelocities"); ccmaDirectionsKernel = context.getKernel(module, "computeCCMAConstraintDirections"); ccmaPosForceKernel = context.getKernel(module, "computeCCMAPositionConstraintForce"); ccmaVelForceKernel = context.getKernel(module, "computeCCMAVelocityConstraintForce"); ccmaMultiplyKernel = context.getKernel(module, "multiplyByCCMAConstraintMatrix"); ccmaUpdateKernel = context.getKernel(module, "updateCCMAAtomPositions"); CHECK_RESULT2(cuEventCreate(&ccmaEvent, CU_EVENT_DISABLE_TIMING), "Error creating event for CCMA"); vsitePositionKernel = context.getKernel(module, "computeVirtualSites"); vsiteForceKernel = context.getKernel(module, "distributeVirtualSiteForces"); numVsites = num2Avg+num3Avg+numOutOfPlane+numLocalCoords; randomKernel = context.getKernel(module, "generateRandomNumbers"); timeShiftKernel = context.getKernel(module, "timeShiftVelocities"); }