void CudaIntegrateRPMDStepKernel::computeForces(ContextImpl& context) {
    // Compute forces from all groups that didn't have a specified contraction.

    for (int i = 0; i < numCopies; i++) {
        void* copyToContextArgs[] = {&velocities->getDevicePointer(), &cu.getVelm().getDevicePointer(), &positions->getDevicePointer(),
                &cu.getPosq().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i};
        cu.executeKernel(copyToContextKernel, copyToContextArgs, cu.getNumAtoms());
        context.computeVirtualSites();
        Vec3 initialBox[3];
        context.getPeriodicBoxVectors(initialBox[0], initialBox[1], initialBox[2]);
        context.updateContextState();
        Vec3 finalBox[3];
        context.getPeriodicBoxVectors(finalBox[0], finalBox[1], finalBox[2]);
        if (initialBox[0] != finalBox[0] || initialBox[1] != finalBox[1] || initialBox[2] != finalBox[2])
            throw OpenMMException("Standard barostats cannot be used with RPMDIntegrator.  Use RPMDMonteCarloBarostat instead.");
        context.calcForcesAndEnergy(true, false, groupsNotContracted);
        void* copyFromContextArgs[] = {&cu.getForce().getDevicePointer(), &forces->getDevicePointer(), &cu.getVelm().getDevicePointer(),
                &velocities->getDevicePointer(), &cu.getPosq().getDevicePointer(), &positions->getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i};
        cu.executeKernel(copyFromContextKernel, copyFromContextArgs, cu.getNumAtoms());
    }
    
    // Now loop over contractions and compute forces from them.
    
    for (map<int, int>::const_iterator iter = groupsByCopies.begin(); iter != groupsByCopies.end(); ++iter) {
        int copies = iter->first;
        int groupFlags = iter->second;
        
        // Find the contracted positions.
        
        void* contractPosArgs[] = {&positions->getDevicePointer(), &contractedPositions->getDevicePointer()};
        cu.executeKernel(positionContractionKernels[copies], contractPosArgs, numParticles*numCopies, workgroupSize);

        // Compute forces.

        for (int i = 0; i < copies; i++) {
            void* copyToContextArgs[] = {&velocities->getDevicePointer(), &cu.getVelm().getDevicePointer(), &contractedPositions->getDevicePointer(),
                    &cu.getPosq().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i};
            cu.executeKernel(copyToContextKernel, copyToContextArgs, cu.getNumAtoms());
            context.computeVirtualSites();
            context.calcForcesAndEnergy(true, false, groupFlags);
            void* copyFromContextArgs[] = {&cu.getForce().getDevicePointer(), &contractedForces->getDevicePointer(), &cu.getVelm().getDevicePointer(),
                   &velocities->getDevicePointer(), &cu.getPosq().getDevicePointer(), &contractedPositions->getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i};
            cu.executeKernel(copyFromContextKernel, copyFromContextArgs, cu.getNumAtoms());
        }
        
        // Apply the forces to the original copies.
        
        void* contractForceArgs[] = {&forces->getDevicePointer(), &contractedForces->getDevicePointer()};
        cu.executeKernel(forceContractionKernels[copies], contractForceArgs, numParticles*numCopies, workgroupSize);
    }
    if (groupsByCopies.size() > 0) {
        // Ensure the Context contains the positions from the last copy, since we'll assume that later.
        
        int i = numCopies-1;
        void* copyToContextArgs[] = {&velocities->getDevicePointer(), &cu.getVelm().getDevicePointer(), &positions->getDevicePointer(),
                &cu.getPosq().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i};
        cu.executeKernel(copyToContextKernel, copyToContextArgs, cu.getNumAtoms());
    }
}
void MonteCarloAnisotropicBarostatImpl::initialize(ContextImpl& context) {
    kernel = context.getPlatform().createKernel(ApplyMonteCarloBarostatKernel::Name(), context);
    kernel.getAs<ApplyMonteCarloBarostatKernel>().initialize(context.getSystem(), owner);
    Vec3 box[3];
    context.getPeriodicBoxVectors(box[0], box[1], box[2]);
    double volume = box[0][0]*box[1][1]*box[2][2];
    for (int i=0; i<3; i++) {
        volumeScale[i] = 0.01*volume;
        numAttempted[i] = 0;
        numAccepted[i] = 0;
    }
    init_gen_rand(owner.getRandomNumberSeed(), random);
}
void MonteCarloBarostatImpl::initialize(ContextImpl& context) {
    kernel = context.getPlatform().createKernel(ApplyMonteCarloBarostatKernel::Name(), context);
    kernel.getAs<ApplyMonteCarloBarostatKernel>().initialize(context.getSystem(), owner);
    Vec3 box[3];
    context.getPeriodicBoxVectors(box[0], box[1], box[2]);
    double volume = box[0][0]*box[1][1]*box[2][2];
    volumeScale = 0.01*volume;
    numAttempted = 0;
    numAccepted = 0;
    int randSeed = owner.getRandomNumberSeed();
    // A random seed of 0 means use a unique one
    if (randSeed == 0) randSeed = osrngseed();
    init_gen_rand(randSeed, random);
}
void MonteCarloBarostatImpl::updateContextState(ContextImpl& context) {
    if (++step < owner.getFrequency() || owner.getFrequency() == 0)
        return;
    step = 0;

    // Compute the current potential energy.

    double initialEnergy = context.getOwner().getState(State::Energy).getPotentialEnergy();

    // Modify the periodic box size.

    Vec3 box[3];
    context.getPeriodicBoxVectors(box[0], box[1], box[2]);
    double volume = box[0][0]*box[1][1]*box[2][2];
    double deltaVolume = volumeScale*2*(genrand_real2(random)-0.5);
    double newVolume = volume+deltaVolume;
    double lengthScale = std::pow(newVolume/volume, 1.0/3.0);
    kernel.getAs<ApplyMonteCarloBarostatKernel>().scaleCoordinates(context, lengthScale, lengthScale, lengthScale);
    context.getOwner().setPeriodicBoxVectors(box[0]*lengthScale, box[1]*lengthScale, box[2]*lengthScale);

    // Compute the energy of the modified system.
    
    double finalEnergy = context.getOwner().getState(State::Energy).getPotentialEnergy();
    double pressure = context.getParameter(MonteCarloBarostat::Pressure())*(AVOGADRO*1e-25);
    double kT = BOLTZ*context.getParameter(MonteCarloBarostat::Temperature());
    double w = finalEnergy-initialEnergy + pressure*deltaVolume - context.getMolecules().size()*kT*std::log(newVolume/volume);
    if (w > 0 && genrand_real2(random) > std::exp(-w/kT)) {
        // Reject the step.

        kernel.getAs<ApplyMonteCarloBarostatKernel>().restoreCoordinates(context);
        context.getOwner().setPeriodicBoxVectors(box[0], box[1], box[2]);
        volume = newVolume;
    }
    else
        numAccepted++;
    numAttempted++;
    if (numAttempted >= 10) {
        if (numAccepted < 0.25*numAttempted) {
            volumeScale /= 1.1;
            numAttempted = 0;
            numAccepted = 0;
        }
        else if (numAccepted > 0.75*numAttempted) {
            volumeScale = std::min(volumeScale*1.1, volume*0.3);
            numAttempted = 0;
            numAccepted = 0;
        }
    }
}
void MonteCarloAnisotropicBarostatImpl::updateContextState(ContextImpl& context) {
    if (++step < owner.getFrequency() || owner.getFrequency() == 0)
        return;
    if (!owner.getScaleX() && !owner.getScaleY() && !owner.getScaleZ())
        return;
    step = 0;
    
    // Compute the current potential energy.
    
    double initialEnergy = context.getOwner().getState(State::Energy).getPotentialEnergy();
    double pressure;
    
    // Choose which axis to modify at random.
    int axis;
    while (true) {
        double rnd = genrand_real2(random)*3.0;
        if (rnd < 1.0) {
            if (owner.getScaleX()) {
                axis = 0;
                pressure = context.getParameter(MonteCarloAnisotropicBarostat::PressureX())*(AVOGADRO*1e-25);
                break;
            }
        } else if (rnd < 2.0) {
            if (owner.getScaleY()) {
                axis = 1;
                pressure = context.getParameter(MonteCarloAnisotropicBarostat::PressureY())*(AVOGADRO*1e-25);
                break;
            }
        } else if (owner.getScaleZ()) {
            axis = 2;
            pressure = context.getParameter(MonteCarloAnisotropicBarostat::PressureZ())*(AVOGADRO*1e-25);
            break;
        }
    }
    
    // Modify the periodic box size.
    
    Vec3 box[3];
    context.getPeriodicBoxVectors(box[0], box[1], box[2]);
    double volume = box[0][0]*box[1][1]*box[2][2];
    double deltaVolume = volumeScale[axis]*2*(genrand_real2(random)-0.5);
    double newVolume = volume+deltaVolume;
    Vec3 lengthScale(1.0, 1.0, 1.0);
    lengthScale[axis] = newVolume/volume;
    kernel.getAs<ApplyMonteCarloBarostatKernel>().scaleCoordinates(context, lengthScale[0], lengthScale[1], lengthScale[2]);
    context.getOwner().setPeriodicBoxVectors(box[0]*lengthScale[0], box[1]*lengthScale[1], box[2]*lengthScale[2]);
    
    // Compute the energy of the modified system.
    
    double finalEnergy = context.getOwner().getState(State::Energy).getPotentialEnergy();
    double kT = BOLTZ*owner.getTemperature();
    double w = finalEnergy-initialEnergy + pressure*deltaVolume - context.getMolecules().size()*kT*std::log(newVolume/volume);
    if (w > 0 && genrand_real2(random) > std::exp(-w/kT)) {
        // Reject the step.
        
        kernel.getAs<ApplyMonteCarloBarostatKernel>().restoreCoordinates(context);
        context.getOwner().setPeriodicBoxVectors(box[0], box[1], box[2]);
        volume = newVolume;
    }
    else
        numAccepted[axis]++;
    numAttempted[axis]++;
    if (numAttempted[axis] >= 10) {
        if (numAccepted[axis] < 0.25*numAttempted[axis]) {
            volumeScale[axis] /= 1.1;
            numAttempted[axis] = 0;
            numAccepted[axis] = 0;
        }
        else if (numAccepted[axis] > 0.75*numAttempted[axis]) {
            volumeScale[axis] = std::min(volumeScale[axis]*1.1, volume*0.3);
            numAttempted[axis] = 0;
            numAccepted[axis] = 0;
        }
    }
}
void ReferenceIntegrateRPMDStepKernel::computeForces(ContextImpl& context, const RPMDIntegrator& integrator) {
    const int totalCopies = positions.size();
    const int numParticles = positions[0].size();
    vector<RealVec>& pos = extractPositions(context);
    vector<RealVec>& vel = extractVelocities(context);
    vector<RealVec>& f = extractForces(context);
    
    // Compute forces from all groups that didn't have a specified contraction.
    
    for (int i = 0; i < totalCopies; i++) {
        pos = positions[i];
        vel = velocities[i];
        context.computeVirtualSites();
        Vec3 initialBox[3];
        context.getPeriodicBoxVectors(initialBox[0], initialBox[1], initialBox[2]);
        context.updateContextState();
        Vec3 finalBox[3];
        context.getPeriodicBoxVectors(finalBox[0], finalBox[1], finalBox[2]);
        if (initialBox[0] != finalBox[0] || initialBox[1] != finalBox[1] || initialBox[2] != finalBox[2]) {
            // A barostat was applied during updateContextState().  Adjust the particle positions in all the
            // other copies to match this one.
            
            for (int j = 0; j < numParticles; j++) {
                Vec3 delta = pos[j]-positions[i][j];
                for (int k = 0; k < totalCopies; k++)
                    if (k != i)
                        positions[k][j] += delta;
            }
        }
        positions[i] = pos;
        velocities[i] = vel;
        context.calcForcesAndEnergy(true, false, groupsNotContracted);
        forces[i] = f;
    }
    
    // Now loop over contractions and compute forces from them.
    
    for (map<int, int>::const_iterator iter = groupsByCopies.begin(); iter != groupsByCopies.end(); ++iter) {
        int copies = iter->first;
        int groupFlags = iter->second;
        fftpack* shortFFT = contractionFFT[copies];
        
        // Find the contracted positions.
        
        vector<t_complex> q(totalCopies);
        const RealOpenMM scale1 = 1.0/totalCopies;
        for (int particle = 0; particle < numParticles; particle++) {
            for (int component = 0; component < 3; component++) {
                // Transform to the frequency domain, set high frequency components to zero, and transform back.
                
                for (int k = 0; k < totalCopies; k++)
                    q[k] = t_complex(positions[k][particle][component], 0.0);
                fftpack_exec_1d(fft, FFTPACK_FORWARD, &q[0], &q[0]);
                if (copies > 1) {
                    int start = (copies+1)/2;
                    int end = totalCopies-copies+start;
                    for (int k = end; k < totalCopies; k++)
                        q[k-(totalCopies-copies)] = q[k];
                    fftpack_exec_1d(shortFFT, FFTPACK_BACKWARD, &q[0], &q[0]);
                }
                for (int k = 0; k < copies; k++)
                    contractedPositions[k][particle][component] = scale1*q[k].re;
            }
        }
        
        // Compute forces.

        for (int i = 0; i < copies; i++) {
            pos = contractedPositions[i];
            context.computeVirtualSites();
            context.calcForcesAndEnergy(true, false, groupFlags);
            contractedForces[i] = f;
        }
        
        // Apply the forces to the original copies.
        
        const RealOpenMM scale2 = 1.0/copies;
        for (int particle = 0; particle < numParticles; particle++) {
            for (int component = 0; component < 3; component++) {
                // Transform to the frequency domain, pad with zeros, and transform back.
                
                for (int k = 0; k < copies; k++)
                    q[k] = t_complex(contractedForces[k][particle][component], 0.0);
                if (copies > 1)
                    fftpack_exec_1d(shortFFT, FFTPACK_FORWARD, &q[0], &q[0]);
                int start = (copies+1)/2;
                int end = totalCopies-copies+start;
                for (int k = end; k < totalCopies; k++)
                    q[k] = q[k-(totalCopies-copies)];
                for (int k = start; k < end; k++)
                    q[k] = t_complex(0, 0);
                fftpack_exec_1d(fft, FFTPACK_BACKWARD, &q[0], &q[0]);
                for (int k = 0; k < totalCopies; k++)
                    forces[k][particle][component] += scale2*q[k].re;
            }
        }
    }
}