static real rms_force(t_commrec *cr, rvec f[], int ns, t_shell s[],
                      int ndir, real *sf_dir, real *Epot)
    int    i, shell, ntot;
    double buf[4];

    buf[0] = *sf_dir;
    for (i = 0; i < ns; i++)
        shell    = s[i].shell;
        buf[0]  += norm2(f[shell]);
    ntot = ns;

    if (PAR(cr))
        buf[1] = ntot;
        buf[2] = *sf_dir;
        buf[3] = *Epot;
        gmx_sumd(4, buf, cr);
        ntot    = (int)(buf[1] + 0.5);
        *sf_dir = buf[2];
        *Epot   = buf[3];
    ntot += ndir;

    return (ntot ? sqrt(buf[0]/ntot) : 0);
void sum_bin(t_bin *b, t_commrec *cr)
    int i;

    for (i = b->nreal; (i < b->maxreal); i++)
        b->rbuf[i] = 0;
    gmx_sumd(b->maxreal, b->rbuf, cr);
/* Get the center from local positions that already have the correct
 * PBC representation */
extern void get_center_comm(
        const t_commrec *cr,
        rvec             x_loc[],      /* Local positions */
        real             weight_loc[], /* Local masses or other weights */
        int              nr_loc,       /* Local number of atoms */
        int              nr_group,     /* Total number of atoms of the group */
        rvec             center)       /* Weighted center */
    double weight_sum, denom;
    dvec   dsumvec;
    double buf[4];

    weight_sum = get_sum_of_positions(x_loc, weight_loc, nr_loc, dsumvec);

    /* Add the local contributions from all nodes. Put the sum vector and the
     * weight in a buffer array so that we get along with a single communication
     * call. */
    if (PAR(cr))
        buf[0] = dsumvec[XX];
        buf[1] = dsumvec[YY];
        buf[2] = dsumvec[ZZ];
        buf[3] = weight_sum;

        /* Communicate buffer */
        gmx_sumd(4, buf, cr);

        dsumvec[XX] = buf[0];
        dsumvec[YY] = buf[1];
        dsumvec[ZZ] = buf[2];
        weight_sum  = buf[3];

    if (weight_loc != nullptr)
        denom = 1.0/weight_sum; /* Divide by the sum of weight to get center of mass e.g. */
        denom = 1.0/nr_group;   /* Divide by the number of atoms to get the geometrical center */

    center[XX] = dsumvec[XX]*denom;
    center[YY] = dsumvec[YY]*denom;
    center[ZZ] = dsumvec[ZZ]*denom;
static void pull_reduce_double(t_commrec   *cr,
                               pull_comm_t *comm,
                               int          n,
                               double      *data)
    if (cr != NULL && PAR(cr))
        if (comm->bParticipateAll)
            /* Sum the contributions over all DD ranks */
            gmx_sumd(n, data, cr);
            MPI_Allreduce(MPI_IN_PLACE, data, n, MPI_DOUBLE, MPI_SUM,
            double *buf;

            snew(buf, n);

            MPI_Allreduce(data, buf, n, MPI_DOUBLE, MPI_SUM,

            /* Copy the result from the buffer to the input/output data */
            for (int i = 0; i < n; i++)
                data[i] = buf[i];
            gmx_incons("comm->bParticipateAll=FALSE without GMX_MPI");
/* calculates center of mass of selection index from all coordinates x */
void pull_calc_coms(t_commrec *cr,
                    t_pull *pull, t_mdatoms *md, t_pbc *pbc, double t,
                    rvec x[], rvec *xp)
    int           g, i, ii, m;
    real          mass, w, wm, twopi_box = 0;
    double        wmass, wwmass, invwmass;
    dvec          com, comp;
    double        cm, sm, cmp, smp, ccm, csm, ssm, csw, snw;
    rvec         *xx[2], x_pbc = {0, 0, 0}, dx;
    t_pull_group *pgrp;

    if (pull->rbuf == NULL)
        snew(pull->rbuf, pull->ngroup);
    if (pull->dbuf == NULL)
        snew(pull->dbuf, 3*pull->ngroup);

    if (pull->bRefAt)
        pull_set_pbcatoms(cr, pull, md, x, pull->rbuf);

    if (pull->cosdim >= 0)
        for (m = pull->cosdim+1; m < pull->npbcdim; m++)
            if (pbc->box[m][pull->cosdim] != 0)
                gmx_fatal(FARGS, "Can not do cosine weighting for trilinic dimensions");
        twopi_box = 2.0*M_PI/pbc->box[pull->cosdim][pull->cosdim];

    for (g = 0; g < pull->ngroup; g++)
        pgrp = &pull->group[g];
        wmass  = 0;
        wwmass = 0;
        cm     = 0;
        sm     = 0;
        cmp    = 0;
        smp    = 0;
        ccm    = 0;
        csm    = 0;
        ssm    = 0;
        if (!(g == 0 && PULL_CYL(pull)))
            if (pgrp->epgrppbc == epgrppbcREFAT)
                /* Set the pbc atom */
                copy_rvec(pull->rbuf[g], x_pbc);
            w = 1;
            for (i = 0; i < pgrp->nat_loc; i++)
                ii   = pgrp->ind_loc[i];
                mass = md->massT[ii];
                if (pgrp->epgrppbc != epgrppbcCOS)
                    if (pgrp->weight_loc)
                        w = pgrp->weight_loc[i];
                    wm      = w*mass;
                    wmass  += wm;
                    wwmass += wm*w;
                    if (pgrp->epgrppbc == epgrppbcNONE)
                        /* Plain COM: sum the coordinates */
                        for (m = 0; m < DIM; m++)
                            com[m]    += wm*x[ii][m];
                        if (xp)
                            for (m = 0; m < DIM; m++)
                                comp[m] += wm*xp[ii][m];
                        /* Sum the difference with the reference atom */
                        pbc_dx(pbc, x[ii], x_pbc, dx);
                        for (m = 0; m < DIM; m++)
                            com[m]    += wm*dx[m];
                        if (xp)
                            /* For xp add the difference between xp and x to dx,
                             * such that we use the same periodic image,
                             * also when xp has a large displacement.
                            for (m = 0; m < DIM; m++)
                                comp[m] += wm*(dx[m] + xp[ii][m] - x[ii][m]);
                    /* Determine cos and sin sums */
                    csw  = cos(x[ii][pull->cosdim]*twopi_box);
                    snw  = sin(x[ii][pull->cosdim]*twopi_box);
                    cm  += csw*mass;
                    sm  += snw*mass;
                    ccm += csw*csw*mass;
                    csm += csw*snw*mass;
                    ssm += snw*snw*mass;

                    if (xp)
                        csw  = cos(xp[ii][pull->cosdim]*twopi_box);
                        snw  = sin(xp[ii][pull->cosdim]*twopi_box);
                        cmp += csw*mass;
                        smp += snw*mass;

        /* Copy local sums to a buffer for global summing */
        switch (pgrp->epgrppbc)
            case epgrppbcNONE:
            case epgrppbcREFAT:
                copy_dvec(com, pull->dbuf[g*3]);
                copy_dvec(comp, pull->dbuf[g*3+1]);
                pull->dbuf[g*3+2][0] = wmass;
                pull->dbuf[g*3+2][1] = wwmass;
                pull->dbuf[g*3+2][2] = 0;
            case epgrppbcCOS:
                pull->dbuf[g*3  ][0] = cm;
                pull->dbuf[g*3  ][1] = sm;
                pull->dbuf[g*3  ][2] = 0;
                pull->dbuf[g*3+1][0] = ccm;
                pull->dbuf[g*3+1][1] = csm;
                pull->dbuf[g*3+1][2] = ssm;
                pull->dbuf[g*3+2][0] = cmp;
                pull->dbuf[g*3+2][1] = smp;
                pull->dbuf[g*3+2][2] = 0;

    if (cr && PAR(cr))
        /* Sum the contributions over the nodes */
        gmx_sumd(pull->ngroup*3*DIM, pull->dbuf[0], cr);

    for (g = 0; g < pull->ngroup; g++)
        pgrp = &pull->group[g];
        if (pgrp->nat > 0 && !(g == 0 && PULL_CYL(pull)))
            if (pgrp->epgrppbc != epgrppbcCOS)
                /* Determine the inverse mass */
                wmass    = pull->dbuf[g*3+2][0];
                wwmass   = pull->dbuf[g*3+2][1];
                invwmass = 1/wmass;
                /* invtm==0 signals a frozen group, so then we should keep it zero */
                if (pgrp->invtm > 0)
                    pgrp->wscale = wmass/wwmass;
                    pgrp->invtm  = 1.0/(pgrp->wscale*wmass);
                /* Divide by the total mass */
                for (m = 0; m < DIM; m++)
                    pgrp->x[m]    = pull->dbuf[g*3  ][m]*invwmass;
                    if (xp)
                        pgrp->xp[m] = pull->dbuf[g*3+1][m]*invwmass;
                    if (pgrp->epgrppbc == epgrppbcREFAT)
                        pgrp->x[m]    += pull->rbuf[g][m];
                        if (xp)
                            pgrp->xp[m] += pull->rbuf[g][m];
                /* Determine the optimal location of the cosine weight */
                csw                   = pull->dbuf[g*3][0];
                snw                   = pull->dbuf[g*3][1];
                pgrp->x[pull->cosdim] = atan2_0_2pi(snw, csw)/twopi_box;
                /* Set the weights for the local atoms */
                wmass  = sqrt(csw*csw + snw*snw);
                wwmass = (pull->dbuf[g*3+1][0]*csw*csw +
                          pull->dbuf[g*3+1][1]*csw*snw +
                pgrp->wscale = wmass/wwmass;
                pgrp->invtm  = 1.0/(pgrp->wscale*wmass);
                /* Set the weights for the local atoms */
                csw *= pgrp->invtm;
                snw *= pgrp->invtm;
                for (i = 0; i < pgrp->nat_loc; i++)
                    ii                  = pgrp->ind_loc[i];
                    pgrp->weight_loc[i] = csw*cos(twopi_box*x[ii][pull->cosdim]) +
                if (xp)
                    csw                    = pull->dbuf[g*3+2][0];
                    snw                    = pull->dbuf[g*3+2][1];
                    pgrp->xp[pull->cosdim] = atan2_0_2pi(snw, csw)/twopi_box;
            if (debug)
                fprintf(debug, "Pull group %d wmass %f wwmass %f invtm %f\n",
                        g, wmass, wwmass, pgrp->invtm);

    if (PULL_CYL(pull))
        /* Calculate the COMs for the cyclinder reference groups */
        make_cyl_refgrps(cr, pull, md, pbc, t, x, xp);
static void make_cyl_refgrps(t_commrec *cr, t_pull *pull, t_mdatoms *md,
                             t_pbc *pbc, double t, rvec *x, rvec *xp)
    int           c, i, ii, m, start, end;
    rvec          g_x, dx, dir;
    double        r0_2, sum_a, sum_ap, dr2, mass, weight, wmass, wwmass, inp;
    t_pull_coord *pcrd;
    t_pull_group *pref, *pgrp, *pdyna;
    gmx_ga2la_t   ga2la = NULL;

    if (pull->dbuf_cyl == NULL)
        snew(pull->dbuf_cyl, pull->ncoord*4);

    if (cr && DOMAINDECOMP(cr))
        ga2la = cr->dd->ga2la;

    start = 0;
    end   = md->homenr;

    r0_2 = dsqr(pull->cyl_r0);

    /* loop over all groups to make a reference group for each*/
    for (c = 0; c < pull->ncoord; c++)
        pcrd  = &pull->coord[c];

        /* pref will be the same group for all pull coordinates */
        pref  = &pull->group[pcrd->group[0]];
        pgrp  = &pull->group[pcrd->group[1]];
        pdyna = &pull->dyna[c];
        copy_rvec(pcrd->vec, dir);
        sum_a          = 0;
        sum_ap         = 0;
        wmass          = 0;
        wwmass         = 0;
        pdyna->nat_loc = 0;

        for (m = 0; m < DIM; m++)
            g_x[m] = pgrp->x[m] - pcrd->vec[m]*(pcrd->init + pcrd->rate*t);

        /* loop over all atoms in the main ref group */
        for (i = 0; i < pref->nat; i++)
            ii = pref->ind[i];
            if (ga2la)
                if (!ga2la_get_home(ga2la, pref->ind[i], &ii))
                    ii = -1;
            if (ii >= start && ii < end)
                pbc_dx_aiuc(pbc, x[ii], g_x, dx);
                inp = iprod(dir, dx);
                dr2 = 0;
                for (m = 0; m < DIM; m++)
                    dr2 += dsqr(dx[m] - inp*dir[m]);

                if (dr2 < r0_2)
                    /* add to index, to sum of COM, to weight array */
                    if (pdyna->nat_loc >= pdyna->nalloc_loc)
                        pdyna->nalloc_loc = over_alloc_large(pdyna->nat_loc+1);
                        srenew(pdyna->ind_loc, pdyna->nalloc_loc);
                        srenew(pdyna->weight_loc, pdyna->nalloc_loc);
                    pdyna->ind_loc[pdyna->nat_loc] = ii;
                    mass   = md->massT[ii];
                    weight = get_weight(sqrt(dr2), pull->cyl_r1, pull->cyl_r0);
                    pdyna->weight_loc[pdyna->nat_loc] = weight;
                    sum_a += mass*weight*inp;
                    if (xp)
                        pbc_dx_aiuc(pbc, xp[ii], g_x, dx);
                        inp     = iprod(dir, dx);
                        sum_ap += mass*weight*inp;
                    wmass  += mass*weight;
                    wwmass += mass*sqr(weight);
        pull->dbuf_cyl[c*4+0] = wmass;
        pull->dbuf_cyl[c*4+1] = wwmass;
        pull->dbuf_cyl[c*4+2] = sum_a;
        pull->dbuf_cyl[c*4+3] = sum_ap;

    if (cr && PAR(cr))
        /* Sum the contributions over the nodes */
        gmx_sumd(pull->ncoord*4, pull->dbuf_cyl, cr);

    for (c = 0; c < pull->ncoord; c++)
        pcrd  = &pull->coord[c];

        pdyna = &pull->dyna[c];
        pgrp  = &pull->group[pcrd->group[1]];

        wmass         = pull->dbuf_cyl[c*4+0];
        wwmass        = pull->dbuf_cyl[c*4+1];
        pdyna->wscale = wmass/wwmass;
        pdyna->invtm  = 1.0/(pdyna->wscale*wmass);

        for (m = 0; m < DIM; m++)
            g_x[m]      = pgrp->x[m] - pcrd->vec[m]*(pcrd->init + pcrd->rate*t);
            pdyna->x[m] = g_x[m] + pcrd->vec[m]*pull->dbuf_cyl[c*4+2]/wmass;
            if (xp)
                pdyna->xp[m] = g_x[m] + pcrd->vec[m]*pull->dbuf_cyl[c*4+3]/wmass;

        if (debug)
            fprintf(debug, "Pull cylinder group %d:%8.3f%8.3f%8.3f m:%8.3f\n",
                    c, pdyna->x[0], pdyna->x[1],
                    pdyna->x[2], 1.0/pdyna->invtm);
                          t_commrec                 *cr,
                          FILE                      *fp_err,
                          FILE                      *fp_log,
                          t_inputrec                *ir,
                          t_state                   *state,
                          double                     cycles,
                          interaction_const_t       *ic,
                          struct nonbonded_verlet_t *nbv,
                          struct gmx_pme_t **        pmedata,
                          gmx_int64_t                step)
    gmx_bool     OK;
    pme_setup_t *set;
    double       cycles_fast;
    char         buf[STRLEN], sbuf[22];
    real         rtab;
    gmx_bool     bUsesSimpleTables = TRUE;

    if (pme_lb->stage == pme_lb->nstage)
        return FALSE;

    if (PAR(cr))
        gmx_sumd(1, &cycles, cr);
        cycles /= cr->nnodes;

    set = &pme_lb->setup[pme_lb->cur];

    rtab = ir->rlistlong + ir->tabext;

    if (set->count % 2 == 1)
        /* Skip the first cycle, because the first step after a switch
         * is much slower due to allocation and/or caching effects.
        return TRUE;

    sprintf(buf, "step %4s: ", gmx_step_str(step, sbuf));
    print_grid(fp_err, fp_log, buf, "timed with", set, cycles);

    if (set->count <= 2)
        set->cycles = cycles;
        if (cycles*PME_LB_ACCEL_TOL < set->cycles &&
            pme_lb->stage == pme_lb->nstage - 1)
            /* The performance went up a lot (due to e.g. DD load balancing).
             * Add a stage, keep the minima, but rescan all setups.

            if (debug)
                fprintf(debug, "The performance for grid %d %d %d went from %.3f to %.1f M-cycles, this is more than %f\n"
                        "Increased the number stages to %d"
                        " and ignoring the previous performance\n",
                        set->grid[XX], set->grid[YY], set->grid[ZZ],
                        cycles*1e-6, set->cycles*1e-6, PME_LB_ACCEL_TOL,
        set->cycles = min(set->cycles, cycles);

    if (set->cycles < pme_lb->setup[pme_lb->fastest].cycles)
        pme_lb->fastest = pme_lb->cur;

        if (DOMAINDECOMP(cr))
            /* We found a new fastest setting, ensure that with subsequent
             * shorter cut-off's the dynamic load balancing does not make
             * the use of the current cut-off impossible. This solution is
             * a trade-off, as the PME load balancing and DD domain size
             * load balancing can interact in complex ways.
             * With the Verlet kernels, DD load imbalance will usually be
             * mainly due to bonded interaction imbalance, which will often
             * quickly push the domain boundaries beyond the limit for the
             * optimal, PME load balanced, cut-off. But it could be that
             * better overal performance can be obtained with a slightly
             * shorter cut-off and better DD load balancing.
    cycles_fast = pme_lb->setup[pme_lb->fastest].cycles;

    /* Check in stage 0 if we should stop scanning grids.
     * Stop when the time is more than SLOW_FAC longer than the fastest.
    if (pme_lb->stage == 0 && pme_lb->cur > 0 &&
        cycles > pme_lb->setup[pme_lb->fastest].cycles*PME_LB_SLOW_FAC)
        pme_lb->n = pme_lb->cur + 1;
        /* Done with scanning, go to stage 1 */

    if (pme_lb->stage == 0)
        int gridsize_start;

        gridsize_start = set->grid[XX]*set->grid[YY]*set->grid[ZZ];

            if (pme_lb->cur+1 < pme_lb->n)
                /* We had already generated the next setup */
                OK = TRUE;
                /* Find the next setup */
                OK = pme_loadbal_increase_cutoff(pme_lb, ir->pme_order, cr->dd);

                if (!OK)
                    pme_lb->elimited = epmelblimPMEGRID;

            if (OK && ir->ePBC != epbcNONE)
                OK = (sqr(pme_lb->setup[pme_lb->cur+1].rlistlong)
                      <= max_cutoff2(ir->ePBC, state->box));
                if (!OK)
                    pme_lb->elimited = epmelblimBOX;

            if (OK)

                if (DOMAINDECOMP(cr))
                    OK = change_dd_cutoff(cr, state, ir,
                    if (!OK)
                        /* Failed: do not use this setup */
                        pme_lb->elimited = epmelblimDD;
            if (!OK)
                /* We hit the upper limit for the cut-off,
                 * the setup should not go further than cur.
                pme_lb->n = pme_lb->cur + 1;
                print_loadbal_limited(fp_err, fp_log, step, pme_lb);
                /* Switch to the next stage */
        while (OK &&
                 pme_lb->setup[pme_lb->cur].grid[ZZ] <
                 pme_lb->setup[pme_lb->cur].grid_efficiency <

    if (pme_lb->stage > 0 && pme_lb->end == 1)
        pme_lb->cur   = 0;
        pme_lb->stage = pme_lb->nstage;
    else if (pme_lb->stage > 0 && pme_lb->end > 1)
        /* If stage = nstage-1:
         *   scan over all setups, rerunning only those setups
         *   which are not much slower than the fastest
         * else:
         *   use the next setup
            if (pme_lb->cur == pme_lb->end)
                pme_lb->cur = pme_lb->start;
        while (pme_lb->stage == pme_lb->nstage - 1 &&
               pme_lb->setup[pme_lb->cur].count > 0 &&
               pme_lb->setup[pme_lb->cur].cycles > cycles_fast*PME_LB_SLOW_FAC);

        if (pme_lb->stage == pme_lb->nstage)
            /* We are done optimizing, use the fastest setup we found */
            pme_lb->cur = pme_lb->fastest;

    if (DOMAINDECOMP(cr) && pme_lb->stage > 0)
        OK = change_dd_cutoff(cr, state, ir, pme_lb->setup[pme_lb->cur].rlistlong);
        if (!OK)
            /* Failsafe solution */
            if (pme_lb->cur > 1 && pme_lb->stage == pme_lb->nstage)
            pme_lb->fastest  = 0;
            pme_lb->start    = 0;
            pme_lb->end      = pme_lb->cur;
            pme_lb->cur      = pme_lb->start;
            pme_lb->elimited = epmelblimDD;
            print_loadbal_limited(fp_err, fp_log, step, pme_lb);

    /* Change the Coulomb cut-off and the PME grid */

    set = &pme_lb->setup[pme_lb->cur];

    ic->rcoulomb     = set->rcut_coulomb;
    ic->rlist        = set->rlist;
    ic->rlistlong    = set->rlistlong;
    ir->nstcalclr    = set->nstcalclr;
    ic->ewaldcoeff_q = set->ewaldcoeff_q;
    /* TODO: centralize the code that sets the potentials shifts */
    if (ic->coulomb_modifier == eintmodPOTSHIFT)
        ic->sh_ewald = gmx_erfc(ic->ewaldcoeff_q*ic->rcoulomb);
    if (EVDW_PME(ic->vdwtype))
        /* We have PME for both Coulomb and VdW, set rvdw equal to rcoulomb */
        ic->rvdw            = set->rcut_coulomb;
        ic->ewaldcoeff_lj   = set->ewaldcoeff_lj;
        if (ic->vdw_modifier == eintmodPOTSHIFT)
            real crc2;

            ic->dispersion_shift.cpot = -pow(ic->rvdw, -6.0);
            ic->repulsion_shift.cpot  = -pow(ic->rvdw, -12.0);
            ic->sh_invrc6             = -ic->dispersion_shift.cpot;
            crc2                      = sqr(ic->ewaldcoeff_lj*ic->rvdw);
            ic->sh_lj_ewald           = (exp(-crc2)*(1 + crc2 + 0.5*crc2*crc2) - 1)*pow(ic->rvdw, -6.0);

    bUsesSimpleTables = uses_simple_tables(ir->cutoff_scheme, nbv, 0);
    nbnxn_gpu_pme_loadbal_update_param(nbv, ic);

    /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
     * also sharing texture references. To keep the code simple, we don't
     * treat texture references as shared resources, but this means that
     * the coulomb_tab texture ref will get updated by multiple threads.
     * Hence, to ensure that the non-bonded kernels don't start before all
     * texture binding operations are finished, we need to wait for all ranks
     * to arrive here before continuing.
     * Note that we could omit this barrier if GPUs are not shared (or
     * texture objects are used), but as this is initialization code, there
     * is not point in complicating things.
    if (PAR(cr) && use_GPU(nbv))
#endif  /* GMX_THREAD_MPI */

    /* Usually we won't need the simple tables with GPUs.
     * But we do with hybrid acceleration and with free energy.
     * To avoid bugs, we always re-initialize the simple tables here.
    init_interaction_const_tables(NULL, ic, bUsesSimpleTables, rtab);

    if (cr->duty & DUTY_PME)
        if (pme_lb->setup[pme_lb->cur].pmedata == NULL)
            /* Generate a new PME data structure,
             * copying part of the old pointers.
                           cr, pme_lb->setup[0].pmedata, ir,
        *pmedata = set->pmedata;
        /* Tell our PME-only node to switch grid */
        gmx_pme_send_switchgrid(cr, set->grid, set->ewaldcoeff_q, set->ewaldcoeff_lj);

    if (debug)
        print_grid(NULL, debug, "", "switched to", set, -1);

    if (pme_lb->stage == pme_lb->nstage)
        print_grid(fp_err, fp_log, "", "optimal", set, -1);

    return TRUE;
static void calc_cgcm_av_stddev(t_block *cgs, int n, rvec *x, rvec av, rvec stddev,
                                t_commrec *cr_sum)
    int   *cgindex;
    dvec   s1, s2;
    double buf[7];
    int    cg, d, k0, k1, k, nrcg;
    real   inv_ncg;
    rvec   cg_cm;


    cgindex = cgs->index;
    for (cg = 0; cg < n; cg++)
        k0      = cgindex[cg];
        k1      = cgindex[cg+1];
        nrcg    = k1 - k0;
        if (nrcg == 1)
            copy_rvec(x[k0], cg_cm);
            inv_ncg = 1.0/nrcg;

            for (k = k0; (k < k1); k++)
                rvec_inc(cg_cm, x[k]);
            for (d = 0; (d < DIM); d++)
                cg_cm[d] *= inv_ncg;
        for (d = 0; d < DIM; d++)
            s1[d] += cg_cm[d];
            s2[d] += cg_cm[d]*cg_cm[d];

    if (cr_sum != NULL)
        for (d = 0; d < DIM; d++)
            buf[d]     = s1[d];
            buf[DIM+d] = s2[d];
        buf[6] = n;
        gmx_sumd(7, buf, cr_sum);
        for (d = 0; d < DIM; d++)
            s1[d] = buf[d];
            s2[d] = buf[DIM+d];
        n = (int)(buf[6] + 0.5);

    dsvmul(1.0/n, s1, s1);
    dsvmul(1.0/n, s2, s2);

    for (d = 0; d < DIM; d++)
        av[d]     = s1[d];
        stddev[d] = sqrt(s2[d] - s1[d]*s1[d]);
double do_tpi(FILE *fplog, t_commrec *cr,
              int nfile, const t_filenm fnm[],
              const output_env_t oenv, gmx_bool bVerbose, gmx_bool gmx_unused bCompact,
              int gmx_unused nstglobalcomm,
              gmx_vsite_t gmx_unused *vsite, gmx_constr_t gmx_unused constr,
              int gmx_unused stepout,
              t_inputrec *inputrec,
              gmx_mtop_t *top_global, t_fcdata *fcd,
              t_state *state,
              t_mdatoms *mdatoms,
              t_nrnb *nrnb, gmx_wallcycle_t wcycle,
              gmx_edsam_t gmx_unused ed,
              t_forcerec *fr,
              int gmx_unused repl_ex_nst, int gmx_unused repl_ex_nex, int gmx_unused repl_ex_seed,
              gmx_membed_t gmx_unused membed,
              real gmx_unused cpt_period, real gmx_unused max_hours,
              const char gmx_unused *deviceOptions,
              int gmx_unused imdport,
              unsigned long gmx_unused Flags,
              gmx_walltime_accounting_t walltime_accounting)
    const char     *TPI = "Test Particle Insertion";
    gmx_localtop_t *top;
    gmx_groups_t   *groups;
    gmx_enerdata_t *enerd;
    rvec           *f;
    real            lambda, t, temp, beta, drmax, epot;
    double          embU, sum_embU, *sum_UgembU, V, V_all, VembU_all;
    t_trxstatus    *status;
    t_trxframe      rerun_fr;
    gmx_bool        bDispCorr, bCharge, bRFExcl, bNotLastFrame, bStateChanged, bNS;
    tensor          force_vir, shake_vir, vir, pres;
    int             cg_tp, a_tp0, a_tp1, ngid, gid_tp, nener, e;
    rvec           *x_mol;
    rvec            mu_tot, x_init, dx, x_tp;
    int             nnodes, frame;
    gmx_int64_t     frame_step_prev, frame_step;
    gmx_int64_t     nsteps, stepblocksize = 0, step;
    gmx_int64_t     rnd_count_stride, rnd_count;
    gmx_int64_t     seed;
    double          rnd[4];
    int             i, start, end;
    FILE           *fp_tpi = NULL;
    char           *ptr, *dump_pdb, **leg, str[STRLEN], str2[STRLEN];
    double          dbl, dump_ener;
    gmx_bool        bCavity;
    int             nat_cavity  = 0, d;
    real           *mass_cavity = NULL, mass_tot;
    int             nbin;
    double          invbinw, *bin, refvolshift, logV, bUlogV;
    real            dvdl, prescorr, enercorr, dvdlcorr;
    gmx_bool        bEnergyOutOfBounds;
    const char     *tpid_leg[2] = {"direct", "reweighted"};

    /* Since there is no upper limit to the insertion energies,
     * we need to set an upper limit for the distribution output.
    real bU_bin_limit      = 50;
    real bU_logV_bin_limit = bU_bin_limit + 10;

    nnodes = cr->nnodes;

    top = gmx_mtop_generate_local_top(top_global, inputrec);

    groups = &top_global->groups;

    bCavity = (inputrec->eI == eiTPIC);
    if (bCavity)
        ptr = getenv("GMX_TPIC_MASSES");
        if (ptr == NULL)
            nat_cavity = 1;
            /* Read (multiple) masses from env var GMX_TPIC_MASSES,
             * The center of mass of the last atoms is then used for TPIC.
            nat_cavity = 0;
            while (sscanf(ptr, "%lf%n", &dbl, &i) > 0)
                srenew(mass_cavity, nat_cavity+1);
                mass_cavity[nat_cavity] = dbl;
                fprintf(fplog, "mass[%d] = %f\n",
                        nat_cavity+1, mass_cavity[nat_cavity]);
                ptr += i;
            if (nat_cavity == 0)
                gmx_fatal(FARGS, "Found %d masses in GMX_TPIC_MASSES", nat_cavity);

    /* We never need full pbc for TPI */
    fr->ePBC = epbcXYZ;
    /* Determine the temperature for the Boltzmann weighting */
    temp = inputrec->opts.ref_t[0];
    if (fplog)
        for (i = 1; (i < inputrec->opts.ngtc); i++)
            if (inputrec->opts.ref_t[i] != temp)
                fprintf(fplog, "\nWARNING: The temperatures of the different temperature coupling groups are not identical\n\n");
                fprintf(stderr, "\nWARNING: The temperatures of the different temperature coupling groups are not identical\n\n");
                "\n  The temperature for test particle insertion is %.3f K\n\n",
    beta = 1.0/(BOLTZ*temp);

    /* Number of insertions per frame */
    nsteps = inputrec->nsteps;

    /* Use the same neighborlist with more insertions points
     * in a sphere of radius drmax around the initial point
    /* This should be a proper mdp parameter */
    drmax = inputrec->rtpi;

    /* An environment variable can be set to dump all configurations
     * to pdb with an insertion energy <= this value.
    dump_pdb  = getenv("GMX_TPI_DUMP");
    dump_ener = 0;
    if (dump_pdb)
        sscanf(dump_pdb, "%lf", &dump_ener);

    atoms2md(top_global, inputrec, 0, NULL, top_global->natoms, mdatoms);
    update_mdatoms(mdatoms, inputrec->fepvals->init_lambda);

    snew(enerd, 1);
    init_enerdata(groups->grps[egcENER].nr, inputrec->fepvals->n_lambda, enerd);
    snew(f, top_global->natoms);

    /* Print to log file  */
    wallcycle_start(wcycle, ewcRUN);
    print_start(fplog, cr, walltime_accounting, "Test Particle Insertion");

    /* The last charge group is the group to be inserted */
    cg_tp = top->cgs.nr - 1;
    a_tp0 = top->cgs.index[cg_tp];
    a_tp1 = top->cgs.index[cg_tp+1];
    if (debug)
        fprintf(debug, "TPI cg %d, atoms %d-%d\n", cg_tp, a_tp0, a_tp1);
    if (a_tp1 - a_tp0 > 1 &&
        (inputrec->rlist < inputrec->rcoulomb ||
         inputrec->rlist < inputrec->rvdw))
        gmx_fatal(FARGS, "Can not do TPI for multi-atom molecule with a twin-range cut-off");
    snew(x_mol, a_tp1-a_tp0);

    bDispCorr = (inputrec->eDispCorr != edispcNO);
    bCharge   = FALSE;
    for (i = a_tp0; i < a_tp1; i++)
        /* Copy the coordinates of the molecule to be insterted */
        copy_rvec(state->x[i], x_mol[i-a_tp0]);
        /* Check if we need to print electrostatic energies */
        bCharge |= (mdatoms->chargeA[i] != 0 ||
                    (mdatoms->chargeB && mdatoms->chargeB[i] != 0));
    bRFExcl = (bCharge && EEL_RF(fr->eeltype) && fr->eeltype != eelRF_NEC);

    calc_cgcm(fplog, cg_tp, cg_tp+1, &(top->cgs), state->x, fr->cg_cm);
    if (bCavity)
        if (norm(fr->cg_cm[cg_tp]) > 0.5*inputrec->rlist && fplog)
            fprintf(fplog, "WARNING: Your TPI molecule is not centered at 0,0,0\n");
            fprintf(stderr, "WARNING: Your TPI molecule is not centered at 0,0,0\n");
        /* Center the molecule to be inserted at zero */
        for (i = 0; i < a_tp1-a_tp0; i++)
            rvec_dec(x_mol[i], fr->cg_cm[cg_tp]);

    if (fplog)
        fprintf(fplog, "\nWill insert %d atoms %s partial charges\n",
                a_tp1-a_tp0, bCharge ? "with" : "without");

        fprintf(fplog, "\nWill insert %d times in each frame of %s\n",
                (int)nsteps, opt2fn("-rerun", nfile, fnm));

    if (!bCavity)
        if (inputrec->nstlist > 1)
            if (drmax == 0 && a_tp1-a_tp0 == 1)
                gmx_fatal(FARGS, "Re-using the neighborlist %d times for insertions of a single atom in a sphere of radius %f does not make sense", inputrec->nstlist, drmax);
            if (fplog)
                fprintf(fplog, "Will use the same neighborlist for %d insertions in a sphere of radius %f\n", inputrec->nstlist, drmax);
        if (fplog)
            fprintf(fplog, "Will insert randomly in a sphere of radius %f around the center of the cavity\n", drmax);

    ngid   = groups->grps[egcENER].nr;
    gid_tp = GET_CGINFO_GID(fr->cginfo[cg_tp]);
    nener  = 1 + ngid;
    if (bDispCorr)
        nener += 1;
    if (bCharge)
        nener += ngid;
        if (bRFExcl)
            nener += 1;
        if (EEL_FULL(fr->eeltype))
            nener += 1;
    snew(sum_UgembU, nener);

    /* Copy the random seed set by the user */
    seed = inputrec->ld_seed;
    /* We use the frame step number as one random counter.
     * The second counter use the insertion (step) count. But we
     * need multiple random numbers per insertion. This number is
     * not fixed, since we generate random locations in a sphere
     * by putting locations in a cube and some of these fail.
     * A count of 20 is already extremely unlikely, so 10000 is
     * a safe margin for random numbers per insertion.
    rnd_count_stride = 10000;

    if (MASTER(cr))
        fp_tpi = xvgropen(opt2fn("-tpi", nfile, fnm),
                          "TPI energies", "Time (ps)",
                          "(kJ mol\\S-1\\N) / (nm\\S3\\N)", oenv);
        xvgr_subtitle(fp_tpi, "f. are averages over one frame", oenv);
        snew(leg, 4+nener);
        e = 0;
        sprintf(str, "-kT log(<Ve\\S-\\betaU\\N>/<V>)");
        leg[e++] = strdup(str);
        sprintf(str, "f. -kT log<e\\S-\\betaU\\N>");
        leg[e++] = strdup(str);
        sprintf(str, "f. <e\\S-\\betaU\\N>");
        leg[e++] = strdup(str);
        sprintf(str, "f. V");
        leg[e++] = strdup(str);
        sprintf(str, "f. <Ue\\S-\\betaU\\N>");
        leg[e++] = strdup(str);
        for (i = 0; i < ngid; i++)
            sprintf(str, "f. <U\\sVdW %s\\Ne\\S-\\betaU\\N>",
            leg[e++] = strdup(str);
        if (bDispCorr)
            sprintf(str, "f. <U\\sdisp c\\Ne\\S-\\betaU\\N>");
            leg[e++] = strdup(str);
        if (bCharge)
            for (i = 0; i < ngid; i++)
                sprintf(str, "f. <U\\sCoul %s\\Ne\\S-\\betaU\\N>",
                leg[e++] = strdup(str);
            if (bRFExcl)
                sprintf(str, "f. <U\\sRF excl\\Ne\\S-\\betaU\\N>");
                leg[e++] = strdup(str);
            if (EEL_FULL(fr->eeltype))
                sprintf(str, "f. <U\\sCoul recip\\Ne\\S-\\betaU\\N>");
                leg[e++] = strdup(str);
        xvgr_legend(fp_tpi, 4+nener, (const char**)leg, oenv);
        for (i = 0; i < 4+nener; i++)
    V_all     = 0;
    VembU_all = 0;

    invbinw = 10;
    nbin    = 10;
    snew(bin, nbin);

    /* Avoid frame step numbers <= -1 */
    frame_step_prev = -1;

    bNotLastFrame = read_first_frame(oenv, &status, opt2fn("-rerun", nfile, fnm),
                                     &rerun_fr, TRX_NEED_X);
    frame = 0;

    if (rerun_fr.natoms - (bCavity ? nat_cavity : 0) !=
        mdatoms->nr - (a_tp1 - a_tp0))
        gmx_fatal(FARGS, "Number of atoms in trajectory (%d)%s "
                  "is not equal the number in the run input file (%d) "
                  "minus the number of atoms to insert (%d)\n",
                  rerun_fr.natoms, bCavity ? " minus one" : "",
                  mdatoms->nr, a_tp1-a_tp0);

    refvolshift = log(det(rerun_fr.box));

    switch (inputrec->eI)
        case eiTPI:
            stepblocksize = inputrec->nstlist;
        case eiTPIC:
            stepblocksize = 1;
            gmx_fatal(FARGS, "Unknown integrator %s", ei_names[inputrec->eI]);

#ifdef GMX_SIMD
    /* Make sure we don't detect SIMD overflow generated before this point */

    while (bNotLastFrame)
        frame_step      = rerun_fr.step;
        if (frame_step <= frame_step_prev)
            /* We don't have step number in the trajectory file,
             * or we have constant or decreasing step numbers.
             * Ensure we have increasing step numbers, since we use
             * the step numbers as a counter for random numbers.
            frame_step  = frame_step_prev + 1;
        frame_step_prev = frame_step;

        lambda = rerun_fr.lambda;
        t      = rerun_fr.time;

        sum_embU = 0;
        for (e = 0; e < nener; e++)
            sum_UgembU[e] = 0;

        /* Copy the coordinates from the input trajectory */
        for (i = 0; i < rerun_fr.natoms; i++)
            copy_rvec(rerun_fr.x[i], state->x[i]);
        copy_mat(rerun_fr.box, state->box);

        V    = det(state->box);
        logV = log(V);

        bStateChanged = TRUE;
        bNS           = TRUE;

        step = cr->nodeid*stepblocksize;
        while (step < nsteps)
            /* Initialize the second counter for random numbers using
             * the insertion step index. This ensures that we get
             * the same random numbers independently of how many
             * MPI ranks we use. Also for the same seed, we get
             * the same initial random sequence for different nsteps.
            rnd_count = step*rnd_count_stride;

            if (!bCavity)
                /* Random insertion in the whole volume */
                bNS = (step % inputrec->nstlist == 0);
                if (bNS)
                    /* Generate a random position in the box */
                    gmx_rng_cycle_2uniform(frame_step, rnd_count++, seed, RND_SEED_TPI, rnd);
                    gmx_rng_cycle_2uniform(frame_step, rnd_count++, seed, RND_SEED_TPI, rnd+2);
                    for (d = 0; d < DIM; d++)
                        x_init[d] = rnd[d]*state->box[d][d];
                if (inputrec->nstlist == 1)
                    copy_rvec(x_init, x_tp);
                    /* Generate coordinates within |dx|=drmax of x_init */
                        gmx_rng_cycle_2uniform(frame_step, rnd_count++, seed, RND_SEED_TPI, rnd);
                        gmx_rng_cycle_2uniform(frame_step, rnd_count++, seed, RND_SEED_TPI, rnd+2);
                        for (d = 0; d < DIM; d++)
                            dx[d] = (2*rnd[d] - 1)*drmax;
                    while (norm2(dx) > drmax*drmax);
                    rvec_add(x_init, dx, x_tp);
                /* Random insertion around a cavity location
                 * given by the last coordinate of the trajectory.
                if (step == 0)
                    if (nat_cavity == 1)
                        /* Copy the location of the cavity */
                        copy_rvec(rerun_fr.x[rerun_fr.natoms-1], x_init);
                        /* Determine the center of mass of the last molecule */
                        mass_tot = 0;
                        for (i = 0; i < nat_cavity; i++)
                            for (d = 0; d < DIM; d++)
                                x_init[d] +=
                            mass_tot += mass_cavity[i];
                        for (d = 0; d < DIM; d++)
                            x_init[d] /= mass_tot;
                /* Generate coordinates within |dx|=drmax of x_init */
                    gmx_rng_cycle_2uniform(frame_step, rnd_count++, seed, RND_SEED_TPI, rnd);
                    gmx_rng_cycle_2uniform(frame_step, rnd_count++, seed, RND_SEED_TPI, rnd+2);
                    for (d = 0; d < DIM; d++)
                        dx[d] = (2*rnd[d] - 1)*drmax;
                while (norm2(dx) > drmax*drmax);
                rvec_add(x_init, dx, x_tp);

            if (a_tp1 - a_tp0 == 1)
                /* Insert a single atom, just copy the insertion location */
                copy_rvec(x_tp, state->x[a_tp0]);
                /* Copy the coordinates from the top file */
                for (i = a_tp0; i < a_tp1; i++)
                    copy_rvec(x_mol[i-a_tp0], state->x[i]);
                /* Rotate the molecule randomly */
                gmx_rng_cycle_2uniform(frame_step, rnd_count++, seed, RND_SEED_TPI, rnd);
                gmx_rng_cycle_2uniform(frame_step, rnd_count++, seed, RND_SEED_TPI, rnd+2);
                rotate_conf(a_tp1-a_tp0, state->x+a_tp0, NULL,
                /* Shift to the insertion location */
                for (i = a_tp0; i < a_tp1; i++)
                    rvec_inc(state->x[i], x_tp);

            /* Clear some matrix variables  */

            /* Set the charge group center of mass of the test particle */
            copy_rvec(x_init, fr->cg_cm[top->cgs.nr-1]);

            /* Calc energy (no forces) on new positions.
             * Since we only need the intermolecular energy
             * and the RF exclusion terms of the inserted molecule occur
             * within a single charge group we can pass NULL for the graph.
             * This also avoids shifts that would move charge groups
             * out of the box.
             * Some checks above ensure than we can not have
             * twin-range interactions together with nstlist > 1,
             * therefore we do not need to remember the LR energies.
            /* Make do_force do a single node force calculation */
            cr->nnodes = 1;
            do_force(fplog, cr, inputrec,
                     step, nrnb, wcycle, top, &top_global->groups,
                     state->box, state->x, &state->hist,
                     f, force_vir, mdatoms, enerd, fcd,
                     NULL, fr, NULL, mu_tot, t, NULL, NULL, FALSE,
                     (bNS ? GMX_FORCE_DYNAMICBOX | GMX_FORCE_NS | GMX_FORCE_DO_LR : 0) |
                     (bStateChanged ? GMX_FORCE_STATECHANGED : 0));
            cr->nnodes    = nnodes;
            bStateChanged = FALSE;
            bNS           = FALSE;

            /* Calculate long range corrections to pressure and energy */
            calc_dispcorr(fplog, inputrec, fr, step, top_global->natoms, state->box,
                          lambda, pres, vir, &prescorr, &enercorr, &dvdlcorr);
            /* figure out how to rearrange the next 4 lines MRS 8/4/2009 */
            enerd->term[F_DISPCORR]  = enercorr;
            enerd->term[F_EPOT]     += enercorr;
            enerd->term[F_PRES]     += prescorr;
            enerd->term[F_DVDL_VDW] += dvdlcorr;

            epot               = enerd->term[F_EPOT];
            bEnergyOutOfBounds = FALSE;
            /* With SSE the energy can overflow, check for this */
            if (gmx_mm_check_and_reset_overflow())
                if (debug)
                    fprintf(debug, "Found an SSE overflow, assuming the energy is out of bounds\n");
                bEnergyOutOfBounds = TRUE;
            /* If the compiler doesn't optimize this check away
             * we catch the NAN energies.
             * The epot>GMX_REAL_MAX check catches inf values,
             * which should nicely result in embU=0 through the exp below,
             * but it does not hurt to check anyhow.
            /* Non-bonded Interaction usually diverge at r=0.
             * With tabulated interaction functions the first few entries
             * should be capped in a consistent fashion between
             * repulsion, dispersion and Coulomb to avoid accidental
             * negative values in the total energy.
             * The table generation code in tables.c does this.
             * With user tbales the user should take care of this.
            if (epot != epot || epot > GMX_REAL_MAX)
                bEnergyOutOfBounds = TRUE;
            if (bEnergyOutOfBounds)
                if (debug)
                    fprintf(debug, "\n  time %.3f, step %d: non-finite energy %f, using exp(-bU)=0\n", t, (int)step, epot);
                embU = 0;
                embU      = exp(-beta*epot);
                sum_embU += embU;
                /* Determine the weighted energy contributions of each energy group */
                e                = 0;
                sum_UgembU[e++] += epot*embU;
                if (fr->bBHAM)
                    for (i = 0; i < ngid; i++)
                        sum_UgembU[e++] +=
                            (enerd->grpp.ener[egBHAMSR][GID(i, gid_tp, ngid)] +
                             enerd->grpp.ener[egBHAMLR][GID(i, gid_tp, ngid)])*embU;
                    for (i = 0; i < ngid; i++)
                        sum_UgembU[e++] +=
                            (enerd->grpp.ener[egLJSR][GID(i, gid_tp, ngid)] +
                             enerd->grpp.ener[egLJLR][GID(i, gid_tp, ngid)])*embU;
                if (bDispCorr)
                    sum_UgembU[e++] += enerd->term[F_DISPCORR]*embU;
                if (bCharge)
                    for (i = 0; i < ngid; i++)
                        sum_UgembU[e++] +=
                            (enerd->grpp.ener[egCOULSR][GID(i, gid_tp, ngid)] +
                             enerd->grpp.ener[egCOULLR][GID(i, gid_tp, ngid)])*embU;
                    if (bRFExcl)
                        sum_UgembU[e++] += enerd->term[F_RF_EXCL]*embU;
                    if (EEL_FULL(fr->eeltype))
                        sum_UgembU[e++] += enerd->term[F_COUL_RECIP]*embU;

            if (embU == 0 || beta*epot > bU_bin_limit)
                i = (int)((bU_logV_bin_limit
                           - (beta*epot - logV + refvolshift))*invbinw
                          + 0.5);
                if (i < 0)
                    i = 0;
                if (i >= nbin)
                    realloc_bins(&bin, &nbin, i+10);

            if (debug)
                fprintf(debug, "TPI %7d %12.5e %12.5f %12.5f %12.5f\n",
                        (int)step, epot, x_tp[XX], x_tp[YY], x_tp[ZZ]);

            if (dump_pdb && epot <= dump_ener)
                sprintf(str, "t%g_step%d.pdb", t, (int)step);
                sprintf(str2, "t: %f step %d ener: %f", t, (int)step, epot);
                write_sto_conf_mtop(str, str2, top_global, state->x, state->v,
                                    inputrec->ePBC, state->box);

            if ((step/stepblocksize) % cr->nnodes != cr->nodeid)
                /* Skip all steps assigned to the other MPI ranks */
                step += (cr->nnodes - 1)*stepblocksize;

        if (PAR(cr))
            /* When running in parallel sum the energies over the processes */
            gmx_sumd(1,    &sum_embU, cr);
            gmx_sumd(nener, sum_UgembU, cr);

        V_all     += V;
        VembU_all += V*sum_embU/nsteps;

        if (fp_tpi)
            if (bVerbose || frame%10 == 0 || frame < 10)
                fprintf(stderr, "mu %10.3e <mu> %10.3e\n",
                        -log(sum_embU/nsteps)/beta, -log(VembU_all/V_all)/beta);

            fprintf(fp_tpi, "%10.3f %12.5e %12.5e %12.5e %12.5e",
                    VembU_all == 0 ? 20/beta : -log(VembU_all/V_all)/beta,
                    sum_embU == 0  ? 20/beta : -log(sum_embU/nsteps)/beta,
                    sum_embU/nsteps, V);
            for (e = 0; e < nener; e++)
                fprintf(fp_tpi, " %12.5e", sum_UgembU[e]/nsteps);
            fprintf(fp_tpi, "\n");

        bNotLastFrame = read_next_frame(oenv, status, &rerun_fr);
    } /* End of the loop  */


    if (fp_tpi != NULL)

    if (fplog != NULL)
        fprintf(fplog, "\n");
        fprintf(fplog, "  <V>  = %12.5e nm^3\n", V_all/frame);
        fprintf(fplog, "  <mu> = %12.5e kJ/mol\n", -log(VembU_all/V_all)/beta);

    /* Write the Boltzmann factor histogram */
    if (PAR(cr))
        /* When running in parallel sum the bins over the processes */
        i = nbin;
        global_max(cr, &i);
        realloc_bins(&bin, &nbin, i);
        gmx_sumd(nbin, bin, cr);
    if (MASTER(cr))
        fp_tpi = xvgropen(opt2fn("-tpid", nfile, fnm),
                          "TPI energy distribution",
                          "\\betaU - log(V/<V>)", "count", oenv);
        sprintf(str, "number \\betaU > %g: %9.3e", bU_bin_limit, bin[0]);
        xvgr_subtitle(fp_tpi, str, oenv);
        xvgr_legend(fp_tpi, 2, (const char **)tpid_leg, oenv);
        for (i = nbin-1; i > 0; i--)
            bUlogV = -i/invbinw + bU_logV_bin_limit - refvolshift + log(V_all/frame);
            fprintf(fp_tpi, "%6.2f %10d %12.5e\n",


    walltime_accounting_set_nsteps_done(walltime_accounting, frame*inputrec->nsteps);

    return 0;
gmx_bool pme_load_balance(pme_load_balancing_t pme_lb,
                          t_commrec           *cr,
                          FILE                *fp_err,
                          FILE                *fp_log,
                          t_inputrec          *ir,
                          t_state             *state,
                          double               cycles,
                          interaction_const_t *ic,
                          nonbonded_verlet_t  *nbv,
                          gmx_pme_t           *pmedata,
                          gmx_large_int_t      step)
    gmx_bool     OK;
    pme_setup_t *set;
    double       cycles_fast;
    char         buf[STRLEN], sbuf[22];
    real         rtab;
    gmx_bool     bUsesSimpleTables = TRUE;

    if (pme_lb->stage == pme_lb->nstage)
        return FALSE;

    if (PAR(cr))
        gmx_sumd(1, &cycles, cr);
        cycles /= cr->nnodes;

    set = &pme_lb->setup[pme_lb->cur];

    rtab = ir->rlistlong + ir->tabext;

    if (set->count % 2 == 1)
        /* Skip the first cycle, because the first step after a switch
         * is much slower due to allocation and/or caching effects.
        return TRUE;

    sprintf(buf, "step %4s: ", gmx_step_str(step, sbuf));
    print_grid(fp_err, fp_log, buf, "timed with", set, cycles);

    if (set->count <= 2)
        set->cycles = cycles;
        if (cycles*PME_LB_ACCEL_TOL < set->cycles &&
            pme_lb->stage == pme_lb->nstage - 1)
            /* The performance went up a lot (due to e.g. DD load balancing).
             * Add a stage, keep the minima, but rescan all setups.

            if (debug)
                fprintf(debug, "The performance for grid %d %d %d went from %.3f to %.1f M-cycles, this is more than %f\n"
                        "Increased the number stages to %d"
                        " and ignoring the previous performance\n",
                        set->grid[XX], set->grid[YY], set->grid[ZZ],
                        cycles*1e-6, set->cycles*1e-6, PME_LB_ACCEL_TOL,
        set->cycles = min(set->cycles, cycles);

    if (set->cycles < pme_lb->setup[pme_lb->fastest].cycles)
        pme_lb->fastest = pme_lb->cur;

        if (DOMAINDECOMP(cr))
            /* We found a new fastest setting, ensure that with subsequent
             * shorter cut-off's the dynamic load balancing does not make
             * the use of the current cut-off impossible. This solution is
             * a trade-off, as the PME load balancing and DD domain size
             * load balancing can interact in complex ways.
             * With the Verlet kernels, DD load imbalance will usually be
             * mainly due to bonded interaction imbalance, which will often
             * quickly push the domain boundaries beyond the limit for the
             * optimal, PME load balanced, cut-off. But it could be that
             * better overal performance can be obtained with a slightly
             * shorter cut-off and better DD load balancing.
    cycles_fast = pme_lb->setup[pme_lb->fastest].cycles;

    /* Check in stage 0 if we should stop scanning grids.
     * Stop when the time is more than SLOW_FAC longer than the fastest.
    if (pme_lb->stage == 0 && pme_lb->cur > 0 &&
        cycles > pme_lb->setup[pme_lb->fastest].cycles*PME_LB_SLOW_FAC)
        pme_lb->n = pme_lb->cur + 1;
        /* Done with scanning, go to stage 1 */

    if (pme_lb->stage == 0)
        int gridsize_start;

        gridsize_start = set->grid[XX]*set->grid[YY]*set->grid[ZZ];

            if (pme_lb->cur+1 < pme_lb->n)
                /* We had already generated the next setup */
                OK = TRUE;
                /* Find the next setup */
                OK = pme_loadbal_increase_cutoff(pme_lb, ir->pme_order);

            if (OK && ir->ePBC != epbcNONE)
                OK = (sqr(pme_lb->setup[pme_lb->cur+1].rlistlong)
                      <= max_cutoff2(ir->ePBC, state->box));
                if (!OK)
                    pme_lb->elimited = epmelblimBOX;

            if (OK)

                if (DOMAINDECOMP(cr))
                    OK = change_dd_cutoff(cr, state, ir,
                    if (!OK)
                        /* Failed: do not use this setup */
                        pme_lb->elimited = epmelblimDD;
            if (!OK)
                /* We hit the upper limit for the cut-off,
                 * the setup should not go further than cur.
                pme_lb->n = pme_lb->cur + 1;
                print_loadbal_limited(fp_err, fp_log, step, pme_lb);
                /* Switch to the next stage */
        while (OK &&
                 pme_lb->setup[pme_lb->cur].grid[ZZ] <
                 pme_lb->setup[pme_lb->cur].grid_efficiency <

    if (pme_lb->stage > 0 && pme_lb->end == 1)
        pme_lb->cur   = 0;
        pme_lb->stage = pme_lb->nstage;
    else if (pme_lb->stage > 0 && pme_lb->end > 1)
        /* If stage = nstage-1:
         *   scan over all setups, rerunning only those setups
         *   which are not much slower than the fastest
         * else:
         *   use the next setup
            if (pme_lb->cur == pme_lb->end)
                pme_lb->cur = pme_lb->start;
        while (pme_lb->stage == pme_lb->nstage - 1 &&
               pme_lb->setup[pme_lb->cur].count > 0 &&
               pme_lb->setup[pme_lb->cur].cycles > cycles_fast*PME_LB_SLOW_FAC);

        if (pme_lb->stage == pme_lb->nstage)
            /* We are done optimizing, use the fastest setup we found */
            pme_lb->cur = pme_lb->fastest;

    if (DOMAINDECOMP(cr) && pme_lb->stage > 0)
        OK = change_dd_cutoff(cr, state, ir, pme_lb->setup[pme_lb->cur].rlistlong);
        if (!OK)
            /* Failsafe solution */
            if (pme_lb->cur > 1 && pme_lb->stage == pme_lb->nstage)
            pme_lb->fastest  = 0;
            pme_lb->start    = 0;
            pme_lb->end      = pme_lb->cur;
            pme_lb->cur      = pme_lb->start;
            pme_lb->elimited = epmelblimDD;
            print_loadbal_limited(fp_err, fp_log, step, pme_lb);

    /* Change the Coulomb cut-off and the PME grid */

    set = &pme_lb->setup[pme_lb->cur];

    ic->rcoulomb   = set->rcut_coulomb;
    ic->rlist      = set->rlist;
    ic->rlistlong  = set->rlistlong;
    ir->nstcalclr  = set->nstcalclr;
    ic->ewaldcoeff = set->ewaldcoeff;

    bUsesSimpleTables = uses_simple_tables(ir->cutoff_scheme, nbv, 0);
    if (pme_lb->cutoff_scheme == ecutsVERLET &&
        nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
        nbnxn_cuda_pme_loadbal_update_param(nbv->cu_nbv, ic);
        init_interaction_const_tables(NULL, ic, bUsesSimpleTables,

    if (pme_lb->cutoff_scheme == ecutsVERLET && nbv->ngrp > 1)
        init_interaction_const_tables(NULL, ic, bUsesSimpleTables,

    if (cr->duty & DUTY_PME)
        if (pme_lb->setup[pme_lb->cur].pmedata == NULL)
            /* Generate a new PME data structure,
             * copying part of the old pointers.
                           cr, pme_lb->setup[0].pmedata, ir,
        *pmedata = set->pmedata;
        /* Tell our PME-only node to switch grid */
        gmx_pme_send_switchgrid(cr, set->grid, set->ewaldcoeff);

    if (debug)
        print_grid(NULL, debug, "", "switched to", set, -1);

    if (pme_lb->stage == pme_lb->nstage)
        print_grid(fp_err, fp_log, "", "optimal", set, -1);

    return TRUE;
		   t_inputrec *inputrec,t_mdatoms *md,
		   t_state *state,rvec buf[],rvec f[],
		   t_graph *graph,t_commrec *cr,t_nrnb *nrnb,
		   t_forcerec *fr,t_idef *idef)
  int    i,m,start,end,step;
  double mass,tmass,vcm[4];
  real   dt=inputrec->delta_t;
  real   dvdlambda;

  start = md->start;
  end   = md->homenr + start;
  if (debug)
    fprintf(debug,"vcm: start=%d, homenr=%d, end=%d\n",
  /* Do a first SHAKE to reset particles... */
  step = inputrec->init_step;
  if (fplog)
    fprintf(fplog,"\nConstraining the starting coordinates (step %d)\n",step);
  dvdlambda = 0;

  if (EI_STATE_VELOCITY(inputrec->eI)) {
    for(i=start; (i<end); i++) {
      for(m=0; (m<DIM); m++) {
	/* Reverse the velocity */
	state->v[i][m] = -state->v[i][m];
	/* Store the position at t-dt in buf */
	buf[i][m] = state->x[i][m] + dt*state->v[i][m];
    /* Shake the positions at t=-dt with the positions at t=0
     * as reference coordinates.
    if (fplog)
      fprintf(fplog,"\nConstraining the coordinates at t0-dt (step %d)\n",
    dvdlambda = 0;
    for(m=0; (m<4); m++)
      vcm[m] = 0;
    for(i=start; i<end; i++) {
      mass = md->massT[i];
      for(m=0; m<DIM; m++) {
	/* Re-reverse the velocities */
	state->v[i][m] = -state->v[i][m];
	vcm[m] += state->v[i][m]*mass;
      vcm[3] += mass;
    if (inputrec->nstcomm != 0 || debug) {
      /* Compute the global sum of vcm */
      if (debug)
	fprintf(debug,"vcm: %8.3f  %8.3f  %8.3f,"
		" total mass = %12.5e\n",vcm[XX],vcm[YY],vcm[ZZ],vcm[3]);
      if (PAR(cr))
      tmass = vcm[3];
      for(m=0; (m<DIM); m++)
	vcm[m] /= tmass;
      if (debug) 
	fprintf(debug,"vcm: %8.3f  %8.3f  %8.3f,"
		" total mass = %12.5e\n",vcm[XX],vcm[YY],vcm[ZZ],tmass);
      if (inputrec->nstcomm != 0) {
	/* Now we have the velocity of center of mass, let's remove it */
	for(i=start; (i<end); i++) {
	  for(m=0; (m<DIM); m++)
	    state->v[i][m] -= vcm[m];
	      t_inputrec *inputrec,
	      int step,t_nrnb *nrnb,gmx_wallcycle_t wcycle,
	      gmx_localtop_t *top,
	      gmx_groups_t *groups,
	      matrix box,rvec x[],history_t *hist,
	      rvec f[],rvec buf[],
	      tensor vir_force,
	      t_mdatoms *mdatoms,
	      gmx_enerdata_t *enerd,t_fcdata *fcd,
	      real lambda,t_graph *graph,
	      t_forcerec *fr,gmx_vsite_t *vsite,rvec mu_tot,
	      real t,FILE *field,gmx_edsam_t ed,
	      int flags)
  static rvec box_size;
  int    cg0,cg1,i,j;
  int    start,homenr;
  static double mu[2*DIM]; 
  rvec   mu_tot_AB[2];
  bool   bSepDVDL,bStateChanged,bNS,bFillGrid,bCalcCGCM,bBS,bDoForces;
  matrix boxs;
  real   e,v,dvdl;
  t_pbc  pbc;
  float  cycles_ppdpme,cycles_pme,cycles_force;
  start  = mdatoms->start;
  homenr = mdatoms->homenr;

  bSepDVDL = (fr->bSepDVDL && do_per_step(step,inputrec->nstlog));

  if (PARTDECOMP(cr)) {
  } else {
    cg0 = 0;
    if (DOMAINDECOMP(cr))
      cg1 = cr->dd->ncg_tot;
      cg1 = top->cgs.nr;
    if (fr->n_tpi > 0)

  bStateChanged = (flags & GMX_FORCE_STATECHANGED);
  bNS           = (flags & GMX_FORCE_NS);
  bFillGrid     = (bNS && bStateChanged);
  bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
  bDoForces     = (flags & GMX_FORCE_FORCES);

  if (bStateChanged) {
    /* Calculate total (local) dipole moment in a temporary common array. 
     * This makes it possible to sum them over nodes faster.
  if (fr->ePBC != epbcNONE) { 
    /* Compute shift vectors every step,
     * because of pressure coupling or box deformation!
    if (DYNAMIC_BOX(*inputrec) && bStateChanged)
    if (bCalcCGCM) { 
    else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph) {
  else if (bCalcCGCM) {
  if (bCalcCGCM) {
    if (PAR(cr)) {
    if (gmx_debug_at)

#ifdef GMX_MPI
  if (!(cr->duty & DUTY_PME)) {
    /* Send particle coordinates to the pme nodes.
     * Since this is only implemented for domain decomposition
     * and domain decomposition does not use the graph,
     * we do not need to worry about shifting.


    bBS = (inputrec->nwall == 2);
    if (bBS) {

    gmx_pme_send_x(cr,bBS ? boxs : box,x,mdatoms->nChargePerturbed,lambda);

#endif /* GMX_MPI */

  /* Communicate coordinates and sum dipole if necessary */
  if (PAR(cr)) {
    if (DOMAINDECOMP(cr)) {
    } else {
    /* When we don't need the total dipole we sum it in global_stat */
    if (NEED_MUTOT(*inputrec))
  for(i=0; i<2; i++)
      mu_tot_AB[i][j] = mu[i*DIM + j];
  if (fr->efep == efepNO)
    for(j=0; j<DIM; j++)
      mu_tot[j] = (1.0 - lambda)*mu_tot_AB[0][j] + lambda*mu_tot_AB[1][j];

  /* Reset energies */
  if (bNS) {
    if (graph && bStateChanged)
      /* Calculate intramolecular shift vectors to make molecules whole */

    /* Reset long range forces if necessary */
    if (fr->bTwinRange) {
    /* Do the actual neighbour searching and if twin range electrostatics
     * also do the calculation of long range forces and energies.
    dvdl = 0; 
    if (bSepDVDL)
      fprintf(fplog,sepdvdlformat,"LR non-bonded",0,dvdl);
    enerd->dvdl_lr       = dvdl;
    enerd->term[F_DVDL] += dvdl;

  if (DOMAINDECOMP(cr)) {
    if (!(cr->duty & DUTY_PME)) {
  /* Start the force cycle counter.
   * This counter is stopped in do_forcelow_level.
   * No parallel communication should occur while this counter is running,
   * since that will interfere with the dynamic load balancing.

  if (bDoForces) {
      /* Reset PME/Ewald forces if necessary */
    if (fr->bF_NoVirSum) 
      if (fr->bDomDec)
    /* Copy long range forces into normal buffers */
    if (fr->bTwinRange) {
      for(i=0; i<fr->f_twin_n; i++)
      for(i=0; i<SHIFTS; i++)
    else {
      if (DOMAINDECOMP(cr))
  if (inputrec->ePull == epullCONSTRAINT)

  /* update QMMMrec, if necessary */

  if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0) {
    /* Position restraints always require full pbc */
    v = posres(top->idef.il[F_POSRES].nr,top->idef.il[F_POSRES].iatoms,
	       (const rvec*)x,fr->f_novirsum,fr->vir_diag_posres,
	       inputrec->ePBC==epbcNONE ? NULL : &pbc,lambda,&dvdl,
    if (bSepDVDL) {
    enerd->term[F_POSRES] += v;
    enerd->term[F_DVDL]   += dvdl;
  /* Compute the bonded and non-bonded forces */    

  if (ed) {
  if (DOMAINDECOMP(cr)) {
    if (wcycle)
  if (bDoForces) {
    /* Compute forces due to electric field */
    calc_f_el(MASTER(cr) ? field : NULL,
    /* When using PME/Ewald we compute the long range virial there.
     * otherwise we do it based on long range forces from twin range
     * cut-off based calculation (or not at all).
    /* Communicate the forces */
    if (PAR(cr)) {
      if (DOMAINDECOMP(cr)) {
	/* Position restraint do not introduce inter-cg forces */
	if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl)
      } else {

  if (bDoForces) {
    if (vsite) {
    /* Calculation of the virial must be done after vsites! */

  if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F) {
    /* Calculate the center of mass forces, this requires communication,
     * which is why pull_potential is called close to other communication.
     * The virial contribution is calculated directly,
     * which is why we call pull_potential after calc_virial.
    dvdl = 0; 
    enerd->term[F_COM_PULL] =
    if (bSepDVDL)
      fprintf(fplog,sepdvdlformat,"Com pull",enerd->term[F_COM_PULL],dvdl);
    enerd->term[F_DVDL] += dvdl;

  if (!(cr->duty & DUTY_PME)) {
    cycles_ppdpme = wallcycle_stop(wcycle,ewcPPDURINGPME);

#ifdef GMX_MPI
  if (PAR(cr) && !(cr->duty & DUTY_PME)) {
    /* In case of node-splitting, the PP nodes receive the long-range 
     * forces, virial and energy from the PME nodes here.
    dvdl = 0;
    if (bSepDVDL)
      fprintf(fplog,sepdvdlformat,"PME mesh",e,dvdl);
    enerd->term[F_COUL_RECIP] += e;
    enerd->term[F_DVDL] += dvdl;
    if (wcycle)

  if (bDoForces && fr->bF_NoVirSum) {
    if (vsite) {
      /* Spread the mesh force on virtual sites to the other particles... 
       * This is parallellized. MPI communication is performed
       * if the constructing atoms aren't local.
    /* Now add the forces, this is local */
    if (fr->bDomDec) {
    } else {
    if (EEL_FULL(fr->eeltype)) {
      /* Add the mesh contribution to the virial */
    if (debug)

  /* Sum the potential energy terms from group contributions */

  if (fr->print_force >= 0 && bDoForces)
void do_shakefirst(FILE *log,bool bTYZ,real lambda,real ener[],
		   t_parm *parm,t_nsborder *nsb,t_mdatoms *md,
		   rvec x[],rvec vold[],rvec buf[],rvec f[],
		   rvec v[],t_graph *graph,t_commrec *cr,t_nrnb *nrnb,
		   t_groups *grps,t_forcerec *fr,t_topology *top,
		   t_edsamyn *edyn,t_pull *pulldata)
  int    i,m,start,homenr,end,step;
  tensor shake_vir;
  double mass,tmass,vcm[4];
  real   dt=parm->ir.delta_t;
  real   dt_1;

  if (count_constraints(top,cr)) {
    start  = START(nsb);
    homenr = HOMENR(nsb);
    end    = start+homenr;
    if (debug)
      fprintf(debug,"vcm: start=%d, homenr=%d, end=%d\n",start,homenr,end);
    /* Do a first SHAKE to reset particles... */
    step = -2;
        fprintf(log,"\nConstraining the starting coordinates (step %d)\n",step);
    /* Compute coordinates at t=-dt, store them in buf */
    /* for(i=0; (i<nsb->natoms); i++) {*/
    for(i=start; (i<end); i++) {
      for(m=0; (m<DIM); m++) { 
    /* Shake the positions at t=-dt with the positions at t=0
     * as reference coordinates.
    step = -1;
        fprintf(log,"\nConstraining the coordinates at t0-dt (step %d)\n",step);
    /* Compute the velocities at t=-dt/2 using the coordinates at
     * t=-dt and t=0
     * Compute velocity of center of mass and total mass
    for(m=0; (m<4); m++)
      vcm[m] = 0;
    for(i=start; (i<end); i++) {
      /*for(i=0; (i<nsb->natoms); i++) {*/
      mass = md->massA[i];
      for(m=0; (m<DIM); m++) {
	vcm[m] += v[i][m]*mass;
      vcm[3] += mass;
    /* Compute the global sum of vcm */
    if (debug)
      fprintf(debug,"vcm: %8.3f  %8.3f  %8.3f,"
	      " total mass = %12.5e\n",vcm[XX],vcm[YY],vcm[ZZ],vcm[3]);
    if (PAR(cr))
    tmass = vcm[3];
    for(m=0; (m<DIM); m++)
      vcm[m] /= tmass;
    if (debug) 
      fprintf(debug,"vcm: %8.3f  %8.3f  %8.3f,"
	      " total mass = %12.5e\n",vcm[XX],vcm[YY],vcm[ZZ],tmass);
    /* Now we have the velocity of center of mass, let's remove it */
    for(i=start; (i<end); i++) {
      for(m=0; (m<DIM); m++)
	v[i][m] -= vcm[m];
/* calculates center of mass of selection index from all coordinates x */
void pull_calc_coms(t_commrec *cr,
                    t_pull *pull, t_mdatoms *md, t_pbc *pbc, double t,
                    rvec x[], rvec *xp)
    int  g;
    real twopi_box = 0;

    if (pull->rbuf == NULL)
        snew(pull->rbuf, pull->ngroup);
    if (pull->dbuf == NULL)
        snew(pull->dbuf, 3*pull->ngroup);

    if (pull->bRefAt && pull->bSetPBCatoms)
        pull_set_pbcatoms(cr, pull, x, pull->rbuf);

        if (cr != NULL && DOMAINDECOMP(cr))
            /* We can keep these PBC reference coordinates fixed for nstlist
             * steps, since atoms won't jump over PBC.
             * This avoids a global reduction at the next nstlist-1 steps.
             * Note that the exact values of the pbc reference coordinates
             * are irrelevant, as long all atoms in the group are within
             * half a box distance of the reference coordinate.
            pull->bSetPBCatoms = FALSE;

    if (pull->cosdim >= 0)
        int m;

        assert(pull->npbcdim <= DIM);

        for (m = pull->cosdim+1; m < pull->npbcdim; m++)
            if (pbc->box[m][pull->cosdim] != 0)
                gmx_fatal(FARGS, "Can not do cosine weighting for trilinic dimensions");
        twopi_box = 2.0*M_PI/pbc->box[pull->cosdim][pull->cosdim];

    for (g = 0; g < pull->ngroup; g++)
        t_pull_group *pgrp;

        pgrp = &pull->group[g];

        if (pgrp->bCalcCOM)
            if (pgrp->epgrppbc != epgrppbcCOS)
                dvec   com, comp;
                double wmass, wwmass;
                rvec   x_pbc = { 0, 0, 0 };
                int    i;

                wmass  = 0;
                wwmass = 0;

                if (pgrp->epgrppbc == epgrppbcREFAT)
                    /* Set the pbc atom */
                    copy_rvec(pull->rbuf[g], x_pbc);

                for (i = 0; i < pgrp->nat_loc; i++)
                    int  ii, m;
                    real mass, wm;

                    ii   = pgrp->ind_loc[i];
                    mass = md->massT[ii];
                    if (pgrp->weight_loc == NULL)
                        wm     = mass;
                        wmass += wm;
                        real w;

                        w       = pgrp->weight_loc[i];
                        wm      = w*mass;
                        wmass  += wm;
                        wwmass += wm*w;
                    if (pgrp->epgrppbc == epgrppbcNONE)
                        /* Plain COM: sum the coordinates */
                        for (m = 0; m < DIM; m++)
                            com[m]    += wm*x[ii][m];
                        if (xp)
                            for (m = 0; m < DIM; m++)
                                comp[m] += wm*xp[ii][m];
                        rvec dx;

                        /* Sum the difference with the reference atom */
                        pbc_dx(pbc, x[ii], x_pbc, dx);
                        for (m = 0; m < DIM; m++)
                            com[m]    += wm*dx[m];
                        if (xp)
                            /* For xp add the difference between xp and x to dx,
                             * such that we use the same periodic image,
                             * also when xp has a large displacement.
                            for (m = 0; m < DIM; m++)
                                comp[m] += wm*(dx[m] + xp[ii][m] - x[ii][m]);

                /* We do this check after the loop above to avoid more nesting.
                 * If we have a single-atom group the mass is irrelevant, so
                 * we can remove the mass factor to avoid division by zero.
                 * Note that with constraint pulling the mass does matter, but
                 * in that case a check group mass != 0 has been done before.
                if (pgrp->nat == 1 && pgrp->nat_loc == 1 && wmass == 0)
                    int m;

                    /* Copy the single atom coordinate */
                    for (m = 0; m < DIM; m++)
                        com[m] = x[pgrp->ind_loc[0]][m];
                    /* Set all mass factors to 1 to get the correct COM */
                    wmass  = 1;
                    wwmass = 1;

                if (pgrp->weight_loc == NULL)
                    wwmass = wmass;

                /* Copy local sums to a buffer for global summing */
                copy_dvec(com, pull->dbuf[g*3]);
                copy_dvec(comp, pull->dbuf[g*3+1]);
                pull->dbuf[g*3+2][0] = wmass;
                pull->dbuf[g*3+2][1] = wwmass;
                pull->dbuf[g*3+2][2] = 0;
                /* Cosine weighting geometry */
                double cm, sm, cmp, smp, ccm, csm, ssm, csw, snw;
                int    i;

                cm  = 0;
                sm  = 0;
                cmp = 0;
                smp = 0;
                ccm = 0;
                csm = 0;
                ssm = 0;

                for (i = 0; i < pgrp->nat_loc; i++)
                    int  ii;
                    real mass;

                    ii   = pgrp->ind_loc[i];
                    mass = md->massT[ii];
                    /* Determine cos and sin sums */
                    csw  = cos(x[ii][pull->cosdim]*twopi_box);
                    snw  = sin(x[ii][pull->cosdim]*twopi_box);
                    cm  += csw*mass;
                    sm  += snw*mass;
                    ccm += csw*csw*mass;
                    csm += csw*snw*mass;
                    ssm += snw*snw*mass;

                    if (xp)
                        csw  = cos(xp[ii][pull->cosdim]*twopi_box);
                        snw  = sin(xp[ii][pull->cosdim]*twopi_box);
                        cmp += csw*mass;
                        smp += snw*mass;

                /* Copy local sums to a buffer for global summing */
                pull->dbuf[g*3  ][0] = cm;
                pull->dbuf[g*3  ][1] = sm;
                pull->dbuf[g*3  ][2] = 0;
                pull->dbuf[g*3+1][0] = ccm;
                pull->dbuf[g*3+1][1] = csm;
                pull->dbuf[g*3+1][2] = ssm;
                pull->dbuf[g*3+2][0] = cmp;
                pull->dbuf[g*3+2][1] = smp;
                pull->dbuf[g*3+2][2] = 0;

    if (cr && PAR(cr))
        /* Sum the contributions over the nodes */
        gmx_sumd(pull->ngroup*3*DIM, pull->dbuf[0], cr);

    for (g = 0; g < pull->ngroup; g++)
        t_pull_group *pgrp;

        pgrp = &pull->group[g];
        if (pgrp->nat > 0 && pgrp->bCalcCOM)
            if (pgrp->epgrppbc != epgrppbcCOS)
                double wmass, wwmass;
                int    m;

                /* Determine the inverse mass */
                wmass             = pull->dbuf[g*3+2][0];
                wwmass            = pull->dbuf[g*3+2][1];
                pgrp->mwscale     = 1.0/wmass;
                /* invtm==0 signals a frozen group, so then we should keep it zero */
                if (pgrp->invtm != 0)
                    pgrp->wscale  = wmass/wwmass;
                    pgrp->invtm   = wwmass/(wmass*wmass);
                /* Divide by the total mass */
                for (m = 0; m < DIM; m++)
                    pgrp->x[m]    = pull->dbuf[g*3  ][m]*pgrp->mwscale;
                    if (xp)
                        pgrp->xp[m] = pull->dbuf[g*3+1][m]*pgrp->mwscale;
                    if (pgrp->epgrppbc == epgrppbcREFAT)
                        pgrp->x[m]    += pull->rbuf[g][m];
                        if (xp)
                            pgrp->xp[m] += pull->rbuf[g][m];
                /* Cosine weighting geometry */
                double csw, snw, wmass, wwmass;
                int    i, ii;

                /* Determine the optimal location of the cosine weight */
                csw                   = pull->dbuf[g*3][0];
                snw                   = pull->dbuf[g*3][1];
                pgrp->x[pull->cosdim] = atan2_0_2pi(snw, csw)/twopi_box;
                /* Set the weights for the local atoms */
                wmass  = sqrt(csw*csw + snw*snw);
                wwmass = (pull->dbuf[g*3+1][0]*csw*csw +
                          pull->dbuf[g*3+1][1]*csw*snw +

                pgrp->mwscale = 1.0/wmass;
                pgrp->wscale  = wmass/wwmass;
                pgrp->invtm   = wwmass/(wmass*wmass);
                /* Set the weights for the local atoms */
                csw *= pgrp->invtm;
                snw *= pgrp->invtm;
                for (i = 0; i < pgrp->nat_loc; i++)
                    ii                  = pgrp->ind_loc[i];
                    pgrp->weight_loc[i] = csw*cos(twopi_box*x[ii][pull->cosdim]) +
                if (xp)
                    csw                    = pull->dbuf[g*3+2][0];
                    snw                    = pull->dbuf[g*3+2][1];
                    pgrp->xp[pull->cosdim] = atan2_0_2pi(snw, csw)/twopi_box;
            if (debug)
                fprintf(debug, "Pull group %d wmass %f invtm %f\n",
                        g, 1.0/pgrp->mwscale, pgrp->invtm);

    if (pull->bCylinder)
        /* Calculate the COMs for the cyclinder reference groups */
        make_cyl_refgrps(cr, pull, md, pbc, t, x);
static void make_cyl_refgrps(t_commrec *cr, t_pull *pull, t_mdatoms *md,
                             t_pbc *pbc, double t, rvec *x)
    /* The size and stride per coord for the reduction buffer */
    const int     stride = 9;
    int           c, i, ii, m, start, end;
    rvec          g_x, dx, dir;
    double        inv_cyl_r2;
    t_pull_coord *pcrd;
    t_pull_group *pref, *pgrp, *pdyna;
    gmx_ga2la_t   ga2la = NULL;

    if (pull->dbuf_cyl == NULL)
        snew(pull->dbuf_cyl, pull->ncoord*stride);

    if (cr && DOMAINDECOMP(cr))
        ga2la = cr->dd->ga2la;

    start = 0;
    end   = md->homenr;

    inv_cyl_r2 = 1/dsqr(pull->cylinder_r);

    /* loop over all groups to make a reference group for each*/
    for (c = 0; c < pull->ncoord; c++)
        double sum_a, wmass, wwmass;
        dvec   radf_fac0, radf_fac1;

        pcrd   = &pull->coord[c];

        sum_a  = 0;
        wmass  = 0;
        wwmass = 0;

        if (pcrd->eGeom == epullgCYL)
            /* pref will be the same group for all pull coordinates */
            pref  = &pull->group[pcrd->group[0]];
            pgrp  = &pull->group[pcrd->group[1]];
            pdyna = &pull->dyna[c];
            copy_rvec(pcrd->vec, dir);
            pdyna->nat_loc = 0;

            /* We calculate distances with respect to the reference location
             * of this cylinder group (g_x), which we already have now since
             * we reduced the other group COM over the ranks. This resolves
             * any PBC issues and we don't need to use a PBC-atom here.
            for (m = 0; m < DIM; m++)
                g_x[m] = pgrp->x[m] - pcrd->vec[m]*(pcrd->init + pcrd->rate*t);

            /* loop over all atoms in the main ref group */
            for (i = 0; i < pref->nat; i++)
                ii = pref->ind[i];
                if (ga2la)
                    if (!ga2la_get_home(ga2la, pref->ind[i], &ii))
                        ii = -1;
                if (ii >= start && ii < end)
                    double dr2, dr2_rel, inp;
                    dvec   dr;

                    pbc_dx_aiuc(pbc, x[ii], g_x, dx);
                    inp = iprod(dir, dx);
                    dr2 = 0;
                    for (m = 0; m < DIM; m++)
                        /* Determine the radial components */
                        dr[m] = dx[m] - inp*dir[m];
                        dr2  += dr[m]*dr[m];
                    dr2_rel = dr2*inv_cyl_r2;

                    if (dr2_rel < 1)
                        double mass, weight, dweight_r;
                        dvec   mdw;

                        /* add to index, to sum of COM, to weight array */
                        if (pdyna->nat_loc >= pdyna->nalloc_loc)
                            pdyna->nalloc_loc = over_alloc_large(pdyna->nat_loc+1);
                            srenew(pdyna->ind_loc,    pdyna->nalloc_loc);
                            srenew(pdyna->weight_loc, pdyna->nalloc_loc);
                            srenew(pdyna->mdw,        pdyna->nalloc_loc);
                            srenew(pdyna->dv,         pdyna->nalloc_loc);
                        pdyna->ind_loc[pdyna->nat_loc] = ii;

                        mass      = md->massT[ii];
                        /* The radial weight function is 1-2x^2+x^4,
                         * where x=r/cylinder_r. Since this function depends
                         * on the radial component, we also get radial forces
                         * on both groups.
                        weight    = 1 + (-2 + dr2_rel)*dr2_rel;
                        dweight_r = (-4 + 4*dr2_rel)*inv_cyl_r2;
                        pdyna->weight_loc[pdyna->nat_loc] = weight;
                        sum_a    += mass*weight*inp;
                        wmass    += mass*weight;
                        wwmass   += mass*weight*weight;
                        dsvmul(mass*dweight_r, dr, mdw);
                        copy_dvec(mdw, pdyna->mdw[pdyna->nat_loc]);
                        /* Currently we only have the axial component of the
                         * distance (inp) up to an unkown offset. We add this
                         * offset after the reduction needs to determine the
                         * COM of the cylinder group.
                        pdyna->dv[pdyna->nat_loc] = inp;
                        for (m = 0; m < DIM; m++)
                            radf_fac0[m] += mdw[m];
                            radf_fac1[m] += mdw[m]*inp;
        pull->dbuf_cyl[c*stride+0] = wmass;
        pull->dbuf_cyl[c*stride+1] = wwmass;
        pull->dbuf_cyl[c*stride+2] = sum_a;
        pull->dbuf_cyl[c*stride+3] = radf_fac0[XX];
        pull->dbuf_cyl[c*stride+4] = radf_fac0[YY];
        pull->dbuf_cyl[c*stride+5] = radf_fac0[ZZ];
        pull->dbuf_cyl[c*stride+6] = radf_fac1[XX];
        pull->dbuf_cyl[c*stride+7] = radf_fac1[YY];
        pull->dbuf_cyl[c*stride+8] = radf_fac1[ZZ];

    if (cr != NULL && PAR(cr))
        /* Sum the contributions over the ranks */
        gmx_sumd(pull->ncoord*stride, pull->dbuf_cyl, cr);

    for (c = 0; c < pull->ncoord; c++)
        pcrd  = &pull->coord[c];

        if (pcrd->eGeom == epullgCYL)
            double wmass, wwmass, inp, dist;

            pdyna = &pull->dyna[c];
            pgrp  = &pull->group[pcrd->group[1]];

            wmass          = pull->dbuf_cyl[c*stride+0];
            wwmass         = pull->dbuf_cyl[c*stride+1];
            pdyna->mwscale = 1.0/wmass;
            /* Cylinder pulling can't be used with constraints, but we set
             * wscale and invtm anyhow, in case someone would like to use them.
            pdyna->wscale  = wmass/wwmass;
            pdyna->invtm   = wwmass/(wmass*wmass);

            /* We store the deviation of the COM from the reference location
             * used above, since we need it when we apply the radial forces
             * to the atoms in the cylinder group.
            pcrd->cyl_dev  = 0;
            for (m = 0; m < DIM; m++)
                g_x[m]         = pgrp->x[m] - pcrd->vec[m]*(pcrd->init + pcrd->rate*t);
                dist           = -pcrd->vec[m]*pull->dbuf_cyl[c*stride+2]*pdyna->mwscale;
                pdyna->x[m]    = g_x[m] - dist;
                pcrd->cyl_dev += dist;
            /* Now we know the exact COM of the cylinder reference group,
             * we can determine the radial force factor (ffrad) that when
             * multiplied with the axial pull force will give the radial
             * force on the pulled (non-cylinder) group.
            for (m = 0; m < DIM; m++)
                pcrd->ffrad[m] = (pull->dbuf_cyl[c*stride+6+m] +

            if (debug)
                fprintf(debug, "Pull cylinder group %d:%8.3f%8.3f%8.3f m:%8.3f\n",
                        c, pdyna->x[0], pdyna->x[1],
                        pdyna->x[2], 1.0/pdyna->invtm);
                fprintf(debug, "ffrad %8.3f %8.3f %8.3f\n",
                        pcrd->ffrad[XX], pcrd->ffrad[YY], pcrd->ffrad[ZZ]);