Beispiel #1
0
static void reduce_thread_forces(int n, rvec *f,
                                 tensor vir_q, tensor vir_lj,
                                 real *Vcorr_q, real *Vcorr_lj,
                                 real *dvdl_q, real *dvdl_lj,
                                 int nthreads, f_thread_t *f_t)
{
    int t, i;
    int nthreads_loop gmx_unused;

    // cppcheck-suppress unreadVariable
    nthreads_loop = gmx_omp_nthreads_get(emntBonded);
    /* This reduction can run over any number of threads */
#pragma omp parallel for num_threads(nthreads_loop) private(t) schedule(static)
    for (i = 0; i < n; i++)
    {
        for (t = 1; t < nthreads; t++)
        {
            rvec_inc(f[i], f_t[t].f[i]);
        }
    }
    for (t = 1; t < nthreads; t++)
    {
        *Vcorr_q  += f_t[t].Vcorr_q;
        *Vcorr_lj += f_t[t].Vcorr_lj;
        *dvdl_q   += f_t[t].dvdl[efptCOUL];
        *dvdl_lj  += f_t[t].dvdl[efptVDW];
        m_add(vir_q, f_t[t].vir_q, vir_q);
        m_add(vir_lj, f_t[t].vir_lj, vir_lj);
    }
}
Beispiel #2
0
void init_ekindata(FILE gmx_unused *log, gmx_mtop_t *mtop, t_grpopts *opts,
                   gmx_ekindata_t *ekind)
{
    int i;
    int nthread, thread;
#ifdef DEBUG
    fprintf(log, "ngtc: %d, ngacc: %d, ngener: %d\n", opts->ngtc, opts->ngacc,
            opts->ngener);
#endif

    /* bNEMD tells if we should remove remove the COM velocity
     * from the velocities during velocity scaling in T-coupling.
     * Turn this on when we have multiple acceleration groups
     * or one accelerated group.
     */
    ekind->bNEMD = (opts->ngacc > 1 || norm(opts->acc[0]) > 0);

    ekind->ngtc = opts->ngtc;
    snew(ekind->tcstat, opts->ngtc);
    init_grptcstat(opts->ngtc, ekind->tcstat);
    /* Set Berendsen tcoupl lambda's to 1,
     * so runs without Berendsen coupling are not affected.
     */
    for (i = 0; i < opts->ngtc; i++)
    {
        ekind->tcstat[i].lambda         = 1.0;
        ekind->tcstat[i].vscale_nhc     = 1.0;
        ekind->tcstat[i].ekinscaleh_nhc = 1.0;
        ekind->tcstat[i].ekinscalef_nhc = 1.0;
    }

    nthread = gmx_omp_nthreads_get(emntUpdate);

    snew(ekind->ekin_work_alloc, nthread);
    snew(ekind->ekin_work, nthread);
    snew(ekind->dekindl_work, nthread);
#pragma omp parallel for num_threads(nthread) schedule(static)
    for (thread = 0; thread < nthread; thread++)
    {
#define EKIN_WORK_BUFFER_SIZE 2
        /* Allocate 2 extra elements on both sides, so in single
         * precision we have
         * EKIN_WORK_BUFFER_SIZE*DIM*DIM*sizeof(real) = 72/144 bytes
         * buffer on both sides to avoid cache pollution.
         */
        snew(ekind->ekin_work_alloc[thread], ekind->ngtc+2*EKIN_WORK_BUFFER_SIZE);
        ekind->ekin_work[thread] = ekind->ekin_work_alloc[thread] + EKIN_WORK_BUFFER_SIZE;
        /* Nasty hack so we can have the per-thread accumulation
         * variable for dekindl in the same thread-local cache lines
         * as the per-thread accumulation tensors for ekin[fh],
         * because they are accumulated in the same loop. */
        ekind->dekindl_work[thread] = &(ekind->ekin_work[thread][ekind->ngtc][0][0]);
#undef EKIN_WORK_BUFFER_SIZE
    }

    ekind->ngacc = opts->ngacc;
    snew(ekind->grpstat, opts->ngacc);
    init_grpstat(mtop, opts->ngacc, ekind->grpstat);
}
Beispiel #3
0
void init_ekindata(FILE *log, gmx_mtop_t *mtop, t_grpopts *opts,
                   gmx_ekindata_t *ekind)
{
    int i;
    int nthread, thread;
#ifdef DEBUG
    fprintf(log, "ngtc: %d, ngacc: %d, ngener: %d\n", opts->ngtc, opts->ngacc,
            opts->ngener);
#endif

    /* bNEMD tells if we should remove remove the COM velocity
     * from the velocities during velocity scaling in T-coupling.
     * Turn this on when we have multiple acceleration groups
     * or one accelerated group.
     */
    ekind->bNEMD = (opts->ngacc > 1 || norm(opts->acc[0]) > 0);

    ekind->ngtc = opts->ngtc;
    snew(ekind->tcstat, opts->ngtc);
    init_grptcstat(opts->ngtc, ekind->tcstat);
    /* Set Berendsen tcoupl lambda's to 1,
     * so runs without Berendsen coupling are not affected.
     */
    for (i = 0; i < opts->ngtc; i++)
    {
        ekind->tcstat[i].lambda         = 1.0;
        ekind->tcstat[i].vscale_nhc     = 1.0;
        ekind->tcstat[i].ekinscaleh_nhc = 1.0;
        ekind->tcstat[i].ekinscalef_nhc = 1.0;
    }

    nthread = gmx_omp_nthreads_get(emntUpdate);

    snew(ekind->ekin_work_alloc, nthread);
    snew(ekind->ekin_work, nthread);
    #pragma omp parallel for num_threads(nthread) schedule(static)
    for (thread = 0; thread < nthread; thread++)
    {
        /* Allocate 2 elements extra on both sides,
         * so in single precision we have 2*3*3*4=72 bytes buffer
         * on both sides to avoid cache pollution.
         */
        snew(ekind->ekin_work_alloc[thread], ekind->ngtc+4);
        ekind->ekin_work[thread] = ekind->ekin_work_alloc[thread] + 2;
    }

    ekind->ngacc = opts->ngacc;
    snew(ekind->grpstat, opts->ngacc);
    init_grpstat(log, mtop, opts->ngacc, ekind->grpstat);
}
Beispiel #4
0
void init_domdec_constraints(gmx_domdec_t *dd,
                             gmx_mtop_t *mtop,
                             gmx_constr_t constr)
{
    gmx_domdec_constraints_t *dc;
    gmx_molblock_t *molb;
    int mb,ncon,c,a;
    
    if (debug)
    {
        fprintf(debug,"Begin init_domdec_constraints\n");
    }
    
    snew(dd->constraints,1);
    dc = dd->constraints;
    
    snew(dc->molb_con_offset,mtop->nmolblock);
    snew(dc->molb_ncon_mol,mtop->nmolblock);
    
    ncon = 0;
    for(mb=0; mb<mtop->nmolblock; mb++)
    {
        molb = &mtop->molblock[mb];
        dc->molb_con_offset[mb] = ncon;
        dc->molb_ncon_mol[mb] =
            mtop->moltype[molb->type].ilist[F_CONSTR].nr/3 +
            mtop->moltype[molb->type].ilist[F_CONSTRNC].nr/3;
        ncon += molb->nmol*dc->molb_ncon_mol[mb];
    }
    
    if (ncon > 0)
    {
        snew(dc->gc_req,ncon);
        for(c=0; c<ncon; c++)
        {
            dc->gc_req[c] = 0;
        }
    }

    /* Use a hash table for the global to local index.
     * The number of keys is a rough estimate, it will be optimized later.
     */
    dc->ga2la = gmx_hash_init(min(mtop->natoms/20,
                                  mtop->natoms/(2*dd->nnodes)));

    dc->nthread = gmx_omp_nthreads_get(emntDomdec);
    snew(dc->ils,dc->nthread);

    dd->constraint_comm = specat_comm_init(dc->nthread);
}
Beispiel #5
0
void put_atoms_in_box_omp(int ePBC, matrix box, int natoms, rvec x[])
{
    int t, nth;
    nth = gmx_omp_nthreads_get(emntDefault);

#pragma omp parallel for num_threads(nth) schedule(static)
    for (t = 0; t < nth; t++)
    {
        int offset, len;

        offset = (natoms*t    )/nth;
        len    = (natoms*(t + 1))/nth - offset;
        put_atoms_in_box(ePBC, box, len, x + offset);
    }
}
Beispiel #6
0
/* Set CPU affinity. Can be important for performance.
   On some systems (e.g. Cray) CPU Affinity is set by default.
   But default assigning doesn't work (well) with only some ranks
   having threads. This causes very low performance.
   External tools have cumbersome syntax for setting affinity
   in the case that only some ranks have threads.
   Thus it is important that GROMACS sets the affinity internally
   if only PME is using threads.
 */
void
gmx_set_thread_affinity(FILE                *fplog,
                        const t_commrec     *cr,
                        gmx_hw_opt_t        *hw_opt,
                        int                  nthreads_pme,
                        const gmx_hw_info_t *hwinfo,
                        const t_inputrec    *inputrec)
{
    int        nth_affinity_set, thread_id_node, thread_id,
               nthread_local, nthread_node, nthread_hw_max, nphyscore;
    int        offset;
    const int *locality_order;
    int        rc;

    if (hw_opt->thread_affinity == threadaffOFF)
    {
        /* Nothing to do */
        return;
    }

    /* If the tMPI thread affinity setting is not supported encourage the user
     * to report it as it's either a bug or an exotic platform which we might
     * want to support. */
    if (tMPI_Thread_setaffinity_support() != TMPI_SETAFFINITY_SUPPORT_YES)
    {
        /* we know Mac OS doesn't support setting thread affinity, so there's
           no point in warning the user in that case. In any other case
           the user might be able to do something about it. */
#ifndef __APPLE__
        md_print_warn(NULL, fplog,
                      "Can not set thread affinities on the current platform. On NUMA systems this\n"
                      "can cause performance degradation. If you think your platform should support\n"
                      "setting affinities, contact the GROMACS developers.");
#endif  /* __APPLE__ */
        return;
    }

    /* threads on this MPI process or TMPI thread */
    if (cr->duty & DUTY_PP)
    {
        nthread_local = gmx_omp_nthreads_get(emntNonbonded);
    }
    else
    {
        nthread_local = gmx_omp_nthreads_get(emntPME);
    }

    /* map the current process to cores */
    thread_id_node = 0;
    nthread_node   = nthread_local;
#ifdef GMX_MPI
    if (PAR(cr) || MULTISIM(cr))
    {
        /* We need to determine a scan of the thread counts in this
         * compute node.
         */
        MPI_Comm comm_intra;

        MPI_Comm_split(MPI_COMM_WORLD, gmx_hostname_num(), cr->rank_intranode,
                       &comm_intra);
        MPI_Scan(&nthread_local, &thread_id_node, 1, MPI_INT, MPI_SUM, comm_intra);
        /* MPI_Scan is inclusive, but here we need exclusive */
        thread_id_node -= nthread_local;
        /* Get the total number of threads on this physical node */
        MPI_Allreduce(&nthread_local, &nthread_node, 1, MPI_INT, MPI_SUM, comm_intra);
        MPI_Comm_free(&comm_intra);
    }
#endif

    if (hw_opt->thread_affinity == threadaffAUTO &&
        nthread_node != hwinfo->nthreads_hw_avail)
    {
        if (nthread_node > 1 && nthread_node < hwinfo->nthreads_hw_avail)
        {
            md_print_warn(cr, fplog,
                          "NOTE: The number of threads is not equal to the number of (logical) cores\n"
                          "      and the -pin option is set to auto: will not pin thread to cores.\n"
                          "      This can lead to significant performance degradation.\n"
                          "      Consider using -pin on (and -pinoffset in case you run multiple jobs).\n");
        }

        return;
    }

    offset = 0;
    if (hw_opt->core_pinning_offset != 0)
    {
        offset = hw_opt->core_pinning_offset;
        md_print_info(cr, fplog, "Applying core pinning offset %d\n", offset);
    }

    rc = get_thread_affinity_layout(fplog, cr, hwinfo,
                                    nthread_node,
                                    offset, &hw_opt->core_pinning_stride,
                                    &locality_order);

    if (rc != 0)
    {
        /* Incompatible layout, don't pin, warning was already issued */
        return;
    }

    /* Set the per-thread affinity. In order to be able to check the success
     * of affinity settings, we will set nth_affinity_set to 1 on threads
     * where the affinity setting succeded and to 0 where it failed.
     * Reducing these 0/1 values over the threads will give the total number
     * of threads on which we succeeded.
     */
    nth_affinity_set = 0;
#pragma omp parallel firstprivate(thread_id_node) num_threads(nthread_local) \
    reduction(+:nth_affinity_set)
    {
        int      index, core;
        gmx_bool setaffinity_ret;

        thread_id       = gmx_omp_get_thread_num();
        thread_id_node += thread_id;
        index           = offset + thread_id_node*hw_opt->core_pinning_stride;
        if (locality_order != NULL)
        {
            core = locality_order[index];
        }
        else
        {
            core = index;
        }

        setaffinity_ret = tMPI_Thread_setaffinity_single(tMPI_Thread_self(), core);

        /* store the per-thread success-values of the setaffinity */
        nth_affinity_set = (setaffinity_ret == 0);

        if (debug)
        {
            fprintf(debug, "On rank %2d, thread %2d, core %2d the affinity setting returned %d\n",
                    cr->nodeid, gmx_omp_get_thread_num(), core, setaffinity_ret);
        }
    }

    if (nth_affinity_set > nthread_local)
    {
        char msg[STRLEN];

        sprintf(msg, "Looks like we have set affinity for more threads than "
                "we have (%d > %d)!\n", nth_affinity_set, nthread_local);
        gmx_incons(msg);
    }
    else
    {
        /* check & warn if some threads failed to set their affinities */
        if (nth_affinity_set != nthread_local)
        {
            char sbuf1[STRLEN], sbuf2[STRLEN];

            /* sbuf1 contains rank info, while sbuf2 OpenMP thread info */
            sbuf1[0] = sbuf2[0] = '\0';
            /* Only add rank info if we have more than one rank. */
            if (cr->nnodes > 1)
            {
#ifdef GMX_MPI
#ifdef GMX_THREAD_MPI
                sprintf(sbuf1, "In tMPI thread #%d: ", cr->nodeid);
#else           /* GMX_LIB_MPI */
                sprintf(sbuf1, "In MPI process #%d: ", cr->nodeid);
#endif
#endif          /* GMX_MPI */
            }

            if (nthread_local > 1)
            {
                sprintf(sbuf2, "for %d/%d thread%s ",
                        nthread_local - nth_affinity_set, nthread_local,
                        nthread_local > 1 ? "s" : "");
            }

            md_print_warn(NULL, fplog,
                          "WARNING: %sAffinity setting %sfailed.\n"
                          "         This can cause performance degradation! If you think your setting are\n"
                          "         correct, contact the GROMACS developers.",
                          sbuf1, sbuf2);
        }
    }
    return;
}
Beispiel #7
0
gmx_bool constrain(FILE *fplog, gmx_bool bLog, gmx_bool bEner,
                   struct gmx_constr *constr,
                   t_idef *idef, t_inputrec *ir, gmx_ekindata_t *ekind,
                   t_commrec *cr,
                   gmx_int64_t step, int delta_step,
                   t_mdatoms *md,
                   rvec *x, rvec *xprime, rvec *min_proj,
                   gmx_bool bMolPBC, matrix box,
                   real lambda, real *dvdlambda,
                   rvec *v, tensor *vir,
                   t_nrnb *nrnb, int econq, gmx_bool bPscal,
                   real veta, real vetanew)
{
    gmx_bool    bOK, bDump;
    int         start, homenr, nrend;
    int         i, j, d;
    int         ncons, settle_error;
    tensor      vir_r_m_dr;
    rvec       *vstor;
    real        invdt, vir_fac, t;
    t_ilist    *settle;
    int         nsettle;
    t_pbc       pbc, *pbc_null;
    char        buf[22];
    t_vetavars  vetavar;
    int         nth, th;

    if (econq == econqForceDispl && !EI_ENERGY_MINIMIZATION(ir->eI))
    {
        gmx_incons("constrain called for forces displacements while not doing energy minimization, can not do this while the LINCS and SETTLE constraint connection matrices are mass weighted");
    }

    bOK   = TRUE;
    bDump = FALSE;

    start  = 0;
    homenr = md->homenr;
    nrend  = start+homenr;

    /* set constants for pressure control integration */
    init_vetavars(&vetavar, econq != econqCoord,
                  veta, vetanew, ir, ekind, bPscal);

    if (ir->delta_t == 0)
    {
        invdt = 0;
    }
    else
    {
        invdt  = 1/ir->delta_t;
    }

    if (ir->efep != efepNO && EI_DYNAMICS(ir->eI))
    {
        /* Set the constraint lengths for the step at which this configuration
         * is meant to be. The invmasses should not be changed.
         */
        lambda += delta_step*ir->fepvals->delta_lambda;
    }

    if (vir != NULL)
    {
        clear_mat(vir_r_m_dr);
    }

    where();

    settle  = &idef->il[F_SETTLE];
    nsettle = settle->nr/(1+NRAL(F_SETTLE));

    if (nsettle > 0)
    {
        nth = gmx_omp_nthreads_get(emntSETTLE);
    }
    else
    {
        nth = 1;
    }

    if (nth > 1 && constr->vir_r_m_dr_th == NULL)
    {
        snew(constr->vir_r_m_dr_th, nth);
        snew(constr->settle_error, nth);
    }

    settle_error = -1;

    /* We do not need full pbc when constraints do not cross charge groups,
     * i.e. when dd->constraint_comm==NULL.
     * Note that PBC for constraints is different from PBC for bondeds.
     * For constraints there is both forward and backward communication.
     */
    if (ir->ePBC != epbcNONE &&
        (cr->dd || bMolPBC) && !(cr->dd && cr->dd->constraint_comm == NULL))
    {
        /* With pbc=screw the screw has been changed to a shift
         * by the constraint coordinate communication routine,
         * so that here we can use normal pbc.
         */
        pbc_null = set_pbc_dd(&pbc, ir->ePBC, cr->dd, FALSE, box);
    }
    else
    {
        pbc_null = NULL;
    }

    /* Communicate the coordinates required for the non-local constraints
     * for LINCS and/or SETTLE.
     */
    if (cr->dd)
    {
        dd_move_x_constraints(cr->dd, box, x, xprime, econq == econqCoord);
    }

    if (constr->lincsd != NULL)
    {
        bOK = constrain_lincs(fplog, bLog, bEner, ir, step, constr->lincsd, md, cr,
                              x, xprime, min_proj,
                              box, pbc_null, lambda, dvdlambda,
                              invdt, v, vir != NULL, vir_r_m_dr,
                              econq, nrnb,
                              constr->maxwarn, &constr->warncount_lincs);
        if (!bOK && constr->maxwarn >= 0)
        {
            if (fplog != NULL)
            {
                fprintf(fplog, "Constraint error in algorithm %s at step %s\n",
                        econstr_names[econtLINCS], gmx_step_str(step, buf));
            }
            bDump = TRUE;
        }
    }

    if (constr->nblocks > 0)
    {
        switch (econq)
        {
            case (econqCoord):
                bOK = bshakef(fplog, constr->shaked,
                              md->invmass, constr->nblocks, constr->sblock,
                              idef, ir, x, xprime, nrnb,
                              constr->lagr, lambda, dvdlambda,
                              invdt, v, vir != NULL, vir_r_m_dr,
                              constr->maxwarn >= 0, econq, &vetavar);
                break;
            case (econqVeloc):
                bOK = bshakef(fplog, constr->shaked,
                              md->invmass, constr->nblocks, constr->sblock,
                              idef, ir, x, min_proj, nrnb,
                              constr->lagr, lambda, dvdlambda,
                              invdt, NULL, vir != NULL, vir_r_m_dr,
                              constr->maxwarn >= 0, econq, &vetavar);
                break;
            default:
                gmx_fatal(FARGS, "Internal error, SHAKE called for constraining something else than coordinates");
                break;
        }

        if (!bOK && constr->maxwarn >= 0)
        {
            if (fplog != NULL)
            {
                fprintf(fplog, "Constraint error in algorithm %s at step %s\n",
                        econstr_names[econtSHAKE], gmx_step_str(step, buf));
            }
            bDump = TRUE;
        }
    }

    if (nsettle > 0)
    {
        int calcvir_atom_end;

        if (vir == NULL)
        {
            calcvir_atom_end = 0;
        }
        else
        {
            calcvir_atom_end = md->homenr;
        }

        switch (econq)
        {
            case econqCoord:
#pragma omp parallel for num_threads(nth) schedule(static)
                for (th = 0; th < nth; th++)
                {
                    int start_th, end_th;

                    if (th > 0)
                    {
                        clear_mat(constr->vir_r_m_dr_th[th]);
                    }

                    start_th = (nsettle* th   )/nth;
                    end_th   = (nsettle*(th+1))/nth;
                    if (start_th >= 0 && end_th - start_th > 0)
                    {
                        csettle(constr->settled,
                                end_th-start_th,
                                settle->iatoms+start_th*(1+NRAL(F_SETTLE)),
                                pbc_null,
                                x[0], xprime[0],
                                invdt, v ? v[0] : NULL, calcvir_atom_end,
                                th == 0 ? vir_r_m_dr : constr->vir_r_m_dr_th[th],
                                th == 0 ? &settle_error : &constr->settle_error[th],
                                &vetavar);
                    }
                }
                inc_nrnb(nrnb, eNR_SETTLE, nsettle);
                if (v != NULL)
                {
                    inc_nrnb(nrnb, eNR_CONSTR_V, nsettle*3);
                }
                if (vir != NULL)
                {
                    inc_nrnb(nrnb, eNR_CONSTR_VIR, nsettle*3);
                }
                break;
            case econqVeloc:
            case econqDeriv:
            case econqForce:
            case econqForceDispl:
#pragma omp parallel for num_threads(nth) schedule(static)
                for (th = 0; th < nth; th++)
                {
                    int start_th, end_th;

                    if (th > 0)
                    {
                        clear_mat(constr->vir_r_m_dr_th[th]);
                    }

                    start_th = (nsettle* th   )/nth;
                    end_th   = (nsettle*(th+1))/nth;

                    if (start_th >= 0 && end_th - start_th > 0)
                    {
                        settle_proj(constr->settled, econq,
                                    end_th-start_th,
                                    settle->iatoms+start_th*(1+NRAL(F_SETTLE)),
                                    pbc_null,
                                    x,
                                    xprime, min_proj, calcvir_atom_end,
                                    th == 0 ? vir_r_m_dr : constr->vir_r_m_dr_th[th],
                                    &vetavar);
                    }
                }
                /* This is an overestimate */
                inc_nrnb(nrnb, eNR_SETTLE, nsettle);
                break;
            case econqDeriv_FlexCon:
                /* Nothing to do, since the are no flexible constraints in settles */
                break;
            default:
                gmx_incons("Unknown constraint quantity for settle");
        }
    }

    if (settle->nr > 0)
    {
        /* Combine virial and error info of the other threads */
        for (i = 1; i < nth; i++)
        {
            m_add(vir_r_m_dr, constr->vir_r_m_dr_th[i], vir_r_m_dr);
            settle_error = constr->settle_error[i];
        }

        if (econq == econqCoord && settle_error >= 0)
        {
            bOK = FALSE;
            if (constr->maxwarn >= 0)
            {
                char buf[256];
                sprintf(buf,
                        "\nstep " "%"GMX_PRId64 ": Water molecule starting at atom %d can not be "
                        "settled.\nCheck for bad contacts and/or reduce the timestep if appropriate.\n",
                        step, ddglatnr(cr->dd, settle->iatoms[settle_error*(1+NRAL(F_SETTLE))+1]));
                if (fplog)
                {
                    fprintf(fplog, "%s", buf);
                }
                fprintf(stderr, "%s", buf);
                constr->warncount_settle++;
                if (constr->warncount_settle > constr->maxwarn)
                {
                    too_many_constraint_warnings(-1, constr->warncount_settle);
                }
                bDump = TRUE;
            }
        }
    }

    free_vetavars(&vetavar);

    if (vir != NULL)
    {
        switch (econq)
        {
            case econqCoord:
                vir_fac = 0.5/(ir->delta_t*ir->delta_t);
                break;
            case econqVeloc:
                vir_fac = 0.5/ir->delta_t;
                break;
            case econqForce:
            case econqForceDispl:
                vir_fac = 0.5;
                break;
            default:
                vir_fac = 0;
                gmx_incons("Unsupported constraint quantity for virial");
        }

        if (EI_VV(ir->eI))
        {
            vir_fac *= 2;  /* only constraining over half the distance here */
        }
        for (i = 0; i < DIM; i++)
        {
            for (j = 0; j < DIM; j++)
            {
                (*vir)[i][j] = vir_fac*vir_r_m_dr[i][j];
            }
        }
    }

    if (bDump)
    {
        dump_confs(fplog, step, constr->warn_mtop, start, homenr, cr, x, xprime, box);
    }

    if (econq == econqCoord)
    {
        if (ir->ePull == epullCONSTRAINT)
        {
            if (EI_DYNAMICS(ir->eI))
            {
                t = ir->init_t + (step + delta_step)*ir->delta_t;
            }
            else
            {
                t = ir->init_t;
            }
            set_pbc(&pbc, ir->ePBC, box);
            pull_constraint(ir->pull, md, &pbc, cr, ir->delta_t, t, x, xprime, v, *vir);
        }
        if (constr->ed && delta_step > 0)
        {
            /* apply the essential dynamcs constraints here */
            do_edsam(ir, step, cr, xprime, v, box, constr->ed);
        }
    }

    return bOK;
}
void
nbnxn_kernel_simd_2xnn(nbnxn_pairlist_set_t      gmx_unused *nbl_list,
                       const nbnxn_atomdata_t    gmx_unused *nbat,
                       const interaction_const_t gmx_unused *ic,
                       int                       gmx_unused  ewald_excl,
                       rvec                      gmx_unused *shift_vec,
                       int                       gmx_unused  force_flags,
                       int                       gmx_unused  clearF,
                       real                      gmx_unused *fshift,
                       real                      gmx_unused *Vc,
                       real                      gmx_unused *Vvdw)
#ifdef GMX_NBNXN_SIMD_2XNN
{
    int                nnbl;
    nbnxn_pairlist_t **nbl;
    int                coulkt, vdwkt = 0;
    int                nb;
    int                nthreads gmx_unused;

    nnbl = nbl_list->nnbl;
    nbl  = nbl_list->nbl;

    if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
    {
        coulkt = coulktRF;
    }
    else
    {
        if (ewald_excl == ewaldexclTable)
        {
            if (ic->rcoulomb == ic->rvdw)
            {
                coulkt = coulktTAB;
            }
            else
            {
                coulkt = coulktTAB_TWIN;
            }
        }
        else
        {
            if (ic->rcoulomb == ic->rvdw)
            {
                coulkt = coulktEWALD;
            }
            else
            {
                coulkt = coulktEWALD_TWIN;
            }
        }
    }

    if (ic->vdwtype == evdwCUT)
    {
        switch (ic->vdw_modifier)
        {
            case eintmodNONE:
            case eintmodPOTSHIFT:
                switch (nbat->comb_rule)
                {
                    case ljcrGEOM: vdwkt = vdwktLJCUT_COMBGEOM; break;
                    case ljcrLB:   vdwkt = vdwktLJCUT_COMBLB;   break;
                    case ljcrNONE: vdwkt = vdwktLJCUT_COMBNONE; break;
                    default:       gmx_incons("Unknown combination rule");
                }
                break;
            case eintmodFORCESWITCH:
                vdwkt = vdwktLJFORCESWITCH;
                break;
            case eintmodPOTSWITCH:
                vdwkt = vdwktLJPOTSWITCH;
                break;
            default:
                gmx_incons("Unsupported VdW interaction modifier");
        }
    }
    else if (ic->vdwtype == evdwPME)
    {
        if (ic->ljpme_comb_rule == eljpmeLB)
        {
            gmx_incons("The nbnxn SIMD kernels don't suport LJ-PME with LB");
        }
        vdwkt = vdwktLJEWALDCOMBGEOM;
    }
    else
    {
        gmx_incons("Unsupported VdW interaction type");
    }

    nthreads = gmx_omp_nthreads_get(emntNonbonded);
#pragma omp parallel for schedule(static) num_threads(nthreads)
    for (nb = 0; nb < nnbl; nb++)
    {
        nbnxn_atomdata_output_t *out;
        real                    *fshift_p;

        out = &nbat->out[nb];

        if (clearF == enbvClearFYes)
        {
            clear_f(nbat, nb, out->f);
        }

        if ((force_flags & GMX_FORCE_VIRIAL) && nnbl == 1)
        {
            fshift_p = fshift;
        }
        else
        {
            fshift_p = out->fshift;

            if (clearF == enbvClearFYes)
            {
                clear_fshift(fshift_p);
            }
        }

        if (!(force_flags & GMX_FORCE_ENERGY))
        {
            /* Don't calculate energies */
            p_nbk_noener[coulkt][vdwkt](nbl[nb], nbat,
                                        ic,
                                        shift_vec,
                                        out->f,
                                        fshift_p);
        }
        else if (out->nV == 1)
        {
            /* No energy groups */
            out->Vvdw[0] = 0;
            out->Vc[0]   = 0;

            p_nbk_ener[coulkt][vdwkt](nbl[nb], nbat,
                                      ic,
                                      shift_vec,
                                      out->f,
                                      fshift_p,
                                      out->Vvdw,
                                      out->Vc);
        }
        else
        {
            /* Calculate energy group contributions */
            int i;

            for (i = 0; i < out->nVS; i++)
            {
                out->VSvdw[i] = 0;
            }
            for (i = 0; i < out->nVS; i++)
            {
                out->VSc[i] = 0;
            }

            p_nbk_energrp[coulkt][vdwkt](nbl[nb], nbat,
                                         ic,
                                         shift_vec,
                                         out->f,
                                         fshift_p,
                                         out->VSvdw,
                                         out->VSc);

            reduce_group_energies(nbat->nenergrp, nbat->neg_2log,
                                  out->VSvdw, out->VSc,
                                  out->Vvdw, out->Vc);
        }
    }

    if (force_flags & GMX_FORCE_ENERGY)
    {
        reduce_energies_over_lists(nbat, nnbl, Vvdw, Vc);
    }
}
Beispiel #9
0
void
nbnxn_kernel_ref(const nbnxn_pairlist_set_t *nbl_list,
                 const nbnxn_atomdata_t     *nbat,
                 const interaction_const_t  *ic,
                 rvec                       *shift_vec,
                 int                         force_flags,
                 int                         clearF,
                 real                       *fshift,
                 real                       *Vc,
                 real                       *Vvdw)
{
    int                nnbl;
    nbnxn_pairlist_t **nbl;
    int                coult;
    int                vdwt;
    int                nb;
    int                nthreads gmx_unused;

    nnbl = nbl_list->nnbl;
    nbl  = nbl_list->nbl;

    if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
    {
        coult = coultRF;
    }
    else
    {
        if (ic->rcoulomb == ic->rvdw)
        {
            coult = coultTAB;
        }
        else
        {
            coult = coultTAB_TWIN;
        }
    }

    if (ic->vdwtype == evdwCUT)
    {
        switch (ic->vdw_modifier)
        {
            case eintmodPOTSHIFT:
            case eintmodNONE:
                vdwt = vdwtCUT;
                break;
            case eintmodFORCESWITCH:
                vdwt = vdwtFSWITCH;
                break;
            case eintmodPOTSWITCH:
                vdwt = vdwtPSWITCH;
                break;
            default:
                gmx_incons("Unsupported VdW modifier");
                break;
        }
    }
    else if (ic->vdwtype == evdwPME)
    {
        if (ic->ljpme_comb_rule == ljcrGEOM)
        {
            assert(nbat->comb_rule == ljcrGEOM);
            vdwt = vdwtEWALDGEOM;
        }
        else
        {
            assert(nbat->comb_rule == ljcrLB);
            vdwt = vdwtEWALDLB;
        }
    }
    else
    {
        gmx_incons("Unsupported vdwtype in nbnxn reference kernel");
    }

    // cppcheck-suppress unreadVariable
    nthreads = gmx_omp_nthreads_get(emntNonbonded);
#pragma omp parallel for schedule(static) num_threads(nthreads)
    for (nb = 0; nb < nnbl; nb++)
    {
        // Presently, the kernels do not call C++ code that can throw, so
        // no need for a try/catch pair in this OpenMP region.
        nbnxn_atomdata_output_t *out;
        real                    *fshift_p;

        out = &nbat->out[nb];

        if (clearF == enbvClearFYes)
        {
            clear_f(nbat, nb, out->f);
        }

        if ((force_flags & GMX_FORCE_VIRIAL) && nnbl == 1)
        {
            fshift_p = fshift;
        }
        else
        {
            fshift_p = out->fshift;

            if (clearF == enbvClearFYes)
            {
                clear_fshift(fshift_p);
            }
        }

        if (!(force_flags & GMX_FORCE_ENERGY))
        {
            /* Don't calculate energies */
            p_nbk_c_noener[coult][vdwt](nbl[nb], nbat,
                                        ic,
                                        shift_vec,
                                        out->f,
                                        fshift_p);
        }
        else if (out->nV == 1)
        {
            /* No energy groups */
            out->Vvdw[0] = 0;
            out->Vc[0]   = 0;

            p_nbk_c_ener[coult][vdwt](nbl[nb], nbat,
                                      ic,
                                      shift_vec,
                                      out->f,
                                      fshift_p,
                                      out->Vvdw,
                                      out->Vc);
        }
        else
        {
            /* Calculate energy group contributions */
            int i;

            for (i = 0; i < out->nV; i++)
            {
                out->Vvdw[i] = 0;
            }
            for (i = 0; i < out->nV; i++)
            {
                out->Vc[i] = 0;
            }

            p_nbk_c_energrp[coult][vdwt](nbl[nb], nbat,
                                         ic,
                                         shift_vec,
                                         out->f,
                                         fshift_p,
                                         out->Vvdw,
                                         out->Vc);
        }
    }

    if (force_flags & GMX_FORCE_ENERGY)
    {
        reduce_energies_over_lists(nbat, nnbl, Vvdw, Vc);
    }
}
void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
                                        const gmx_hw_opt_t  *hw_opt,
                                        gmx_bool             bNtOmpOptionSet,
                                        t_commrec           *cr,
                                        FILE                *fplog)
{
#if defined GMX_OPENMP && defined GMX_MPI
    int         nth_omp_min, nth_omp_max, ngpu;
    char        buf[1000];
#ifdef GMX_THREAD_MPI
    const char *mpi_option = " (option -ntmpi)";
#else
    const char *mpi_option = "";
#endif

    /* This function should be called after thread-MPI (when configured) and
     * OpenMP have been initialized. Check that here.
     */
#ifdef GMX_THREAD_MPI
    GMX_RELEASE_ASSERT(nthreads_omp_faster_default >= nthreads_omp_mpi_ok_max, "Inconsistent OpenMP thread count default values");
    GMX_RELEASE_ASSERT(hw_opt->nthreads_tmpi >= 1, "Must have at least one thread-MPI rank");
#endif
    GMX_RELEASE_ASSERT(gmx_omp_nthreads_get(emntDefault) >= 1, "Must have at least one OpenMP thread");

    nth_omp_min = gmx_omp_nthreads_get(emntDefault);
    nth_omp_max = gmx_omp_nthreads_get(emntDefault);
    ngpu        = hw_opt->gpu_opt.n_dev_use;

    /* Thread-MPI seems to have a bug with reduce on 1 node, so use a cond. */
    if (cr->nnodes + cr->npmenodes > 1)
    {
        int count[3], count_max[3];

        count[0] = -nth_omp_min;
        count[1] =  nth_omp_max;
        count[2] =  ngpu;

        MPI_Allreduce(count, count_max, 3, MPI_INT, MPI_MAX, cr->mpi_comm_mysim);

        /* In case of an inhomogeneous run setup we use the maximum counts */
        nth_omp_min = -count_max[0];
        nth_omp_max =  count_max[1];
        ngpu        =  count_max[2];
    }

    int nthreads_omp_mpi_ok_min;

    if (ngpu == 0)
    {
        nthreads_omp_mpi_ok_min = nthreads_omp_mpi_ok_min_cpu;
    }
    else
    {
        /* With GPUs we set the minimum number of OpenMP threads to 2 to catch
         * cases where the user specifies #ranks == #cores.
         */
        nthreads_omp_mpi_ok_min = nthreads_omp_mpi_ok_min_gpu;
    }

    if (DOMAINDECOMP(cr) && cr->nnodes > 1)
    {
        if (nth_omp_max < nthreads_omp_mpi_ok_min ||
            (!(ngpu > 0 && !gmx_gpu_sharing_supported()) &&
             nth_omp_max > nthreads_omp_mpi_ok_max))
        {
            /* Note that we print target_max here, not ok_max */
            sprintf(buf, "Your choice of number of MPI ranks and amount of resources results in using %d OpenMP threads per rank, which is most likely inefficient. The optimum is usually between %d and %d threads per rank.",
                    nth_omp_max,
                    nthreads_omp_mpi_ok_min,
                    nthreads_omp_mpi_target_max);

            if (bNtOmpOptionSet)
            {
                md_print_warn(cr, fplog, "NOTE: %s\n", buf);
            }
            else
            {
                /* This fatal error, and the one below, is nasty, but it's
                 * probably the only way to ensure that all users don't waste
                 * a lot of resources, since many users don't read logs/stderr.
                 */
                gmx_fatal(FARGS, "%s If you want to run with this setup, specify the -ntomp option. But we suggest to change the number of MPI ranks%s.", buf, mpi_option);
            }
        }
    }
    else
    {
        /* No domain decomposition (or only one domain) */
        if (!(ngpu > 0 && !gmx_gpu_sharing_supported()) &&
            nth_omp_max > nthreads_omp_faster(hwinfo->cpuid_info, ngpu > 0))
        {
            /* To arrive here, the user/system set #ranks and/or #OMPthreads */
            gmx_bool bEnvSet;
            char     buf2[256];

            bEnvSet = (getenv("OMP_NUM_THREADS") != NULL);

            if (bNtOmpOptionSet || bEnvSet)
            {
                sprintf(buf2, "You requested %d OpenMP threads", nth_omp_max);
            }
            else
            {
                sprintf(buf2, "Your choice of %d MPI rank%s and the use of %d total threads %sleads to the use of %d OpenMP threads",
                        cr->nnodes + cr->npmenodes,
                        cr->nnodes + cr->npmenodes == 1 ? "" : "s",
                        hw_opt->nthreads_tot > 0 ? hw_opt->nthreads_tot : hwinfo->nthreads_hw_avail,
                        hwinfo->nphysicalnode > 1 ? "on a node " : "",
                        nth_omp_max);
            }
            sprintf(buf, "%s, whereas we expect the optimum to be with more MPI ranks with %d to %d OpenMP threads.",
                    buf2, nthreads_omp_mpi_ok_min, nthreads_omp_mpi_target_max);

            /* We can not quit with a fatal error when OMP_NUM_THREADS is set
             * with different values per rank or node, since in that case
             * the user can not set -ntomp to override the error.
             */
            if (bNtOmpOptionSet || (bEnvSet && nth_omp_min != nth_omp_max))
            {
                md_print_warn(cr, fplog, "NOTE: %s\n", buf);
            }
            else
            {
                gmx_fatal(FARGS, "%s If you want to run with this many OpenMP threads, specify the -ntomp option. But we suggest to increase the number of MPI ranks%s.", buf, mpi_option);
            }
        }
    }
#else /* GMX_OPENMP && GMX_MPI */
      /* No OpenMP and/or MPI: it doesn't make much sense to check */
    GMX_UNUSED_VALUE(hw_opt);
    GMX_UNUSED_VALUE(bNtOmpOptionSet);
    /* Check if we have more than 1 physical core, if detected,
     * or more than 1 hardware thread if physical cores were not detected.
     */
#if !(defined GMX_OPENMP) && !(defined GMX_MPI)
    if ((hwinfo->ncore > 1) ||
        (hwinfo->ncore == 0 && hwinfo->nthreads_hw_avail > 1))
    {
        md_print_warn(cr, fplog, "NOTE: GROMACS was compiled without OpenMP and (thread-)MPI support, can only use a single CPU core\n");
    }
#else
    GMX_UNUSED_VALUE(hwinfo);
    GMX_UNUSED_VALUE(cr);
    GMX_UNUSED_VALUE(fplog);
#endif

#endif /* GMX_OPENMP && GMX_MPI */
}
Beispiel #11
0
void do_force_lowlevel(FILE       *fplog,   gmx_large_int_t step,
                       t_forcerec *fr,      t_inputrec *ir,
                       t_idef     *idef,    t_commrec  *cr,
                       t_nrnb     *nrnb,    gmx_wallcycle_t wcycle,
                       t_mdatoms  *md,
                       t_grpopts  *opts,
                       rvec       x[],      history_t  *hist,
                       rvec       f[],
                       rvec       f_longrange[],
                       gmx_enerdata_t *enerd,
                       t_fcdata   *fcd,
                       gmx_mtop_t     *mtop,
                       gmx_localtop_t *top,
                       gmx_genborn_t *born,
                       t_atomtypes *atype,
                       gmx_bool       bBornRadii,
                       matrix     box,
                       t_lambda   *fepvals,
                       real       *lambda,
                       t_graph    *graph,
                       t_blocka   *excl,
                       rvec       mu_tot[],
                       int        flags,
                       float      *cycles_pme)
{
    int         i, j, status;
    int         donb_flags;
    gmx_bool    bDoEpot, bSepDVDL, bSB;
    int         pme_flags;
    matrix      boxs;
    rvec        box_size;
    real        Vsr, Vlr, Vcorr = 0;
    t_pbc       pbc;
    real        dvdgb;
    char        buf[22];
    double      clam_i, vlam_i;
    real        dvdl_dum[efptNR], dvdl, dvdl_nb[efptNR], lam_i[efptNR];
    real        dvdlsum;

#ifdef GMX_MPI
    double  t0 = 0.0, t1, t2, t3; /* time measurement for coarse load balancing */
#endif

#define PRINT_SEPDVDL(s, v, dvdlambda) if (bSepDVDL) {fprintf(fplog, sepdvdlformat, s, v, dvdlambda); }

    GMX_MPE_LOG(ev_force_start);
    set_pbc(&pbc, fr->ePBC, box);

    /* reset free energy components */
    for (i = 0; i < efptNR; i++)
    {
        dvdl_nb[i]  = 0;
        dvdl_dum[i] = 0;
    }

    /* Reset box */
    for (i = 0; (i < DIM); i++)
    {
        box_size[i] = box[i][i];
    }

    bSepDVDL = (fr->bSepDVDL && do_per_step(step, ir->nstlog));
    debug_gmx();

    /* do QMMM first if requested */
    if (fr->bQMMM)
    {
        enerd->term[F_EQM] = calculate_QMMM(cr, x, f, fr, md);
    }

    if (bSepDVDL)
    {
        fprintf(fplog, "Step %s: non-bonded V and dVdl for node %d:\n",
                gmx_step_str(step, buf), cr->nodeid);
    }

    /* Call the short range functions all in one go. */
    GMX_MPE_LOG(ev_do_fnbf_start);

#ifdef GMX_MPI
    /*#define TAKETIME ((cr->npmenodes) && (fr->timesteps < 12))*/
#define TAKETIME FALSE
    if (TAKETIME)
    {
        MPI_Barrier(cr->mpi_comm_mygroup);
        t0 = MPI_Wtime();
    }
#endif

    if (ir->nwall)
    {
        /* foreign lambda component for walls */
        dvdl = do_walls(ir, fr, box, md, x, f, lambda[efptVDW],
                        enerd->grpp.ener[egLJSR], nrnb);
        PRINT_SEPDVDL("Walls", 0.0, dvdl);
        enerd->dvdl_lin[efptVDW] += dvdl;
    }

    /* If doing GB, reset dvda and calculate the Born radii */
    if (ir->implicit_solvent)
    {
        wallcycle_sub_start(wcycle, ewcsNONBONDED);

        for (i = 0; i < born->nr; i++)
        {
            fr->dvda[i] = 0;
        }

        if (bBornRadii)
        {
            calc_gb_rad(cr, fr, ir, top, atype, x, &(fr->gblist), born, md, nrnb);
        }

        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
    }

    where();
    /* We only do non-bonded calculation with group scheme here, the verlet
     * calls are done from do_force_cutsVERLET(). */
    if (fr->cutoff_scheme == ecutsGROUP && (flags & GMX_FORCE_NONBONDED))
    {
        donb_flags = 0;
        /* Add short-range interactions */
        donb_flags |= GMX_NONBONDED_DO_SR;

        if (flags & GMX_FORCE_FORCES)
        {
            donb_flags |= GMX_NONBONDED_DO_FORCE;
        }
        if (flags & GMX_FORCE_ENERGY)
        {
            donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
        }
        if (flags & GMX_FORCE_DO_LR)
        {
            donb_flags |= GMX_NONBONDED_DO_LR;
        }

        wallcycle_sub_start(wcycle, ewcsNONBONDED);
        do_nonbonded(cr, fr, x, f, f_longrange, md, excl,
                     &enerd->grpp, box_size, nrnb,
                     lambda, dvdl_nb, -1, -1, donb_flags);

        /* If we do foreign lambda and we have soft-core interactions
         * we have to recalculate the (non-linear) energies contributions.
         */
        if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
        {
            for (i = 0; i < enerd->n_lambda; i++)
            {
                for (j = 0; j < efptNR; j++)
                {
                    lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
                }
                reset_foreign_enerdata(enerd);
                do_nonbonded(cr, fr, x, f, f_longrange, md, excl,
                             &(enerd->foreign_grpp), box_size, nrnb,
                             lam_i, dvdl_dum, -1, -1,
                             (donb_flags & ~GMX_NONBONDED_DO_FORCE) | GMX_NONBONDED_DO_FOREIGNLAMBDA);
                sum_epot(&ir->opts, &(enerd->foreign_grpp), enerd->foreign_term);
                enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
            }
        }
        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
        where();
    }

    /* If we are doing GB, calculate bonded forces and apply corrections
     * to the solvation forces */
    /* MRS: Eventually, many need to include free energy contribution here! */
    if (ir->implicit_solvent)
    {
        wallcycle_sub_start(wcycle, ewcsBONDED);
        calc_gb_forces(cr, md, born, top, atype, x, f, fr, idef,
                       ir->gb_algorithm, ir->sa_algorithm, nrnb, bBornRadii, &pbc, graph, enerd);
        wallcycle_sub_stop(wcycle, ewcsBONDED);
    }

#ifdef GMX_MPI
    if (TAKETIME)
    {
        t1          = MPI_Wtime();
        fr->t_fnbf += t1-t0;
    }
#endif

    if (fepvals->sc_alpha != 0)
    {
        enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
    }
    else
    {
        enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
    }

    if (fepvals->sc_alpha != 0)

    /* even though coulomb part is linear, we already added it, beacuse we
       need to go through the vdw calculation anyway */
    {
        enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
    }
    else
    {
        enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
    }

    Vsr = 0;
    if (bSepDVDL)
    {
        for (i = 0; i < enerd->grpp.nener; i++)
        {
            Vsr +=
                (fr->bBHAM ?
                 enerd->grpp.ener[egBHAMSR][i] :
                 enerd->grpp.ener[egLJSR][i])
                + enerd->grpp.ener[egCOULSR][i] + enerd->grpp.ener[egGB][i];
        }
        dvdlsum = dvdl_nb[efptVDW] + dvdl_nb[efptCOUL];
        PRINT_SEPDVDL("VdW and Coulomb SR particle-p.", Vsr, dvdlsum);
    }
    debug_gmx();

    GMX_MPE_LOG(ev_do_fnbf_finish);

    if (debug)
    {
        pr_rvecs(debug, 0, "fshift after SR", fr->fshift, SHIFTS);
    }

    /* Shift the coordinates. Must be done before bonded forces and PPPM,
     * but is also necessary for SHAKE and update, therefore it can NOT
     * go when no bonded forces have to be evaluated.
     */

    /* Here sometimes we would not need to shift with NBFonly,
     * but we do so anyhow for consistency of the returned coordinates.
     */
    if (graph)
    {
        shift_self(graph, box, x);
        if (TRICLINIC(box))
        {
            inc_nrnb(nrnb, eNR_SHIFTX, 2*graph->nnodes);
        }
        else
        {
            inc_nrnb(nrnb, eNR_SHIFTX, graph->nnodes);
        }
    }
    /* Check whether we need to do bondeds or correct for exclusions */
    if (fr->bMolPBC &&
        ((flags & GMX_FORCE_BONDED)
         || EEL_RF(fr->eeltype) || EEL_FULL(fr->eeltype)))
    {
        /* Since all atoms are in the rectangular or triclinic unit-cell,
         * only single box vector shifts (2 in x) are required.
         */
        set_pbc_dd(&pbc, fr->ePBC, cr->dd, TRUE, box);
    }
    debug_gmx();

    if (flags & GMX_FORCE_BONDED)
    {
        GMX_MPE_LOG(ev_calc_bonds_start);

        wallcycle_sub_start(wcycle, ewcsBONDED);
        calc_bonds(fplog, cr->ms,
                   idef, x, hist, f, fr, &pbc, graph, enerd, nrnb, lambda, md, fcd,
                   DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL, atype, born,
                   flags,
                   fr->bSepDVDL && do_per_step(step, ir->nstlog), step);

        /* Check if we have to determine energy differences
         * at foreign lambda's.
         */
        if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) &&
            idef->ilsort != ilsortNO_FE)
        {
            if (idef->ilsort != ilsortFE_SORTED)
            {
                gmx_incons("The bonded interactions are not sorted for free energy");
            }
            for (i = 0; i < enerd->n_lambda; i++)
            {
                reset_foreign_enerdata(enerd);
                for (j = 0; j < efptNR; j++)
                {
                    lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
                }
                calc_bonds_lambda(fplog, idef, x, fr, &pbc, graph, &(enerd->foreign_grpp), enerd->foreign_term, nrnb, lam_i, md,
                                  fcd, DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL);
                sum_epot(&ir->opts, &(enerd->foreign_grpp), enerd->foreign_term);
                enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
            }
        }
        debug_gmx();
        GMX_MPE_LOG(ev_calc_bonds_finish);
        wallcycle_sub_stop(wcycle, ewcsBONDED);
    }

    where();

    *cycles_pme = 0;
    if (EEL_FULL(fr->eeltype))
    {
        bSB = (ir->nwall == 2);
        if (bSB)
        {
            copy_mat(box, boxs);
            svmul(ir->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
            box_size[ZZ] *= ir->wall_ewald_zfac;
        }

        clear_mat(fr->vir_el_recip);

        if (fr->bEwald)
        {
            Vcorr = 0;
            dvdl  = 0;

            /* With the Verlet scheme exclusion forces are calculated
             * in the non-bonded kernel.
             */
            /* The TPI molecule does not have exclusions with the rest
             * of the system and no intra-molecular PME grid contributions
             * will be calculated in gmx_pme_calc_energy.
             */
            if ((ir->cutoff_scheme == ecutsGROUP && fr->n_tpi == 0) ||
                ir->ewald_geometry != eewg3D ||
                ir->epsilon_surface != 0)
            {
                int nthreads, t;

                wallcycle_sub_start(wcycle, ewcsEWALD_CORRECTION);

                if (fr->n_tpi > 0)
                {
                    gmx_fatal(FARGS, "TPI with PME currently only works in a 3D geometry with tin-foil boundary conditions");
                }

                nthreads = gmx_omp_nthreads_get(emntBonded);
#pragma omp parallel for num_threads(nthreads) schedule(static)
                for (t = 0; t < nthreads; t++)
                {
                    int     s, e, i;
                    rvec   *fnv;
                    tensor *vir;
                    real   *Vcorrt, *dvdlt;
                    if (t == 0)
                    {
                        fnv    = fr->f_novirsum;
                        vir    = &fr->vir_el_recip;
                        Vcorrt = &Vcorr;
                        dvdlt  = &dvdl;
                    }
                    else
                    {
                        fnv    = fr->f_t[t].f;
                        vir    = &fr->f_t[t].vir;
                        Vcorrt = &fr->f_t[t].Vcorr;
                        dvdlt  = &fr->f_t[t].dvdl[efptCOUL];
                        for (i = 0; i < fr->natoms_force; i++)
                        {
                            clear_rvec(fnv[i]);
                        }
                        clear_mat(*vir);
                    }
                    *dvdlt  = 0;
                    *Vcorrt =
                        ewald_LRcorrection(fplog,
                                           fr->excl_load[t], fr->excl_load[t+1],
                                           cr, t, fr,
                                           md->chargeA,
                                           md->nChargePerturbed ? md->chargeB : NULL,
                                           ir->cutoff_scheme != ecutsVERLET,
                                           excl, x, bSB ? boxs : box, mu_tot,
                                           ir->ewald_geometry,
                                           ir->epsilon_surface,
                                           fnv, *vir,
                                           lambda[efptCOUL], dvdlt);
                }
                if (nthreads > 1)
                {
                    reduce_thread_forces(fr->natoms_force, fr->f_novirsum,
                                         fr->vir_el_recip,
                                         &Vcorr, efptCOUL, &dvdl,
                                         nthreads, fr->f_t);
                }

                wallcycle_sub_stop(wcycle, ewcsEWALD_CORRECTION);
            }

            if (fr->n_tpi == 0)
            {
                Vcorr += ewald_charge_correction(cr, fr, lambda[efptCOUL], box,
                                                 &dvdl, fr->vir_el_recip);
            }

            PRINT_SEPDVDL("Ewald excl./charge/dip. corr.", Vcorr, dvdl);
            enerd->dvdl_lin[efptCOUL] += dvdl;
        }

        status = 0;
        Vlr    = 0;
        dvdl   = 0;
        switch (fr->eeltype)
        {
            case eelPME:
            case eelPMESWITCH:
            case eelPMEUSER:
            case eelPMEUSERSWITCH:
            case eelP3M_AD:
                if (cr->duty & DUTY_PME)
                {
                    assert(fr->n_tpi >= 0);
                    if (fr->n_tpi == 0 || (flags & GMX_FORCE_STATECHANGED))
                    {
                        pme_flags = GMX_PME_SPREAD_Q | GMX_PME_SOLVE;
                        if (flags & GMX_FORCE_FORCES)
                        {
                            pme_flags |= GMX_PME_CALC_F;
                        }
                        if (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY))
                        {
                            pme_flags |= GMX_PME_CALC_ENER_VIR;
                        }
                        if (fr->n_tpi > 0)
                        {
                            /* We don't calculate f, but we do want the potential */
                            pme_flags |= GMX_PME_CALC_POT;
                        }
                        wallcycle_start(wcycle, ewcPMEMESH);
                        status = gmx_pme_do(fr->pmedata,
                                            md->start, md->homenr - fr->n_tpi,
                                            x, fr->f_novirsum,
                                            md->chargeA, md->chargeB,
                                            bSB ? boxs : box, cr,
                                            DOMAINDECOMP(cr) ? dd_pme_maxshift_x(cr->dd) : 0,
                                            DOMAINDECOMP(cr) ? dd_pme_maxshift_y(cr->dd) : 0,
                                            nrnb, wcycle,
                                            fr->vir_el_recip, fr->ewaldcoeff,
                                            &Vlr, lambda[efptCOUL], &dvdl,
                                            pme_flags);
                        *cycles_pme = wallcycle_stop(wcycle, ewcPMEMESH);

                        /* We should try to do as little computation after
                         * this as possible, because parallel PME synchronizes
                         * the nodes, so we want all load imbalance of the rest
                         * of the force calculation to be before the PME call.
                         * DD load balancing is done on the whole time of
                         * the force call (without PME).
                         */
                    }
                    if (fr->n_tpi > 0)
                    {
                        /* Determine the PME grid energy of the test molecule
                         * with the PME grid potential of the other charges.
                         */
                        gmx_pme_calc_energy(fr->pmedata, fr->n_tpi,
                                            x + md->homenr - fr->n_tpi,
                                            md->chargeA + md->homenr - fr->n_tpi,
                                            &Vlr);
                    }
                    PRINT_SEPDVDL("PME mesh", Vlr, dvdl);
                }
                break;
            case eelEWALD:
                Vlr = do_ewald(fplog, FALSE, ir, x, fr->f_novirsum,
                               md->chargeA, md->chargeB,
                               box_size, cr, md->homenr,
                               fr->vir_el_recip, fr->ewaldcoeff,
                               lambda[efptCOUL], &dvdl, fr->ewald_table);
                PRINT_SEPDVDL("Ewald long-range", Vlr, dvdl);
                break;
            default:
                gmx_fatal(FARGS, "No such electrostatics method implemented %s",
                          eel_names[fr->eeltype]);
        }
        if (status != 0)
        {
            gmx_fatal(FARGS, "Error %d in long range electrostatics routine %s",
                      status, EELTYPE(fr->eeltype));
        }
        /* Note that with separate PME nodes we get the real energies later */
        enerd->dvdl_lin[efptCOUL] += dvdl;
        enerd->term[F_COUL_RECIP]  = Vlr + Vcorr;
        if (debug)
        {
            fprintf(debug, "Vlr = %g, Vcorr = %g, Vlr_corr = %g\n",
                    Vlr, Vcorr, enerd->term[F_COUL_RECIP]);
            pr_rvecs(debug, 0, "vir_el_recip after corr", fr->vir_el_recip, DIM);
            pr_rvecs(debug, 0, "fshift after LR Corrections", fr->fshift, SHIFTS);
        }
    }
    else
    {
        if (EEL_RF(fr->eeltype))
        {
            /* With the Verlet scheme exclusion forces are calculated
             * in the non-bonded kernel.
             */
            if (ir->cutoff_scheme != ecutsVERLET && fr->eeltype != eelRF_NEC)
            {
                dvdl                   = 0;
                enerd->term[F_RF_EXCL] =
                    RF_excl_correction(fplog, fr, graph, md, excl, x, f,
                                       fr->fshift, &pbc, lambda[efptCOUL], &dvdl);
            }

            enerd->dvdl_lin[efptCOUL] += dvdl;
            PRINT_SEPDVDL("RF exclusion correction",
                          enerd->term[F_RF_EXCL], dvdl);
        }
    }
    where();
    debug_gmx();

    if (debug)
    {
        print_nrnb(debug, nrnb);
    }
    debug_gmx();

#ifdef GMX_MPI
    if (TAKETIME)
    {
        t2 = MPI_Wtime();
        MPI_Barrier(cr->mpi_comm_mygroup);
        t3          = MPI_Wtime();
        fr->t_wait += t3-t2;
        if (fr->timesteps == 11)
        {
            fprintf(stderr, "* PP load balancing info: node %d, step %s, rel wait time=%3.0f%% , load string value: %7.2f\n",
                    cr->nodeid, gmx_step_str(fr->timesteps, buf),
                    100*fr->t_wait/(fr->t_wait+fr->t_fnbf),
                    (fr->t_fnbf+fr->t_wait)/fr->t_fnbf);
        }
        fr->timesteps++;
    }
#endif

    if (debug)
    {
        pr_rvecs(debug, 0, "fshift after bondeds", fr->fshift, SHIFTS);
    }

    GMX_MPE_LOG(ev_force_finish);

}
/* Set CPU affinity. Can be important for performance.
   On some systems (e.g. Cray) CPU Affinity is set by default.
   But default assigning doesn't work (well) with only some ranks
   having threads. This causes very low performance.
   External tools have cumbersome syntax for setting affinity
   in the case that only some ranks have threads.
   Thus it is important that GROMACS sets the affinity internally
   if only PME is using threads.
 */
void
gmx_set_thread_affinity(FILE                *fplog,
                        const t_commrec     *cr,
                        const gmx_hw_opt_t  *hw_opt,
                        const gmx_hw_info_t *hwinfo)
{
    int        nth_affinity_set, thread0_id_node,
               nthread_local, nthread_node;
    int        offset;
    int *      localityOrder = nullptr;
    int        rc;

    if (hw_opt->thread_affinity == threadaffOFF)
    {
        /* Nothing to do */
        return;
    }

    /* If the tMPI thread affinity setting is not supported encourage the user
     * to report it as it's either a bug or an exotic platform which we might
     * want to support. */
    if (tMPI_Thread_setaffinity_support() != TMPI_SETAFFINITY_SUPPORT_YES)
    {
        /* we know Mac OS & BlueGene do not support setting thread affinity, so there's
           no point in warning the user in that case. In any other case
           the user might be able to do something about it. */
#if !defined(__APPLE__) && !defined(__bg__)
        md_print_warn(cr, fplog,
                      "NOTE: Cannot set thread affinities on the current platform.\n");
#endif  /* __APPLE__ */
        return;
    }

    /* threads on this MPI process or TMPI thread */
    if (cr->duty & DUTY_PP)
    {
        nthread_local = gmx_omp_nthreads_get(emntNonbonded);
    }
    else
    {
        nthread_local = gmx_omp_nthreads_get(emntPME);
    }

    /* map the current process to cores */
    thread0_id_node = 0;
    nthread_node    = nthread_local;
#if GMX_MPI
    if (PAR(cr) || MULTISIM(cr))
    {
        /* We need to determine a scan of the thread counts in this
         * compute node.
         */
        MPI_Comm comm_intra;

        MPI_Comm_split(MPI_COMM_WORLD,
                       gmx_physicalnode_id_hash(), cr->rank_intranode,
                       &comm_intra);
        MPI_Scan(&nthread_local, &thread0_id_node, 1, MPI_INT, MPI_SUM, comm_intra);
        /* MPI_Scan is inclusive, but here we need exclusive */
        thread0_id_node -= nthread_local;
        /* Get the total number of threads on this physical node */
        MPI_Allreduce(&nthread_local, &nthread_node, 1, MPI_INT, MPI_SUM, comm_intra);
        MPI_Comm_free(&comm_intra);
    }
#endif

    if (hw_opt->thread_affinity == threadaffAUTO &&
        nthread_node != hwinfo->nthreads_hw_avail)
    {
        if (nthread_node > 1 && nthread_node < hwinfo->nthreads_hw_avail)
        {
            md_print_warn(cr, fplog,
                          "NOTE: The number of threads is not equal to the number of (logical) cores\n"
                          "      and the -pin option is set to auto: will not pin thread to cores.\n"
                          "      This can lead to significant performance degradation.\n"
                          "      Consider using -pin on (and -pinoffset in case you run multiple jobs).\n");
        }

        return;
    }

    offset = 0;
    if (hw_opt->core_pinning_offset != 0)
    {
        offset = hw_opt->core_pinning_offset;
        md_print_info(cr, fplog, "Applying core pinning offset %d\n", offset);
    }

    int core_pinning_stride = hw_opt->core_pinning_stride;
    rc = get_thread_affinity_layout(fplog, cr, hwinfo,
                                    nthread_node,
                                    offset, &core_pinning_stride,
                                    &localityOrder);
    gmx::scoped_guard_sfree localityOrderGuard(localityOrder);

    if (rc != 0)
    {
        /* Incompatible layout, don't pin, warning was already issued */
        return;
    }

    /* Set the per-thread affinity. In order to be able to check the success
     * of affinity settings, we will set nth_affinity_set to 1 on threads
     * where the affinity setting succeded and to 0 where it failed.
     * Reducing these 0/1 values over the threads will give the total number
     * of threads on which we succeeded.
     */

    // To avoid warnings from the static analyzer we initialize nth_affinity_set
    // to zero outside the OpenMP block, and then add to it inside the block.
    // The value will still always be 0 or 1 from each thread.
    nth_affinity_set = 0;
#pragma omp parallel num_threads(nthread_local) reduction(+:nth_affinity_set)
    {
        try
        {
            int      thread_id, thread_id_node;
            int      index, core;
            gmx_bool setaffinity_ret;

            thread_id      = gmx_omp_get_thread_num();
            thread_id_node = thread0_id_node + thread_id;
            index          = offset + thread_id_node*core_pinning_stride;
            if (localityOrder != nullptr)
            {
                core = localityOrder[index];
            }
            else
            {
                core = index;
            }

            setaffinity_ret = tMPI_Thread_setaffinity_single(tMPI_Thread_self(), core);

            /* store the per-thread success-values of the setaffinity */
            nth_affinity_set += (setaffinity_ret == 0);

            if (debug)
            {
                fprintf(debug, "On rank %2d, thread %2d, index %2d, core %2d the affinity setting returned %d\n",
                        cr->nodeid, gmx_omp_get_thread_num(), index, core, setaffinity_ret);
            }
        }
        GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
    }

    if (nth_affinity_set > nthread_local)
    {
        char msg[STRLEN];

        sprintf(msg, "Looks like we have set affinity for more threads than "
                "we have (%d > %d)!\n", nth_affinity_set, nthread_local);
        gmx_incons(msg);
    }
    else
    {
        /* check & warn if some threads failed to set their affinities */
        const bool allAffinitiesSet = (nth_affinity_set == nthread_local);
        if (!allAffinitiesSet)
        {
            char sbuf1[STRLEN], sbuf2[STRLEN];

            /* sbuf1 contains rank info, while sbuf2 OpenMP thread info */
            sbuf1[0] = sbuf2[0] = '\0';
            /* Only add rank info if we have more than one rank. */
            if (cr->nnodes > 1)
            {
#if GMX_MPI
#if GMX_THREAD_MPI
                sprintf(sbuf1, "In tMPI thread #%d: ", cr->nodeid);
#else           /* GMX_LIB_MPI */
                sprintf(sbuf1, "In MPI process #%d: ", cr->nodeid);
#endif
#endif          /* GMX_MPI */
            }

            if (nthread_local > 1)
            {
                sprintf(sbuf2, "for %d/%d thread%s ",
                        nthread_local - nth_affinity_set, nthread_local,
                        nthread_local > 1 ? "s" : "");
            }

            fprintf(stderr, "NOTE: %sAffinity setting %sfailed.\n", sbuf1, sbuf2);
        }
        if (invalidWithinSimulation(cr, !allAffinitiesSet))
        {
            md_print_warn(cr, fplog,
                          "NOTE: Thread affinity setting failed. This can cause performance degradation.\n"
                          "      If you think your settings are correct, ask on the gmx-users list.\n");
        }
    }
}
Beispiel #13
0
void atoms2md(const gmx_mtop_t *mtop, const t_inputrec *ir,
              int nindex, const int *index,
              int homenr,
              t_mdatoms *md)
{
    gmx_bool              bLJPME;
    gmx_mtop_atomlookup_t alook;
    int                   i;
    const t_grpopts      *opts;
    const gmx_groups_t   *groups;
    int                   nthreads gmx_unused;
    const real            oneOverSix = 1.0 / 6.0;

    bLJPME = EVDW_PME(ir->vdwtype);

    opts = &ir->opts;

    groups = &mtop->groups;

    /* Index==NULL indicates no DD (unless we have a DD node with no
     * atoms), so also check for homenr. This should be
     * signaled properly with an extra parameter or nindex==-1.
     */
    if (index == NULL && (homenr > 0))
    {
        md->nr = mtop->natoms;
    }
    else
    {
        md->nr = nindex;
    }

    if (md->nr > md->nalloc)
    {
        md->nalloc = over_alloc_dd(md->nr);

        if (md->nMassPerturbed)
        {
            srenew(md->massA, md->nalloc);
            srenew(md->massB, md->nalloc);
        }
        srenew(md->massT, md->nalloc);
        srenew(md->invmass, md->nalloc);
        srenew(md->chargeA, md->nalloc);
        srenew(md->typeA, md->nalloc);
        if (md->nPerturbed)
        {
            srenew(md->chargeB, md->nalloc);
            srenew(md->typeB, md->nalloc);
        }
        if (bLJPME)
        {
            srenew(md->sqrt_c6A, md->nalloc);
            srenew(md->sigmaA, md->nalloc);
            srenew(md->sigma3A, md->nalloc);
            if (md->nPerturbed)
            {
                srenew(md->sqrt_c6B, md->nalloc);
                srenew(md->sigmaB, md->nalloc);
                srenew(md->sigma3B, md->nalloc);
            }
        }
        srenew(md->ptype, md->nalloc);
        if (opts->ngtc > 1)
        {
            srenew(md->cTC, md->nalloc);
            /* We always copy cTC with domain decomposition */
        }
        srenew(md->cENER, md->nalloc);
        if (opts->ngacc > 1)
        {
            srenew(md->cACC, md->nalloc);
        }
        if (opts->nFreeze &&
            (opts->ngfrz > 1 ||
             opts->nFreeze[0][XX] || opts->nFreeze[0][YY] || opts->nFreeze[0][ZZ]))
        {
            srenew(md->cFREEZE, md->nalloc);
        }
        if (md->bVCMgrps)
        {
            srenew(md->cVCM, md->nalloc);
        }
        if (md->bOrires)
        {
            srenew(md->cORF, md->nalloc);
        }
        if (md->nPerturbed)
        {
            srenew(md->bPerturbed, md->nalloc);
        }

        /* Note that these user t_mdatoms array pointers are NULL
         * when there is only one group present.
         * Therefore, when adding code, the user should use something like:
         * gprnrU1 = (md->cU1==NULL ? 0 : md->cU1[localatindex])
         */
        if (mtop->groups.grpnr[egcUser1] != NULL)
        {
            srenew(md->cU1, md->nalloc);
        }
        if (mtop->groups.grpnr[egcUser2] != NULL)
        {
            srenew(md->cU2, md->nalloc);
        }

        if (ir->bQMMM)
        {
            srenew(md->bQM, md->nalloc);
        }
        if (ir->bAdress)
        {
            srenew(md->wf, md->nalloc);
            srenew(md->tf_table_index, md->nalloc);
        }
    }

    alook = gmx_mtop_atomlookup_init(mtop);

    // cppcheck-suppress unreadVariable
    nthreads = gmx_omp_nthreads_get(emntDefault);
#pragma omp parallel for num_threads(nthreads) schedule(static)
    for (i = 0; i < md->nr; i++)
    {
        try
        {
            int      g, ag;
            real     mA, mB, fac;
            real     c6, c12;
            t_atom  *atom;

            if (index == NULL)
            {
                ag = i;
            }
            else
            {
                ag   = index[i];
            }
            gmx_mtop_atomnr_to_atom(alook, ag, &atom);

            if (md->cFREEZE)
            {
                md->cFREEZE[i] = ggrpnr(groups, egcFREEZE, ag);
            }
            if (EI_ENERGY_MINIMIZATION(ir->eI))
            {
                /* Displacement is proportional to F, masses used for constraints */
                mA = 1.0;
                mB = 1.0;
            }
            else if (ir->eI == eiBD)
            {
                /* With BD the physical masses are irrelevant.
                 * To keep the code simple we use most of the normal MD code path
                 * for BD. Thus for constraining the masses should be proportional
                 * to the friction coefficient. We set the absolute value such that
                 * m/2<(dx/dt)^2> = m/2*2kT/fric*dt = kT/2 => m=fric*dt/2
                 * Then if we set the (meaningless) velocity to v=dx/dt, we get the
                 * correct kinetic energy and temperature using the usual code path.
                 * Thus with BD v*dt will give the displacement and the reported
                 * temperature can signal bad integration (too large time step).
                 */
                if (ir->bd_fric > 0)
                {
                    mA = 0.5*ir->bd_fric*ir->delta_t;
                    mB = 0.5*ir->bd_fric*ir->delta_t;
                }
                else
                {
                    /* The friction coefficient is mass/tau_t */
                    fac = ir->delta_t/opts->tau_t[md->cTC ? groups->grpnr[egcTC][ag] : 0];
                    mA  = 0.5*atom->m*fac;
                    mB  = 0.5*atom->mB*fac;
                }
            }
            else
            {
                mA = atom->m;
                mB = atom->mB;
            }
            if (md->nMassPerturbed)
            {
                md->massA[i]  = mA;
                md->massB[i]  = mB;
            }
            md->massT[i]    = mA;
            if (mA == 0.0)
            {
                md->invmass[i]    = 0;
            }
            else if (md->cFREEZE)
            {
                g = md->cFREEZE[i];
                if (opts->nFreeze[g][XX] && opts->nFreeze[g][YY] && opts->nFreeze[g][ZZ])
                {
                    /* Set the mass of completely frozen particles to ALMOST_ZERO iso 0
                     * to avoid div by zero in lincs or shake.
                     * Note that constraints can still move a partially frozen particle.
                     */
                    md->invmass[i]  = ALMOST_ZERO;
                }
                else
                {
                    md->invmass[i]  = 1.0/mA;
                }
            }
            else
            {
                md->invmass[i]    = 1.0/mA;
            }
            md->chargeA[i]      = atom->q;
            md->typeA[i]        = atom->type;
            if (bLJPME)
            {
                c6                = mtop->ffparams.iparams[atom->type*(mtop->ffparams.atnr+1)].lj.c6;
                c12               = mtop->ffparams.iparams[atom->type*(mtop->ffparams.atnr+1)].lj.c12;
                md->sqrt_c6A[i]   = sqrt(c6);
                if (c6 == 0.0 || c12 == 0)
                {
                    md->sigmaA[i] = 1.0;
                }
                else
                {
                    md->sigmaA[i] = pow(c12/c6, oneOverSix);
                }
                md->sigma3A[i]    = 1/(md->sigmaA[i]*md->sigmaA[i]*md->sigmaA[i]);
            }
            if (md->nPerturbed)
            {
                md->bPerturbed[i] = PERTURBED(*atom);
                md->chargeB[i]    = atom->qB;
                md->typeB[i]      = atom->typeB;
                if (bLJPME)
                {
                    c6                = mtop->ffparams.iparams[atom->typeB*(mtop->ffparams.atnr+1)].lj.c6;
                    c12               = mtop->ffparams.iparams[atom->typeB*(mtop->ffparams.atnr+1)].lj.c12;
                    md->sqrt_c6B[i]   = sqrt(c6);
                    if (c6 == 0.0 || c12 == 0)
                    {
                        md->sigmaB[i] = 1.0;
                    }
                    else
                    {
                        md->sigmaB[i] = pow(c12/c6, oneOverSix);
                    }
                    md->sigma3B[i]    = 1/(md->sigmaB[i]*md->sigmaB[i]*md->sigmaB[i]);
                }
            }
            md->ptype[i]    = atom->ptype;
            if (md->cTC)
            {
                md->cTC[i]    = groups->grpnr[egcTC][ag];
            }
            md->cENER[i]    =
                (groups->grpnr[egcENER] ? groups->grpnr[egcENER][ag] : 0);
            if (md->cACC)
            {
                md->cACC[i]   = groups->grpnr[egcACC][ag];
            }
            if (md->cVCM)
            {
                md->cVCM[i]       = groups->grpnr[egcVCM][ag];
            }
            if (md->cORF)
            {
                md->cORF[i]       = groups->grpnr[egcORFIT][ag];
            }

            if (md->cU1)
            {
                md->cU1[i]        = groups->grpnr[egcUser1][ag];
            }
            if (md->cU2)
            {
                md->cU2[i]        = groups->grpnr[egcUser2][ag];
            }

            if (ir->bQMMM)
            {
                if (groups->grpnr[egcQMMM] == 0 ||
                    groups->grpnr[egcQMMM][ag] < groups->grps[egcQMMM].nr-1)
                {
                    md->bQM[i]      = TRUE;
                }
                else
                {
                    md->bQM[i]      = FALSE;
                }
            }
            /* Initialize AdResS weighting functions to adressw */
            if (ir->bAdress)
            {
                md->wf[i]           = 1.0;
                /* if no tf table groups specified, use default table */
                md->tf_table_index[i] = DEFAULT_TF_TABLE;
                if (ir->adress->n_tf_grps > 0)
                {
                    /* if tf table groups specified, tf is only applied to thoose energy groups*/
                    md->tf_table_index[i] = NO_TF_TABLE;
                    /* check wether atom is in one of the relevant energy groups and assign a table index */
                    for (g = 0; g < ir->adress->n_tf_grps; g++)
                    {
                        if (md->cENER[i] == ir->adress->tf_table_index[g])
                        {
                            md->tf_table_index[i] = g;
                        }
                    }
                }
            }
        }
        GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
    }

    gmx_mtop_atomlookup_destroy(alook);

    md->homenr = homenr;
    md->lambda = 0;
}
Beispiel #14
0
void do_force_lowlevel(t_forcerec *fr,      t_inputrec *ir,
                       t_idef     *idef,    t_commrec  *cr,
                       t_nrnb     *nrnb,    gmx_wallcycle_t wcycle,
                       t_mdatoms  *md,
                       rvec       x[],      history_t  *hist,
                       rvec       f[],
                       rvec       f_longrange[],
                       gmx_enerdata_t *enerd,
                       t_fcdata   *fcd,
                       gmx_localtop_t *top,
                       gmx_genborn_t *born,
                       gmx_bool       bBornRadii,
                       matrix     box,
                       t_lambda   *fepvals,
                       real       *lambda,
                       t_graph    *graph,
                       t_blocka   *excl,
                       rvec       mu_tot[],
                       int        flags,
                       float      *cycles_pme)
{
    int         i, j;
    int         donb_flags;
    gmx_bool    bSB;
    int         pme_flags;
    matrix      boxs;
    rvec        box_size;
    t_pbc       pbc;
    real        dvdl_dum[efptNR], dvdl_nb[efptNR];

#ifdef GMX_MPI
    double  t0 = 0.0, t1, t2, t3; /* time measurement for coarse load balancing */
#endif

    set_pbc(&pbc, fr->ePBC, box);

    /* reset free energy components */
    for (i = 0; i < efptNR; i++)
    {
        dvdl_nb[i]  = 0;
        dvdl_dum[i] = 0;
    }

    /* Reset box */
    for (i = 0; (i < DIM); i++)
    {
        box_size[i] = box[i][i];
    }

    debug_gmx();

    /* do QMMM first if requested */
    if (fr->bQMMM)
    {
        enerd->term[F_EQM] = calculate_QMMM(cr, x, f, fr);
    }

    /* Call the short range functions all in one go. */

#ifdef GMX_MPI
    /*#define TAKETIME ((cr->npmenodes) && (fr->timesteps < 12))*/
#define TAKETIME FALSE
    if (TAKETIME)
    {
        MPI_Barrier(cr->mpi_comm_mygroup);
        t0 = MPI_Wtime();
    }
#endif

    if (ir->nwall)
    {
        /* foreign lambda component for walls */
        real dvdl_walls = do_walls(ir, fr, box, md, x, f, lambda[efptVDW],
                                   enerd->grpp.ener[egLJSR], nrnb);
        enerd->dvdl_lin[efptVDW] += dvdl_walls;
    }

    /* If doing GB, reset dvda and calculate the Born radii */
    if (ir->implicit_solvent)
    {
        wallcycle_sub_start(wcycle, ewcsNONBONDED);

        for (i = 0; i < born->nr; i++)
        {
            fr->dvda[i] = 0;
        }

        if (bBornRadii)
        {
            calc_gb_rad(cr, fr, ir, top, x, &(fr->gblist), born, md, nrnb);
        }

        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
    }

    where();
    /* We only do non-bonded calculation with group scheme here, the verlet
     * calls are done from do_force_cutsVERLET(). */
    if (fr->cutoff_scheme == ecutsGROUP && (flags & GMX_FORCE_NONBONDED))
    {
        donb_flags = 0;
        /* Add short-range interactions */
        donb_flags |= GMX_NONBONDED_DO_SR;

        /* Currently all group scheme kernels always calculate (shift-)forces */
        if (flags & GMX_FORCE_FORCES)
        {
            donb_flags |= GMX_NONBONDED_DO_FORCE;
        }
        if (flags & GMX_FORCE_VIRIAL)
        {
            donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
        }
        if (flags & GMX_FORCE_ENERGY)
        {
            donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
        }
        if (flags & GMX_FORCE_DO_LR)
        {
            donb_flags |= GMX_NONBONDED_DO_LR;
        }

        wallcycle_sub_start(wcycle, ewcsNONBONDED);
        do_nonbonded(fr, x, f, f_longrange, md, excl,
                     &enerd->grpp, nrnb,
                     lambda, dvdl_nb, -1, -1, donb_flags);

        /* If we do foreign lambda and we have soft-core interactions
         * we have to recalculate the (non-linear) energies contributions.
         */
        if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
        {
            for (i = 0; i < enerd->n_lambda; i++)
            {
                real lam_i[efptNR];

                for (j = 0; j < efptNR; j++)
                {
                    lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
                }
                reset_foreign_enerdata(enerd);
                do_nonbonded(fr, x, f, f_longrange, md, excl,
                             &(enerd->foreign_grpp), nrnb,
                             lam_i, dvdl_dum, -1, -1,
                             (donb_flags & ~GMX_NONBONDED_DO_FORCE) | GMX_NONBONDED_DO_FOREIGNLAMBDA);
                sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
                enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
            }
        }
        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
        where();
    }

    /* If we are doing GB, calculate bonded forces and apply corrections
     * to the solvation forces */
    /* MRS: Eventually, many need to include free energy contribution here! */
    if (ir->implicit_solvent)
    {
        wallcycle_sub_start(wcycle, ewcsLISTED);
        calc_gb_forces(cr, md, born, top, x, f, fr, idef,
                       ir->gb_algorithm, ir->sa_algorithm, nrnb, &pbc, graph, enerd);
        wallcycle_sub_stop(wcycle, ewcsLISTED);
    }

#ifdef GMX_MPI
    if (TAKETIME)
    {
        t1          = MPI_Wtime();
        fr->t_fnbf += t1-t0;
    }
#endif

    if (fepvals->sc_alpha != 0)
    {
        enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
    }
    else
    {
        enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
    }

    if (fepvals->sc_alpha != 0)

    /* even though coulomb part is linear, we already added it, beacuse we
       need to go through the vdw calculation anyway */
    {
        enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
    }
    else
    {
        enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
    }

    debug_gmx();


    if (debug)
    {
        pr_rvecs(debug, 0, "fshift after SR", fr->fshift, SHIFTS);
    }

    /* Shift the coordinates. Must be done before listed forces and PPPM,
     * but is also necessary for SHAKE and update, therefore it can NOT
     * go when no listed forces have to be evaluated.
     *
     * The shifting and PBC code is deliberately not timed, since with
     * the Verlet scheme it only takes non-zero time with triclinic
     * boxes, and even then the time is around a factor of 100 less
     * than the next smallest counter.
     */


    /* Here sometimes we would not need to shift with NBFonly,
     * but we do so anyhow for consistency of the returned coordinates.
     */
    if (graph)
    {
        shift_self(graph, box, x);
        if (TRICLINIC(box))
        {
            inc_nrnb(nrnb, eNR_SHIFTX, 2*graph->nnodes);
        }
        else
        {
            inc_nrnb(nrnb, eNR_SHIFTX, graph->nnodes);
        }
    }
    /* Check whether we need to do listed interactions or correct for exclusions */
    if (fr->bMolPBC &&
        ((flags & GMX_FORCE_LISTED)
         || EEL_RF(fr->eeltype) || EEL_FULL(fr->eeltype) || EVDW_PME(fr->vdwtype)))
    {
        /* TODO There are no electrostatics methods that require this
           transformation, when using the Verlet scheme, so update the
           above conditional. */
        /* Since all atoms are in the rectangular or triclinic unit-cell,
         * only single box vector shifts (2 in x) are required.
         */
        set_pbc_dd(&pbc, fr->ePBC, cr->dd, TRUE, box);
    }
    debug_gmx();

    do_force_listed(wcycle, box, ir->fepvals, cr->ms,
                    idef, (const rvec *) x, hist, f, fr,
                    &pbc, graph, enerd, nrnb, lambda, md, fcd,
                    DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL,
                    flags);

    where();

    *cycles_pme = 0;
    clear_mat(fr->vir_el_recip);
    clear_mat(fr->vir_lj_recip);

    /* Do long-range electrostatics and/or LJ-PME, including related short-range
     * corrections.
     */
    if (EEL_FULL(fr->eeltype) || EVDW_PME(fr->vdwtype))
    {
        int  status            = 0;
        real Vlr_q             = 0, Vlr_lj = 0, Vcorr_q = 0, Vcorr_lj = 0;
        real dvdl_long_range_q = 0, dvdl_long_range_lj = 0;

        bSB = (ir->nwall == 2);
        if (bSB)
        {
            copy_mat(box, boxs);
            svmul(ir->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
            box_size[ZZ] *= ir->wall_ewald_zfac;
        }

        if (EEL_PME_EWALD(fr->eeltype) || EVDW_PME(fr->vdwtype))
        {
            real dvdl_long_range_correction_q   = 0;
            real dvdl_long_range_correction_lj  = 0;
            /* With the Verlet scheme exclusion forces are calculated
             * in the non-bonded kernel.
             */
            /* The TPI molecule does not have exclusions with the rest
             * of the system and no intra-molecular PME grid
             * contributions will be calculated in
             * gmx_pme_calc_energy.
             */
            if ((ir->cutoff_scheme == ecutsGROUP && fr->n_tpi == 0) ||
                ir->ewald_geometry != eewg3D ||
                ir->epsilon_surface != 0)
            {
                int nthreads, t;

                wallcycle_sub_start(wcycle, ewcsEWALD_CORRECTION);

                if (fr->n_tpi > 0)
                {
                    gmx_fatal(FARGS, "TPI with PME currently only works in a 3D geometry with tin-foil boundary conditions");
                }

                nthreads = gmx_omp_nthreads_get(emntBonded);
#pragma omp parallel for num_threads(nthreads) schedule(static)
                for (t = 0; t < nthreads; t++)
                {
                    int     i;
                    rvec   *fnv;
                    tensor *vir_q, *vir_lj;
                    real   *Vcorrt_q, *Vcorrt_lj, *dvdlt_q, *dvdlt_lj;
                    if (t == 0)
                    {
                        fnv       = fr->f_novirsum;
                        vir_q     = &fr->vir_el_recip;
                        vir_lj    = &fr->vir_lj_recip;
                        Vcorrt_q  = &Vcorr_q;
                        Vcorrt_lj = &Vcorr_lj;
                        dvdlt_q   = &dvdl_long_range_correction_q;
                        dvdlt_lj  = &dvdl_long_range_correction_lj;
                    }
                    else
                    {
                        fnv       = fr->f_t[t].f;
                        vir_q     = &fr->f_t[t].vir_q;
                        vir_lj    = &fr->f_t[t].vir_lj;
                        Vcorrt_q  = &fr->f_t[t].Vcorr_q;
                        Vcorrt_lj = &fr->f_t[t].Vcorr_lj;
                        dvdlt_q   = &fr->f_t[t].dvdl[efptCOUL];
                        dvdlt_lj  = &fr->f_t[t].dvdl[efptVDW];
                        for (i = 0; i < fr->natoms_force; i++)
                        {
                            clear_rvec(fnv[i]);
                        }
                        clear_mat(*vir_q);
                        clear_mat(*vir_lj);
                    }
                    *dvdlt_q  = 0;
                    *dvdlt_lj = 0;

                    ewald_LRcorrection(fr->excl_load[t], fr->excl_load[t+1],
                                       cr, t, fr,
                                       md->chargeA, md->chargeB,
                                       md->sqrt_c6A, md->sqrt_c6B,
                                       md->sigmaA, md->sigmaB,
                                       md->sigma3A, md->sigma3B,
                                       md->nChargePerturbed || md->nTypePerturbed,
                                       ir->cutoff_scheme != ecutsVERLET,
                                       excl, x, bSB ? boxs : box, mu_tot,
                                       ir->ewald_geometry,
                                       ir->epsilon_surface,
                                       fnv, *vir_q, *vir_lj,
                                       Vcorrt_q, Vcorrt_lj,
                                       lambda[efptCOUL], lambda[efptVDW],
                                       dvdlt_q, dvdlt_lj);
                }
                if (nthreads > 1)
                {
                    reduce_thread_forces(fr->natoms_force, fr->f_novirsum,
                                         fr->vir_el_recip, fr->vir_lj_recip,
                                         &Vcorr_q, &Vcorr_lj,
                                         &dvdl_long_range_correction_q,
                                         &dvdl_long_range_correction_lj,
                                         nthreads, fr->f_t);
                }
                wallcycle_sub_stop(wcycle, ewcsEWALD_CORRECTION);
            }

            if (EEL_PME_EWALD(fr->eeltype) && fr->n_tpi == 0)
            {
                /* This is not in a subcounter because it takes a
                   negligible and constant-sized amount of time */
                Vcorr_q += ewald_charge_correction(cr, fr, lambda[efptCOUL], box,
                                                   &dvdl_long_range_correction_q,
                                                   fr->vir_el_recip);
            }

            enerd->dvdl_lin[efptCOUL] += dvdl_long_range_correction_q;
            enerd->dvdl_lin[efptVDW]  += dvdl_long_range_correction_lj;

            if ((EEL_PME(fr->eeltype) || EVDW_PME(fr->vdwtype)) && (cr->duty & DUTY_PME))
            {
                /* Do reciprocal PME for Coulomb and/or LJ. */
                assert(fr->n_tpi >= 0);
                if (fr->n_tpi == 0 || (flags & GMX_FORCE_STATECHANGED))
                {
                    pme_flags = GMX_PME_SPREAD | GMX_PME_SOLVE;
                    if (EEL_PME(fr->eeltype))
                    {
                        pme_flags     |= GMX_PME_DO_COULOMB;
                    }
                    if (EVDW_PME(fr->vdwtype))
                    {
                        pme_flags |= GMX_PME_DO_LJ;
                    }
                    if (flags & GMX_FORCE_FORCES)
                    {
                        pme_flags |= GMX_PME_CALC_F;
                    }
                    if (flags & GMX_FORCE_VIRIAL)
                    {
                        pme_flags |= GMX_PME_CALC_ENER_VIR;
                    }
                    if (fr->n_tpi > 0)
                    {
                        /* We don't calculate f, but we do want the potential */
                        pme_flags |= GMX_PME_CALC_POT;
                    }
                    wallcycle_start(wcycle, ewcPMEMESH);
                    status = gmx_pme_do(fr->pmedata,
                                        0, md->homenr - fr->n_tpi,
                                        x, fr->f_novirsum,
                                        md->chargeA, md->chargeB,
                                        md->sqrt_c6A, md->sqrt_c6B,
                                        md->sigmaA, md->sigmaB,
                                        bSB ? boxs : box, cr,
                                        DOMAINDECOMP(cr) ? dd_pme_maxshift_x(cr->dd) : 0,
                                        DOMAINDECOMP(cr) ? dd_pme_maxshift_y(cr->dd) : 0,
                                        nrnb, wcycle,
                                        fr->vir_el_recip, fr->ewaldcoeff_q,
                                        fr->vir_lj_recip, fr->ewaldcoeff_lj,
                                        &Vlr_q, &Vlr_lj,
                                        lambda[efptCOUL], lambda[efptVDW],
                                        &dvdl_long_range_q, &dvdl_long_range_lj, pme_flags);
                    *cycles_pme = wallcycle_stop(wcycle, ewcPMEMESH);
                    if (status != 0)
                    {
                        gmx_fatal(FARGS, "Error %d in reciprocal PME routine", status);
                    }
                    /* We should try to do as little computation after
                     * this as possible, because parallel PME synchronizes
                     * the nodes, so we want all load imbalance of the
                     * rest of the force calculation to be before the PME
                     * call.  DD load balancing is done on the whole time
                     * of the force call (without PME).
                     */
                }
                if (fr->n_tpi > 0)
                {
                    if (EVDW_PME(ir->vdwtype))
                    {

                        gmx_fatal(FARGS, "Test particle insertion not implemented with LJ-PME");
                    }
                    /* Determine the PME grid energy of the test molecule
                     * with the PME grid potential of the other charges.
                     */
                    gmx_pme_calc_energy(fr->pmedata, fr->n_tpi,
                                        x + md->homenr - fr->n_tpi,
                                        md->chargeA + md->homenr - fr->n_tpi,
                                        &Vlr_q);
                }
            }
        }

        if (!EEL_PME(fr->eeltype) && EEL_PME_EWALD(fr->eeltype))
        {
            Vlr_q = do_ewald(ir, x, fr->f_novirsum,
                             md->chargeA, md->chargeB,
                             box_size, cr, md->homenr,
                             fr->vir_el_recip, fr->ewaldcoeff_q,
                             lambda[efptCOUL], &dvdl_long_range_q, fr->ewald_table);
        }

        /* Note that with separate PME nodes we get the real energies later */
        enerd->dvdl_lin[efptCOUL] += dvdl_long_range_q;
        enerd->dvdl_lin[efptVDW]  += dvdl_long_range_lj;
        enerd->term[F_COUL_RECIP]  = Vlr_q + Vcorr_q;
        enerd->term[F_LJ_RECIP]    = Vlr_lj + Vcorr_lj;
        if (debug)
        {
            fprintf(debug, "Vlr_q = %g, Vcorr_q = %g, Vlr_corr_q = %g\n",
                    Vlr_q, Vcorr_q, enerd->term[F_COUL_RECIP]);
            pr_rvecs(debug, 0, "vir_el_recip after corr", fr->vir_el_recip, DIM);
            pr_rvecs(debug, 0, "fshift after LR Corrections", fr->fshift, SHIFTS);
            fprintf(debug, "Vlr_lj: %g, Vcorr_lj = %g, Vlr_corr_lj = %g\n",
                    Vlr_lj, Vcorr_lj, enerd->term[F_LJ_RECIP]);
            pr_rvecs(debug, 0, "vir_lj_recip after corr", fr->vir_lj_recip, DIM);
        }
    }
    else
    {
        /* Is there a reaction-field exclusion correction needed? */
        if (EEL_RF(fr->eeltype) && eelRF_NEC != fr->eeltype)
        {
            /* With the Verlet scheme, exclusion forces are calculated
             * in the non-bonded kernel.
             */
            if (ir->cutoff_scheme != ecutsVERLET)
            {
                real dvdl_rf_excl      = 0;
                enerd->term[F_RF_EXCL] =
                    RF_excl_correction(fr, graph, md, excl, x, f,
                                       fr->fshift, &pbc, lambda[efptCOUL], &dvdl_rf_excl);

                enerd->dvdl_lin[efptCOUL] += dvdl_rf_excl;
            }
        }
    }
    where();
    debug_gmx();

    if (debug)
    {
        print_nrnb(debug, nrnb);
    }
    debug_gmx();

#ifdef GMX_MPI
    if (TAKETIME)
    {
        t2 = MPI_Wtime();
        MPI_Barrier(cr->mpi_comm_mygroup);
        t3          = MPI_Wtime();
        fr->t_wait += t3-t2;
        if (fr->timesteps == 11)
        {
            char buf[22];
            fprintf(stderr, "* PP load balancing info: rank %d, step %s, rel wait time=%3.0f%% , load string value: %7.2f\n",
                    cr->nodeid, gmx_step_str(fr->timesteps, buf),
                    100*fr->t_wait/(fr->t_wait+fr->t_fnbf),
                    (fr->t_fnbf+fr->t_wait)/fr->t_fnbf);
        }
        fr->timesteps++;
    }
#endif

    if (debug)
    {
        pr_rvecs(debug, 0, "fshift after bondeds", fr->fshift, SHIFTS);
    }

}
Beispiel #15
0
gmx_lincsdata_t init_lincs(FILE *fplog,gmx_mtop_t *mtop,
                           int nflexcon_global,t_blocka *at2con,
                           gmx_bool bPLINCS,int nIter,int nProjOrder)
{
    struct gmx_lincsdata *li;
    int mb;
    gmx_moltype_t *molt;
    
    if (fplog)
    {
        fprintf(fplog,"\nInitializing%s LINear Constraint Solver\n",
                bPLINCS ? " Parallel" : "");
    }
    
    snew(li,1);
    
    li->ncg      =
        gmx_mtop_ftype_count(mtop,F_CONSTR) +
        gmx_mtop_ftype_count(mtop,F_CONSTRNC);
    li->ncg_flex = nflexcon_global;
    
    li->ncg_triangle = 0;
    for(mb=0; mb<mtop->nmolblock; mb++)
    {
        molt = &mtop->moltype[mtop->molblock[mb].type];
        li->ncg_triangle +=
            mtop->molblock[mb].nmol*
            count_triangle_constraints(molt->ilist,
                                       &at2con[mtop->molblock[mb].type]);
    }
    
    li->nIter  = nIter;
    li->nOrder = nProjOrder;

    /* LINCS can run on any number of threads.
     * Currently the number is fixed for the whole simulation,
     * but it could be set in set_lincs().
     */
    li->nth = gmx_omp_nthreads_get(emntLINCS);
    if (li->nth == 1)
    {
        snew(li->th,1);
    }
    else
    {
        /* Allocate an extra elements for "thread-overlap" constraints */
        snew(li->th,li->nth+1);
    }
    if (debug)
    {
        fprintf(debug,"LINCS: using %d threads\n",li->nth);
    }

    if (bPLINCS || li->ncg_triangle > 0)
    {
        please_cite(fplog,"Hess2008a");
    }
    else
    {
        please_cite(fplog,"Hess97a");
    }
    
    if (fplog)
    {
        fprintf(fplog,"The number of constraints is %d\n",li->ncg);
        if (bPLINCS)
        {
            fprintf(fplog,"There are inter charge-group constraints,\n"
                    "will communicate selected coordinates each lincs iteration\n");
        }
        if (li->ncg_triangle > 0)
        {
            fprintf(fplog,
                    "%d constraints are involved in constraint triangles,\n"
                    "will apply an additional matrix expansion of order %d for couplings\n"
                    "between constraints inside triangles\n",
                    li->ncg_triangle,li->nOrder);
        }
    }
    
    return li;
}
Beispiel #16
0
int mdrunner(gmx_hw_opt_t *hw_opt,
             FILE *fplog, t_commrec *cr, int nfile,
             const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
             gmx_bool bCompact, int nstglobalcomm,
             ivec ddxyz, int dd_node_order, real rdd, real rconstr,
             const char *dddlb_opt, real dlb_scale,
             const char *ddcsx, const char *ddcsy, const char *ddcsz,
             const char *nbpu_opt, int nstlist_cmdline,
             gmx_int64_t nsteps_cmdline, int nstepout, int resetstep,
             int gmx_unused nmultisim, int repl_ex_nst, int repl_ex_nex,
             int repl_ex_seed, real pforce, real cpt_period, real max_hours,
             int imdport, unsigned long Flags)
{
    gmx_bool                  bForceUseGPU, bTryUseGPU, bRerunMD;
    t_inputrec               *inputrec;
    t_state                  *state = NULL;
    matrix                    box;
    gmx_ddbox_t               ddbox = {0};
    int                       npme_major, npme_minor;
    t_nrnb                   *nrnb;
    gmx_mtop_t               *mtop          = NULL;
    t_mdatoms                *mdatoms       = NULL;
    t_forcerec               *fr            = NULL;
    t_fcdata                 *fcd           = NULL;
    real                      ewaldcoeff_q  = 0;
    real                      ewaldcoeff_lj = 0;
    struct gmx_pme_t        **pmedata       = NULL;
    gmx_vsite_t              *vsite         = NULL;
    gmx_constr_t              constr;
    int                       nChargePerturbed = -1, nTypePerturbed = 0, status;
    gmx_wallcycle_t           wcycle;
    gmx_bool                  bReadEkin;
    gmx_walltime_accounting_t walltime_accounting = NULL;
    int                       rc;
    gmx_int64_t               reset_counters;
    gmx_edsam_t               ed           = NULL;
    int                       nthreads_pme = 1;
    int                       nthreads_pp  = 1;
    gmx_membed_t              membed       = NULL;
    gmx_hw_info_t            *hwinfo       = NULL;
    /* The master rank decides early on bUseGPU and broadcasts this later */
    gmx_bool                  bUseGPU      = FALSE;

    /* CAUTION: threads may be started later on in this function, so
       cr doesn't reflect the final parallel state right now */
    snew(inputrec, 1);
    snew(mtop, 1);

    if (Flags & MD_APPENDFILES)
    {
        fplog = NULL;
    }

    bRerunMD     = (Flags & MD_RERUN);
    bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
    bTryUseGPU   = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;

    /* Detect hardware, gather information. This is an operation that is
     * global for this process (MPI rank). */
    hwinfo = gmx_detect_hardware(fplog, cr, bTryUseGPU);

    gmx_print_detected_hardware(fplog, cr, hwinfo);

    if (fplog != NULL)
    {
        /* Print references after all software/hardware printing */
        please_cite(fplog, "Abraham2015");
        please_cite(fplog, "Pall2015");
        please_cite(fplog, "Pronk2013");
        please_cite(fplog, "Hess2008b");
        please_cite(fplog, "Spoel2005a");
        please_cite(fplog, "Lindahl2001a");
        please_cite(fplog, "Berendsen95a");
    }

    snew(state, 1);
    if (SIMMASTER(cr))
    {
        /* Read (nearly) all data required for the simulation */
        read_tpx_state(ftp2fn(efTPR, nfile, fnm), inputrec, state, NULL, mtop);

        if (inputrec->cutoff_scheme == ecutsVERLET)
        {
            /* Here the master rank decides if all ranks will use GPUs */
            bUseGPU = (hwinfo->gpu_info.n_dev_compatible > 0 ||
                       getenv("GMX_EMULATE_GPU") != NULL);

            /* TODO add GPU kernels for this and replace this check by:
             * (bUseGPU && (ir->vdwtype == evdwPME &&
             *               ir->ljpme_combination_rule == eljpmeLB))
             * update the message text and the content of nbnxn_acceleration_supported.
             */
            if (bUseGPU &&
                !nbnxn_gpu_acceleration_supported(fplog, cr, inputrec, bRerunMD))
            {
                /* Fallback message printed by nbnxn_acceleration_supported */
                if (bForceUseGPU)
                {
                    gmx_fatal(FARGS, "GPU acceleration requested, but not supported with the given input settings");
                }
                bUseGPU = FALSE;
            }

            prepare_verlet_scheme(fplog, cr,
                                  inputrec, nstlist_cmdline, mtop, state->box,
                                  bUseGPU);
        }
        else
        {
            if (nstlist_cmdline > 0)
            {
                gmx_fatal(FARGS, "Can not set nstlist with the group cut-off scheme");
            }

            if (hwinfo->gpu_info.n_dev_compatible > 0)
            {
                md_print_warn(cr, fplog,
                              "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
                              "      To use a GPU, set the mdp option: cutoff-scheme = Verlet\n");
            }

            if (bForceUseGPU)
            {
                gmx_fatal(FARGS, "GPU requested, but can't be used without cutoff-scheme=Verlet");
            }

#ifdef GMX_TARGET_BGQ
            md_print_warn(cr, fplog,
                          "NOTE: There is no SIMD implementation of the group scheme kernels on\n"
                          "      BlueGene/Q. You will observe better performance from using the\n"
                          "      Verlet cut-off scheme.\n");
#endif
        }

        if (inputrec->eI == eiSD2)
        {
            md_print_warn(cr, fplog, "The stochastic dynamics integrator %s is deprecated, since\n"
                          "it is slower than integrator %s and is slightly less accurate\n"
                          "with constraints. Use the %s integrator.",
                          ei_names[inputrec->eI], ei_names[eiSD1], ei_names[eiSD1]);
        }
    }

    /* Check and update the hardware options for internal consistency */
    check_and_update_hw_opt_1(hw_opt, cr);

    /* Early check for externally set process affinity. */
    gmx_check_thread_affinity_set(fplog, cr,
                                  hw_opt, hwinfo->nthreads_hw_avail, FALSE);

#ifdef GMX_THREAD_MPI
    if (SIMMASTER(cr))
    {
        if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
        {
            gmx_fatal(FARGS, "You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME ranks");
        }

        /* Since the master knows the cut-off scheme, update hw_opt for this.
         * This is done later for normal MPI and also once more with tMPI
         * for all tMPI ranks.
         */
        check_and_update_hw_opt_2(hw_opt, inputrec->cutoff_scheme);

        /* NOW the threads will be started: */
        hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
                                                 hw_opt,
                                                 inputrec, mtop,
                                                 cr, fplog, bUseGPU);

        if (hw_opt->nthreads_tmpi > 1)
        {
            t_commrec *cr_old       = cr;
            /* now start the threads. */
            cr = mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm,
                                        oenv, bVerbose, bCompact, nstglobalcomm,
                                        ddxyz, dd_node_order, rdd, rconstr,
                                        dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
                                        nbpu_opt, nstlist_cmdline,
                                        nsteps_cmdline, nstepout, resetstep, nmultisim,
                                        repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
                                        cpt_period, max_hours,
                                        Flags);
            /* the main thread continues here with a new cr. We don't deallocate
               the old cr because other threads may still be reading it. */
            if (cr == NULL)
            {
                gmx_comm("Failed to spawn threads");
            }
        }
    }
#endif
    /* END OF CAUTION: cr is now reliable */

    /* g_membed initialisation *
     * Because we change the mtop, init_membed is called before the init_parallel *
     * (in case we ever want to make it run in parallel) */
    if (opt2bSet("-membed", nfile, fnm))
    {
        if (MASTER(cr))
        {
            fprintf(stderr, "Initializing membed");
        }
        membed = init_membed(fplog, nfile, fnm, mtop, inputrec, state, cr, &cpt_period);
    }

    if (PAR(cr))
    {
        /* now broadcast everything to the non-master nodes/threads: */
        init_parallel(cr, inputrec, mtop);

        /* The master rank decided on the use of GPUs,
         * broadcast this information to all ranks.
         */
        gmx_bcast_sim(sizeof(bUseGPU), &bUseGPU, cr);
    }

    if (fplog != NULL)
    {
        pr_inputrec(fplog, 0, "Input Parameters", inputrec, FALSE);
        fprintf(fplog, "\n");
    }

    /* now make sure the state is initialized and propagated */
    set_state_entries(state, inputrec);

    /* A parallel command line option consistency check that we can
       only do after any threads have started. */
    if (!PAR(cr) &&
        (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
    {
        gmx_fatal(FARGS,
                  "The -dd or -npme option request a parallel simulation, "
#ifndef GMX_MPI
                  "but %s was compiled without threads or MPI enabled"
#else
#ifdef GMX_THREAD_MPI
                  "but the number of threads (option -nt) is 1"
#else
                  "but %s was not started through mpirun/mpiexec or only one rank was requested through mpirun/mpiexec"
#endif
#endif
                  , output_env_get_program_display_name(oenv)
                  );
    }

    if (bRerunMD &&
        (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
    {
        gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
    }

    if (can_use_allvsall(inputrec, TRUE, cr, fplog) && DOMAINDECOMP(cr))
    {
        gmx_fatal(FARGS, "All-vs-all loops do not work with domain decomposition, use a single MPI rank");
    }

    if (!(EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype)))
    {
        if (cr->npmenodes > 0)
        {
            gmx_fatal_collective(FARGS, cr, NULL,
                                 "PME-only ranks are requested, but the system does not use PME for electrostatics or LJ");
        }

        cr->npmenodes = 0;
    }

    if (bUseGPU && cr->npmenodes < 0)
    {
        /* With GPUs we don't automatically use PME-only ranks. PME ranks can
         * improve performance with many threads per GPU, since our OpenMP
         * scaling is bad, but it's difficult to automate the setup.
         */
        cr->npmenodes = 0;
    }

#ifdef GMX_FAHCORE
    if (MASTER(cr))
    {
        fcRegisterSteps(inputrec->nsteps, inputrec->init_step);
    }
#endif

    /* NMR restraints must be initialized before load_checkpoint,
     * since with time averaging the history is added to t_state.
     * For proper consistency check we therefore need to extend
     * t_state here.
     * So the PME-only nodes (if present) will also initialize
     * the distance restraints.
     */
    snew(fcd, 1);

    /* This needs to be called before read_checkpoint to extend the state */
    init_disres(fplog, mtop, inputrec, cr, fcd, state, repl_ex_nst > 0);

    init_orires(fplog, mtop, state->x, inputrec, cr, &(fcd->orires),
                state);

    if (DEFORM(*inputrec))
    {
        /* Store the deform reference box before reading the checkpoint */
        if (SIMMASTER(cr))
        {
            copy_mat(state->box, box);
        }
        if (PAR(cr))
        {
            gmx_bcast(sizeof(box), box, cr);
        }
        /* Because we do not have the update struct available yet
         * in which the reference values should be stored,
         * we store them temporarily in static variables.
         * This should be thread safe, since they are only written once
         * and with identical values.
         */
        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
        deform_init_init_step_tpx = inputrec->init_step;
        copy_mat(box, deform_init_box_tpx);
        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
    }

    if (opt2bSet("-cpi", nfile, fnm))
    {
        /* Check if checkpoint file exists before doing continuation.
         * This way we can use identical input options for the first and subsequent runs...
         */
        if (gmx_fexist_master(opt2fn_master("-cpi", nfile, fnm, cr), cr) )
        {
            load_checkpoint(opt2fn_master("-cpi", nfile, fnm, cr), &fplog,
                            cr, ddxyz,
                            inputrec, state, &bReadEkin,
                            (Flags & MD_APPENDFILES),
                            (Flags & MD_APPENDFILESSET));

            if (bReadEkin)
            {
                Flags |= MD_READ_EKIN;
            }
        }
    }

    if (MASTER(cr) && (Flags & MD_APPENDFILES))
    {
        gmx_log_open(ftp2fn(efLOG, nfile, fnm), cr,
                     Flags, &fplog);
    }

    /* override nsteps with value from cmdline */
    override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);

    if (SIMMASTER(cr))
    {
        copy_mat(state->box, box);
    }

    if (PAR(cr))
    {
        gmx_bcast(sizeof(box), box, cr);
    }

    /* Essential dynamics */
    if (opt2bSet("-ei", nfile, fnm))
    {
        /* Open input and output files, allocate space for ED data structure */
        ed = ed_open(mtop->natoms, &state->edsamstate, nfile, fnm, Flags, oenv, cr);
    }

    if (PAR(cr) && !(EI_TPI(inputrec->eI) ||
                     inputrec->eI == eiNM))
    {
        cr->dd = init_domain_decomposition(fplog, cr, Flags, ddxyz, rdd, rconstr,
                                           dddlb_opt, dlb_scale,
                                           ddcsx, ddcsy, ddcsz,
                                           mtop, inputrec,
                                           box, state->x,
                                           &ddbox, &npme_major, &npme_minor);

        make_dd_communicators(fplog, cr, dd_node_order);

        /* Set overallocation to avoid frequent reallocation of arrays */
        set_over_alloc_dd(TRUE);
    }
    else
    {
        /* PME, if used, is done on all nodes with 1D decomposition */
        cr->npmenodes = 0;
        cr->duty      = (DUTY_PP | DUTY_PME);
        npme_major    = 1;
        npme_minor    = 1;

        if (inputrec->ePBC == epbcSCREW)
        {
            gmx_fatal(FARGS,
                      "pbc=%s is only implemented with domain decomposition",
                      epbc_names[inputrec->ePBC]);
        }
    }

    if (PAR(cr))
    {
        /* After possible communicator splitting in make_dd_communicators.
         * we can set up the intra/inter node communication.
         */
        gmx_setup_nodecomm(fplog, cr);
    }

    /* Initialize per-physical-node MPI process/thread ID and counters. */
    gmx_init_intranode_counters(cr);
#ifdef GMX_MPI
    if (MULTISIM(cr))
    {
        md_print_info(cr, fplog,
                      "This is simulation %d out of %d running as a composite GROMACS\n"
                      "multi-simulation job. Setup for this simulation:\n\n",
                      cr->ms->sim, cr->ms->nsim);
    }
    md_print_info(cr, fplog, "Using %d MPI %s\n",
                  cr->nnodes,
#ifdef GMX_THREAD_MPI
                  cr->nnodes == 1 ? "thread" : "threads"
#else
                  cr->nnodes == 1 ? "process" : "processes"
#endif
                  );
    fflush(stderr);
#endif

    /* Check and update hw_opt for the cut-off scheme */
    check_and_update_hw_opt_2(hw_opt, inputrec->cutoff_scheme);

    /* Check and update hw_opt for the number of MPI ranks */
    check_and_update_hw_opt_3(hw_opt);

    gmx_omp_nthreads_init(fplog, cr,
                          hwinfo->nthreads_hw_avail,
                          hw_opt->nthreads_omp,
                          hw_opt->nthreads_omp_pme,
                          (cr->duty & DUTY_PP) == 0,
                          inputrec->cutoff_scheme == ecutsVERLET);

#ifndef NDEBUG
    if (integrator[inputrec->eI].func != do_tpi &&
        inputrec->cutoff_scheme == ecutsVERLET)
    {
        gmx_feenableexcept();
    }
#endif

    if (bUseGPU)
    {
        /* Select GPU id's to use */
        gmx_select_gpu_ids(fplog, cr, &hwinfo->gpu_info, bForceUseGPU,
                           &hw_opt->gpu_opt);
    }
    else
    {
        /* Ignore (potentially) manually selected GPUs */
        hw_opt->gpu_opt.n_dev_use = 0;
    }

    /* check consistency across ranks of things like SIMD
     * support and number of GPUs selected */
    gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt, bUseGPU);

    /* Now that we know the setup is consistent, check for efficiency */
    check_resource_division_efficiency(hwinfo, hw_opt, Flags & MD_NTOMPSET,
                                       cr, fplog);

    if (DOMAINDECOMP(cr))
    {
        /* When we share GPUs over ranks, we need to know this for the DLB */
        dd_setup_dlb_resource_sharing(cr, hwinfo, hw_opt);
    }

    /* getting number of PP/PME threads
       PME: env variable should be read only on one node to make sure it is
       identical everywhere;
     */
    /* TODO nthreads_pp is only used for pinning threads.
     * This is a temporary solution until we have a hw topology library.
     */
    nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
    nthreads_pme = gmx_omp_nthreads_get(emntPME);

    wcycle = wallcycle_init(fplog, resetstep, cr, nthreads_pp, nthreads_pme);

    if (PAR(cr))
    {
        /* Master synchronizes its value of reset_counters with all nodes
         * including PME only nodes */
        reset_counters = wcycle_get_reset_counters(wcycle);
        gmx_bcast_sim(sizeof(reset_counters), &reset_counters, cr);
        wcycle_set_reset_counters(wcycle, reset_counters);
    }

    snew(nrnb, 1);
    if (cr->duty & DUTY_PP)
    {
        bcast_state(cr, state);

        /* Initiate forcerecord */
        fr          = mk_forcerec();
        fr->hwinfo  = hwinfo;
        fr->gpu_opt = &hw_opt->gpu_opt;
        init_forcerec(fplog, oenv, fr, fcd, inputrec, mtop, cr, box,
                      opt2fn("-table", nfile, fnm),
                      opt2fn("-tabletf", nfile, fnm),
                      opt2fn("-tablep", nfile, fnm),
                      opt2fn("-tableb", nfile, fnm),
                      nbpu_opt,
                      FALSE,
                      pforce);

        /* version for PCA_NOT_READ_NODE (see md.c) */
        /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
           "nofile","nofile","nofile","nofile",FALSE,pforce);
         */

        /* Initialize QM-MM */
        if (fr->bQMMM)
        {
            init_QMMMrec(cr, mtop, inputrec, fr);
        }

        /* Initialize the mdatoms structure.
         * mdatoms is not filled with atom data,
         * as this can not be done now with domain decomposition.
         */
        mdatoms = init_mdatoms(fplog, mtop, inputrec->efep != efepNO);

        /* Initialize the virtual site communication */
        vsite = init_vsite(mtop, cr, FALSE);

        calc_shifts(box, fr->shift_vec);

        /* With periodic molecules the charge groups should be whole at start up
         * and the virtual sites should not be far from their proper positions.
         */
        if (!inputrec->bContinuation && MASTER(cr) &&
            !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
        {
            /* Make molecules whole at start of run */
            if (fr->ePBC != epbcNONE)
            {
                do_pbc_first_mtop(fplog, inputrec->ePBC, box, mtop, state->x);
            }
            if (vsite)
            {
                /* Correct initial vsite positions are required
                 * for the initial distribution in the domain decomposition
                 * and for the initial shell prediction.
                 */
                construct_vsites_mtop(vsite, mtop, state->x);
            }
        }

        if (EEL_PME(fr->eeltype) || EVDW_PME(fr->vdwtype))
        {
            ewaldcoeff_q  = fr->ewaldcoeff_q;
            ewaldcoeff_lj = fr->ewaldcoeff_lj;
            pmedata       = &fr->pmedata;
        }
        else
        {
            pmedata = NULL;
        }
    }
    else
    {
        /* This is a PME only node */

        /* We don't need the state */
        done_state(state);

        ewaldcoeff_q  = calc_ewaldcoeff_q(inputrec->rcoulomb, inputrec->ewald_rtol);
        ewaldcoeff_lj = calc_ewaldcoeff_lj(inputrec->rvdw, inputrec->ewald_rtol_lj);
        snew(pmedata, 1);
    }

    if (hw_opt->thread_affinity != threadaffOFF)
    {
        /* Before setting affinity, check whether the affinity has changed
         * - which indicates that probably the OpenMP library has changed it
         * since we first checked).
         */
        gmx_check_thread_affinity_set(fplog, cr,
                                      hw_opt, hwinfo->nthreads_hw_avail, TRUE);

        /* Set the CPU affinity */
        gmx_set_thread_affinity(fplog, cr, hw_opt, hwinfo);
    }

    /* Initiate PME if necessary,
     * either on all nodes or on dedicated PME nodes only. */
    if (EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype))
    {
        if (mdatoms)
        {
            nChargePerturbed = mdatoms->nChargePerturbed;
            if (EVDW_PME(inputrec->vdwtype))
            {
                nTypePerturbed   = mdatoms->nTypePerturbed;
            }
        }
        if (cr->npmenodes > 0)
        {
            /* The PME only nodes need to know nChargePerturbed(FEP on Q) and nTypePerturbed(FEP on LJ)*/
            gmx_bcast_sim(sizeof(nChargePerturbed), &nChargePerturbed, cr);
            gmx_bcast_sim(sizeof(nTypePerturbed), &nTypePerturbed, cr);
        }

        if (cr->duty & DUTY_PME)
        {
            status = gmx_pme_init(pmedata, cr, npme_major, npme_minor, inputrec,
                                  mtop ? mtop->natoms : 0, nChargePerturbed, nTypePerturbed,
                                  (Flags & MD_REPRODUCIBLE), nthreads_pme);
            if (status != 0)
            {
                gmx_fatal(FARGS, "Error %d initializing PME", status);
            }
        }
    }


    if (integrator[inputrec->eI].func == do_md)
    {
        /* Turn on signal handling on all nodes */
        /*
         * (A user signal from the PME nodes (if any)
         * is communicated to the PP nodes.
         */
        signal_handler_install();
    }

    if (cr->duty & DUTY_PP)
    {
        /* Assumes uniform use of the number of OpenMP threads */
        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntDefault));

        if (inputrec->bPull)
        {
            /* Initialize pull code */
            inputrec->pull_work =
                init_pull(fplog, inputrec->pull, inputrec, nfile, fnm,
                          mtop, cr, oenv, inputrec->fepvals->init_lambda,
                          EI_DYNAMICS(inputrec->eI) && MASTER(cr), Flags);
        }

        if (inputrec->bRot)
        {
            /* Initialize enforced rotation code */
            init_rot(fplog, inputrec, nfile, fnm, cr, state->x, box, mtop, oenv,
                     bVerbose, Flags);
        }

        if (inputrec->eSwapCoords != eswapNO)
        {
            /* Initialize ion swapping code */
            init_swapcoords(fplog, bVerbose, inputrec, opt2fn_master("-swap", nfile, fnm, cr),
                            mtop, state->x, state->box, &state->swapstate, cr, oenv, Flags);
        }

        constr = init_constraints(fplog, mtop, inputrec, ed, state, cr);

        if (DOMAINDECOMP(cr))
        {
            GMX_RELEASE_ASSERT(fr, "fr was NULL while cr->duty was DUTY_PP");
            dd_init_bondeds(fplog, cr->dd, mtop, vsite, inputrec,
                            Flags & MD_DDBONDCHECK, fr->cginfo_mb);

            set_dd_parameters(fplog, cr->dd, dlb_scale, inputrec, &ddbox);

            setup_dd_grid(fplog, cr->dd);
        }

        /* Now do whatever the user wants us to do (how flexible...) */
        integrator[inputrec->eI].func(fplog, cr, nfile, fnm,
                                      oenv, bVerbose, bCompact,
                                      nstglobalcomm,
                                      vsite, constr,
                                      nstepout, inputrec, mtop,
                                      fcd, state,
                                      mdatoms, nrnb, wcycle, ed, fr,
                                      repl_ex_nst, repl_ex_nex, repl_ex_seed,
                                      membed,
                                      cpt_period, max_hours,
                                      imdport,
                                      Flags,
                                      walltime_accounting);

        if (inputrec->bPull)
        {
            finish_pull(inputrec->pull_work);
        }

        if (inputrec->bRot)
        {
            finish_rot(inputrec->rot);
        }

    }
    else
    {
        GMX_RELEASE_ASSERT(pmedata, "pmedata was NULL while cr->duty was not DUTY_PP");
        /* do PME only */
        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME));
        gmx_pmeonly(*pmedata, cr, nrnb, wcycle, walltime_accounting, ewaldcoeff_q, ewaldcoeff_lj, inputrec);
    }

    wallcycle_stop(wcycle, ewcRUN);

    /* Finish up, write some stuff
     * if rerunMD, don't write last frame again
     */
    finish_run(fplog, cr,
               inputrec, nrnb, wcycle, walltime_accounting,
               fr ? fr->nbv : NULL,
               EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));


    /* Free GPU memory and context */
    free_gpu_resources(fr, cr, &hwinfo->gpu_info, fr ? fr->gpu_opt : NULL);

    if (opt2bSet("-membed", nfile, fnm))
    {
        sfree(membed);
    }

    gmx_hardware_info_free(hwinfo);

    /* Does what it says */
    print_date_and_time(fplog, cr->nodeid, "Finished mdrun", gmx_gettime());
    walltime_accounting_destroy(walltime_accounting);

    /* PLUMED */
    if(plumedswitch){
      plumed_finalize(plumedmain);
    }
    /* END PLUMED */

    /* Close logfile already here if we were appending to it */
    if (MASTER(cr) && (Flags & MD_APPENDFILES))
    {
        gmx_log_close(fplog);
    }

    rc = (int)gmx_get_stop_condition();

    done_ed(&ed);

#ifdef GMX_THREAD_MPI
    /* we need to join all threads. The sub-threads join when they
       exit this function, but the master thread needs to be told to
       wait for that. */
    if (PAR(cr) && MASTER(cr))
    {
        tMPI_Finalize();
    }
#endif

    return rc;
}