Example #1
0
static void pp_verlet_load(gmx_mtop_t *mtop, t_inputrec *ir, matrix box,
                           int *nq_tot, int *nlj_tot,
                           double *cost_pp,
                           gmx_bool *bChargePerturbed, gmx_bool *bTypePerturbed)
{
    t_atom        *atom;
    int            mb, nmol, atnr, cg, a, a0, nqlj, nq, nlj;
    gmx_bool       bQRF;
    t_iparams     *iparams;
    gmx_moltype_t *molt;
    real           r_eff;
    double         c_qlj, c_q, c_lj;
    double         nppa;
    int            j_cluster_size;
    /* Conversion factor for reference vs SIMD kernel performance.
     * The factor is about right for SSE2/4, but should be 2 higher for AVX256.
     */
#ifdef GMX_DOUBLE
    const real     nbnxn_refkernel_fac = 4.0;
#else
    const real     nbnxn_refkernel_fac = 8.0;
#endif

    bQRF = (EEL_RF(ir->coulombtype) || ir->coulombtype == eelCUT);

    iparams           = mtop->ffparams.iparams;
    atnr              = mtop->ffparams.atnr;
    nqlj              = 0;
    nq                = 0;
    *bChargePerturbed = FALSE;
    *bTypePerturbed   = FALSE;
    for (mb = 0; mb < mtop->nmolblock; mb++)
    {
        molt = &mtop->moltype[mtop->molblock[mb].type];
        atom = molt->atoms.atom;
        nmol = mtop->molblock[mb].nmol;
        a    = 0;
        for (a = 0; a < molt->atoms.nr; a++)
        {
            if (atom[a].q != 0 || atom[a].qB != 0)
            {
                if (iparams[(atnr+1)*atom[a].type].lj.c6  != 0 ||
                    iparams[(atnr+1)*atom[a].type].lj.c12 != 0)
                {
                    nqlj += nmol;
                }
                else
                {
                    nq += nmol;
                }
            }
            if (atom[a].q != atom[a].qB)
            {
                *bChargePerturbed = TRUE;
            }
            if (atom[a].type != atom[a].typeB)
            {
                *bTypePerturbed = TRUE;
            }
        }
    }

    nlj = mtop->natoms - nqlj - nq;

    *nq_tot  = nqlj + nq;
    *nlj_tot = nqlj + nlj;

    /* Effective cut-off for cluster pair list of 4x4 or 4x8 atoms.
     * This choice should match the one of pick_nbnxn_kernel_cpu().
     * TODO: Make this function use pick_nbnxn_kernel_cpu().
     */
#if defined GMX_SIMD_HAVE_REAL && ((GMX_SIMD_REAL_WIDTH == 8 && defined GMX_SIMD_HAVE_FMA) || GMX_SIMD_REAL_WIDTH > 8)
    j_cluster_size = 8;
#else
    j_cluster_size = 4;
#endif
    r_eff = ir->rlist + nbnxn_get_rlist_effective_inc(j_cluster_size, mtop->natoms/det(box));

    /* The average number of pairs per atom */
    nppa  = 0.5*4/3*M_PI*r_eff*r_eff*r_eff*mtop->natoms/det(box);

    if (debug)
    {
        fprintf(debug, "nqlj %d nq %d nlj %d rlist %.3f r_eff %.3f pairs per atom %.1f\n",
                nqlj, nq, nlj, ir->rlist, r_eff, nppa);
    }

    /* Determine the cost per pair interaction */
    c_qlj = (bQRF ? c_nbnxn_qrf_lj : c_nbnxn_qexp_lj);
    c_q   = (bQRF ? c_nbnxn_qrf    : c_nbnxn_qexp);
    c_lj  = c_nbnxn_lj;
    if (ir->vdw_modifier == eintmodPOTSWITCH || EVDW_PME(ir->vdwtype))
    {
        c_qlj += c_nbnxn_ljexp_add;
        c_lj  += c_nbnxn_ljexp_add;
    }
    if (EVDW_PME(ir->vdwtype) && ir->ljpme_combination_rule == eljpmeLB)
    {
        /* We don't have LJ-PME LB comb. rule kernels, we use slow kernels */
        c_qlj *= nbnxn_refkernel_fac;
        c_q   *= nbnxn_refkernel_fac;
        c_lj  *= nbnxn_refkernel_fac;
    }

    /* For the PP non-bonded cost it is (unrealistically) assumed
     * that all atoms are distributed homogeneously in space.
     */
    *cost_pp = (nqlj*c_qlj + nq*c_q + nlj*c_lj)*nppa;

    *cost_pp *= simd_cycle_factor(bHaveSIMD);
}
Example #2
0
static void pp_verlet_load(const gmx_mtop_t *mtop, const t_inputrec *ir,
                           matrix box,
                           int *nq_tot, int *nlj_tot,
                           double *cost_pp,
                           gmx_bool *bChargePerturbed, gmx_bool *bTypePerturbed)
{
    t_atom        *atom;
    int            mb, nmol, atnr, a, nqlj, nq, nlj;
    gmx_bool       bQRF;
    t_iparams     *iparams;
    gmx_moltype_t *molt;
    real           r_eff;
    double         c_qlj, c_q, c_lj;
    double         nat;
    /* Conversion factor for reference vs SIMD kernel performance.
     * The factor is about right for SSE2/4, but should be 2 higher for AVX256.
     */
#ifdef GMX_DOUBLE
    const real     nbnxn_refkernel_fac = 4.0;
#else
    const real     nbnxn_refkernel_fac = 8.0;
#endif

    bQRF = (EEL_RF(ir->coulombtype) || ir->coulombtype == eelCUT);

    iparams           = mtop->ffparams.iparams;
    atnr              = mtop->ffparams.atnr;
    nqlj              = 0;
    nq                = 0;
    *bChargePerturbed = FALSE;
    *bTypePerturbed   = FALSE;
    for (mb = 0; mb < mtop->nmolblock; mb++)
    {
        molt = &mtop->moltype[mtop->molblock[mb].type];
        atom = molt->atoms.atom;
        nmol = mtop->molblock[mb].nmol;
        for (a = 0; a < molt->atoms.nr; a++)
        {
            if (atom[a].q != 0 || atom[a].qB != 0)
            {
                if (iparams[(atnr+1)*atom[a].type].lj.c6  != 0 ||
                        iparams[(atnr+1)*atom[a].type].lj.c12 != 0)
                {
                    nqlj += nmol;
                }
                else
                {
                    nq += nmol;
                }
            }
            if (atom[a].q != atom[a].qB)
            {
                *bChargePerturbed = TRUE;
            }
            if (atom[a].type != atom[a].typeB)
            {
                *bTypePerturbed = TRUE;
            }
        }
    }

    nlj = mtop->natoms - nqlj - nq;

    *nq_tot  = nqlj + nq;
    *nlj_tot = nqlj + nlj;

    /* Effective cut-off for cluster pair list of 4x4 atoms */
    r_eff = ir->rlist + nbnxn_get_rlist_effective_inc(NBNXN_CPU_CLUSTER_I_SIZE, mtop->natoms/det(box));

    if (debug)
    {
        fprintf(debug, "nqlj %d nq %d nlj %d rlist %.3f r_eff %.3f\n",
                nqlj, nq, nlj, ir->rlist, r_eff);
    }

    /* Determine the cost per pair interaction */
    c_qlj = (bQRF ? C_VT_QRF_LJ : C_VT_QEXP_LJ);
    c_q   = (bQRF ? C_VT_QRF    : C_VT_QEXP);
    c_lj  = C_VT_LJ;
    if (ir->vdw_modifier == eintmodPOTSWITCH || EVDW_PME(ir->vdwtype))
    {
        c_qlj += C_VT_LJEXP_ADD;
        c_lj  += C_VT_LJEXP_ADD;
    }
    if (EVDW_PME(ir->vdwtype) && ir->ljpme_combination_rule == eljpmeLB)
    {
        /* We don't have LJ-PME LB comb. rule kernels, we use slow kernels */
        c_qlj *= nbnxn_refkernel_fac;
        c_q   *= nbnxn_refkernel_fac;
        c_lj  *= nbnxn_refkernel_fac;
    }

    /* For the PP non-bonded cost it is (unrealistically) assumed
     * that all atoms are distributed homogeneously in space.
     */
    /* Convert mtop->natoms to double to avoid int overflow */
    nat      = mtop->natoms;
    *cost_pp = 0.5*nat*(nqlj*c_qlj + nq*c_q + nlj*c_lj)
               *4/3*M_PI*r_eff*r_eff*r_eff/det(box);
}
Example #3
0
static void pp_verlet_load(gmx_mtop_t *mtop,t_inputrec *ir,matrix box,
                           int *nq_tot,
                           double *cost_pp,
                           gmx_bool *bChargePerturbed)
{
    t_atom *atom;
    int  mb,nmol,atnr,cg,a,a0,nqlj,nq,nlj;
    gmx_bool bQRF;
    t_iparams *iparams;
    gmx_moltype_t *molt;
    float r_eff;
    double nat;

    bQRF = (EEL_RF(ir->coulombtype) || ir->coulombtype == eelCUT);

    iparams = mtop->ffparams.iparams;
    atnr = mtop->ffparams.atnr;
    nqlj = 0;
    nq   = 0;
    *bChargePerturbed = FALSE;
    for(mb=0; mb<mtop->nmolblock; mb++)
	{
        molt = &mtop->moltype[mtop->molblock[mb].type];
        atom = molt->atoms.atom;
        nmol = mtop->molblock[mb].nmol;
        a = 0;
        for(a=0; a<molt->atoms.nr; a++)
        {
            if (atom[a].q != 0 || atom[a].qB != 0)
            {
                if (iparams[(atnr+1)*atom[a].type].lj.c6  != 0 ||
                    iparams[(atnr+1)*atom[a].type].lj.c12 != 0)
                {
                    nqlj += nmol;
                }
                else
                {
                    nq += nmol;
                }
            }
            if (atom[a].q != atom[a].qB)
            {
                *bChargePerturbed = TRUE;
            }
        }
    }

    nlj = mtop->natoms - nqlj - nq;

    *nq_tot = nqlj + nq;

    /* Effective cut-off for cluster pair list of 4x4 atoms */
    r_eff = ir->rlist + nbnxn_get_rlist_effective_inc(NBNXN_CPU_CLUSTER_I_SIZE,mtop->natoms/det(box));

    if (debug)
    {
        fprintf(debug,"nqlj %d nq %d nlj %d rlist %.3f r_eff %.3f\n",
                nqlj,nq,nlj,ir->rlist,r_eff);
    }

    /* For the PP non-bonded cost it is (unrealistically) assumed
     * that all atoms are distributed homogeneously in space.
     */
    /* Convert mtop->natoms to double to avoid int overflow */
    nat = mtop->natoms;
    *cost_pp = 0.5*(nqlj*nat*(bQRF ? C_VT_QLJ_RF : C_VT_QLJ_TAB) +
                    nq*nat*(bQRF ? C_VT_Q_RF : C_VT_Q_TAB) +
                    nlj*nat*C_VT_LJ)
        *4/3*M_PI*r_eff*r_eff*r_eff/det(box);
}
Example #4
0
/* Try to increase nstlist when using the Verlet cut-off scheme */
static void increase_nstlist(FILE *fp, t_commrec *cr,
                             t_inputrec *ir, int nstlist_cmdline,
                             const gmx_mtop_t *mtop, matrix box,
                             gmx_bool bGPU)
{
    float                  listfac_ok, listfac_max;
    int                    nstlist_orig, nstlist_prev;
    verletbuf_list_setup_t ls;
    real                   rlistWithReferenceNstlist, rlist_inc, rlist_ok, rlist_max;
    real                   rlist_new, rlist_prev;
    size_t                 nstlist_ind = 0;
    t_state                state_tmp;
    gmx_bool               bBox, bDD, bCont;
    const char            *nstl_gpu = "\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
    const char            *nve_err  = "Can not increase nstlist because an NVE ensemble is used";
    const char            *vbd_err  = "Can not increase nstlist because verlet-buffer-tolerance is not set or used";
    const char            *box_err  = "Can not increase nstlist because the box is too small";
    const char            *dd_err   = "Can not increase nstlist because of domain decomposition limitations";
    char                   buf[STRLEN];
    const float            oneThird = 1.0f / 3.0f;

    if (nstlist_cmdline <= 0)
    {
        if (ir->nstlist == 1)
        {
            /* The user probably set nstlist=1 for a reason,
             * don't mess with the settings.
             */
            return;
        }

        if (fp != NULL && bGPU && ir->nstlist < nstlist_try[0])
        {
            fprintf(fp, nstl_gpu, ir->nstlist);
        }
        nstlist_ind = 0;
        while (nstlist_ind < NNSTL && ir->nstlist >= nstlist_try[nstlist_ind])
        {
            nstlist_ind++;
        }
        if (nstlist_ind == NNSTL)
        {
            /* There are no larger nstlist value to try */
            return;
        }
    }

    if (EI_MD(ir->eI) && ir->etc == etcNO)
    {
        if (MASTER(cr))
        {
            fprintf(stderr, "%s\n", nve_err);
        }
        if (fp != NULL)
        {
            fprintf(fp, "%s\n", nve_err);
        }

        return;
    }

    if (ir->verletbuf_tol == 0 && bGPU)
    {
        gmx_fatal(FARGS, "You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
    }

    if (ir->verletbuf_tol < 0)
    {
        if (MASTER(cr))
        {
            fprintf(stderr, "%s\n", vbd_err);
        }
        if (fp != NULL)
        {
            fprintf(fp, "%s\n", vbd_err);
        }

        return;
    }

    if (bGPU)
    {
        listfac_ok  = nbnxn_gpu_listfac_ok;
        listfac_max = nbnxn_gpu_listfac_max;
    }
    else
    {
        listfac_ok  = nbnxn_cpu_listfac_ok;
        listfac_max = nbnxn_cpu_listfac_max;
    }

    nstlist_orig = ir->nstlist;
    if (nstlist_cmdline > 0)
    {
        if (fp)
        {
            sprintf(buf, "Getting nstlist=%d from command line option",
                    nstlist_cmdline);
        }
        ir->nstlist = nstlist_cmdline;
    }

    verletbuf_get_list_setup(TRUE, bGPU, &ls);

    /* Allow rlist to make the list a given factor larger than the list
     * would be with the reference value for nstlist (10).
     */
    nstlist_prev = ir->nstlist;
    ir->nstlist  = nbnxnReferenceNstlist;
    calc_verlet_buffer_size(mtop, det(box), ir, -1, &ls, NULL,
                            &rlistWithReferenceNstlist);
    ir->nstlist  = nstlist_prev;

    /* Determine the pair list size increase due to zero interactions */
    rlist_inc = nbnxn_get_rlist_effective_inc(ls.cluster_size_j,
                                              mtop->natoms/det(box));
    rlist_ok  = (rlistWithReferenceNstlist + rlist_inc)*pow(listfac_ok, oneThird) - rlist_inc;
    rlist_max = (rlistWithReferenceNstlist + rlist_inc)*pow(listfac_max, oneThird) - rlist_inc;
    if (debug)
    {
        fprintf(debug, "nstlist tuning: rlist_inc %.3f rlist_ok %.3f rlist_max %.3f\n",
                rlist_inc, rlist_ok, rlist_max);
    }

    nstlist_prev = nstlist_orig;
    rlist_prev   = ir->rlist;
    do
    {
        if (nstlist_cmdline <= 0)
        {
            ir->nstlist = nstlist_try[nstlist_ind];
        }

        /* Set the pair-list buffer size in ir */
        calc_verlet_buffer_size(mtop, det(box), ir, -1, &ls, NULL, &rlist_new);

        /* Does rlist fit in the box? */
        bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC, box));
        bDD  = TRUE;
        if (bBox && DOMAINDECOMP(cr))
        {
            /* Check if rlist fits in the domain decomposition */
            if (inputrec2nboundeddim(ir) < DIM)
            {
                gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
            }
            copy_mat(box, state_tmp.box);
            bDD = change_dd_cutoff(cr, &state_tmp, ir, rlist_new);
        }

        if (debug)
        {
            fprintf(debug, "nstlist %d rlist %.3f bBox %d bDD %d\n",
                    ir->nstlist, rlist_new, bBox, bDD);
        }

        bCont = FALSE;

        if (nstlist_cmdline <= 0)
        {
            if (bBox && bDD && rlist_new <= rlist_max)
            {
                /* Increase nstlist */
                nstlist_prev = ir->nstlist;
                rlist_prev   = rlist_new;
                bCont        = (nstlist_ind+1 < NNSTL && rlist_new < rlist_ok);
            }
            else
            {
                /* Stick with the previous nstlist */
                ir->nstlist = nstlist_prev;
                rlist_new   = rlist_prev;
                bBox        = TRUE;
                bDD         = TRUE;
            }
        }

        nstlist_ind++;
    }
    while (bCont);

    if (!bBox || !bDD)
    {
        gmx_warning(!bBox ? box_err : dd_err);
        if (fp != NULL)
        {
            fprintf(fp, "\n%s\n", bBox ? box_err : dd_err);
        }
        ir->nstlist = nstlist_orig;
    }
    else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
    {
        sprintf(buf, "Changing nstlist from %d to %d, rlist from %g to %g",
                nstlist_orig, ir->nstlist,
                ir->rlist, rlist_new);
        if (MASTER(cr))
        {
            fprintf(stderr, "%s\n\n", buf);
        }
        if (fp != NULL)
        {
            fprintf(fp, "%s\n\n", buf);
        }
        ir->rlist     = rlist_new;
        ir->rlistlong = rlist_new;
    }
}