示例#1
0
::std::vector<real>
simd4Real2Vector(const gmx_simd4_real_t simd4)
{
    real                mem[GMX_SIMD4_WIDTH*2];
    real *              p = gmx_simd4_align_r(mem);

    gmx_simd4_store_r(p, simd4);
    std::vector<real>   v(p, p+GMX_SIMD4_WIDTH);

    return v;
}
示例#2
0
gmx_simd4_real_t
vector2Simd4Real(const std::vector<real> &v)
{
    real                mem[GMX_SIMD4_WIDTH*2];
    real *              p = gmx_simd4_align_r(mem);

    for (int i = 0; i < GMX_SIMD4_WIDTH; i++)
    {
        p[i] = v[i % v.size()];  // repeat vector contents to fill simd width
    }
    return gmx_simd4_load_r(p);
}
示例#3
0
struct pme_spline_work *make_pme_spline_work(int gmx_unused order)
{
    struct pme_spline_work *work;

#ifdef PME_SIMD4_SPREAD_GATHER
    real             tmp[GMX_SIMD4_WIDTH*3], *tmp_aligned;
    gmx_simd4_real_t zero_S;
    gmx_simd4_real_t real_mask_S0, real_mask_S1;
    int              of, i;

    snew_aligned(work, 1, SIMD4_ALIGNMENT);

    tmp_aligned = gmx_simd4_align_r(tmp);

    zero_S = gmx_simd4_setzero_r();

    /* Generate bit masks to mask out the unused grid entries,
     * as we only operate on order of the 8 grid entries that are
     * load into 2 SIMD registers.
     */
    for (of = 0; of < 2*GMX_SIMD4_WIDTH-(order-1); of++)
    {
        for (i = 0; i < 2*GMX_SIMD4_WIDTH; i++)
        {
            tmp_aligned[i] = (i >= of && i < of+order ? -1.0 : 1.0);
        }
        real_mask_S0      = gmx_simd4_load_r(tmp_aligned);
        real_mask_S1      = gmx_simd4_load_r(tmp_aligned+GMX_SIMD4_WIDTH);
        work->mask_S0[of] = gmx_simd4_cmplt_r(real_mask_S0, zero_S);
        work->mask_S1[of] = gmx_simd4_cmplt_r(real_mask_S1, zero_S);
    }
#else
    work = NULL;
#endif

    return work;
}
示例#4
0
void gather_f_bsplines(struct gmx_pme_t *pme, real *grid,
                       gmx_bool bClearF, pme_atomcomm_t *atc,
                       splinedata_t *spline,
                       real scale)
{
    /* sum forces for local particles */
    int                     nn, n, ithx, ithy, ithz, i0, j0, k0;
    int                     index_x, index_xy;
    int                     nx, ny, nz, pny, pnz;
    int                 *   idxptr;
    real                    tx, ty, dx, dy, coefficient;
    real                    fx, fy, fz, gval;
    real                    fxy1, fz1;
    real                   *thx, *thy, *thz, *dthx, *dthy, *dthz;
    int                     norder;
    real                    rxx, ryx, ryy, rzx, rzy, rzz;
    int                     order;

#ifdef PME_SIMD4_SPREAD_GATHER
    // cppcheck-suppress unreadVariable cppcheck seems not to analyze code from pme-simd4.h
    struct pme_spline_work *work = pme->spline_work;
#ifndef PME_SIMD4_UNALIGNED
    real                    thz_buffer[GMX_SIMD4_WIDTH*3],  *thz_aligned;
    real                    dthz_buffer[GMX_SIMD4_WIDTH*3], *dthz_aligned;

    thz_aligned  = gmx_simd4_align_r(thz_buffer);
    dthz_aligned = gmx_simd4_align_r(dthz_buffer);
#endif
#endif

    order = pme->pme_order;
    nx    = pme->nkx;
    ny    = pme->nky;
    nz    = pme->nkz;
    pny   = pme->pmegrid_ny;
    pnz   = pme->pmegrid_nz;

    rxx   = pme->recipbox[XX][XX];
    ryx   = pme->recipbox[YY][XX];
    ryy   = pme->recipbox[YY][YY];
    rzx   = pme->recipbox[ZZ][XX];
    rzy   = pme->recipbox[ZZ][YY];
    rzz   = pme->recipbox[ZZ][ZZ];

    for (nn = 0; nn < spline->n; nn++)
    {
        n           = spline->ind[nn];
        coefficient = scale*atc->coefficient[n];

        if (bClearF)
        {
            atc->f[n][XX] = 0;
            atc->f[n][YY] = 0;
            atc->f[n][ZZ] = 0;
        }
        if (coefficient != 0)
        {
            fx     = 0;
            fy     = 0;
            fz     = 0;
            idxptr = atc->idx[n];
            norder = nn*order;

            i0   = idxptr[XX];
            j0   = idxptr[YY];
            k0   = idxptr[ZZ];

            /* Pointer arithmetic alert, next six statements */
            thx  = spline->theta[XX] + norder;
            thy  = spline->theta[YY] + norder;
            thz  = spline->theta[ZZ] + norder;
            dthx = spline->dtheta[XX] + norder;
            dthy = spline->dtheta[YY] + norder;
            dthz = spline->dtheta[ZZ] + norder;

            switch (order)
            {
                case 4:
#ifdef PME_SIMD4_SPREAD_GATHER
#ifdef PME_SIMD4_UNALIGNED
#define PME_GATHER_F_SIMD4_ORDER4
#else
#define PME_GATHER_F_SIMD4_ALIGNED
#define PME_ORDER 4
#endif
#include "gromacs/ewald/pme-simd4.h"
#else
                    DO_FSPLINE(4);
#endif
                    break;
                case 5:
#ifdef PME_SIMD4_SPREAD_GATHER
#define PME_GATHER_F_SIMD4_ALIGNED
#define PME_ORDER 5
#include "gromacs/ewald/pme-simd4.h"
#else
                    DO_FSPLINE(5);
#endif
                    break;
                default:
                    DO_FSPLINE(order);
                    break;
            }

            atc->f[n][XX] += -coefficient*( fx*nx*rxx );
            atc->f[n][YY] += -coefficient*( fx*nx*ryx + fy*ny*ryy );
            atc->f[n][ZZ] += -coefficient*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
        }
    }
    /* Since the energy and not forces are interpolated
     * the net force might not be exactly zero.
     * This can be solved by also interpolating F, but
     * that comes at a cost.
     * A better hack is to remove the net force every
     * step, but that must be done at a higher level
     * since this routine doesn't see all atoms if running
     * in parallel. Don't know how important it is?  EL 990726
     */
}