::std::vector<real> simd4Real2Vector(const gmx_simd4_real_t simd4) { real mem[GMX_SIMD4_WIDTH*2]; real * p = gmx_simd4_align_r(mem); gmx_simd4_store_r(p, simd4); std::vector<real> v(p, p+GMX_SIMD4_WIDTH); return v; }
gmx_simd4_real_t vector2Simd4Real(const std::vector<real> &v) { real mem[GMX_SIMD4_WIDTH*2]; real * p = gmx_simd4_align_r(mem); for (int i = 0; i < GMX_SIMD4_WIDTH; i++) { p[i] = v[i % v.size()]; // repeat vector contents to fill simd width } return gmx_simd4_load_r(p); }
struct pme_spline_work *make_pme_spline_work(int gmx_unused order) { struct pme_spline_work *work; #ifdef PME_SIMD4_SPREAD_GATHER real tmp[GMX_SIMD4_WIDTH*3], *tmp_aligned; gmx_simd4_real_t zero_S; gmx_simd4_real_t real_mask_S0, real_mask_S1; int of, i; snew_aligned(work, 1, SIMD4_ALIGNMENT); tmp_aligned = gmx_simd4_align_r(tmp); zero_S = gmx_simd4_setzero_r(); /* Generate bit masks to mask out the unused grid entries, * as we only operate on order of the 8 grid entries that are * load into 2 SIMD registers. */ for (of = 0; of < 2*GMX_SIMD4_WIDTH-(order-1); of++) { for (i = 0; i < 2*GMX_SIMD4_WIDTH; i++) { tmp_aligned[i] = (i >= of && i < of+order ? -1.0 : 1.0); } real_mask_S0 = gmx_simd4_load_r(tmp_aligned); real_mask_S1 = gmx_simd4_load_r(tmp_aligned+GMX_SIMD4_WIDTH); work->mask_S0[of] = gmx_simd4_cmplt_r(real_mask_S0, zero_S); work->mask_S1[of] = gmx_simd4_cmplt_r(real_mask_S1, zero_S); } #else work = NULL; #endif return work; }
void gather_f_bsplines(struct gmx_pme_t *pme, real *grid, gmx_bool bClearF, pme_atomcomm_t *atc, splinedata_t *spline, real scale) { /* sum forces for local particles */ int nn, n, ithx, ithy, ithz, i0, j0, k0; int index_x, index_xy; int nx, ny, nz, pny, pnz; int * idxptr; real tx, ty, dx, dy, coefficient; real fx, fy, fz, gval; real fxy1, fz1; real *thx, *thy, *thz, *dthx, *dthy, *dthz; int norder; real rxx, ryx, ryy, rzx, rzy, rzz; int order; #ifdef PME_SIMD4_SPREAD_GATHER // cppcheck-suppress unreadVariable cppcheck seems not to analyze code from pme-simd4.h struct pme_spline_work *work = pme->spline_work; #ifndef PME_SIMD4_UNALIGNED real thz_buffer[GMX_SIMD4_WIDTH*3], *thz_aligned; real dthz_buffer[GMX_SIMD4_WIDTH*3], *dthz_aligned; thz_aligned = gmx_simd4_align_r(thz_buffer); dthz_aligned = gmx_simd4_align_r(dthz_buffer); #endif #endif order = pme->pme_order; nx = pme->nkx; ny = pme->nky; nz = pme->nkz; pny = pme->pmegrid_ny; pnz = pme->pmegrid_nz; rxx = pme->recipbox[XX][XX]; ryx = pme->recipbox[YY][XX]; ryy = pme->recipbox[YY][YY]; rzx = pme->recipbox[ZZ][XX]; rzy = pme->recipbox[ZZ][YY]; rzz = pme->recipbox[ZZ][ZZ]; for (nn = 0; nn < spline->n; nn++) { n = spline->ind[nn]; coefficient = scale*atc->coefficient[n]; if (bClearF) { atc->f[n][XX] = 0; atc->f[n][YY] = 0; atc->f[n][ZZ] = 0; } if (coefficient != 0) { fx = 0; fy = 0; fz = 0; idxptr = atc->idx[n]; norder = nn*order; i0 = idxptr[XX]; j0 = idxptr[YY]; k0 = idxptr[ZZ]; /* Pointer arithmetic alert, next six statements */ thx = spline->theta[XX] + norder; thy = spline->theta[YY] + norder; thz = spline->theta[ZZ] + norder; dthx = spline->dtheta[XX] + norder; dthy = spline->dtheta[YY] + norder; dthz = spline->dtheta[ZZ] + norder; switch (order) { case 4: #ifdef PME_SIMD4_SPREAD_GATHER #ifdef PME_SIMD4_UNALIGNED #define PME_GATHER_F_SIMD4_ORDER4 #else #define PME_GATHER_F_SIMD4_ALIGNED #define PME_ORDER 4 #endif #include "gromacs/ewald/pme-simd4.h" #else DO_FSPLINE(4); #endif break; case 5: #ifdef PME_SIMD4_SPREAD_GATHER #define PME_GATHER_F_SIMD4_ALIGNED #define PME_ORDER 5 #include "gromacs/ewald/pme-simd4.h" #else DO_FSPLINE(5); #endif break; default: DO_FSPLINE(order); break; } atc->f[n][XX] += -coefficient*( fx*nx*rxx ); atc->f[n][YY] += -coefficient*( fx*nx*ryx + fy*ny*ryy ); atc->f[n][ZZ] += -coefficient*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz ); } } /* Since the energy and not forces are interpolated * the net force might not be exactly zero. * This can be solved by also interpolating F, but * that comes at a cost. * A better hack is to remove the net force every * step, but that must be done at a higher level * since this routine doesn't see all atoms if running * in parallel. Don't know how important it is? EL 990726 */ }