/* Return GMX_SUCCESS (0) if SSE2 support is present, or * general error GMX_EFAILURE. */ int nb_kernel_x86_64_sse2_test(FILE * log) { /* * This should NOT be called from threads, * but just in case you still try to do it... */ #ifdef GMX_THREADS gmx_thread_mutex_lock(&nb_kernel_x86_64_sse2_test_mutex); #endif if(log) fprintf(log,"Testing x86_64 SSE2 support..."); nb_kernel_x86_64_sse2_present = TRUE; signal(SIGILL,nb_kernel_x86_64_sse2_sigill_handler); /* return to this point after executing the signal handler * if we catch a SIGILL */ setjmp(nb_kernel_x86_64_sse2_testprog); if(nb_kernel_x86_64_sse2_present) nb_kernel_x86_64_sse2_test_asm(); /* If SSE2 worked, then success is still 1. * If we got SIGILL, it was set to 0 in sigill_handler(). */ if(log) fprintf(log," %spresent.\n", nb_kernel_x86_64_sse2_present ? "":"not "); #ifdef GMX_THREADS gmx_thread_mutex_unlock(&nb_kernel_x86_64_sse2_test_mutex); #endif return ((nb_kernel_x86_64_sse2_present) ? 0 : -1); }
void nb_kernel310_ppc_altivec (int * p_nri, int iinr[], int jindex[], int jjnr[], int shift[], float shiftvec[], float fshift[], int gid[], float pos[], float faction[], float charge[], float * p_facel, float * p_krf, float * p_crf, float Vc[], int type[], int * p_ntype, float vdwparam[], float Vvdw[], float * p_tabscale, float VFtab[], float invsqrta[], float dvda[], float * p_gbtabscale, float GBtab[], int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, float * work) { vector float ix,iy,iz,shvec; vector float vfacel,tsc,fs,fs2,nul; vector float dx,dy,dz; vector float Vvdwtot,vctot,qq,iq,c6,c12,VVc,FFc; vector float fix,fiy,fiz; vector float tmp1,tmp2,tmp3,tmp4; vector float rinv,r,rinvsq,rsq,rinvsix,Vvdw6,Vvdw12; int n,k,ii,is3,ii3,ntiA,nj0,nj1; int jnra,jnrb,jnrc,jnrd; int j3a,j3b,j3c,j3d; int nri, ntype, nouter, ninner; int tja,tjb,tjc,tjd; #ifdef GMX_THREADS int nn0, nn1; #endif nouter = 0; ninner = 0; nri = *p_nri; ntype = *p_ntype; nul=vec_zero(); vfacel=load_float_and_splat(p_facel); tsc=load_float_and_splat(p_tabscale); #ifdef GMX_THREADS nthreads = *p_nthreads; do { gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx); nn0 = *count; nn1 = nn0+(nri-nn0)/(2*nthreads)+3; *count = nn1; gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx); if(nn1>nri) nn1=nri; for(n=nn0; (n<nn1); n++) { #if 0 } /* maintain correct indentation even with conditional left braces */ #endif #else /* without gmx_threads */ for(n=0;n<nri;n++) { #endif is3 = 3*shift[n]; shvec = load_xyz(shiftvec+is3); ii = iinr[n]; ii3 = 3*ii; ix = load_xyz(pos+ii3); Vvdwtot = nul; vctot = nul; fix = nul; fiy = nul; fiz = nul; ix = vec_add(ix,shvec); nj0 = jindex[n]; nj1 = jindex[n+1]; splat_xyz_to_vectors(ix,&ix,&iy,&iz); ntiA = 2*ntype*type[ii]; iq = vec_madd(load_float_and_splat(charge+ii),vfacel,nul); for(k=nj0; k<(nj1-3); k+=4) { jnra = jjnr[k]; jnrb = jjnr[k+1]; jnrc = jjnr[k+2]; jnrd = jjnr[k+3]; j3a = 3*jnra; j3b = 3*jnrb; j3c = 3*jnrc; j3d = 3*jnrd; transpose_4_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b), load_xyz(pos+j3c), load_xyz(pos+j3d),&dx,&dy,&dz); dx = vec_sub(ix,dx); dy = vec_sub(iy,dy); dz = vec_sub(iz,dz); rsq = vec_madd(dx,dx,nul); rsq = vec_madd(dy,dy,rsq); rsq = vec_madd(dz,dz,rsq); rinv = do_invsqrt(rsq); rinvsq = vec_madd(rinv,rinv,nul); r = vec_madd(rinv,rsq,nul); rinvsix = vec_madd(rinvsq,rinvsq,nul); rinvsix = vec_madd(rinvsix,rinvsq,nul); tja = ntiA+2*type[jnra]; tjb = ntiA+2*type[jnrb]; tjc = ntiA+2*type[jnrc]; tjd = ntiA+2*type[jnrd]; qq = vec_madd(load_4_float(charge+jnra,charge+jnrb, charge+jnrc,charge+jnrd),iq,nul); load_4_pair(vdwparam+tja,vdwparam+tjb,vdwparam+tjc,vdwparam+tjd,&c6,&c12); do_4_ctable_coul(VFtab,vec_madd(r,tsc,nul),&VVc,&FFc); fs2 = vec_madd(qq,FFc,nul); /* fijC */ vctot = vec_madd(qq,VVc,vctot); Vvdw6 = vec_madd(c6,rinvsix,nul); Vvdw12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul), nul); fs = vec_madd(vec_twelve(),Vvdw12,nul); fs = vec_nmsub(vec_six(),Vvdw6,fs); fs = vec_madd(fs,rinv,nul); Vvdwtot = vec_add(Vvdwtot,Vvdw12); fs = vec_nmsub(fs2,tsc,fs); fs = vec_madd(fs,rinv,nul); Vvdwtot = vec_sub(Vvdwtot,Vvdw6); fix = vec_madd(fs,dx,fix); /* +=fx */ fiy = vec_madd(fs,dy,fiy); /* +=fy */ fiz = vec_madd(fs,dz,fiz); /* +=fz */ dx = vec_nmsub(dx,fs,nul); /* -fx */ dy = vec_nmsub(dy,fs,nul); /* -fy */ dz = vec_nmsub(dz,fs,nul); /* -fz */ transpose_3_to_4(dx,dy,dz,&tmp1,&tmp2,&tmp3,&tmp4); add_xyz_to_mem(faction+j3a,tmp1); add_xyz_to_mem(faction+j3b,tmp2); add_xyz_to_mem(faction+j3c,tmp3); add_xyz_to_mem(faction+j3d,tmp4); } if(k<(nj1-1)) { jnra = jjnr[k]; jnrb = jjnr[k+1]; j3a = 3*jnra; j3b = 3*jnrb; transpose_2_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b),&dx,&dy,&dz); dx = vec_sub(ix,dx); dy = vec_sub(iy,dy); dz = vec_sub(iz,dz); rsq = vec_madd(dx,dx,nul); rsq = vec_madd(dy,dy,rsq); rsq = vec_madd(dz,dz,rsq); zero_highest_2_elements_in_vector(&rsq); rinv = do_invsqrt(rsq); zero_highest_2_elements_in_vector(&rinv); rinvsq = vec_madd(rinv,rinv,nul); r = vec_madd(rinv,rsq,nul); rinvsix = vec_madd(rinvsq,rinvsq,nul); rinvsix = vec_madd(rinvsix,rinvsq,nul); tja = ntiA+2*type[jnra]; tjb = ntiA+2*type[jnrb]; qq = vec_madd(load_2_float(charge+jnra,charge+jnrb),iq,nul); load_2_pair(vdwparam+tja,vdwparam+tjb,&c6,&c12); do_2_ctable_coul(VFtab,vec_madd(r,tsc,nul),&VVc,&FFc); fs2 = vec_madd(qq,FFc,nul); /* fijC */ vctot = vec_madd(qq,VVc,vctot); Vvdw6 = vec_madd(c6,rinvsix,nul); Vvdw12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul), nul); fs = vec_madd(vec_twelve(),Vvdw12,nul); fs = vec_nmsub(vec_six(),Vvdw6,fs); Vvdwtot = vec_add(Vvdwtot,Vvdw12); fs = vec_madd(fs,rinv,nul); fs = vec_nmsub(fs2,tsc,fs); fs = vec_madd(fs,rinv,nul); Vvdwtot = vec_sub(Vvdwtot,Vvdw6); fix = vec_madd(fs,dx,fix); /* +=fx */ fiy = vec_madd(fs,dy,fiy); /* +=fy */ fiz = vec_madd(fs,dz,fiz); /* +=fz */ dx = vec_nmsub(dx,fs,nul); /* -fx */ dy = vec_nmsub(dy,fs,nul); /* -fy */ dz = vec_nmsub(dz,fs,nul); /* -fz */ transpose_3_to_2(dx,dy,dz,&tmp1,&tmp2); add_xyz_to_mem(faction+j3a,tmp1); add_xyz_to_mem(faction+j3b,tmp2); k += 2; } if((nj1-nj0) & 0x1) { jnra = jjnr[k]; j3a = 3*jnra; transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz); dx = vec_sub(ix,dx); dy = vec_sub(iy,dy); dz = vec_sub(iz,dz); rsq = vec_madd(dx,dx,nul); rsq = vec_madd(dy,dy,rsq); rsq = vec_madd(dz,dz,rsq); zero_highest_3_elements_in_vector(&rsq); rinv = do_invsqrt(rsq); zero_highest_3_elements_in_vector(&rinv); rinvsq = vec_madd(rinv,rinv,nul); r = vec_madd(rinv,rsq,nul); rinvsix = vec_madd(rinvsq,rinvsq,nul); rinvsix = vec_madd(rinvsix,rinvsq,nul); tja = ntiA+2*type[jnra]; qq = vec_madd(load_1_float(charge+jnra),iq,nul); load_1_pair(vdwparam+tja,&c6,&c12); do_1_ctable_coul(VFtab,vec_madd(r,tsc,nul),&VVc,&FFc); fs2 = vec_madd(qq,FFc,nul); /* fijC */ vctot = vec_madd(qq,VVc,vctot); Vvdw6 = vec_madd(c6,rinvsix,nul); Vvdw12 = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul), nul); fs = vec_madd(vec_twelve(),Vvdw12,nul); fs = vec_nmsub(vec_six(),Vvdw6,fs); fs = vec_madd(fs,rinv,nul); Vvdwtot = vec_add(Vvdwtot,Vvdw12); fs = vec_nmsub(fs2,tsc,fs); fs = vec_madd(fs,rinv,nul); Vvdwtot = vec_sub(Vvdwtot,Vvdw6); fix = vec_madd(fs,dx,fix); /* +=fx */ fiy = vec_madd(fs,dy,fiy); /* +=fy */ fiz = vec_madd(fs,dz,fiz); /* +=fz */ dx = vec_nmsub(dx,fs,nul); /* -fx */ dy = vec_nmsub(dy,fs,nul); /* -fy */ dz = vec_nmsub(dz,fs,nul); /* -fz */ transpose_3_to_1(dx,dy,dz,&tmp1); add_xyz_to_mem(faction+j3a,tmp1); } /* update outer data */ transpose_3_to_4(fix,fiy,fiz,&tmp1,&tmp2,&tmp3,&tmp4); tmp1 = vec_add(tmp1,tmp3); tmp2 = vec_add(tmp2,tmp4); tmp1 = vec_add(tmp1,tmp2); add_xyz_to_mem(faction+ii3,tmp1); add_xyz_to_mem(fshift+is3,tmp1); add_vector_to_float(Vc+gid[n],vctot); add_vector_to_float(Vvdw+gid[n],Vvdwtot); ninner += nj1 - nj0; } #ifdef GMX_THREADS nouter += nn1 - nn0; } while (nn1<nri); #else nouter = nri; #endif *outeriter = nouter; *inneriter = ninner; }
/* * Gromacs nonbonded kernel nb_kernel430 * Coulomb interaction: Generalized-Born * VdW interaction: Tabulated * water optimization: No * Calculate forces: yes */ void nb_kernel430( int * p_nri, int * iinr, int * jindex, int * jjnr, int * shift, real * shiftvec, real * fshift, int * gid, real * pos, real * faction, real * charge, real * p_facel, real * p_krf, real * p_crf, real * Vc, int * type, int * p_ntype, real * vdwparam, real * Vvdw, real * p_tabscale, real * VFtab,real * enerd1,real * enerd2,real * enerd3,real * enerd4,int * start,int * end,int * homenr,int * nbsum, real * invsqrta, real * dvda, real * p_gbtabscale, real * GBtab, int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, real * work) { int nri,ntype,nthreads; real facel,krf,crf,tabscale,gbtabscale; int n,ii,is3,ii3,k,nj0,nj1,jnr,j3,ggid; int nn0,nn1,nouter,ninner; real shX,shY,shZ; real fscal,tx,ty,tz; real iq; real qq,vcoul,vctot; int nti; int tj; real Vvdw6,Vvdwtot; real Vvdw12; real r,rt,eps,eps2; int n0,nnn; real Y,F,Geps,Heps2,Fp,VV; real FF; real fijC; real fijD,fijR; real isai,isaj,isaprod,gbscale,vgb,vgbtot; real dvdasum,dvdatmp,dvdaj,fgb; real ix1,iy1,iz1,fix1,fiy1,fiz1; real jx1,jy1,jz1; real dx11,dy11,dz11,rsq11,rinv11; real c6,c12; gmx_gbdata_t *gbdata; real * gpol; real scale_gb; gbdata = (gmx_gbdata_t *)work; gpol = gbdata->gpol; nri = *p_nri; ntype = *p_ntype; nthreads = *p_nthreads; facel = *p_facel; scale_gb = 1.0 - (1.0/gbdata->gb_epsilon_solvent); krf = *p_krf; crf = *p_crf; tabscale = *p_tabscale; gbtabscale = *p_gbtabscale; /* Reset outer and inner iteration counters */ nouter = 0; ninner = 0; /* Loop over thread workunits */ do { #ifdef GMX_THREADS gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx); nn0 = *count; /* Take successively smaller chunks (at least 10 lists) */ nn1 = nn0+(nri-nn0)/(2*nthreads)+10; *count = nn1; gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx); if(nn1>nri) nn1=nri; #else nn0 = 0; nn1 = nri; #endif /* Start outer loop over neighborlists */ for(n=nn0; (n<nn1); n++) { /* Load shift vector for this list */ is3 = 3*shift[n]; shX = shiftvec[is3]; shY = shiftvec[is3+1]; shZ = shiftvec[is3+2]; /* Load limits for loop over neighbors */ nj0 = jindex[n]; nj1 = jindex[n+1]; /* Get outer coordinate index */ ii = iinr[n]; ii3 = 3*ii; /* Load i atom data, add shift vector */ ix1 = shX + pos[ii3+0]; iy1 = shY + pos[ii3+1]; iz1 = shZ + pos[ii3+2]; /* Load parameters for i atom */ iq = facel*charge[ii]; isai = invsqrta[ii]; nti = 2*ntype*type[ii]; /* Zero the potential energy for this list */ vctot = 0; Vvdwtot = 0; dvdasum = 0; vgbtot = 0; /* Clear i atom forces */ fix1 = 0; fiy1 = 0; fiz1 = 0; for(k=nj0; (k<nj1); k++) { /* Get j neighbor index, and coordinate index */ jnr = jjnr[k]; j3 = 3*jnr; /* load j atom coordinates */ jx1 = pos[j3+0]; jy1 = pos[j3+1]; jz1 = pos[j3+2]; /* Calculate distance */ dx11 = ix1 - jx1; dy11 = iy1 - jy1; dz11 = iz1 - jz1; rsq11 = dx11*dx11+dy11*dy11+dz11*dz11; /* Calculate 1/r and 1/r2 */ rinv11 = invsqrt(rsq11); /* Load parameters for j atom */ isaj = invsqrta[jnr]; isaprod = isai*isaj; qq = iq*charge[jnr]; vcoul = qq*rinv11; fscal = vcoul*rinv11; qq = isaprod*(-qq)*scale_gb; gbscale = isaprod*gbtabscale; tj = nti+2*type[jnr]; c6 = vdwparam[tj]; c12 = vdwparam[tj+1]; /* Tabulated Generalized-Born interaction */ dvdaj = dvda[jnr]; r = rsq11*rinv11; /* Calculate table index */ rt = r*gbscale; n0 = rt; eps = rt-n0; eps2 = eps*eps; nnn = 4*n0; Y = GBtab[nnn]; F = GBtab[nnn+1]; Geps = eps*GBtab[nnn+2]; Heps2 = eps2*GBtab[nnn+3]; Fp = F+Geps+Heps2; VV = Y+eps*Fp; FF = Fp+Geps+2.0*Heps2; vgb = qq*VV; fijC = qq*FF*gbscale; dvdatmp = -0.5*(vgb+fijC*r); dvdasum = dvdasum + dvdatmp; dvda[jnr] = dvdaj+dvdatmp*isaj*isaj; vctot = vctot + vcoul; vgbtot = vgbtot + vgb; /* Calculate table index */ r = rsq11*rinv11; /* Calculate table index */ rt = r*tabscale; n0 = rt; eps = rt-n0; eps2 = eps*eps; nnn = 8*n0; /* Tabulated VdW interaction - dispersion */ Y = VFtab[nnn]; F = VFtab[nnn+1]; Geps = eps*VFtab[nnn+2]; Heps2 = eps2*VFtab[nnn+3]; Fp = F+Geps+Heps2; VV = Y+eps*Fp; FF = Fp+Geps+2.0*Heps2; Vvdw6 = c6*VV; fijD = c6*FF; /* Tabulated VdW interaction - repulsion */ nnn = nnn+4; Y = VFtab[nnn]; F = VFtab[nnn+1]; Geps = eps*VFtab[nnn+2]; Heps2 = eps2*VFtab[nnn+3]; Fp = F+Geps+Heps2; VV = Y+eps*Fp; FF = Fp+Geps+2.0*Heps2; Vvdw12 = c12*VV; fijR = c12*FF; Vvdwtot = Vvdwtot+ Vvdw6 + Vvdw12; fscal = -((fijD+fijR)*tabscale+fijC-fscal)*rinv11; /* Calculate temporary vectorial force */ tx = fscal*dx11; ty = fscal*dy11; tz = fscal*dz11; /* Increment i atom force */ fix1 = fix1 + tx; fiy1 = fiy1 + ty; fiz1 = fiz1 + tz; /* Decrement j atom force */ faction[j3+0] = faction[j3+0] - tx; faction[j3+1] = faction[j3+1] - ty; faction[j3+2] = faction[j3+2] - tz; /* Inner loop uses 80 flops/iteration */ } /* Add i forces to mem and shifted force list */ faction[ii3+0] = faction[ii3+0] + fix1; faction[ii3+1] = faction[ii3+1] + fiy1; faction[ii3+2] = faction[ii3+2] + fiz1; fshift[is3] = fshift[is3]+fix1; fshift[is3+1] = fshift[is3+1]+fiy1; fshift[is3+2] = fshift[is3+2]+fiz1; /* Add potential energies to the group for this list */ ggid = gid[n]; Vc[ggid] = Vc[ggid] + vctot; Vvdw[ggid] = Vvdw[ggid] + Vvdwtot; gpol[ggid] = gpol[ggid] + vgbtot; dvda[ii] = dvda[ii] + dvdasum*isai*isai; /* Increment number of inner iterations */ ninner = ninner + nj1 - nj0; /* Outer loop uses 13 flops/iteration */ } /* Increment number of outer iterations */ nouter = nouter + nn1 - nn0; } while (nn1<nri); /* Write outer/inner iteration count to pointers */ *outeriter = nouter; *inneriter = ninner; }
/* * Gromacs nonbonded kernel nb_kernel222nf * Coulomb interaction: Reaction field * VdW interaction: Buckingham * water optimization: pairs of SPC/TIP3P interactions * Calculate forces: no */ void nb_kernel222nf( int * p_nri, int * iinr, int * jindex, int * jjnr, int * shift, real * shiftvec, real * fshift, int * gid, real * pos, real * faction, real * charge, real * p_facel, real * p_krf, real * p_crf, real * Vc, int * type, int * p_ntype, real * vdwparam, real * Vvdw, real * p_tabscale, real * VFtab,real * enerd1,real * enerd2,real * enerd3,real * enerd4,int * start,int * end,int * homenr,int * nbsum, real * invsqrta, real * dvda, real * p_gbtabscale, real * GBtab, int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, real * work) { int nri,ntype,nthreads; real facel,krf,crf,tabscale,gbtabscale; int n,ii,is3,ii3,k,nj0,nj1,jnr,j3,ggid; int nn0,nn1,nouter,ninner; real shX,shY,shZ; real rinvsq; real qq,vcoul,vctot; int tj; real rinvsix; real Vvdw6,Vvdwtot; real krsq; real Vvdwexp,br; real ix1,iy1,iz1; real ix2,iy2,iz2; real ix3,iy3,iz3; real jx1,jy1,jz1; real jx2,jy2,jz2; real jx3,jy3,jz3; real dx11,dy11,dz11,rsq11,rinv11; real dx12,dy12,dz12,rsq12,rinv12; real dx13,dy13,dz13,rsq13,rinv13; real dx21,dy21,dz21,rsq21,rinv21; real dx22,dy22,dz22,rsq22,rinv22; real dx23,dy23,dz23,rsq23,rinv23; real dx31,dy31,dz31,rsq31,rinv31; real dx32,dy32,dz32,rsq32,rinv32; real dx33,dy33,dz33,rsq33,rinv33; real qO,qH,qqOO,qqOH,qqHH; real c6,cexp1,cexp2; nri = *p_nri; ntype = *p_ntype; nthreads = *p_nthreads; facel = *p_facel; krf = *p_krf; crf = *p_crf; tabscale = *p_tabscale; /* Initialize water data */ ii = iinr[0]; qO = charge[ii]; qH = charge[ii+1]; qqOO = facel*qO*qO; qqOH = facel*qO*qH; qqHH = facel*qH*qH; tj = 3*(ntype+1)*type[ii]; c6 = vdwparam[tj]; cexp1 = vdwparam[tj+1]; cexp2 = vdwparam[tj+2]; /* Reset outer and inner iteration counters */ nouter = 0; ninner = 0; /* Loop over thread workunits */ do { #ifdef GMX_THREADS gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx); nn0 = *count; /* Take successively smaller chunks (at least 10 lists) */ nn1 = nn0+(nri-nn0)/(2*nthreads)+10; *count = nn1; gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx); if(nn1>nri) nn1=nri; #else nn0 = 0; nn1 = nri; #endif /* Start outer loop over neighborlists */ for(n=nn0; (n<nn1); n++) { /* Load shift vector for this list */ is3 = 3*shift[n]; shX = shiftvec[is3]; shY = shiftvec[is3+1]; shZ = shiftvec[is3+2]; /* Load limits for loop over neighbors */ nj0 = jindex[n]; nj1 = jindex[n+1]; /* Get outer coordinate index */ ii = iinr[n]; ii3 = 3*ii; /* Load i atom data, add shift vector */ ix1 = shX + pos[ii3+0]; iy1 = shY + pos[ii3+1]; iz1 = shZ + pos[ii3+2]; ix2 = shX + pos[ii3+3]; iy2 = shY + pos[ii3+4]; iz2 = shZ + pos[ii3+5]; ix3 = shX + pos[ii3+6]; iy3 = shY + pos[ii3+7]; iz3 = shZ + pos[ii3+8]; /* Zero the potential energy for this list */ vctot = 0; Vvdwtot = 0; /* Clear i atom forces */ for(k=nj0; (k<nj1); k++) { /* Get j neighbor index, and coordinate index */ jnr = jjnr[k]; j3 = 3*jnr; /* load j atom coordinates */ jx1 = pos[j3+0]; jy1 = pos[j3+1]; jz1 = pos[j3+2]; jx2 = pos[j3+3]; jy2 = pos[j3+4]; jz2 = pos[j3+5]; jx3 = pos[j3+6]; jy3 = pos[j3+7]; jz3 = pos[j3+8]; /* Calculate distance */ dx11 = ix1 - jx1; dy11 = iy1 - jy1; dz11 = iz1 - jz1; rsq11 = dx11*dx11+dy11*dy11+dz11*dz11; dx12 = ix1 - jx2; dy12 = iy1 - jy2; dz12 = iz1 - jz2; rsq12 = dx12*dx12+dy12*dy12+dz12*dz12; dx13 = ix1 - jx3; dy13 = iy1 - jy3; dz13 = iz1 - jz3; rsq13 = dx13*dx13+dy13*dy13+dz13*dz13; dx21 = ix2 - jx1; dy21 = iy2 - jy1; dz21 = iz2 - jz1; rsq21 = dx21*dx21+dy21*dy21+dz21*dz21; dx22 = ix2 - jx2; dy22 = iy2 - jy2; dz22 = iz2 - jz2; rsq22 = dx22*dx22+dy22*dy22+dz22*dz22; dx23 = ix2 - jx3; dy23 = iy2 - jy3; dz23 = iz2 - jz3; rsq23 = dx23*dx23+dy23*dy23+dz23*dz23; dx31 = ix3 - jx1; dy31 = iy3 - jy1; dz31 = iz3 - jz1; rsq31 = dx31*dx31+dy31*dy31+dz31*dz31; dx32 = ix3 - jx2; dy32 = iy3 - jy2; dz32 = iz3 - jz2; rsq32 = dx32*dx32+dy32*dy32+dz32*dz32; dx33 = ix3 - jx3; dy33 = iy3 - jy3; dz33 = iz3 - jz3; rsq33 = dx33*dx33+dy33*dy33+dz33*dz33; /* Calculate 1/r and 1/r2 */ rinv11 = invsqrt(rsq11); rinv12 = invsqrt(rsq12); rinv13 = invsqrt(rsq13); rinv21 = invsqrt(rsq21); rinv22 = invsqrt(rsq22); rinv23 = invsqrt(rsq23); rinv31 = invsqrt(rsq31); rinv32 = invsqrt(rsq32); rinv33 = invsqrt(rsq33); /* Load parameters for j atom */ qq = qqOO; rinvsq = rinv11*rinv11; /* Coulomb reaction-field interaction */ krsq = krf*rsq11; vcoul = qq*(rinv11+krsq-crf); vctot = vctot+vcoul; /* Buckingham interaction */ rinvsix = rinvsq*rinvsq*rinvsq; Vvdw6 = c6*rinvsix; br = cexp2*rsq11*rinv11; Vvdwexp = cexp1*exp(-br); Vvdwtot = Vvdwtot+Vvdwexp-Vvdw6; /* Load parameters for j atom */ qq = qqOH; /* Coulomb reaction-field interaction */ krsq = krf*rsq12; vcoul = qq*(rinv12+krsq-crf); vctot = vctot+vcoul; /* Load parameters for j atom */ qq = qqOH; /* Coulomb reaction-field interaction */ krsq = krf*rsq13; vcoul = qq*(rinv13+krsq-crf); vctot = vctot+vcoul; /* Load parameters for j atom */ qq = qqOH; /* Coulomb reaction-field interaction */ krsq = krf*rsq21; vcoul = qq*(rinv21+krsq-crf); vctot = vctot+vcoul; /* Load parameters for j atom */ qq = qqHH; /* Coulomb reaction-field interaction */ krsq = krf*rsq22; vcoul = qq*(rinv22+krsq-crf); vctot = vctot+vcoul; /* Load parameters for j atom */ qq = qqHH; /* Coulomb reaction-field interaction */ krsq = krf*rsq23; vcoul = qq*(rinv23+krsq-crf); vctot = vctot+vcoul; /* Load parameters for j atom */ qq = qqOH; /* Coulomb reaction-field interaction */ krsq = krf*rsq31; vcoul = qq*(rinv31+krsq-crf); vctot = vctot+vcoul; /* Load parameters for j atom */ qq = qqHH; /* Coulomb reaction-field interaction */ krsq = krf*rsq32; vcoul = qq*(rinv32+krsq-crf); vctot = vctot+vcoul; /* Load parameters for j atom */ qq = qqHH; /* Coulomb reaction-field interaction */ krsq = krf*rsq33; vcoul = qq*(rinv33+krsq-crf); vctot = vctot+vcoul; /* Inner loop uses 197 flops/iteration */ } /* Add i forces to mem and shifted force list */ /* Add potential energies to the group for this list */ ggid = gid[n]; Vc[ggid] = Vc[ggid] + vctot; Vvdw[ggid] = Vvdw[ggid] + Vvdwtot; /* Increment number of inner iterations */ ninner = ninner + nj1 - nj0; /* Outer loop uses 11 flops/iteration */ } /* Increment number of outer iterations */ nouter = nouter + nn1 - nn0; } while (nn1<nri); /* Write outer/inner iteration count to pointers */ *outeriter = nouter; *inneriter = ninner; }
void nb_kernel133nf_ppc_altivec(int * p_nri, int iinr[], int jindex[], int jjnr[], int shift[], float shiftvec[], float fshift[], int gid[], float pos[], float faction[], float charge[], float * p_facel, float * p_krf, float * p_crf, float Vc[], int type[], int * p_ntype, float vdwparam[], float Vvdw[], float * p_tabscale, float VFtab[], float invsqrta[], float dvda[], float * p_gbtabscale, float GBtab[], int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, float * work) { vector float iOx,iOy,iOz,iH1x,iH1y,iH1z,iH2x,iH2y,iH2z,iMx,iMy,iMz; vector float dOx,dOy,dOz,dH1x,dH1y,dH1z,dH2x,dH2y,dH2z,dMx,dMy,dMz; vector float Vvdwtot,c6,c12,VVd,VVr,tsc,r; vector float vfacel,nul; vector float vctot,qqM,qqH,iqM,iqH,jq; vector float rinvO,rinvH1,rinvH2,rinvM,rsqO,rsqH1,rsqH2,rsqM; int n,k,ii,is3,ii3,ntiA,nj0,nj1; int jnra,jnrb,jnrc,jnrd; int j3a,j3b,j3c,j3d; int nri, ntype, nouter, ninner; int tja,tjb,tjc,tjd; #ifdef GMX_THREADS int nn0, nn1; #endif nouter = 0; ninner = 0; nri = *p_nri; ntype = *p_ntype; nul=vec_zero(); tsc=load_float_and_splat(p_tabscale); vfacel=load_float_and_splat(p_facel); ii = iinr[0]; iqH = vec_madd(load_float_and_splat(charge+ii+1),vfacel,nul); iqM = vec_madd(load_float_and_splat(charge+ii+3),vfacel,nul); ntiA = 2*ntype*type[ii]; #ifdef GMX_THREADS nthreads = *p_nthreads; do { gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx); nn0 = *count; nn1 = nn0+(nri-nn0)/(2*nthreads)+3; *count = nn1; gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx); if(nn1>nri) nn1=nri; for(n=nn0; (n<nn1); n++) { #if 0 } /* maintain correct indentation even with conditional left braces */ #endif #else /* without gmx_threads */ for(n=0;n<nri;n++) { #endif is3 = 3*shift[n]; ii = iinr[n]; ii3 = 3*ii; load_1_4atoms_shift_and_splat(pos+ii3,shiftvec+is3,&iOx,&iOy,&iOz, &iH1x,&iH1y,&iH1z,&iH2x,&iH2y,&iH2z, &iMx,&iMy,&iMz); vctot = nul; Vvdwtot = nul; nj0 = jindex[n]; nj1 = jindex[n+1]; for(k=nj0; k<(nj1-3); k+=4) { jnra = jjnr[k]; jnrb = jjnr[k+1]; jnrc = jjnr[k+2]; jnrd = jjnr[k+3]; j3a = 3*jnra; j3b = 3*jnrb; j3c = 3*jnrc; j3d = 3*jnrd; transpose_4_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b), load_xyz(pos+j3c), load_xyz(pos+j3d),&dMx,&dMy,&dMz); dOx = vec_sub(iOx,dMx); dOy = vec_sub(iOy,dMy); dOz = vec_sub(iOz,dMz); dH1x = vec_sub(iH1x,dMx); dH1y = vec_sub(iH1y,dMy); dH1z = vec_sub(iH1z,dMz); dH2x = vec_sub(iH2x,dMx); dH2y = vec_sub(iH2y,dMy); dH2z = vec_sub(iH2z,dMz); dMx = vec_sub(iMx,dMx); dMy = vec_sub(iMy,dMy); dMz = vec_sub(iMz,dMz); rsqO = vec_madd(dOx,dOx,nul); rsqH1 = vec_madd(dH1x,dH1x,nul); rsqH2 = vec_madd(dH2x,dH2x,nul); rsqM = vec_madd(dMx,dMx,nul); rsqO = vec_madd(dOy,dOy,rsqO); rsqH1 = vec_madd(dH1y,dH1y,rsqH1); rsqH2 = vec_madd(dH2y,dH2y,rsqH2); rsqM = vec_madd(dMy,dMy,rsqM); rsqO = vec_madd(dOz,dOz,rsqO); rsqH1 = vec_madd(dH1z,dH1z,rsqH1); rsqH2 = vec_madd(dH2z,dH2z,rsqH2); rsqM = vec_madd(dMz,dMz,rsqM); rinvO = do_invsqrt(rsqO); do_3_invsqrt(rsqM,rsqH1,rsqH2,&rinvM,&rinvH1,&rinvH2); r = vec_madd(rsqO,rinvO,nul); tja = ntiA+2*type[jnra]; tjb = ntiA+2*type[jnrb]; tjc = ntiA+2*type[jnrc]; tjd = ntiA+2*type[jnrd]; /* load 4 j charges and multiply by iq */ jq=load_4_float(charge+jnra,charge+jnrb, charge+jnrc,charge+jnrd); load_4_pair(vdwparam+tja,vdwparam+tjb,vdwparam+tjc,vdwparam+tjd,&c6,&c12); do_vonly_4_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&VVr); Vvdwtot = vec_madd(c6,VVd,Vvdwtot); Vvdwtot = vec_madd(c12,VVr,Vvdwtot); qqH = vec_madd(iqH,jq,nul); qqM = vec_madd(iqM,jq,nul); vctot = vec_madd(qqM,rinvM,vctot); vctot = vec_madd(qqH,rinvH1,vctot); vctot = vec_madd(qqH,rinvH2,vctot); } if(k<(nj1-2)) { jnra = jjnr[k]; jnrb = jjnr[k+1]; jnrc = jjnr[k+2]; j3a = 3*jnra; j3b = 3*jnrb; j3c = 3*jnrc; transpose_4_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b), load_xyz(pos+j3c),nul,&dMx,&dMy,&dMz); dOx = vec_sub(iOx,dMx); dOy = vec_sub(iOy,dMy); dOz = vec_sub(iOz,dMz); dH1x = vec_sub(iH1x,dMx); dH1y = vec_sub(iH1y,dMy); dH1z = vec_sub(iH1z,dMz); dH2x = vec_sub(iH2x,dMx); dH2y = vec_sub(iH2y,dMy); dH2z = vec_sub(iH2z,dMz); dMx = vec_sub(iMx,dMx); dMy = vec_sub(iMy,dMy); dMz = vec_sub(iMz,dMz); rsqO = vec_madd(dOx,dOx,nul); rsqH1 = vec_madd(dH1x,dH1x,nul); rsqH2 = vec_madd(dH2x,dH2x,nul); rsqM = vec_madd(dMx,dMx,nul); rsqO = vec_madd(dOy,dOy,rsqO); rsqH1 = vec_madd(dH1y,dH1y,rsqH1); rsqH2 = vec_madd(dH2y,dH2y,rsqH2); rsqM = vec_madd(dMy,dMy,rsqM); rsqO = vec_madd(dOz,dOz,rsqO); rsqH1 = vec_madd(dH1z,dH1z,rsqH1); rsqH2 = vec_madd(dH2z,dH2z,rsqH2); rsqM = vec_madd(dMz,dMz,rsqM); rinvO = do_invsqrt(rsqO); do_3_invsqrt(rsqM,rsqH1,rsqH2,&rinvM,&rinvH1,&rinvH2); zero_highest_element_in_vector(&rinvO); zero_highest_element_in_vector(&rsqO); zero_highest_element_in_3_vectors(&rinvH1,&rinvH2,&rinvM); r = vec_madd(rsqO,rinvO,nul); jq=load_3_float(charge+jnra,charge+jnrb,charge+jnrc); tja = ntiA+2*type[jnra]; tjb = ntiA+2*type[jnrb]; tjc = ntiA+2*type[jnrc]; /* load 3 j charges and multiply by iq */ load_3_pair(vdwparam+tja,vdwparam+tjb,vdwparam+tjc,&c6,&c12); do_vonly_3_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&VVr); Vvdwtot = vec_madd(c6,VVd,Vvdwtot); Vvdwtot = vec_madd(c12,VVr,Vvdwtot); qqH = vec_madd(iqH,jq,nul); qqM = vec_madd(iqM,jq,nul); vctot = vec_madd(qqM,rinvM,vctot); vctot = vec_madd(qqH,rinvH1,vctot); vctot = vec_madd(qqH,rinvH2,vctot); } else if(k<(nj1-1)) { jnra = jjnr[k]; jnrb = jjnr[k+1]; j3a = 3*jnra; j3b = 3*jnrb; transpose_2_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b),&dMx,&dMy,&dMz); dOx = vec_sub(iOx,dMx); dOy = vec_sub(iOy,dMy); dOz = vec_sub(iOz,dMz); dH1x = vec_sub(iH1x,dMx); dH1y = vec_sub(iH1y,dMy); dH1z = vec_sub(iH1z,dMz); dH2x = vec_sub(iH2x,dMx); dH2y = vec_sub(iH2y,dMy); dH2z = vec_sub(iH2z,dMz); dMx = vec_sub(iMx,dMx); dMy = vec_sub(iMy,dMy); dMz = vec_sub(iMz,dMz); rsqO = vec_madd(dOx,dOx,nul); rsqH1 = vec_madd(dH1x,dH1x,nul); rsqH2 = vec_madd(dH2x,dH2x,nul); rsqM = vec_madd(dMx,dMx,nul); rsqO = vec_madd(dOy,dOy,rsqO); rsqH1 = vec_madd(dH1y,dH1y,rsqH1); rsqH2 = vec_madd(dH2y,dH2y,rsqH2); rsqM = vec_madd(dMy,dMy,rsqM); rsqO = vec_madd(dOz,dOz,rsqO); rsqH1 = vec_madd(dH1z,dH1z,rsqH1); rsqH2 = vec_madd(dH2z,dH2z,rsqH2); rsqM = vec_madd(dMz,dMz,rsqM); rinvO = do_invsqrt(rsqO); do_3_invsqrt(rsqM,rsqH1,rsqH2,&rinvM,&rinvH1,&rinvH2); zero_highest_2_elements_in_vector(&rinvO); zero_highest_2_elements_in_vector(&rsqO); zero_highest_2_elements_in_3_vectors(&rinvM,&rinvH1,&rinvH2); r = vec_madd(rsqO,rinvO,nul); tja = ntiA+2*type[jnra]; tjb = ntiA+2*type[jnrb]; /* load 2 j charges and multiply by iq */ jq=load_2_float(charge+jnra,charge+jnrb); load_2_pair(vdwparam+tja,vdwparam+tjb,&c6,&c12); do_vonly_2_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&VVr); Vvdwtot = vec_madd(c6,VVd,Vvdwtot); Vvdwtot = vec_madd(c12,VVr,Vvdwtot); qqH = vec_madd(iqH,jq,nul); qqM = vec_madd(iqM,jq,nul); vctot = vec_madd(qqM,rinvM,vctot); vctot = vec_madd(qqH,rinvH1,vctot); vctot = vec_madd(qqH,rinvH2,vctot); } else if(k<nj1) { jnra = jjnr[k]; j3a = 3*jnra; transpose_1_to_3(load_xyz(pos+j3a),&dMx,&dMy,&dMz); dOx = vec_sub(iOx,dMx); dOy = vec_sub(iOy,dMy); dOz = vec_sub(iOz,dMz); dH1x = vec_sub(iH1x,dMx); dH1y = vec_sub(iH1y,dMy); dH1z = vec_sub(iH1z,dMz); dH2x = vec_sub(iH2x,dMx); dH2y = vec_sub(iH2y,dMy); dH2z = vec_sub(iH2z,dMz); dMx = vec_sub(iMx,dMx); dMy = vec_sub(iMy,dMy); dMz = vec_sub(iMz,dMz); rsqO = vec_madd(dOx,dOx,nul); rsqH1 = vec_madd(dH1x,dH1x,nul); rsqH2 = vec_madd(dH2x,dH2x,nul); rsqM = vec_madd(dMx,dMx,nul); rsqO = vec_madd(dOy,dOy,rsqO); rsqH1 = vec_madd(dH1y,dH1y,rsqH1); rsqH2 = vec_madd(dH2y,dH2y,rsqH2); rsqM = vec_madd(dMy,dMy,rsqM); rsqO = vec_madd(dOz,dOz,rsqO); rsqH1 = vec_madd(dH1z,dH1z,rsqH1); rsqH2 = vec_madd(dH2z,dH2z,rsqH2); rsqM = vec_madd(dMz,dMz,rsqM); rinvO = do_invsqrt(rsqO); do_3_invsqrt(rsqM,rsqH1,rsqH2,&rinvM,&rinvH1,&rinvH2); zero_highest_3_elements_in_vector(&rinvO); zero_highest_3_elements_in_vector(&rsqO); zero_highest_3_elements_in_3_vectors(&rinvH1,&rinvH2,&rinvM); r = vec_madd(rsqO,rinvO,nul); jq=load_1_float(charge+jnra); tja = ntiA+2*type[jnra]; /* load 1 j charge and multiply by iq */ load_1_pair(vdwparam+tja,&c6,&c12); do_vonly_1_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&VVr); Vvdwtot = vec_madd(c6,VVd,Vvdwtot); Vvdwtot = vec_madd(c12,VVr,Vvdwtot); qqH = vec_madd(iqH,jq,nul); qqM = vec_madd(iqM,jq,nul); vctot = vec_madd(qqM,rinvM,vctot); vctot = vec_madd(qqH,rinvH1,vctot); vctot = vec_madd(qqH,rinvH2,vctot); } /* update outer data */ add_vector_to_float(Vc+gid[n],vctot); add_vector_to_float(Vvdw+gid[n],Vvdwtot); ninner += nj1 - nj0; } #ifdef GMX_THREADS nouter += nn1 - nn0; } while (nn1<nri); #else nouter = nri; #endif *outeriter = nouter; *inneriter = ninner; }
/* * Gromacs nonbonded kernel nb_kernel321 * Coulomb interaction: Tabulated * VdW interaction: Buckingham * water optimization: SPC/TIP3P - other atoms * Calculate forces: yes */ void nb_kernel321( int * p_nri, int * iinr, int * jindex, int * jjnr, int * shift, real * shiftvec, real * fshift, int * gid, real * pos, real * faction, real * charge, real * p_facel, real * p_krf, real * p_crf, real * Vc, int * type, int * p_ntype, real * vdwparam, real * Vvdw, real * p_tabscale, real * VFtab,real * enerd1,real * enerd2,real * enerd3,real * enerd4,int * start,int * end,int * homenr,int * nbsum, real * invsqrta, real * dvda, real * p_gbtabscale, real * GBtab, int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, real * work) { int nri,ntype,nthreads; real facel,krf,crf,tabscale,gbtabscale; int n,ii,is3,ii3,k,nj0,nj1,jnr,j3,ggid; int nn0,nn1,nouter,ninner; real shX,shY,shZ; real fscal,tx,ty,tz; real rinvsq; real jq; real qq,vcoul,vctot; int nti; int tj; real rinvsix; real Vvdw6,Vvdwtot; real r,rt,eps,eps2; int n0,nnn; real Y,F,Geps,Heps2,Fp,VV; real FF; real fijC; real Vvdwexp,br; real ix1,iy1,iz1,fix1,fiy1,fiz1; real ix2,iy2,iz2,fix2,fiy2,fiz2; real ix3,iy3,iz3,fix3,fiy3,fiz3; real jx1,jy1,jz1,fjx1,fjy1,fjz1; real dx11,dy11,dz11,rsq11,rinv11; real dx21,dy21,dz21,rsq21,rinv21; real dx31,dy31,dz31,rsq31,rinv31; real qO,qH; real c6,cexp1,cexp2; nri = *p_nri; ntype = *p_ntype; nthreads = *p_nthreads; facel = *p_facel; krf = *p_krf; crf = *p_crf; tabscale = *p_tabscale; /* Initialize water data */ ii = iinr[0]; qO = facel*charge[ii]; qH = facel*charge[ii+1]; nti = 3*ntype*type[ii]; /* Reset outer and inner iteration counters */ nouter = 0; ninner = 0; /* Loop over thread workunits */ do { #ifdef GMX_THREADS gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx); nn0 = *count; /* Take successively smaller chunks (at least 10 lists) */ nn1 = nn0+(nri-nn0)/(2*nthreads)+10; *count = nn1; gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx); if(nn1>nri) nn1=nri; #else nn0 = 0; nn1 = nri; #endif /* Start outer loop over neighborlists */ for(n=nn0; (n<nn1); n++) { /* Load shift vector for this list */ is3 = 3*shift[n]; shX = shiftvec[is3]; shY = shiftvec[is3+1]; shZ = shiftvec[is3+2]; /* Load limits for loop over neighbors */ nj0 = jindex[n]; nj1 = jindex[n+1]; /* Get outer coordinate index */ ii = iinr[n]; ii3 = 3*ii; /* Load i atom data, add shift vector */ ix1 = shX + pos[ii3+0]; iy1 = shY + pos[ii3+1]; iz1 = shZ + pos[ii3+2]; ix2 = shX + pos[ii3+3]; iy2 = shY + pos[ii3+4]; iz2 = shZ + pos[ii3+5]; ix3 = shX + pos[ii3+6]; iy3 = shY + pos[ii3+7]; iz3 = shZ + pos[ii3+8]; /* Zero the potential energy for this list */ vctot = 0; Vvdwtot = 0; /* Clear i atom forces */ fix1 = 0; fiy1 = 0; fiz1 = 0; fix2 = 0; fiy2 = 0; fiz2 = 0; fix3 = 0; fiy3 = 0; fiz3 = 0; for(k=nj0; (k<nj1); k++) { /* Get j neighbor index, and coordinate index */ jnr = jjnr[k]; j3 = 3*jnr; /* load j atom coordinates */ jx1 = pos[j3+0]; jy1 = pos[j3+1]; jz1 = pos[j3+2]; /* Calculate distance */ dx11 = ix1 - jx1; dy11 = iy1 - jy1; dz11 = iz1 - jz1; rsq11 = dx11*dx11+dy11*dy11+dz11*dz11; dx21 = ix2 - jx1; dy21 = iy2 - jy1; dz21 = iz2 - jz1; rsq21 = dx21*dx21+dy21*dy21+dz21*dz21; dx31 = ix3 - jx1; dy31 = iy3 - jy1; dz31 = iz3 - jz1; rsq31 = dx31*dx31+dy31*dy31+dz31*dz31; /* Calculate 1/r and 1/r2 */ rinv11 = invsqrt(rsq11); rinv21 = invsqrt(rsq21); rinv31 = invsqrt(rsq31); /* Load parameters for j atom */ jq = charge[jnr+0]; qq = qO*jq; tj = nti+3*type[jnr]; c6 = vdwparam[tj]; cexp1 = vdwparam[tj+1]; cexp2 = vdwparam[tj+2]; rinvsq = rinv11*rinv11; /* Calculate table index */ r = rsq11*rinv11; /* Calculate table index */ rt = r*tabscale; n0 = rt; eps = rt-n0; eps2 = eps*eps; nnn = 4*n0; /* Tabulated coulomb interaction */ Y = VFtab[nnn]; F = VFtab[nnn+1]; Geps = eps*VFtab[nnn+2]; Heps2 = eps2*VFtab[nnn+3]; Fp = F+Geps+Heps2; VV = Y+eps*Fp; FF = Fp+Geps+2.0*Heps2; vcoul = qq*VV; fijC = qq*FF; vctot = vctot + vcoul; /* Buckingham interaction */ rinvsix = rinvsq*rinvsq*rinvsq; Vvdw6 = c6*rinvsix; br = cexp2*rsq11*rinv11; Vvdwexp = cexp1*exp(-br); Vvdwtot = Vvdwtot+Vvdwexp-Vvdw6; fscal = (br*Vvdwexp-6.0*Vvdw6)*rinvsq-((fijC)*tabscale)*rinv11; /* Calculate temporary vectorial force */ tx = fscal*dx11; ty = fscal*dy11; tz = fscal*dz11; /* Increment i atom force */ fix1 = fix1 + tx; fiy1 = fiy1 + ty; fiz1 = fiz1 + tz; /* Decrement j atom force */ fjx1 = faction[j3+0] - tx; fjy1 = faction[j3+1] - ty; fjz1 = faction[j3+2] - tz; /* Load parameters for j atom */ qq = qH*jq; /* Calculate table index */ r = rsq21*rinv21; /* Calculate table index */ rt = r*tabscale; n0 = rt; eps = rt-n0; eps2 = eps*eps; nnn = 4*n0; /* Tabulated coulomb interaction */ Y = VFtab[nnn]; F = VFtab[nnn+1]; Geps = eps*VFtab[nnn+2]; Heps2 = eps2*VFtab[nnn+3]; Fp = F+Geps+Heps2; VV = Y+eps*Fp; FF = Fp+Geps+2.0*Heps2; vcoul = qq*VV; fijC = qq*FF; vctot = vctot + vcoul; fscal = -((fijC)*tabscale)*rinv21; /* Calculate temporary vectorial force */ tx = fscal*dx21; ty = fscal*dy21; tz = fscal*dz21; /* Increment i atom force */ fix2 = fix2 + tx; fiy2 = fiy2 + ty; fiz2 = fiz2 + tz; /* Decrement j atom force */ fjx1 = fjx1 - tx; fjy1 = fjy1 - ty; fjz1 = fjz1 - tz; /* Load parameters for j atom */ /* Calculate table index */ r = rsq31*rinv31; /* Calculate table index */ rt = r*tabscale; n0 = rt; eps = rt-n0; eps2 = eps*eps; nnn = 4*n0; /* Tabulated coulomb interaction */ Y = VFtab[nnn]; F = VFtab[nnn+1]; Geps = eps*VFtab[nnn+2]; Heps2 = eps2*VFtab[nnn+3]; Fp = F+Geps+Heps2; VV = Y+eps*Fp; FF = Fp+Geps+2.0*Heps2; vcoul = qq*VV; fijC = qq*FF; vctot = vctot + vcoul; fscal = -((fijC)*tabscale)*rinv31; /* Calculate temporary vectorial force */ tx = fscal*dx31; ty = fscal*dy31; tz = fscal*dz31; /* Increment i atom force */ fix3 = fix3 + tx; fiy3 = fiy3 + ty; fiz3 = fiz3 + tz; /* Decrement j atom force */ faction[j3+0] = fjx1 - tx; faction[j3+1] = fjy1 - ty; faction[j3+2] = fjz1 - tz; /* Inner loop uses 164 flops/iteration */ } /* Add i forces to mem and shifted force list */ faction[ii3+0] = faction[ii3+0] + fix1; faction[ii3+1] = faction[ii3+1] + fiy1; faction[ii3+2] = faction[ii3+2] + fiz1; faction[ii3+3] = faction[ii3+3] + fix2; faction[ii3+4] = faction[ii3+4] + fiy2; faction[ii3+5] = faction[ii3+5] + fiz2; faction[ii3+6] = faction[ii3+6] + fix3; faction[ii3+7] = faction[ii3+7] + fiy3; faction[ii3+8] = faction[ii3+8] + fiz3; fshift[is3] = fshift[is3]+fix1+fix2+fix3; fshift[is3+1] = fshift[is3+1]+fiy1+fiy2+fiy3; fshift[is3+2] = fshift[is3+2]+fiz1+fiz2+fiz3; /* Add potential energies to the group for this list */ ggid = gid[n]; Vc[ggid] = Vc[ggid] + vctot; Vvdw[ggid] = Vvdw[ggid] + Vvdwtot; /* Increment number of inner iterations */ ninner = ninner + nj1 - nj0; /* Outer loop uses 29 flops/iteration */ } /* Increment number of outer iterations */ nouter = nouter + nn1 - nn0; } while (nn1<nri); /* Write outer/inner iteration count to pointers */ *outeriter = nouter; *inneriter = ninner; }
void nb_kernel100nf_ppc_altivec(int * p_nri, int iinr[], int jindex[], int jjnr[], int shift[], float shiftvec[], float fshift[], int gid[], float pos[], float faction[], float charge[], float * p_facel, float * p_krf, float * p_crf, float Vc[], int type[], int * p_ntype, float vdwparam[], float Vvdw[], float * p_tabscale, float VFtab[], float invsqrta[], float dvda[], float * p_gbtabscale, float GBtab[], int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, float * work) { vector float ix,iy,iz,shvec; vector float vfacel,nul; vector float dx,dy,dz; vector float vctot,qq,iq; vector float rinv,rsq; int n,k,ii,is3,ii3,nj0,nj1; int jnra,jnrb,jnrc,jnrd; int j3a,j3b,j3c,j3d; int nri, ntype, nouter, ninner; #ifdef GMX_THREADS int nn0, nn1; #endif nouter = 0; ninner = 0; nri = *p_nri; ntype = *p_ntype; nul=vec_zero(); vfacel=load_float_and_splat(p_facel); #ifdef GMX_THREADS nthreads = *p_nthreads; do { gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx); nn0 = *count; nn1 = nn0+(nri-nn0)/(2*nthreads)+3; *count = nn1; gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx); if(nn1>nri) nn1=nri; for(n=nn0; (n<nn1); n++) { #if 0 } /* maintain correct indentation even with conditional left braces */ #endif #else /* without gmx_threads */ for(n=0;n<nri;n++) { #endif is3 = 3*shift[n]; shvec = load_xyz(shiftvec+is3); ii = iinr[n]; ii3 = 3*ii; ix = load_xyz(pos+ii3); vctot = nul; ix = vec_add(ix,shvec); nj0 = jindex[n]; nj1 = jindex[n+1]; splat_xyz_to_vectors(ix,&ix,&iy,&iz); iq = vec_madd(load_float_and_splat(charge+ii),vfacel,nul); for(k=nj0; k<(nj1-3); k+=4) { jnra = jjnr[k]; jnrb = jjnr[k+1]; jnrc = jjnr[k+2]; jnrd = jjnr[k+3]; j3a = 3*jnra; j3b = 3*jnrb; j3c = 3*jnrc; j3d = 3*jnrd; transpose_4_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b), load_xyz(pos+j3c), load_xyz(pos+j3d),&dx,&dy,&dz); dx = vec_sub(ix,dx); dy = vec_sub(iy,dy); dz = vec_sub(iz,dz); rsq = vec_madd(dx,dx,nul); rsq = vec_madd(dy,dy,rsq); rsq = vec_madd(dz,dz,rsq); rinv = do_invsqrt(rsq); /* load 4 j charges and multiply by iq */ qq = vec_madd(load_4_float(charge+jnra,charge+jnrb, charge+jnrc,charge+jnrd),iq,nul); vctot = vec_madd(qq,rinv,vctot); } if(k<(nj1-1)) { jnra = jjnr[k]; jnrb = jjnr[k+1]; j3a = 3*jnra; j3b = 3*jnrb; transpose_2_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b),&dx,&dy,&dz); dx = vec_sub(ix,dx); dy = vec_sub(iy,dy); dz = vec_sub(iz,dz); rsq = vec_madd(dx,dx,nul); rsq = vec_madd(dy,dy,rsq); rsq = vec_madd(dz,dz,rsq); rinv = do_invsqrt(rsq); zero_highest_2_elements_in_vector(&rinv); /* load 2 j charges and multiply by iq */ qq = vec_madd(load_2_float(charge+jnra,charge+jnrb),iq,nul); vctot = vec_madd(qq,rinv,vctot); k += 2; } if((nj1-nj0) & 0x1) { jnra = jjnr[k]; j3a = 3*jnra; transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz); dx = vec_sub(ix,dx); dy = vec_sub(iy,dy); dz = vec_sub(iz,dz); rsq = vec_madd(dx,dx,nul); rsq = vec_madd(dy,dy,rsq); rsq = vec_madd(dz,dz,rsq); rinv = do_invsqrt(rsq); zero_highest_3_elements_in_vector(&rinv); /* load 1 j charge and multiply by iq */ qq = vec_madd(load_1_float(charge+jnra),iq,nul); vctot = vec_madd(qq,rinv,vctot); } /* update outer data */ add_vector_to_float(Vc+gid[n],vctot); ninner += nj1 - nj0; } #ifdef GMX_THREADS nouter += nn1 - nn0; } while (nn1<nri); #else nouter = nri; #endif *outeriter = nouter; *inneriter = ninner; }
/* * Gromacs nonbonded kernel nb_kernel010nf * Coulomb interaction: Not calculated * VdW interaction: Lennard-Jones * water optimization: No * Calculate forces: no */ void nb_kernel010nf( int * p_nri, int * iinr, int * jindex, int * jjnr, int * shift, real * shiftvec, real * fshift, int * gid, real * pos, real * faction, real * charge, real * p_facel, real * p_krf, real * p_crf, real * Vc, int * type, int * p_ntype, real * vdwparam, real * Vvdw, real * p_tabscale, real * VFtab,real * enerd1,real * enerd2,real * enerd3,real * enerd4,int * start,int * end,int * homenr,int * nbsum, real * invsqrta, real * dvda, real * p_gbtabscale, real * GBtab, int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, real * work) { int nri,ntype,nthreads; real facel,krf,crf,tabscale,gbtabscale; int n,ii,is3,ii3,k,nj0,nj1,jnr,j3,ggid; int nn0,nn1,nouter,ninner; real shX,shY,shZ; real rinvsq; int nti; int tj; real rinvsix; real Vvdw6,Vvdwtot; real Vvdw12; real ix1,iy1,iz1; real jx1,jy1,jz1; real dx11,dy11,dz11,rsq11; real c6,c12; int index; nri = *p_nri; ntype = *p_ntype; nthreads = *p_nthreads; facel = *p_facel; krf = *p_krf; crf = *p_crf; tabscale = *p_tabscale; /* Reset outer and inner iteration counters */ nouter = 0; ninner = 0; /* Loop over thread workunits */ do { #ifdef GMX_THREADS gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx); nn0 = *count; /* Take successively smaller chunks (at least 10 lists) */ nn1 = nn0+(nri-nn0)/(2*nthreads)+10; *count = nn1; gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx); if(nn1>nri) nn1=nri; #else nn0 = 0; nn1 = nri; #endif /* Start outer loop over neighborlists */ for(n=nn0; (n<nn1); n++) { /* Load shift vector for this list */ is3 = 3*shift[n]; shX = shiftvec[is3]; shY = shiftvec[is3+1]; shZ = shiftvec[is3+2]; /* Load limits for loop over neighbors */ nj0 = jindex[n]; nj1 = jindex[n+1]; /* Get outer coordinate index */ ii = iinr[n]; ii3 = 3*ii; /* Load i atom data, add shift vector */ ix1 = shX + pos[ii3+0]; iy1 = shY + pos[ii3+1]; iz1 = shZ + pos[ii3+2]; /* Load parameters for i atom */ nti = 2*ntype*type[ii]; /* Zero the potential energy for this list */ Vvdwtot = 0; /* Clear i atom forces */ for(k=nj0; (k<nj1); k++) { /* Get j neighbor index, and coordinate index */ jnr = jjnr[k]; j3 = 3*jnr; if(enerd1) { if(ii<jnr) { index = start[ii]**homenr - nbsum[start[ii]] + start[jnr]; } else { index = start[jnr]**homenr - nbsum[start[jnr]] + start[ii]; } } /* load j atom coordinates */ jx1 = pos[j3+0]; jy1 = pos[j3+1]; jz1 = pos[j3+2]; /* Calculate distance */ dx11 = ix1 - jx1; dy11 = iy1 - jy1; dz11 = iz1 - jz1; rsq11 = dx11*dx11+dy11*dy11+dz11*dz11; /* Calculate 1/r and 1/r2 */ rinvsq = 1.0/rsq11; /* Load parameters for j atom */ tj = nti+2*type[jnr]; c6 = vdwparam[tj]; c12 = vdwparam[tj+1]; /* Lennard-Jones interaction */ rinvsix = rinvsq*rinvsq*rinvsq; Vvdw6 = c6*rinvsix; Vvdw12 = c12*rinvsix*rinvsix; Vvdwtot = Vvdwtot+Vvdw12-Vvdw6; if(enerd2) { enerd2[index] = enerd2[index] + Vvdw12-Vvdw6; } /* Inner loop uses 19 flops/iteration */ } /* Add i forces to mem and shifted force list */ /* Add potential energies to the group for this list */ ggid = gid[n]; Vvdw[ggid] = Vvdw[ggid] + Vvdwtot; /* Increment number of inner iterations */ ninner = ninner + nj1 - nj0; /* Outer loop uses 4 flops/iteration */ } /* Increment number of outer iterations */ nouter = nouter + nn1 - nn0; } while (nn1<nri); /* Write outer/inner iteration count to pointers */ *outeriter = nouter; *inneriter = ninner; }
/* * Gromacs nonbonded kernel nb_kernel120 * Coulomb interaction: Normal Coulomb * VdW interaction: Buckingham * water optimization: No * Calculate forces: yes */ void nb_kernel120( int * p_nri, int * iinr, int * jindex, int * jjnr, int * shift, real * shiftvec, real * fshift, int * gid, real * pos, real * faction, real * charge, real * p_facel, real * p_krf, real * p_crf, real * Vc, int * type, int * p_ntype, real * vdwparam, real * Vvdw, real * p_tabscale, real * VFtab,real * enerd1,real * enerd2,real * enerd3,real * enerd4,int * start,int * end,int * homenr,int * nbsum, real * invsqrta, real * dvda, real * p_gbtabscale, real * GBtab, int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, real * work) { int nri,ntype,nthreads; real facel,krf,crf,tabscale,gbtabscale; int n,ii,is3,ii3,k,nj0,nj1,jnr,j3,ggid; int nn0,nn1,nouter,ninner; real shX,shY,shZ; real fscal,tx,ty,tz; real rinvsq; real iq; real qq,vcoul,vctot; int nti; int tj; real rinvsix; real Vvdw6,Vvdwtot; real Vvdwexp,br; real ix1,iy1,iz1,fix1,fiy1,fiz1; real jx1,jy1,jz1; real dx11,dy11,dz11,rsq11,rinv11; real c6,cexp1,cexp2; nri = *p_nri; ntype = *p_ntype; nthreads = *p_nthreads; facel = *p_facel; krf = *p_krf; crf = *p_crf; tabscale = *p_tabscale; /* Reset outer and inner iteration counters */ nouter = 0; ninner = 0; /* Loop over thread workunits */ do { #ifdef GMX_THREADS gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx); nn0 = *count; /* Take successively smaller chunks (at least 10 lists) */ nn1 = nn0+(nri-nn0)/(2*nthreads)+10; *count = nn1; gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx); if(nn1>nri) nn1=nri; #else nn0 = 0; nn1 = nri; #endif /* Start outer loop over neighborlists */ for(n=nn0; (n<nn1); n++) { /* Load shift vector for this list */ is3 = 3*shift[n]; shX = shiftvec[is3]; shY = shiftvec[is3+1]; shZ = shiftvec[is3+2]; /* Load limits for loop over neighbors */ nj0 = jindex[n]; nj1 = jindex[n+1]; /* Get outer coordinate index */ ii = iinr[n]; ii3 = 3*ii; /* Load i atom data, add shift vector */ ix1 = shX + pos[ii3+0]; iy1 = shY + pos[ii3+1]; iz1 = shZ + pos[ii3+2]; /* Load parameters for i atom */ iq = facel*charge[ii]; nti = 3*ntype*type[ii]; /* Zero the potential energy for this list */ vctot = 0; Vvdwtot = 0; /* Clear i atom forces */ fix1 = 0; fiy1 = 0; fiz1 = 0; for(k=nj0; (k<nj1); k++) { /* Get j neighbor index, and coordinate index */ jnr = jjnr[k]; j3 = 3*jnr; /* load j atom coordinates */ jx1 = pos[j3+0]; jy1 = pos[j3+1]; jz1 = pos[j3+2]; /* Calculate distance */ dx11 = ix1 - jx1; dy11 = iy1 - jy1; dz11 = iz1 - jz1; rsq11 = dx11*dx11+dy11*dy11+dz11*dz11; /* Calculate 1/r and 1/r2 */ rinv11 = invsqrt(rsq11); /* Load parameters for j atom */ qq = iq*charge[jnr]; tj = nti+3*type[jnr]; c6 = vdwparam[tj]; cexp1 = vdwparam[tj+1]; cexp2 = vdwparam[tj+2]; rinvsq = rinv11*rinv11; /* Coulomb interaction */ vcoul = qq*rinv11; vctot = vctot+vcoul; /* Buckingham interaction */ rinvsix = rinvsq*rinvsq*rinvsq; Vvdw6 = c6*rinvsix; br = cexp2*rsq11*rinv11; Vvdwexp = cexp1*exp(-br); Vvdwtot = Vvdwtot+Vvdwexp-Vvdw6; fscal = (vcoul+br*Vvdwexp-6.0*Vvdw6)*rinvsq; /* Calculate temporary vectorial force */ tx = fscal*dx11; ty = fscal*dy11; tz = fscal*dz11; /* Increment i atom force */ fix1 = fix1 + tx; fiy1 = fiy1 + ty; fiz1 = fiz1 + tz; /* Decrement j atom force */ faction[j3+0] = faction[j3+0] - tx; faction[j3+1] = faction[j3+1] - ty; faction[j3+2] = faction[j3+2] - tz; /* Inner loop uses 64 flops/iteration */ } /* Add i forces to mem and shifted force list */ faction[ii3+0] = faction[ii3+0] + fix1; faction[ii3+1] = faction[ii3+1] + fiy1; faction[ii3+2] = faction[ii3+2] + fiz1; fshift[is3] = fshift[is3]+fix1; fshift[is3+1] = fshift[is3+1]+fiy1; fshift[is3+2] = fshift[is3+2]+fiz1; /* Add potential energies to the group for this list */ ggid = gid[n]; Vc[ggid] = Vc[ggid] + vctot; Vvdw[ggid] = Vvdw[ggid] + Vvdwtot; /* Increment number of inner iterations */ ninner = ninner + nj1 - nj0; /* Outer loop uses 12 flops/iteration */ } /* Increment number of outer iterations */ nouter = nouter + nn1 - nn0; } while (nn1<nri); /* Write outer/inner iteration count to pointers */ *outeriter = nouter; *inneriter = ninner; }
/* * Gromacs nonbonded kernel nb_kernel332 * Coulomb interaction: Tabulated * VdW interaction: Tabulated * water optimization: pairs of SPC/TIP3P interactions * Calculate forces: yes */ void nb_kernel332_sse2_single( int * p_nri, int * iinr, int * jindex, int * jjnr, int * shift, float * shiftvec, float * fshift, int * gid, float * pos, float * faction, float * charge, float * p_facel, float * p_krf, float * p_crf, float * Vc, int * type, int * p_ntype, float * vdwparam, float * Vvdw, float * p_tabscale, float * VFtab, float * invsqrta, float * dvda, float * p_gbtabscale, float * GBtab, int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, float * work) { int nri,ntype,nthreads; float facel,krf,crf,tabscale,gbtabscale; int n,ii,is3,ii3,k,nj0,nj1,jnr,j3,ggid; int nn0,nn1,nouter,ninner; float shX,shY,shZ; float fscal,tx,ty,tz; float qq,vcoul,vctot; int tj; float Vvdw6,Vvdwtot; float Vvdw12; float r,rt,eps,eps2; int n0,nnn; float Y,F,Geps,Heps2,Fp,VV; float FF; float fijC; float fijD,fijR; float ix1,iy1,iz1,fix1,fiy1,fiz1; float ix2,iy2,iz2,fix2,fiy2,fiz2; float ix3,iy3,iz3,fix3,fiy3,fiz3; float jx1,jy1,jz1,fjx1,fjy1,fjz1; float jx2,jy2,jz2,fjx2,fjy2,fjz2; float jx3,jy3,jz3,fjx3,fjy3,fjz3; float dx11,dy11,dz11,rsq11,rinv11; float dx12,dy12,dz12,rsq12,rinv12; float dx13,dy13,dz13,rsq13,rinv13; float dx21,dy21,dz21,rsq21,rinv21; float dx22,dy22,dz22,rsq22,rinv22; float dx23,dy23,dz23,rsq23,rinv23; float dx31,dy31,dz31,rsq31,rinv31; float dx32,dy32,dz32,rsq32,rinv32; float dx33,dy33,dz33,rsq33,rinv33; float qO,qH,qqOO,qqOH,qqHH; float c6,c12; nri = *p_nri; ntype = *p_ntype; nthreads = *p_nthreads; facel = *p_facel; krf = *p_krf; crf = *p_crf; tabscale = *p_tabscale; /* Initialize water data */ ii = iinr[0]; qO = charge[ii]; qH = charge[ii+1]; qqOO = facel*qO*qO; qqOH = facel*qO*qH; qqHH = facel*qH*qH; tj = 2*(ntype+1)*type[ii]; c6 = vdwparam[tj]; c12 = vdwparam[tj+1]; /* Reset outer and inner iteration counters */ nouter = 0; ninner = 0; /* Loop over thread workunits */ do { #ifdef GMX_THREADS gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx); nn0 = *count; /* Take successively smaller chunks (at least 10 lists) */ nn1 = nn0+(nri-nn0)/(2*nthreads)+10; *count = nn1; gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx); if(nn1>nri) nn1=nri; #else nn0 = 0; nn1 = nri; #endif /* Start outer loop over neighborlists */ for(n=nn0; (n<nn1); n++) { /* Load shift vector for this list */ is3 = 3*shift[n]; shX = shiftvec[is3]; shY = shiftvec[is3+1]; shZ = shiftvec[is3+2]; /* Load limits for loop over neighbors */ nj0 = jindex[n]; nj1 = jindex[n+1]; /* Get outer coordinate index */ ii = iinr[n]; ii3 = 3*ii; /* Load i atom data, add shift vector */ ix1 = shX + pos[ii3+0]; iy1 = shY + pos[ii3+1]; iz1 = shZ + pos[ii3+2]; ix2 = shX + pos[ii3+3]; iy2 = shY + pos[ii3+4]; iz2 = shZ + pos[ii3+5]; ix3 = shX + pos[ii3+6]; iy3 = shY + pos[ii3+7]; iz3 = shZ + pos[ii3+8]; /* Zero the potential energy for this list */ vctot = 0; Vvdwtot = 0; /* Clear i atom forces */ fix1 = 0; fiy1 = 0; fiz1 = 0; fix2 = 0; fiy2 = 0; fiz2 = 0; fix3 = 0; fiy3 = 0; fiz3 = 0; for(k=nj0; (k<nj1); k++) { /* Get j neighbor index, and coordinate index */ jnr = jjnr[k]; j3 = 3*jnr; /* load j atom coordinates */ jx1 = pos[j3+0]; jy1 = pos[j3+1]; jz1 = pos[j3+2]; jx2 = pos[j3+3]; jy2 = pos[j3+4]; jz2 = pos[j3+5]; jx3 = pos[j3+6]; jy3 = pos[j3+7]; jz3 = pos[j3+8]; /* Calculate distance */ dx11 = ix1 - jx1; dy11 = iy1 - jy1; dz11 = iz1 - jz1; rsq11 = dx11*dx11+dy11*dy11+dz11*dz11; dx12 = ix1 - jx2; dy12 = iy1 - jy2; dz12 = iz1 - jz2; rsq12 = dx12*dx12+dy12*dy12+dz12*dz12; dx13 = ix1 - jx3; dy13 = iy1 - jy3; dz13 = iz1 - jz3; rsq13 = dx13*dx13+dy13*dy13+dz13*dz13; dx21 = ix2 - jx1; dy21 = iy2 - jy1; dz21 = iz2 - jz1; rsq21 = dx21*dx21+dy21*dy21+dz21*dz21; dx22 = ix2 - jx2; dy22 = iy2 - jy2; dz22 = iz2 - jz2; rsq22 = dx22*dx22+dy22*dy22+dz22*dz22; dx23 = ix2 - jx3; dy23 = iy2 - jy3; dz23 = iz2 - jz3; rsq23 = dx23*dx23+dy23*dy23+dz23*dz23; dx31 = ix3 - jx1; dy31 = iy3 - jy1; dz31 = iz3 - jz1; rsq31 = dx31*dx31+dy31*dy31+dz31*dz31; dx32 = ix3 - jx2; dy32 = iy3 - jy2; dz32 = iz3 - jz2; rsq32 = dx32*dx32+dy32*dy32+dz32*dz32; dx33 = ix3 - jx3; dy33 = iy3 - jy3; dz33 = iz3 - jz3; rsq33 = dx33*dx33+dy33*dy33+dz33*dz33; /* Calculate 1/r and 1/r2 */ rinv11 = 1.0/sqrt(rsq11); rinv12 = 1.0/sqrt(rsq12); rinv13 = 1.0/sqrt(rsq13); rinv21 = 1.0/sqrt(rsq21); rinv22 = 1.0/sqrt(rsq22); rinv23 = 1.0/sqrt(rsq23); rinv31 = 1.0/sqrt(rsq31); rinv32 = 1.0/sqrt(rsq32); rinv33 = 1.0/sqrt(rsq33); /* Load parameters for j atom */ qq = qqOO; /* Calculate table index */ r = rsq11*rinv11; /* Calculate table index */ rt = r*tabscale; n0 = rt; eps = rt-n0; eps2 = eps*eps; nnn = 12*n0; /* Tabulated coulomb interaction */ Y = VFtab[nnn]; F = VFtab[nnn+1]; Geps = eps*VFtab[nnn+2]; Heps2 = eps2*VFtab[nnn+3]; Fp = F+Geps+Heps2; VV = Y+eps*Fp; FF = Fp+Geps+2.0*Heps2; vcoul = qq*VV; fijC = qq*FF; vctot = vctot + vcoul; /* Tabulated VdW interaction - dispersion */ nnn = nnn+4; Y = VFtab[nnn]; F = VFtab[nnn+1]; Geps = eps*VFtab[nnn+2]; Heps2 = eps2*VFtab[nnn+3]; Fp = F+Geps+Heps2; VV = Y+eps*Fp; FF = Fp+Geps+2.0*Heps2; Vvdw6 = c6*VV; fijD = c6*FF; /* Tabulated VdW interaction - repulsion */ nnn = nnn+4; Y = VFtab[nnn]; F = VFtab[nnn+1]; Geps = eps*VFtab[nnn+2]; Heps2 = eps2*VFtab[nnn+3]; Fp = F+Geps+Heps2; VV = Y+eps*Fp; FF = Fp+Geps+2.0*Heps2; Vvdw12 = c12*VV; fijR = c12*FF; Vvdwtot = Vvdwtot+ Vvdw6 + Vvdw12; fscal = -((fijC+fijD+fijR)*tabscale)*rinv11; /* Calculate temporary vectorial force */ tx = fscal*dx11; ty = fscal*dy11; tz = fscal*dz11; /* Increment i atom force */ fix1 = fix1 + tx; fiy1 = fiy1 + ty; fiz1 = fiz1 + tz; /* Decrement j atom force */ fjx1 = faction[j3+0] - tx; fjy1 = faction[j3+1] - ty; fjz1 = faction[j3+2] - tz; /* Load parameters for j atom */ qq = qqOH; /* Calculate table index */ r = rsq12*rinv12; /* Calculate table index */ rt = r*tabscale; n0 = rt; eps = rt-n0; eps2 = eps*eps; nnn = 12*n0; /* Tabulated coulomb interaction */ Y = VFtab[nnn]; F = VFtab[nnn+1]; Geps = eps*VFtab[nnn+2]; Heps2 = eps2*VFtab[nnn+3]; Fp = F+Geps+Heps2; VV = Y+eps*Fp; FF = Fp+Geps+2.0*Heps2; vcoul = qq*VV; fijC = qq*FF; vctot = vctot + vcoul; fscal = -((fijC)*tabscale)*rinv12; /* Calculate temporary vectorial force */ tx = fscal*dx12; ty = fscal*dy12; tz = fscal*dz12; /* Increment i atom force */ fix1 = fix1 + tx; fiy1 = fiy1 + ty; fiz1 = fiz1 + tz; /* Decrement j atom force */ fjx2 = faction[j3+3] - tx; fjy2 = faction[j3+4] - ty; fjz2 = faction[j3+5] - tz; /* Load parameters for j atom */ qq = qqOH; /* Calculate table index */ r = rsq13*rinv13; /* Calculate table index */ rt = r*tabscale; n0 = rt; eps = rt-n0; eps2 = eps*eps; nnn = 12*n0; /* Tabulated coulomb interaction */ Y = VFtab[nnn]; F = VFtab[nnn+1]; Geps = eps*VFtab[nnn+2]; Heps2 = eps2*VFtab[nnn+3]; Fp = F+Geps+Heps2; VV = Y+eps*Fp; FF = Fp+Geps+2.0*Heps2; vcoul = qq*VV; fijC = qq*FF; vctot = vctot + vcoul; fscal = -((fijC)*tabscale)*rinv13; /* Calculate temporary vectorial force */ tx = fscal*dx13; ty = fscal*dy13; tz = fscal*dz13; /* Increment i atom force */ fix1 = fix1 + tx; fiy1 = fiy1 + ty; fiz1 = fiz1 + tz; /* Decrement j atom force */ fjx3 = faction[j3+6] - tx; fjy3 = faction[j3+7] - ty; fjz3 = faction[j3+8] - tz; /* Load parameters for j atom */ qq = qqOH; /* Calculate table index */ r = rsq21*rinv21; /* Calculate table index */ rt = r*tabscale; n0 = rt; eps = rt-n0; eps2 = eps*eps; nnn = 12*n0; /* Tabulated coulomb interaction */ Y = VFtab[nnn]; F = VFtab[nnn+1]; Geps = eps*VFtab[nnn+2]; Heps2 = eps2*VFtab[nnn+3]; Fp = F+Geps+Heps2; VV = Y+eps*Fp; FF = Fp+Geps+2.0*Heps2; vcoul = qq*VV; fijC = qq*FF; vctot = vctot + vcoul; fscal = -((fijC)*tabscale)*rinv21; /* Calculate temporary vectorial force */ tx = fscal*dx21; ty = fscal*dy21; tz = fscal*dz21; /* Increment i atom force */ fix2 = fix2 + tx; fiy2 = fiy2 + ty; fiz2 = fiz2 + tz; /* Decrement j atom force */ fjx1 = fjx1 - tx; fjy1 = fjy1 - ty; fjz1 = fjz1 - tz; /* Load parameters for j atom */ qq = qqHH; /* Calculate table index */ r = rsq22*rinv22; /* Calculate table index */ rt = r*tabscale; n0 = rt; eps = rt-n0; eps2 = eps*eps; nnn = 12*n0; /* Tabulated coulomb interaction */ Y = VFtab[nnn]; F = VFtab[nnn+1]; Geps = eps*VFtab[nnn+2]; Heps2 = eps2*VFtab[nnn+3]; Fp = F+Geps+Heps2; VV = Y+eps*Fp; FF = Fp+Geps+2.0*Heps2; vcoul = qq*VV; fijC = qq*FF; vctot = vctot + vcoul; fscal = -((fijC)*tabscale)*rinv22; /* Calculate temporary vectorial force */ tx = fscal*dx22; ty = fscal*dy22; tz = fscal*dz22; /* Increment i atom force */ fix2 = fix2 + tx; fiy2 = fiy2 + ty; fiz2 = fiz2 + tz; /* Decrement j atom force */ fjx2 = fjx2 - tx; fjy2 = fjy2 - ty; fjz2 = fjz2 - tz; /* Load parameters for j atom */ qq = qqHH; /* Calculate table index */ r = rsq23*rinv23; /* Calculate table index */ rt = r*tabscale; n0 = rt; eps = rt-n0; eps2 = eps*eps; nnn = 12*n0; /* Tabulated coulomb interaction */ Y = VFtab[nnn]; F = VFtab[nnn+1]; Geps = eps*VFtab[nnn+2]; Heps2 = eps2*VFtab[nnn+3]; Fp = F+Geps+Heps2; VV = Y+eps*Fp; FF = Fp+Geps+2.0*Heps2; vcoul = qq*VV; fijC = qq*FF; vctot = vctot + vcoul; fscal = -((fijC)*tabscale)*rinv23; /* Calculate temporary vectorial force */ tx = fscal*dx23; ty = fscal*dy23; tz = fscal*dz23; /* Increment i atom force */ fix2 = fix2 + tx; fiy2 = fiy2 + ty; fiz2 = fiz2 + tz; /* Decrement j atom force */ fjx3 = fjx3 - tx; fjy3 = fjy3 - ty; fjz3 = fjz3 - tz; /* Load parameters for j atom */ qq = qqOH; /* Calculate table index */ r = rsq31*rinv31; /* Calculate table index */ rt = r*tabscale; n0 = rt; eps = rt-n0; eps2 = eps*eps; nnn = 12*n0; /* Tabulated coulomb interaction */ Y = VFtab[nnn]; F = VFtab[nnn+1]; Geps = eps*VFtab[nnn+2]; Heps2 = eps2*VFtab[nnn+3]; Fp = F+Geps+Heps2; VV = Y+eps*Fp; FF = Fp+Geps+2.0*Heps2; vcoul = qq*VV; fijC = qq*FF; vctot = vctot + vcoul; fscal = -((fijC)*tabscale)*rinv31; /* Calculate temporary vectorial force */ tx = fscal*dx31; ty = fscal*dy31; tz = fscal*dz31; /* Increment i atom force */ fix3 = fix3 + tx; fiy3 = fiy3 + ty; fiz3 = fiz3 + tz; /* Decrement j atom force */ faction[j3+0] = fjx1 - tx; faction[j3+1] = fjy1 - ty; faction[j3+2] = fjz1 - tz; /* Load parameters for j atom */ qq = qqHH; /* Calculate table index */ r = rsq32*rinv32; /* Calculate table index */ rt = r*tabscale; n0 = rt; eps = rt-n0; eps2 = eps*eps; nnn = 12*n0; /* Tabulated coulomb interaction */ Y = VFtab[nnn]; F = VFtab[nnn+1]; Geps = eps*VFtab[nnn+2]; Heps2 = eps2*VFtab[nnn+3]; Fp = F+Geps+Heps2; VV = Y+eps*Fp; FF = Fp+Geps+2.0*Heps2; vcoul = qq*VV; fijC = qq*FF; vctot = vctot + vcoul; fscal = -((fijC)*tabscale)*rinv32; /* Calculate temporary vectorial force */ tx = fscal*dx32; ty = fscal*dy32; tz = fscal*dz32; /* Increment i atom force */ fix3 = fix3 + tx; fiy3 = fiy3 + ty; fiz3 = fiz3 + tz; /* Decrement j atom force */ faction[j3+3] = fjx2 - tx; faction[j3+4] = fjy2 - ty; faction[j3+5] = fjz2 - tz; /* Load parameters for j atom */ qq = qqHH; /* Calculate table index */ r = rsq33*rinv33; /* Calculate table index */ rt = r*tabscale; n0 = rt; eps = rt-n0; eps2 = eps*eps; nnn = 12*n0; /* Tabulated coulomb interaction */ Y = VFtab[nnn]; F = VFtab[nnn+1]; Geps = eps*VFtab[nnn+2]; Heps2 = eps2*VFtab[nnn+3]; Fp = F+Geps+Heps2; VV = Y+eps*Fp; FF = Fp+Geps+2.0*Heps2; vcoul = qq*VV; fijC = qq*FF; vctot = vctot + vcoul; fscal = -((fijC)*tabscale)*rinv33; /* Calculate temporary vectorial force */ tx = fscal*dx33; ty = fscal*dy33; tz = fscal*dz33; /* Increment i atom force */ fix3 = fix3 + tx; fiy3 = fiy3 + ty; fiz3 = fiz3 + tz; /* Decrement j atom force */ faction[j3+6] = fjx3 - tx; faction[j3+7] = fjy3 - ty; faction[j3+8] = fjz3 - tz; /* Inner loop uses 395 flops/iteration */ } /* Add i forces to mem and shifted force list */ faction[ii3+0] = faction[ii3+0] + fix1; faction[ii3+1] = faction[ii3+1] + fiy1; faction[ii3+2] = faction[ii3+2] + fiz1; faction[ii3+3] = faction[ii3+3] + fix2; faction[ii3+4] = faction[ii3+4] + fiy2; faction[ii3+5] = faction[ii3+5] + fiz2; faction[ii3+6] = faction[ii3+6] + fix3; faction[ii3+7] = faction[ii3+7] + fiy3; faction[ii3+8] = faction[ii3+8] + fiz3; fshift[is3] = fshift[is3]+fix1+fix2+fix3; fshift[is3+1] = fshift[is3+1]+fiy1+fiy2+fiy3; fshift[is3+2] = fshift[is3+2]+fiz1+fiz2+fiz3; /* Add potential energies to the group for this list */ ggid = gid[n]; Vc[ggid] = Vc[ggid] + vctot; Vvdw[ggid] = Vvdw[ggid] + Vvdwtot; /* Increment number of inner iterations */ ninner = ninner + nj1 - nj0; /* Outer loop uses 29 flops/iteration */ } /* Increment number of outer iterations */ nouter = nouter + nn1 - nn0; } while (nn1<nri); /* Write outer/inner iteration count to pointers */ *outeriter = nouter; *inneriter = ninner; }
/* * Gromacs nonbonded kernel nb_kernel314nf * Coulomb interaction: Tabulated * VdW interaction: Lennard-Jones * water optimization: pairs of TIP4P interactions * Calculate forces: no */ void nb_kernel314nf( int * p_nri, int * iinr, int * jindex, int * jjnr, int * shift, real * shiftvec, real * fshift, int * gid, real * pos, real * faction, real * charge, real * p_facel, real * p_krf, real * p_crf, real * Vc, int * type, int * p_ntype, real * vdwparam, real * Vvdw, real * p_tabscale, real * VFtab,real * enerd1,real * enerd2,real * enerd3,real * enerd4,int * start,int * end,int * homenr,int * nbsum, real * invsqrta, real * dvda, real * p_gbtabscale, real * GBtab, int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, real * work) { int nri,ntype,nthreads; real facel,krf,crf,tabscale,gbtabscale; int n,ii,is3,ii3,k,nj0,nj1,jnr,j3,ggid; int nn0,nn1,nouter,ninner; real shX,shY,shZ; real rinvsq; real qq,vcoul,vctot; int tj; real rinvsix; real Vvdw6,Vvdwtot; real Vvdw12; real r,rt,eps,eps2; int n0,nnn; real Y,F,Geps,Heps2,Fp,VV; real ix1,iy1,iz1; real ix2,iy2,iz2; real ix3,iy3,iz3; real ix4,iy4,iz4; real jx1,jy1,jz1; real jx2,jy2,jz2; real jx3,jy3,jz3; real jx4,jy4,jz4; real dx11,dy11,dz11,rsq11; real dx22,dy22,dz22,rsq22,rinv22; real dx23,dy23,dz23,rsq23,rinv23; real dx24,dy24,dz24,rsq24,rinv24; real dx32,dy32,dz32,rsq32,rinv32; real dx33,dy33,dz33,rsq33,rinv33; real dx34,dy34,dz34,rsq34,rinv34; real dx42,dy42,dz42,rsq42,rinv42; real dx43,dy43,dz43,rsq43,rinv43; real dx44,dy44,dz44,rsq44,rinv44; real qH,qM,qqMM,qqMH,qqHH; real c6,c12; nri = *p_nri; ntype = *p_ntype; nthreads = *p_nthreads; facel = *p_facel; krf = *p_krf; crf = *p_crf; tabscale = *p_tabscale; /* Initialize water data */ ii = iinr[0]; qH = charge[ii+1]; qM = charge[ii+3]; qqMM = facel*qM*qM; qqMH = facel*qM*qH; qqHH = facel*qH*qH; tj = 2*(ntype+1)*type[ii]; c6 = vdwparam[tj]; c12 = vdwparam[tj+1]; /* Reset outer and inner iteration counters */ nouter = 0; ninner = 0; /* Loop over thread workunits */ do { #ifdef GMX_THREADS gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx); nn0 = *count; /* Take successively smaller chunks (at least 10 lists) */ nn1 = nn0+(nri-nn0)/(2*nthreads)+10; *count = nn1; gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx); if(nn1>nri) nn1=nri; #else nn0 = 0; nn1 = nri; #endif /* Start outer loop over neighborlists */ for(n=nn0; (n<nn1); n++) { /* Load shift vector for this list */ is3 = 3*shift[n]; shX = shiftvec[is3]; shY = shiftvec[is3+1]; shZ = shiftvec[is3+2]; /* Load limits for loop over neighbors */ nj0 = jindex[n]; nj1 = jindex[n+1]; /* Get outer coordinate index */ ii = iinr[n]; ii3 = 3*ii; /* Load i atom data, add shift vector */ ix1 = shX + pos[ii3+0]; iy1 = shY + pos[ii3+1]; iz1 = shZ + pos[ii3+2]; ix2 = shX + pos[ii3+3]; iy2 = shY + pos[ii3+4]; iz2 = shZ + pos[ii3+5]; ix3 = shX + pos[ii3+6]; iy3 = shY + pos[ii3+7]; iz3 = shZ + pos[ii3+8]; ix4 = shX + pos[ii3+9]; iy4 = shY + pos[ii3+10]; iz4 = shZ + pos[ii3+11]; /* Zero the potential energy for this list */ vctot = 0; Vvdwtot = 0; /* Clear i atom forces */ for(k=nj0; (k<nj1); k++) { /* Get j neighbor index, and coordinate index */ jnr = jjnr[k]; j3 = 3*jnr; /* load j atom coordinates */ jx1 = pos[j3+0]; jy1 = pos[j3+1]; jz1 = pos[j3+2]; jx2 = pos[j3+3]; jy2 = pos[j3+4]; jz2 = pos[j3+5]; jx3 = pos[j3+6]; jy3 = pos[j3+7]; jz3 = pos[j3+8]; jx4 = pos[j3+9]; jy4 = pos[j3+10]; jz4 = pos[j3+11]; /* Calculate distance */ dx11 = ix1 - jx1; dy11 = iy1 - jy1; dz11 = iz1 - jz1; rsq11 = dx11*dx11+dy11*dy11+dz11*dz11; dx22 = ix2 - jx2; dy22 = iy2 - jy2; dz22 = iz2 - jz2; rsq22 = dx22*dx22+dy22*dy22+dz22*dz22; dx23 = ix2 - jx3; dy23 = iy2 - jy3; dz23 = iz2 - jz3; rsq23 = dx23*dx23+dy23*dy23+dz23*dz23; dx24 = ix2 - jx4; dy24 = iy2 - jy4; dz24 = iz2 - jz4; rsq24 = dx24*dx24+dy24*dy24+dz24*dz24; dx32 = ix3 - jx2; dy32 = iy3 - jy2; dz32 = iz3 - jz2; rsq32 = dx32*dx32+dy32*dy32+dz32*dz32; dx33 = ix3 - jx3; dy33 = iy3 - jy3; dz33 = iz3 - jz3; rsq33 = dx33*dx33+dy33*dy33+dz33*dz33; dx34 = ix3 - jx4; dy34 = iy3 - jy4; dz34 = iz3 - jz4; rsq34 = dx34*dx34+dy34*dy34+dz34*dz34; dx42 = ix4 - jx2; dy42 = iy4 - jy2; dz42 = iz4 - jz2; rsq42 = dx42*dx42+dy42*dy42+dz42*dz42; dx43 = ix4 - jx3; dy43 = iy4 - jy3; dz43 = iz4 - jz3; rsq43 = dx43*dx43+dy43*dy43+dz43*dz43; dx44 = ix4 - jx4; dy44 = iy4 - jy4; dz44 = iz4 - jz4; rsq44 = dx44*dx44+dy44*dy44+dz44*dz44; /* Calculate 1/r and 1/r2 */ rinvsq = 1.0/rsq11; rinv22 = invsqrt(rsq22); rinv23 = invsqrt(rsq23); rinv24 = invsqrt(rsq24); rinv32 = invsqrt(rsq32); rinv33 = invsqrt(rsq33); rinv34 = invsqrt(rsq34); rinv42 = invsqrt(rsq42); rinv43 = invsqrt(rsq43); rinv44 = invsqrt(rsq44); /* Load parameters for j atom */ /* Lennard-Jones interaction */ rinvsix = rinvsq*rinvsq*rinvsq; Vvdw6 = c6*rinvsix; Vvdw12 = c12*rinvsix*rinvsix; Vvdwtot = Vvdwtot+Vvdw12-Vvdw6; /* Load parameters for j atom */ qq = qqHH; /* Calculate table index */ r = rsq22*rinv22; /* Calculate table index */ rt = r*tabscale; n0 = rt; eps = rt-n0; eps2 = eps*eps; nnn = 4*n0; /* Tabulated coulomb interaction */ Y = VFtab[nnn]; F = VFtab[nnn+1]; Geps = eps*VFtab[nnn+2]; Heps2 = eps2*VFtab[nnn+3]; Fp = F+Geps+Heps2; VV = Y+eps*Fp; vcoul = qq*VV; vctot = vctot + vcoul; /* Load parameters for j atom */ qq = qqHH; /* Calculate table index */ r = rsq23*rinv23; /* Calculate table index */ rt = r*tabscale; n0 = rt; eps = rt-n0; eps2 = eps*eps; nnn = 4*n0; /* Tabulated coulomb interaction */ Y = VFtab[nnn]; F = VFtab[nnn+1]; Geps = eps*VFtab[nnn+2]; Heps2 = eps2*VFtab[nnn+3]; Fp = F+Geps+Heps2; VV = Y+eps*Fp; vcoul = qq*VV; vctot = vctot + vcoul; /* Load parameters for j atom */ qq = qqMH; /* Calculate table index */ r = rsq24*rinv24; /* Calculate table index */ rt = r*tabscale; n0 = rt; eps = rt-n0; eps2 = eps*eps; nnn = 4*n0; /* Tabulated coulomb interaction */ Y = VFtab[nnn]; F = VFtab[nnn+1]; Geps = eps*VFtab[nnn+2]; Heps2 = eps2*VFtab[nnn+3]; Fp = F+Geps+Heps2; VV = Y+eps*Fp; vcoul = qq*VV; vctot = vctot + vcoul; /* Load parameters for j atom */ qq = qqHH; /* Calculate table index */ r = rsq32*rinv32; /* Calculate table index */ rt = r*tabscale; n0 = rt; eps = rt-n0; eps2 = eps*eps; nnn = 4*n0; /* Tabulated coulomb interaction */ Y = VFtab[nnn]; F = VFtab[nnn+1]; Geps = eps*VFtab[nnn+2]; Heps2 = eps2*VFtab[nnn+3]; Fp = F+Geps+Heps2; VV = Y+eps*Fp; vcoul = qq*VV; vctot = vctot + vcoul; /* Load parameters for j atom */ qq = qqHH; /* Calculate table index */ r = rsq33*rinv33; /* Calculate table index */ rt = r*tabscale; n0 = rt; eps = rt-n0; eps2 = eps*eps; nnn = 4*n0; /* Tabulated coulomb interaction */ Y = VFtab[nnn]; F = VFtab[nnn+1]; Geps = eps*VFtab[nnn+2]; Heps2 = eps2*VFtab[nnn+3]; Fp = F+Geps+Heps2; VV = Y+eps*Fp; vcoul = qq*VV; vctot = vctot + vcoul; /* Load parameters for j atom */ qq = qqMH; /* Calculate table index */ r = rsq34*rinv34; /* Calculate table index */ rt = r*tabscale; n0 = rt; eps = rt-n0; eps2 = eps*eps; nnn = 4*n0; /* Tabulated coulomb interaction */ Y = VFtab[nnn]; F = VFtab[nnn+1]; Geps = eps*VFtab[nnn+2]; Heps2 = eps2*VFtab[nnn+3]; Fp = F+Geps+Heps2; VV = Y+eps*Fp; vcoul = qq*VV; vctot = vctot + vcoul; /* Load parameters for j atom */ qq = qqMH; /* Calculate table index */ r = rsq42*rinv42; /* Calculate table index */ rt = r*tabscale; n0 = rt; eps = rt-n0; eps2 = eps*eps; nnn = 4*n0; /* Tabulated coulomb interaction */ Y = VFtab[nnn]; F = VFtab[nnn+1]; Geps = eps*VFtab[nnn+2]; Heps2 = eps2*VFtab[nnn+3]; Fp = F+Geps+Heps2; VV = Y+eps*Fp; vcoul = qq*VV; vctot = vctot + vcoul; /* Load parameters for j atom */ qq = qqMH; /* Calculate table index */ r = rsq43*rinv43; /* Calculate table index */ rt = r*tabscale; n0 = rt; eps = rt-n0; eps2 = eps*eps; nnn = 4*n0; /* Tabulated coulomb interaction */ Y = VFtab[nnn]; F = VFtab[nnn+1]; Geps = eps*VFtab[nnn+2]; Heps2 = eps2*VFtab[nnn+3]; Fp = F+Geps+Heps2; VV = Y+eps*Fp; vcoul = qq*VV; vctot = vctot + vcoul; /* Load parameters for j atom */ qq = qqMM; /* Calculate table index */ r = rsq44*rinv44; /* Calculate table index */ rt = r*tabscale; n0 = rt; eps = rt-n0; eps2 = eps*eps; nnn = 4*n0; /* Tabulated coulomb interaction */ Y = VFtab[nnn]; F = VFtab[nnn+1]; Geps = eps*VFtab[nnn+2]; Heps2 = eps2*VFtab[nnn+3]; Fp = F+Geps+Heps2; VV = Y+eps*Fp; vcoul = qq*VV; vctot = vctot + vcoul; /* Inner loop uses 244 flops/iteration */ } /* Add i forces to mem and shifted force list */ /* Add potential energies to the group for this list */ ggid = gid[n]; Vc[ggid] = Vc[ggid] + vctot; Vvdw[ggid] = Vvdw[ggid] + Vvdwtot; /* Increment number of inner iterations */ ninner = ninner + nj1 - nj0; /* Outer loop uses 14 flops/iteration */ } /* Increment number of outer iterations */ nouter = nouter + nn1 - nn0; } while (nn1<nri); /* Write outer/inner iteration count to pointers */ *outeriter = nouter; *inneriter = ninner; }
void nb_kernel301_ppc_altivec (int * p_nri, int iinr[], int jindex[], int jjnr[], int shift[], float shiftvec[], float fshift[], int gid[], float pos[], float faction[], float charge[], float * p_facel, float * p_krf, float * p_crf, float Vc[], int type[], int * p_ntype, float vdwparam[], float Vvdw[], float * p_tabscale, float VFtab[], float invsqrta[], float dvda[], float * p_gbtabscale, float GBtab[], int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, float * work) { vector float iOx,iOy,iOz,iH1x,iH1y,iH1z,iH2x,iH2y,iH2z; vector float dOx,dOy,dOz,dH1x,dH1y,dH1z,dH2x,dH2y,dH2z; vector float vfacel,nul; vector float fsO,fsH1,fsH2,tsc,VVcO,FFcO,VVcH1,FFcH1,VVcH2,FFcH2; vector float vctot,qqO,qqH,iqO,iqH,jq; vector float fiOx,fiOy,fiOz,fiH1x,fiH1y,fiH1z,fiH2x,fiH2y,fiH2z; vector float tmp1,tmp2,tmp3,tmp4; vector float rinvO,rinvH1,rinvH2,rO,rH1,rH2,rsqO,rsqH1,rsqH2; int n,k,ii,is3,ii3,nj0,nj1; int jnra,jnrb,jnrc,jnrd; int j3a,j3b,j3c,j3d; int nri, ntype, nouter, ninner; #ifdef GMX_THREADS int nn0, nn1; #endif nouter = 0; ninner = 0; nri = *p_nri; ntype = *p_ntype; nul=vec_zero(); vfacel=load_float_and_splat(p_facel); tsc=load_float_and_splat(p_tabscale); iqO = vec_madd(load_float_and_splat(charge+iinr[0]),vfacel,nul); iqH = vec_madd(load_float_and_splat(charge+iinr[0]+1),vfacel,nul); #ifdef GMX_THREADS nthreads = *p_nthreads; do { gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx); nn0 = *count; nn1 = nn0+(nri-nn0)/(2*nthreads)+3; *count = nn1; gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx); if(nn1>nri) nn1=nri; for(n=nn0; (n<nn1); n++) { #if 0 } /* maintain correct indentation even with conditional left braces */ #endif #else /* without gmx_threads */ for(n=0;n<nri;n++) { #endif is3 = 3*shift[n]; ii = iinr[n]; ii3 = 3*ii; load_1_3atoms_shift_and_splat(pos+ii3,shiftvec+is3,&iOx,&iOy,&iOz, &iH1x,&iH1y,&iH1z,&iH2x,&iH2y,&iH2z); vctot = nul; fiOx = nul; fiOy = nul; fiOz = nul; fiH1x = nul; fiH1y = nul; fiH1z = nul; fiH2x = nul; fiH2y = nul; fiH2z = nul; nj0 = jindex[n]; nj1 = jindex[n+1]; for(k=nj0; k<(nj1-3); k+=4) { jnra = jjnr[k]; jnrb = jjnr[k+1]; jnrc = jjnr[k+2]; jnrd = jjnr[k+3]; j3a = 3*jnra; j3b = 3*jnrb; j3c = 3*jnrc; j3d = 3*jnrd; transpose_4_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b), load_xyz(pos+j3c), load_xyz(pos+j3d),&dH2x,&dH2y,&dH2z); dOx = vec_sub(iOx,dH2x); dOy = vec_sub(iOy,dH2y); dOz = vec_sub(iOz,dH2z); dH1x = vec_sub(iH1x,dH2x); dH1y = vec_sub(iH1y,dH2y); dH1z = vec_sub(iH1z,dH2z); dH2x = vec_sub(iH2x,dH2x); dH2y = vec_sub(iH2y,dH2y); dH2z = vec_sub(iH2z,dH2z); rsqO = vec_madd(dOx,dOx,nul); rsqH1 = vec_madd(dH1x,dH1x,nul); rsqH2 = vec_madd(dH2x,dH2x,nul); rsqO = vec_madd(dOy,dOy,rsqO); rsqH1 = vec_madd(dH1y,dH1y,rsqH1); rsqH2 = vec_madd(dH2y,dH2y,rsqH2); rsqO = vec_madd(dOz,dOz,rsqO); rsqH1 = vec_madd(dH1z,dH1z,rsqH1); rsqH2 = vec_madd(dH2z,dH2z,rsqH2); do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2); rO = vec_madd(rsqO,rinvO,nul); rH1 = vec_madd(rsqH1,rinvH1,nul); rH2 = vec_madd(rsqH2,rinvH2,nul); /* load 4 j charges and multiply by iq */ jq=load_4_float(charge+jnra,charge+jnrb, charge+jnrc,charge+jnrd); do_4_ctable_coul(VFtab,vec_madd(rO,tsc,nul),&VVcO,&FFcO); do_4_ctable_coul(VFtab,vec_madd(rH1,tsc,nul),&VVcH1,&FFcH1); do_4_ctable_coul(VFtab,vec_madd(rH2,tsc,nul),&VVcH2,&FFcH2); qqO = vec_madd(iqO,jq,nul); qqH = vec_madd(iqH,jq,nul); vctot = vec_madd(qqO,VVcO,vctot); fsO = vec_nmsub(qqO,FFcO,nul); fsH1 = vec_nmsub(qqH,FFcH1,nul); fsH2 = vec_nmsub(qqH,FFcH2,nul); vctot = vec_madd(qqH,VVcH1,vctot); fsO = vec_madd(fsO,tsc,nul); fsH1 = vec_madd(fsH1,tsc,nul); fsH2 = vec_madd(fsH2,tsc,nul); vctot = vec_madd(qqH,VVcH2,vctot); fsO = vec_madd(fsO,rinvO,nul); fsH1 = vec_madd(fsH1,rinvH1,nul); fsH2 = vec_madd(fsH2,rinvH2,nul); fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */ dOx = vec_nmsub(fsO,dOx,nul); /* -fx */ fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */ dOy = vec_nmsub(fsO,dOy,nul); /* -fy */ fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */ dOz = vec_nmsub(fsO,dOz,nul); /* -fz */ fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */ dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */ fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */ dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */ fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */ dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */ fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */ dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */ fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */ dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */ fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */ dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */ transpose_3_to_4(dOx,dOy,dOz,&tmp1,&tmp2,&tmp3,&tmp4); add_xyz_to_mem(faction+j3a,tmp1); add_xyz_to_mem(faction+j3b,tmp2); add_xyz_to_mem(faction+j3c,tmp3); add_xyz_to_mem(faction+j3d,tmp4); } if(k<(nj1-2)) { jnra = jjnr[k]; jnrb = jjnr[k+1]; jnrc = jjnr[k+2]; j3a = 3*jnra; j3b = 3*jnrb; j3c = 3*jnrc; transpose_4_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b), load_xyz(pos+j3c),nul,&dH2x,&dH2y,&dH2z); dOx = vec_sub(iOx,dH2x); dOy = vec_sub(iOy,dH2y); dOz = vec_sub(iOz,dH2z); dH1x = vec_sub(iH1x,dH2x); dH1y = vec_sub(iH1y,dH2y); dH1z = vec_sub(iH1z,dH2z); dH2x = vec_sub(iH2x,dH2x); dH2y = vec_sub(iH2y,dH2y); dH2z = vec_sub(iH2z,dH2z); rsqO = vec_madd(dOx,dOx,nul); rsqH1 = vec_madd(dH1x,dH1x,nul); rsqH2 = vec_madd(dH2x,dH2x,nul); rsqO = vec_madd(dOy,dOy,rsqO); rsqH1 = vec_madd(dH1y,dH1y,rsqH1); rsqH2 = vec_madd(dH2y,dH2y,rsqH2); rsqO = vec_madd(dOz,dOz,rsqO); rsqH1 = vec_madd(dH1z,dH1z,rsqH1); rsqH2 = vec_madd(dH2z,dH2z,rsqH2); zero_highest_element_in_3_vectors(&rsqO,&rsqH1,&rsqH2); do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2); zero_highest_element_in_3_vectors(&rinvO,&rinvH1,&rinvH2); rO = vec_madd(rsqO,rinvO,nul); rH1 = vec_madd(rsqH1,rinvH1,nul); rH2 = vec_madd(rsqH2,rinvH2,nul); /* load 3 j charges and multiply by iq */ jq=load_3_float(charge+jnra,charge+jnrb,charge+jnrc); do_3_ctable_coul(VFtab,vec_madd(rO,tsc,nul),&VVcO,&FFcO); do_3_ctable_coul(VFtab,vec_madd(rH1,tsc,nul),&VVcH1,&FFcH1); do_3_ctable_coul(VFtab,vec_madd(rH2,tsc,nul),&VVcH2,&FFcH2); qqO = vec_madd(iqO,jq,nul); qqH = vec_madd(iqH,jq,nul); vctot = vec_madd(qqO,VVcO,vctot); fsO = vec_nmsub(qqO,FFcO,nul); fsH1 = vec_nmsub(qqH,FFcH1,nul); fsH2 = vec_nmsub(qqH,FFcH2,nul); vctot = vec_madd(qqH,VVcH1,vctot); fsO = vec_madd(fsO,tsc,nul); fsH1 = vec_madd(fsH1,tsc,nul); fsH2 = vec_madd(fsH2,tsc,nul); vctot = vec_madd(qqH,VVcH2,vctot); fsO = vec_madd(fsO,rinvO,nul); fsH1 = vec_madd(fsH1,rinvH1,nul); fsH2 = vec_madd(fsH2,rinvH2,nul); fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */ dOx = vec_nmsub(fsO,dOx,nul); /* -fx */ fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */ dOy = vec_nmsub(fsO,dOy,nul); /* -fy */ fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */ dOz = vec_nmsub(fsO,dOz,nul); /* -fz */ fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */ dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */ fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */ dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */ fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */ dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */ fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */ dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */ fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */ dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */ fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */ dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */ transpose_4_to_3(dOx,dOy,dOz,nul,&tmp1,&tmp2,&tmp3); add_xyz_to_mem(faction+j3a,tmp1); add_xyz_to_mem(faction+j3b,tmp2); add_xyz_to_mem(faction+j3c,tmp3); } else if(k<(nj1-1)) { jnra = jjnr[k]; jnrb = jjnr[k+1]; j3a = 3*jnra; j3b = 3*jnrb; transpose_2_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b),&dH2x,&dH2y,&dH2z); dOx = vec_sub(iOx,dH2x); dOy = vec_sub(iOy,dH2y); dOz = vec_sub(iOz,dH2z); dH1x = vec_sub(iH1x,dH2x); dH1y = vec_sub(iH1y,dH2y); dH1z = vec_sub(iH1z,dH2z); dH2x = vec_sub(iH2x,dH2x); dH2y = vec_sub(iH2y,dH2y); dH2z = vec_sub(iH2z,dH2z); rsqO = vec_madd(dOx,dOx,nul); rsqH1 = vec_madd(dH1x,dH1x,nul); rsqH2 = vec_madd(dH2x,dH2x,nul); rsqO = vec_madd(dOy,dOy,rsqO); rsqH1 = vec_madd(dH1y,dH1y,rsqH1); rsqH2 = vec_madd(dH2y,dH2y,rsqH2); rsqO = vec_madd(dOz,dOz,rsqO); rsqH1 = vec_madd(dH1z,dH1z,rsqH1); rsqH2 = vec_madd(dH2z,dH2z,rsqH2); zero_highest_2_elements_in_3_vectors(&rsqO,&rsqH1,&rsqH2); do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2); zero_highest_2_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2); rO = vec_madd(rsqO,rinvO,nul); rH1 = vec_madd(rsqH1,rinvH1,nul); rH2 = vec_madd(rsqH2,rinvH2,nul); /* load 2 j charges and multiply by iq */ jq=load_2_float(charge+jnra,charge+jnrb); do_2_ctable_coul(VFtab,vec_madd(rO,tsc,nul),&VVcO,&FFcO); do_2_ctable_coul(VFtab,vec_madd(rH1,tsc,nul),&VVcH1,&FFcH1); do_2_ctable_coul(VFtab,vec_madd(rH2,tsc,nul),&VVcH2,&FFcH2); qqO = vec_madd(iqO,jq,nul); qqH = vec_madd(iqH,jq,nul); vctot = vec_madd(qqO,VVcO,vctot); fsO = vec_nmsub(qqO,FFcO,nul); fsH1 = vec_nmsub(qqH,FFcH1,nul); fsH2 = vec_nmsub(qqH,FFcH2,nul); vctot = vec_madd(qqH,VVcH1,vctot); fsO = vec_madd(fsO,tsc,nul); fsH1 = vec_madd(fsH1,tsc,nul); fsH2 = vec_madd(fsH2,tsc,nul); vctot = vec_madd(qqH,VVcH2,vctot); fsO = vec_madd(fsO,rinvO,nul); fsH1 = vec_madd(fsH1,rinvH1,nul); fsH2 = vec_madd(fsH2,rinvH2,nul); fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */ dOx = vec_nmsub(fsO,dOx,nul); /* -fx */ fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */ dOy = vec_nmsub(fsO,dOy,nul); /* -fy */ fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */ dOz = vec_nmsub(fsO,dOz,nul); /* -fz */ fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */ dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */ fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */ dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */ fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */ dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */ fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */ dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */ fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */ dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */ fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */ dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */ transpose_3_to_2(dOx,dOy,dOz,&tmp1,&tmp2); add_xyz_to_mem(faction+j3a,tmp1); add_xyz_to_mem(faction+j3b,tmp2); } else if(k<nj1) { jnra = jjnr[k]; j3a = 3*jnra; transpose_1_to_3(load_xyz(pos+j3a),&dH2x,&dH2y,&dH2z); dOx = vec_sub(iOx,dH2x); dOy = vec_sub(iOy,dH2y); dOz = vec_sub(iOz,dH2z); dH1x = vec_sub(iH1x,dH2x); dH1y = vec_sub(iH1y,dH2y); dH1z = vec_sub(iH1z,dH2z); dH2x = vec_sub(iH2x,dH2x); dH2y = vec_sub(iH2y,dH2y); dH2z = vec_sub(iH2z,dH2z); rsqO = vec_madd(dOx,dOx,nul); rsqH1 = vec_madd(dH1x,dH1x,nul); rsqH2 = vec_madd(dH2x,dH2x,nul); rsqO = vec_madd(dOy,dOy,rsqO); rsqH1 = vec_madd(dH1y,dH1y,rsqH1); rsqH2 = vec_madd(dH2y,dH2y,rsqH2); rsqO = vec_madd(dOz,dOz,rsqO); rsqH1 = vec_madd(dH1z,dH1z,rsqH1); rsqH2 = vec_madd(dH2z,dH2z,rsqH2); zero_highest_3_elements_in_3_vectors(&rsqO,&rsqH1,&rsqH2); do_3_invsqrt(rsqO,rsqH1,rsqH2,&rinvO,&rinvH1,&rinvH2); zero_highest_3_elements_in_3_vectors(&rinvO,&rinvH1,&rinvH2); rO = vec_madd(rsqO,rinvO,nul); rH1 = vec_madd(rsqH1,rinvH1,nul); rH2 = vec_madd(rsqH2,rinvH2,nul); /* load 1 j charges and multiply by iq */ jq=load_1_float(charge+jnra); do_1_ctable_coul(VFtab,vec_madd(rO,tsc,nul),&VVcO,&FFcO); do_1_ctable_coul(VFtab,vec_madd(rH1,tsc,nul),&VVcH1,&FFcH1); do_1_ctable_coul(VFtab,vec_madd(rH2,tsc,nul),&VVcH2,&FFcH2); qqO = vec_madd(iqO,jq,nul); qqH = vec_madd(iqH,jq,nul); vctot = vec_madd(qqO,VVcO,vctot); fsO = vec_nmsub(qqO,FFcO,nul); fsH1 = vec_nmsub(qqH,FFcH1,nul); fsH2 = vec_nmsub(qqH,FFcH2,nul); vctot = vec_madd(qqH,VVcH1,vctot); fsO = vec_madd(fsO,tsc,nul); fsH1 = vec_madd(fsH1,tsc,nul); fsH2 = vec_madd(fsH2,tsc,nul); vctot = vec_madd(qqH,VVcH2,vctot); fsO = vec_madd(fsO,rinvO,nul); fsH1 = vec_madd(fsH1,rinvH1,nul); fsH2 = vec_madd(fsH2,rinvH2,nul); fiOx = vec_madd(fsO,dOx,fiOx); /* +=fx */ dOx = vec_nmsub(fsO,dOx,nul); /* -fx */ fiOy = vec_madd(fsO,dOy,fiOy); /* +=fy */ dOy = vec_nmsub(fsO,dOy,nul); /* -fy */ fiOz = vec_madd(fsO,dOz,fiOz); /* +=fz */ dOz = vec_nmsub(fsO,dOz,nul); /* -fz */ fiH1x = vec_madd(fsH1,dH1x,fiH1x); /* +=fx */ dOx = vec_nmsub(fsH1,dH1x,dOx); /* -fx */ fiH1y = vec_madd(fsH1,dH1y,fiH1y); /* +=fy */ dOy = vec_nmsub(fsH1,dH1y,dOy); /* -fy */ fiH1z = vec_madd(fsH1,dH1z,fiH1z); /* +=fz */ dOz = vec_nmsub(fsH1,dH1z,dOz); /* -fz */ fiH2x = vec_madd(fsH2,dH2x,fiH2x); /* +=fx */ dOx = vec_nmsub(fsH2,dH2x,dOx); /* -fx */ fiH2y = vec_madd(fsH2,dH2y,fiH2y); /* +=fy */ dOy = vec_nmsub(fsH2,dH2y,dOy); /* -fy */ fiH2z = vec_madd(fsH2,dH2z,fiH2z); /* +=fz */ dOz = vec_nmsub(fsH2,dH2z,dOz); /* -fz */ transpose_3_to_1(dOx,dOy,dOz,&tmp1); add_xyz_to_mem(faction+j3a,tmp1); } /* update outer data */ update_i_3atoms_forces(faction+ii3,fshift+is3, fiOx,fiOy,fiOz,fiH1x,fiH1y,fiH1z, fiH2x,fiH2y,fiH2z); add_vector_to_float(Vc+gid[n],vctot); ninner += nj1 - nj0; } #ifdef GMX_THREADS nouter += nn1 - nn0; } while (nn1<nri); #else nouter = nri; #endif *outeriter = nouter; *inneriter = ninner; }
/* * Gromacs nonbonded kernel nb_kernel400nf * Coulomb interaction: Generalized-Born * VdW interaction: Not calculated * water optimization: No * Calculate forces: no */ void nb_kernel400nf( int * p_nri, int * iinr, int * jindex, int * jjnr, int * shift, real * shiftvec, real * fshift, int * gid, real * pos, real * faction, real * charge, real * p_facel, real * p_krf, real * p_crf, real * Vc, int * type, int * p_ntype, real * vdwparam, real * Vvdw, real * p_tabscale, real * VFtab,real * enerd1,real * enerd2,real * enerd3,real * enerd4,int * start,int * end,int * homenr,int * nbsum, real * invsqrta, real * dvda, real * p_gbtabscale, real * GBtab, int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, real * work) { int nri,ntype,nthreads; real facel,krf,crf,tabscale,gbtabscale; int n,ii,is3,ii3,k,nj0,nj1,jnr,j3,ggid; int nn0,nn1,nouter,ninner; real shX,shY,shZ; real iq; real qq,vcoul,vctot; real r,rt,eps,eps2; int n0,nnn; real Y,F,Geps,Heps2,Fp,VV; real isai,isaj,isaprod,gbscale,vgb; real ix1,iy1,iz1; real jx1,jy1,jz1; real dx11,dy11,dz11,rsq11,rinv11; nri = *p_nri; ntype = *p_ntype; nthreads = *p_nthreads; facel = *p_facel; krf = *p_krf; crf = *p_crf; tabscale = *p_tabscale; gbtabscale = *p_gbtabscale; /* Reset outer and inner iteration counters */ nouter = 0; ninner = 0; /* Loop over thread workunits */ do { #ifdef GMX_THREADS gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx); nn0 = *count; /* Take successively smaller chunks (at least 10 lists) */ nn1 = nn0+(nri-nn0)/(2*nthreads)+10; *count = nn1; gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx); if(nn1>nri) nn1=nri; #else nn0 = 0; nn1 = nri; #endif /* Start outer loop over neighborlists */ for(n=nn0; (n<nn1); n++) { /* Load shift vector for this list */ is3 = 3*shift[n]; shX = shiftvec[is3]; shY = shiftvec[is3+1]; shZ = shiftvec[is3+2]; /* Load limits for loop over neighbors */ nj0 = jindex[n]; nj1 = jindex[n+1]; /* Get outer coordinate index */ ii = iinr[n]; ii3 = 3*ii; /* Load i atom data, add shift vector */ ix1 = shX + pos[ii3+0]; iy1 = shY + pos[ii3+1]; iz1 = shZ + pos[ii3+2]; /* Load parameters for i atom */ iq = facel*charge[ii]; isai = invsqrta[ii]; /* Zero the potential energy for this list */ vctot = 0; /* Clear i atom forces */ for(k=nj0; (k<nj1); k++) { /* Get j neighbor index, and coordinate index */ jnr = jjnr[k]; j3 = 3*jnr; /* load j atom coordinates */ jx1 = pos[j3+0]; jy1 = pos[j3+1]; jz1 = pos[j3+2]; /* Calculate distance */ dx11 = ix1 - jx1; dy11 = iy1 - jy1; dz11 = iz1 - jz1; rsq11 = dx11*dx11+dy11*dy11+dz11*dz11; /* Calculate 1/r and 1/r2 */ rinv11 = invsqrt(rsq11); /* Load parameters for j atom */ isaj = invsqrta[jnr]; isaprod = isai*isaj; qq = iq*charge[jnr]; vcoul = qq*rinv11; qq = isaprod*(-qq); gbscale = isaprod*gbtabscale; /* Tabulated Generalized-Born interaction */ r = rsq11*rinv11; /* Calculate table index */ rt = r*gbscale; n0 = rt; eps = rt-n0; eps2 = eps*eps; nnn = 4*n0; Y = GBtab[nnn]; F = GBtab[nnn+1]; Geps = eps*GBtab[nnn+2]; Heps2 = eps2*GBtab[nnn+3]; Fp = F+Geps+Heps2; VV = Y+eps*Fp; vgb = qq*VV; vctot = vctot + vcoul; /* Inner loop uses 29 flops/iteration */ } /* Add i forces to mem and shifted force list */ /* Add potential energies to the group for this list */ ggid = gid[n]; Vc[ggid] = Vc[ggid] + vctot; /* Increment number of inner iterations */ ninner = ninner + nj1 - nj0; /* Outer loop uses 5 flops/iteration */ } /* Increment number of outer iterations */ nouter = nouter + nn1 - nn0; } while (nn1<nri); /* Write outer/inner iteration count to pointers */ *outeriter = nouter; *inneriter = ninner; }
/* * Gromacs nonbonded kernel nb_kernel231nf * Coulomb interaction: Reaction field * VdW interaction: Tabulated * water optimization: SPC/TIP3P - other atoms * Calculate forces: no */ void nb_kernel231nf( int * p_nri, int * iinr, int * jindex, int * jjnr, int * shift, real * shiftvec, real * fshift, int * gid, real * pos, real * faction, real * charge, real * p_facel, real * p_krf, real * p_crf, real * Vc, int * type, int * p_ntype, real * vdwparam, real * Vvdw, real * p_tabscale, real * VFtab,real * enerd1,real * enerd2,real * enerd3,real * enerd4,int * start,int * end,int * homenr,int * nbsum, real * invsqrta, real * dvda, real * p_gbtabscale, real * GBtab, int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, real * work) { int nri,ntype,nthreads; real facel,krf,crf,tabscale,gbtabscale; int n,ii,is3,ii3,k,nj0,nj1,jnr,j3,ggid; int nn0,nn1,nouter,ninner; real shX,shY,shZ; real jq; real qq,vcoul,vctot; int nti; int tj; real Vvdw6,Vvdwtot; real Vvdw12; real r,rt,eps,eps2; int n0,nnn; real Y,F,Geps,Heps2,Fp,VV; real krsq; real ix1,iy1,iz1; real ix2,iy2,iz2; real ix3,iy3,iz3; real jx1,jy1,jz1; real dx11,dy11,dz11,rsq11,rinv11; real dx21,dy21,dz21,rsq21,rinv21; real dx31,dy31,dz31,rsq31,rinv31; real qO,qH; real c6,c12; nri = *p_nri; ntype = *p_ntype; nthreads = *p_nthreads; facel = *p_facel; krf = *p_krf; crf = *p_crf; tabscale = *p_tabscale; /* Initialize water data */ ii = iinr[0]; qO = facel*charge[ii]; qH = facel*charge[ii+1]; nti = 2*ntype*type[ii]; /* Reset outer and inner iteration counters */ nouter = 0; ninner = 0; /* Loop over thread workunits */ do { #ifdef GMX_THREADS gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx); nn0 = *count; /* Take successively smaller chunks (at least 10 lists) */ nn1 = nn0+(nri-nn0)/(2*nthreads)+10; *count = nn1; gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx); if(nn1>nri) nn1=nri; #else nn0 = 0; nn1 = nri; #endif /* Start outer loop over neighborlists */ for(n=nn0; (n<nn1); n++) { /* Load shift vector for this list */ is3 = 3*shift[n]; shX = shiftvec[is3]; shY = shiftvec[is3+1]; shZ = shiftvec[is3+2]; /* Load limits for loop over neighbors */ nj0 = jindex[n]; nj1 = jindex[n+1]; /* Get outer coordinate index */ ii = iinr[n]; ii3 = 3*ii; /* Load i atom data, add shift vector */ ix1 = shX + pos[ii3+0]; iy1 = shY + pos[ii3+1]; iz1 = shZ + pos[ii3+2]; ix2 = shX + pos[ii3+3]; iy2 = shY + pos[ii3+4]; iz2 = shZ + pos[ii3+5]; ix3 = shX + pos[ii3+6]; iy3 = shY + pos[ii3+7]; iz3 = shZ + pos[ii3+8]; /* Zero the potential energy for this list */ vctot = 0; Vvdwtot = 0; /* Clear i atom forces */ for(k=nj0; (k<nj1); k++) { /* Get j neighbor index, and coordinate index */ jnr = jjnr[k]; j3 = 3*jnr; /* load j atom coordinates */ jx1 = pos[j3+0]; jy1 = pos[j3+1]; jz1 = pos[j3+2]; /* Calculate distance */ dx11 = ix1 - jx1; dy11 = iy1 - jy1; dz11 = iz1 - jz1; rsq11 = dx11*dx11+dy11*dy11+dz11*dz11; dx21 = ix2 - jx1; dy21 = iy2 - jy1; dz21 = iz2 - jz1; rsq21 = dx21*dx21+dy21*dy21+dz21*dz21; dx31 = ix3 - jx1; dy31 = iy3 - jy1; dz31 = iz3 - jz1; rsq31 = dx31*dx31+dy31*dy31+dz31*dz31; /* Calculate 1/r and 1/r2 */ rinv11 = invsqrt(rsq11); rinv21 = invsqrt(rsq21); rinv31 = invsqrt(rsq31); /* Load parameters for j atom */ jq = charge[jnr+0]; qq = qO*jq; tj = nti+2*type[jnr]; c6 = vdwparam[tj]; c12 = vdwparam[tj+1]; /* Coulomb reaction-field interaction */ krsq = krf*rsq11; vcoul = qq*(rinv11+krsq-crf); vctot = vctot+vcoul; /* Calculate table index */ r = rsq11*rinv11; /* Calculate table index */ rt = r*tabscale; n0 = rt; eps = rt-n0; eps2 = eps*eps; nnn = 8*n0; /* Tabulated VdW interaction - dispersion */ Y = VFtab[nnn]; F = VFtab[nnn+1]; Geps = eps*VFtab[nnn+2]; Heps2 = eps2*VFtab[nnn+3]; Fp = F+Geps+Heps2; VV = Y+eps*Fp; Vvdw6 = c6*VV; /* Tabulated VdW interaction - repulsion */ nnn = nnn+4; Y = VFtab[nnn]; F = VFtab[nnn+1]; Geps = eps*VFtab[nnn+2]; Heps2 = eps2*VFtab[nnn+3]; Fp = F+Geps+Heps2; VV = Y+eps*Fp; Vvdw12 = c12*VV; Vvdwtot = Vvdwtot+ Vvdw6 + Vvdw12; /* Load parameters for j atom */ qq = qH*jq; /* Coulomb reaction-field interaction */ krsq = krf*rsq21; vcoul = qq*(rinv21+krsq-crf); vctot = vctot+vcoul; /* Load parameters for j atom */ /* Coulomb reaction-field interaction */ krsq = krf*rsq31; vcoul = qq*(rinv31+krsq-crf); vctot = vctot+vcoul; /* Inner loop uses 76 flops/iteration */ } /* Add i forces to mem and shifted force list */ /* Add potential energies to the group for this list */ ggid = gid[n]; Vc[ggid] = Vc[ggid] + vctot; Vvdw[ggid] = Vvdw[ggid] + Vvdwtot; /* Increment number of inner iterations */ ninner = ninner + nj1 - nj0; /* Outer loop uses 11 flops/iteration */ } /* Increment number of outer iterations */ nouter = nouter + nn1 - nn0; } while (nn1<nri); /* Write outer/inner iteration count to pointers */ *outeriter = nouter; *inneriter = ninner; }
/* * Gromacs nonbonded kernel nb_kernel300 * Coulomb interaction: Tabulated * VdW interaction: Not calculated * water optimization: No * Calculate forces: yes */ void nb_kernel300( int * p_nri, int * iinr, int * jindex, int * jjnr, int * shift, real * shiftvec, real * fshift, int * gid, real * pos, real * faction, real * charge, real * p_facel, real * p_krf, real * p_crf, real * Vc, int * type, int * p_ntype, real * vdwparam, real * Vvdw, real * p_tabscale, real * VFtab,real * enerd1,real * enerd2,real * enerd3,real * enerd4,int * start,int * end,int * homenr,int * nbsum, real * invsqrta, real * dvda, real * p_gbtabscale, real * GBtab, int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, real * work) { int nri,ntype,nthreads; real facel,krf,crf,tabscale,gbtabscale; int n,ii,is3,ii3,k,nj0,nj1,jnr,j3,ggid; int nn0,nn1,nouter,ninner; real shX,shY,shZ; real fscal,tx,ty,tz; real iq; real qq,vcoul,vctot; real r,rt,eps,eps2; int n0,nnn; real Y,F,Geps,Heps2,Fp,VV; real FF; real fijC; real ix1,iy1,iz1,fix1,fiy1,fiz1; real jx1,jy1,jz1; real dx11,dy11,dz11,rsq11,rinv11; int index; nri = *p_nri; ntype = *p_ntype; nthreads = *p_nthreads; facel = *p_facel; krf = *p_krf; crf = *p_crf; tabscale = *p_tabscale; /* Reset outer and inner iteration counters */ nouter = 0; ninner = 0; /* Loop over thread workunits */ do { #ifdef GMX_THREADS gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx); nn0 = *count; /* Take successively smaller chunks (at least 10 lists) */ nn1 = nn0+(nri-nn0)/(2*nthreads)+10; *count = nn1; gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx); if(nn1>nri) nn1=nri; #else nn0 = 0; nn1 = nri; #endif /* Start outer loop over neighborlists */ for(n=nn0; (n<nn1); n++) { /* Load shift vector for this list */ is3 = 3*shift[n]; shX = shiftvec[is3]; shY = shiftvec[is3+1]; shZ = shiftvec[is3+2]; /* Load limits for loop over neighbors */ nj0 = jindex[n]; nj1 = jindex[n+1]; /* Get outer coordinate index */ ii = iinr[n]; ii3 = 3*ii; /* Load i atom data, add shift vector */ ix1 = shX + pos[ii3+0]; iy1 = shY + pos[ii3+1]; iz1 = shZ + pos[ii3+2]; /* Load parameters for i atom */ iq = facel*charge[ii]; /* Zero the potential energy for this list */ vctot = 0; /* Clear i atom forces */ fix1 = 0; fiy1 = 0; fiz1 = 0; for(k=nj0; (k<nj1); k++) { /* Get j neighbor index, and coordinate index */ jnr = jjnr[k]; j3 = 3*jnr; if(enerd1) { if(ii<jnr) { index = start[ii]**homenr - nbsum[start[ii]] + start[jnr]; } else { index = start[jnr]**homenr - nbsum[start[jnr]] + start[ii]; } enerd1[index] = enerd1[index] - vctot; } /* load j atom coordinates */ jx1 = pos[j3+0]; jy1 = pos[j3+1]; jz1 = pos[j3+2]; /* Calculate distance */ dx11 = ix1 - jx1; dy11 = iy1 - jy1; dz11 = iz1 - jz1; rsq11 = dx11*dx11+dy11*dy11+dz11*dz11; /* Calculate 1/r and 1/r2 */ rinv11 = invsqrt(rsq11); /* Load parameters for j atom */ qq = iq*charge[jnr]; /* Calculate table index */ r = rsq11*rinv11; /* Calculate table index */ rt = r*tabscale; n0 = rt; eps = rt-n0; eps2 = eps*eps; nnn = 4*n0; /* Tabulated coulomb interaction */ Y = VFtab[nnn]; F = VFtab[nnn+1]; Geps = eps*VFtab[nnn+2]; Heps2 = eps2*VFtab[nnn+3]; Fp = F+Geps+Heps2; VV = Y+eps*Fp; FF = Fp+Geps+2.0*Heps2; vcoul = qq*VV; fijC = qq*FF; vctot = vctot + vcoul; fscal = -((fijC)*tabscale)*rinv11; if(enerd1) { enerd1[index] = enerd1[index] + vctot; } /* Calculate temporary vectorial force */ tx = fscal*dx11; ty = fscal*dy11; tz = fscal*dz11; /* Increment i atom force */ fix1 = fix1 + tx; fiy1 = fiy1 + ty; fiz1 = fiz1 + tz; /* Decrement j atom force */ faction[j3+0] = faction[j3+0] - tx; faction[j3+1] = faction[j3+1] - ty; faction[j3+2] = faction[j3+2] - tz; /* Inner loop uses 42 flops/iteration */ } /* Add i forces to mem and shifted force list */ faction[ii3+0] = faction[ii3+0] + fix1; faction[ii3+1] = faction[ii3+1] + fiy1; faction[ii3+2] = faction[ii3+2] + fiz1; fshift[is3] = fshift[is3]+fix1; fshift[is3+1] = fshift[is3+1]+fiy1; fshift[is3+2] = fshift[is3+2]+fiz1; /* Add potential energies to the group for this list */ ggid = gid[n]; Vc[ggid] = Vc[ggid] + vctot; /* Increment number of inner iterations */ ninner = ninner + nj1 - nj0; /* Outer loop uses 11 flops/iteration */ } /* Increment number of outer iterations */ nouter = nouter + nn1 - nn0; } while (nn1<nri); /* Write outer/inner iteration count to pointers */ *outeriter = nouter; *inneriter = ninner; }
void nb_kernel400_ppc_altivec (int * p_nri, int iinr[], int jindex[], int jjnr[], int shift[], float shiftvec[], float fshift[], int gid[], float pos[], float faction[], float charge[], float * p_facel, float * p_krf, float * p_crf, float Vc[], int type[], int * p_ntype, float vdwparam[], float Vvdw[], float * p_tabscale, float VFtab[], float invsqrta[], float dvda[], float * p_gbtabscale, float GBtab[], int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, float * work) { vector float ix,iy,iz,shvec; vector float vfacel,fs,nul; vector float dx,dy,dz; vector float vctot,qq,iq; vector float fix,fiy,fiz; vector float tmp1,tmp2,tmp3,tmp4; vector float rinv,r,rsq,VVc,FFc; vector float isai,isaj,isaprod,gbtsc,dvdasum,dvdaj,dvdatmp,gbscale,half; int n,k,ii,is3,ii3,nj0,nj1; int jnra,jnrb,jnrc,jnrd; int j3a,j3b,j3c,j3d; int nri, ntype, nouter, ninner; #ifdef GMX_THREADS int nn0, nn1; #endif nouter = 0; ninner = 0; nri = *p_nri; ntype = *p_ntype; nul=vec_zero(); half=vec_half(); vfacel=load_float_and_splat(p_facel); gbtsc=load_float_and_splat(p_gbtabscale); #ifdef GMX_THREADS nthreads = *p_nthreads; do { gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx); nn0 = *count; nn1 = nn0+(nri-nn0)/(2*nthreads)+3; *count = nn1; gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx); if(nn1>nri) nn1=nri; for(n=nn0; (n<nn1); n++) { #if 0 } /* maintain correct indentation even with conditional left braces */ #endif #else /* without gmx_threads */ for(n=0;n<nri;n++) { #endif is3 = 3*shift[n]; shvec = load_xyz(shiftvec+is3); ii = iinr[n]; ii3 = 3*ii; ix = load_xyz(pos+ii3); vctot = nul; dvdasum = nul; fix = nul; fiy = nul; fiz = nul; ix = vec_add(ix,shvec); nj0 = jindex[n]; nj1 = jindex[n+1]; splat_xyz_to_vectors(ix,&ix,&iy,&iz); iq = vec_madd(load_float_and_splat(charge+ii),vfacel,nul); isai = load_float_and_splat(invsqrta+ii); for(k=nj0; k<(nj1-3); k+=4) { jnra = jjnr[k]; jnrb = jjnr[k+1]; jnrc = jjnr[k+2]; jnrd = jjnr[k+3]; j3a = 3*jnra; j3b = 3*jnrb; j3c = 3*jnrc; j3d = 3*jnrd; transpose_4_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b), load_xyz(pos+j3c), load_xyz(pos+j3d),&dx,&dy,&dz); dx = vec_sub(ix,dx); dy = vec_sub(iy,dy); dz = vec_sub(iz,dz); rsq = vec_madd(dx,dx,nul); rsq = vec_madd(dy,dy,rsq); rsq = vec_madd(dz,dz,rsq); rinv = do_invsqrt(rsq); r = vec_madd(rinv,rsq,nul); /* load 1/sqrt(a2) and multiply with 1/sqrt(a1) */ isaj = load_4_float(invsqrta+jnra,invsqrta+jnrb, invsqrta+jnrc,invsqrta+jnrd); isaprod = vec_madd(isai,isaj,nul); /* load 4 j charges and multiply by iq and 1/sqrt(a1*a2) */ qq = vec_madd(load_4_float(charge+jnra,charge+jnrb, charge+jnrc,charge+jnrd),iq,nul); qq = vec_madd(isaprod,qq,nul); gbscale = vec_madd(isaprod,gbtsc,nul); do_4_ctable_coul(GBtab,vec_madd(r,gbscale,nul),&VVc,&FFc); dvdaj = load_4_float(dvda+jnra,dvda+jnrb, dvda+jnrc,dvda+jnrd); fs = vec_madd(qq,FFc,nul); fs = vec_madd(fs,gbscale,nul); vctot = vec_madd(qq,VVc,vctot); dvdatmp = vec_madd(fs,r,nul); dvdatmp = vec_madd(qq,VVc,dvdatmp); dvdasum = vec_sub(dvdasum,dvdatmp); dvdaj = vec_sub(dvdaj,dvdatmp); store_4_float(dvdaj,dvda+jnra,dvda+jnrb,dvda+jnrc,dvda+jnrd); fs = vec_nmsub(fs,rinv,nul); fix = vec_madd(fs,dx,fix); /* +=fx */ fiy = vec_madd(fs,dy,fiy); /* +=fy */ fiz = vec_madd(fs,dz,fiz); /* +=fz */ dx = vec_nmsub(dx,fs,nul); /* -fx */ dy = vec_nmsub(dy,fs,nul); /* -fy */ dz = vec_nmsub(dz,fs,nul); /* -fz */ transpose_3_to_4(dx,dy,dz,&tmp1,&tmp2,&tmp3,&tmp4); add_xyz_to_mem(faction+j3a,tmp1); add_xyz_to_mem(faction+j3b,tmp2); add_xyz_to_mem(faction+j3c,tmp3); add_xyz_to_mem(faction+j3d,tmp4); } if(k<(nj1-1)) { jnra = jjnr[k]; jnrb = jjnr[k+1]; j3a = 3*jnra; j3b = 3*jnrb; transpose_2_to_3(load_xyz(pos+j3a), load_xyz(pos+j3b),&dx,&dy,&dz); dx = vec_sub(ix,dx); dy = vec_sub(iy,dy); dz = vec_sub(iz,dz); rsq = vec_madd(dx,dx,nul); rsq = vec_madd(dy,dy,rsq); rsq = vec_madd(dz,dz,rsq); zero_highest_2_elements_in_vector(&rsq); rinv = do_invsqrt(rsq); zero_highest_2_elements_in_vector(&rinv); r = vec_madd(rinv,rsq,nul); /* load 1/sqrt(a2) and multiply with 1/sqrt(a1) */ isaj = load_2_float(invsqrta+jnra,invsqrta+jnrb); isaprod = vec_madd(isai,isaj,nul); /* load 2 j charges and multiply by iq and 1/sqrt(a1*a2) */ qq = vec_madd(load_2_float(charge+jnra,charge+jnrb),iq,nul); qq = vec_madd(isaprod,qq,nul); gbscale = vec_madd(isaprod,gbtsc,nul); do_2_ctable_coul(GBtab,vec_madd(r,gbscale,nul),&VVc,&FFc); dvdaj = load_2_float(dvda+jnra,dvda+jnrb); fs = vec_madd(qq,FFc,nul); fs = vec_madd(fs,gbscale,nul); vctot = vec_madd(qq,VVc,vctot); dvdatmp = vec_madd(fs,r,nul); dvdatmp = vec_madd(qq,VVc,dvdatmp); dvdasum = vec_sub(dvdasum,dvdatmp); dvdaj = vec_sub(dvdaj,dvdatmp); store_2_float(dvdaj,dvda+jnra,dvda+jnrb); fs = vec_nmsub(fs,rinv,nul); fix = vec_madd(fs,dx,fix); /* +=fx */ fiy = vec_madd(fs,dy,fiy); /* +=fy */ fiz = vec_madd(fs,dz,fiz); /* +=fz */ dx = vec_nmsub(dx,fs,nul); /* -fx */ dy = vec_nmsub(dy,fs,nul); /* -fy */ dz = vec_nmsub(dz,fs,nul); /* -fz */ transpose_3_to_2(dx,dy,dz,&tmp1,&tmp2); add_xyz_to_mem(faction+j3a,tmp1); add_xyz_to_mem(faction+j3b,tmp2); k += 2; } if((nj1-nj0) & 0x1) { jnra = jjnr[k]; j3a = 3*jnra; transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz); dx = vec_sub(ix,dx); dy = vec_sub(iy,dy); dz = vec_sub(iz,dz); rsq = vec_madd(dx,dx,nul); rsq = vec_madd(dy,dy,rsq); rsq = vec_madd(dz,dz,rsq); zero_highest_3_elements_in_vector(&rsq); rinv = do_invsqrt(rsq); zero_highest_3_elements_in_vector(&rinv); r = vec_madd(rinv,rsq,nul); /* load 1/sqrt(a2) and multiply with 1/sqrt(a1) */ isaj = load_1_float(invsqrta+jnra); isaprod = vec_madd(isai,isaj,nul); /* load 1 j charge and multiply by iq and 1/sqrt(a1*a2) */ qq = vec_madd(load_1_float(charge+jnra),iq,nul); qq = vec_madd(isaprod,qq,nul); gbscale = vec_madd(isaprod,gbtsc,nul); do_1_ctable_coul(GBtab,vec_madd(r,gbscale,nul),&VVc,&FFc); dvdaj = load_1_float(dvda+jnra); fs = vec_madd(qq,FFc,nul); fs = vec_madd(fs,gbscale,nul); vctot = vec_madd(qq,VVc,vctot); dvdatmp = vec_madd(fs,r,nul); dvdatmp = vec_madd(qq,VVc,dvdatmp); dvdasum = vec_sub(dvdasum,dvdatmp); dvdaj = vec_sub(dvdaj,dvdatmp); store_1_float(dvdaj,dvda+jnra); fs = vec_nmsub(fs,rinv,nul); fix = vec_madd(fs,dx,fix); /* +=fx */ fiy = vec_madd(fs,dy,fiy); /* +=fy */ fiz = vec_madd(fs,dz,fiz); /* +=fz */ dx = vec_nmsub(dx,fs,nul); /* -fx */ dy = vec_nmsub(dy,fs,nul); /* -fy */ dz = vec_nmsub(dz,fs,nul); /* -fz */ transpose_3_to_1(dx,dy,dz,&tmp1); add_xyz_to_mem(faction+j3a,tmp1); } /* update outer data */ transpose_3_to_4(fix,fiy,fiz,&tmp1,&tmp2,&tmp3,&tmp4); tmp1 = vec_add(tmp1,tmp3); tmp2 = vec_add(tmp2,tmp4); tmp1 = vec_add(tmp1,tmp2); add_xyz_to_mem(faction+ii3,tmp1); add_xyz_to_mem(fshift+is3,tmp1); add_vector_to_float(Vc+gid[n],vctot); add_vector_to_float(dvda+ii,dvdasum); ninner += nj1 - nj0; } #ifdef GMX_THREADS nouter += nn1 - nn0; } while (nn1<nri); #else nouter = nri; #endif *outeriter = nouter; *inneriter = ninner; }