real rcutoff_scalar; real *shiftvec,*fshift,*x,*f; real *fjptrA,*fjptrB,*fjptrC,*fjptrD; real scratch[4*DIM]; __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; real * vdwioffsetptr0; __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D; __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; __m256d velec,felec,velecsum,facel,crf,krf,krf2; real *charge; __m256d dummy_mask,cutoff_mask; __m128 tmpmask0,tmpmask1; __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) ); __m256d one = _mm256_set1_pd(1.0); __m256d two = _mm256_set1_pd(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0]; fshift = fr->fshift[0]; facel = _mm256_set1_pd(fr->epsfac); charge = mdatoms->chargeA;
real *shiftvec,*fshift,*x,*f; real *fjptrA,*fjptrB,*fjptrC,*fjptrD; real scratch[4*DIM]; __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; real * vdwioffsetptr0; __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D; __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; __m256d velec,felec,velecsum,facel,crf,krf,krf2; real *charge; int nvdwtype; __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; int *vdwtype; real *vdwparam; __m256d one_sixth = _mm256_set1_pd(1.0/6.0); __m256d one_twelfth = _mm256_set1_pd(1.0/12.0); __m128i vfitab; __m128i ifour = _mm_set1_epi32(4); __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; real *vftab; __m256d dummy_mask,cutoff_mask; __m128 tmpmask0,tmpmask1; __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) ); __m256d one = _mm256_set1_pd(1.0); __m256d two = _mm256_set1_pd(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr;
__m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; real * vdwioffsetptr2; __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; real * vdwioffsetptr3; __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D; __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; __m256d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; __m256d velec,felec,velecsum,facel,crf,krf,krf2; real *charge; __m256d dummy_mask,cutoff_mask; __m128 tmpmask0,tmpmask1; __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) ); __m256d one = _mm256_set1_pd(1.0); __m256d two = _mm256_set1_pd(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0]; fshift = fr->fshift[0]; facel = _mm256_set1_pd(fr->epsfac); charge = mdatoms->chargeA;
BI_FORCE_INLINE inline avx_double operator/(const double& o1, const avx_double& o2) { avx_double res; res.packed = _mm256_div_pd(_mm256_set1_pd(o1), o2.packed); return res; }
int *iinr,*jindex,*jjnr,*shiftidx,*gid; real rcutoff_scalar; real *shiftvec,*fshift,*x,*f; real *fjptrA,*fjptrB,*fjptrC,*fjptrD; real scratch[4*DIM]; __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; real * vdwioffsetptr0; __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D; __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; int nvdwtype; __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; int *vdwtype; real *vdwparam; __m256d one_sixth = _mm256_set1_pd(1.0/6.0); __m256d one_twelfth = _mm256_set1_pd(1.0/12.0); __m256d dummy_mask,cutoff_mask; __m128 tmpmask0,tmpmask1; __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) ); __m256d one = _mm256_set1_pd(1.0); __m256d two = _mm256_set1_pd(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid;
real *shiftvec,*fshift,*x,*f; real *fjptrA,*fjptrB,*fjptrC,*fjptrD; real scratch[4*DIM]; __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; real * vdwioffsetptr0; __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D; __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; __m256d velec,felec,velecsum,facel,crf,krf,krf2; real *charge; int nvdwtype; __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; int *vdwtype; real *vdwparam; __m256d one_sixth = _mm256_set1_pd(1.0/6.0); __m256d one_twelfth = _mm256_set1_pd(1.0/12.0); __m128i ewitab; __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3; real *ewtab; __m256d dummy_mask,cutoff_mask; __m128 tmpmask0,tmpmask1; __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) ); __m256d one = _mm256_set1_pd(1.0); __m256d two = _mm256_set1_pd(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr;
__m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; real * vdwioffsetptr0; __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D; __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; __m256d velec,felec,velecsum,facel,crf,krf,krf2; real *charge; __m128i vfitab; __m128i ifour = _mm_set1_epi32(4); __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; real *vftab; __m256d dummy_mask,cutoff_mask; __m128 tmpmask0,tmpmask1; __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) ); __m256d one = _mm256_set1_pd(1.0); __m256d two = _mm256_set1_pd(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0]; fshift = fr->fshift[0]; facel = _mm256_set1_pd(fr->epsfac); charge = mdatoms->chargeA;
__m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; real * vdwioffsetptr2; __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; real * vdwioffsetptr3; __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D; __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; __m256d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; __m256d velec,felec,velecsum,facel,crf,krf,krf2; real *charge; __m256d dummy_mask,cutoff_mask; __m128 tmpmask0,tmpmask1; __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) ); __m256d one = _mm256_set1_pd(1.0); __m256d two = _mm256_set1_pd(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0]; fshift = fr->fshift[0]; facel = _mm256_set1_pd(fr->epsfac); charge = mdatoms->chargeA; krf = _mm256_set1_pd(fr->ic->k_rf);
real *shiftvec,*fshift,*x,*f; real *fjptrA,*fjptrB,*fjptrC,*fjptrD; real scratch[4*DIM]; __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; real * vdwioffsetptr0; __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D; __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; __m256d velec,felec,velecsum,facel,crf,krf,krf2; real *charge; int nvdwtype; __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; int *vdwtype; real *vdwparam; __m256d one_sixth = _mm256_set1_pd(1.0/6.0); __m256d one_twelfth = _mm256_set1_pd(1.0/12.0); __m256d dummy_mask,cutoff_mask; __m128 tmpmask0,tmpmask1; __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) ); __m256d one = _mm256_set1_pd(1.0); __m256d two = _mm256_set1_pd(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid;
int *iinr,*jindex,*jjnr,*shiftidx,*gid; real rcutoff_scalar; real *shiftvec,*fshift,*x,*f; real *fjptrA,*fjptrB,*fjptrC,*fjptrD; real scratch[4*DIM]; __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; real * vdwioffsetptr0; __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D; __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; __m256d velec,felec,velecsum,facel,crf,krf,krf2; real *charge; __m128i gbitab; __m256d vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp; __m256d minushalf = _mm256_set1_pd(-0.5); real *invsqrta,*dvda,*gbtab; __m128i vfitab; __m128i ifour = _mm_set1_epi32(4); __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; real *vftab; __m256d dummy_mask,cutoff_mask; __m128 tmpmask0,tmpmask1; __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) ); __m256d one = _mm256_set1_pd(1.0); __m256d two = _mm256_set1_pd(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr;
real rcutoff_scalar; real *shiftvec,*fshift,*x,*f; real *fjptrA,*fjptrB,*fjptrC,*fjptrD; real scratch[4*DIM]; __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; real * vdwioffsetptr0; real * vdwgridioffsetptr0; __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D; __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; int nvdwtype; __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; int *vdwtype; real *vdwparam; __m256d one_sixth = _mm256_set1_pd(1.0/6.0); __m256d one_twelfth = _mm256_set1_pd(1.0/12.0); __m256d c6grid_00; real *vdwgridparam; __m256d ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald; __m256d one_half = _mm256_set1_pd(0.5); __m256d minus_one = _mm256_set1_pd(-1.0); __m256d dummy_mask,cutoff_mask; __m128 tmpmask0,tmpmask1; __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) ); __m256d one = _mm256_set1_pd(1.0); __m256d two = _mm256_set1_pd(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri;
__m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; real * vdwioffsetptr1; __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; real * vdwioffsetptr2; __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D; __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; __m256d velec,felec,velecsum,facel,crf,krf,krf2; real *charge; __m256d dummy_mask,cutoff_mask; __m128 tmpmask0,tmpmask1; __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) ); __m256d one = _mm256_set1_pd(1.0); __m256d two = _mm256_set1_pd(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0]; fshift = fr->fshift[0]; facel = _mm256_set1_pd(fr->epsfac); charge = mdatoms->chargeA; krf = _mm256_set1_pd(fr->ic->k_rf);
real rcutoff_scalar; real *shiftvec,*fshift,*x,*f; real *fjptrA,*fjptrB,*fjptrC,*fjptrD; real scratch[4*DIM]; __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; real * vdwioffsetptr0; __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D; __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; __m256d velec,felec,velecsum,facel,crf,krf,krf2; real *charge; __m256d dummy_mask,cutoff_mask; __m128 tmpmask0,tmpmask1; __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) ); __m256d one = _mm256_set1_pd(1.0); __m256d two = _mm256_set1_pd(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0]; fshift = fr->fshift[0]; facel = _mm256_set1_pd(fr->epsfac); charge = mdatoms->chargeA; krf = _mm256_set1_pd(fr->ic->k_rf);
real * vdwioffsetptr1; __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; real * vdwioffsetptr2; __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D; __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; __m256d velec,felec,velecsum,facel,crf,krf,krf2; real *charge; int nvdwtype; __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; int *vdwtype; real *vdwparam; __m256d one_sixth = _mm256_set1_pd(1.0/6.0); __m256d one_twelfth = _mm256_set1_pd(1.0/12.0); __m256d dummy_mask,cutoff_mask; __m128 tmpmask0,tmpmask1; __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) ); __m256d one = _mm256_set1_pd(1.0); __m256d two = _mm256_set1_pd(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid;
void CalculateBasisComponents(const MDoubleArray& weights, const BaryCoords& coords, const MIntArray& triangleVertices, const MPointArray& points, const MFloatVectorArray& normals, const MIntArray& sampleIds, double* alignedStorage, MPoint& origin, MVector& up, MVector& normal) { // Start with the recreated point and normal using the barycentric coordinates of the hit point. unsigned int hitIndex = weights.length()-1; #ifdef __AVX__ __m256d originV = Dot4<MPoint>(coords[0], coords[1], coords[2], 0.0, points[triangleVertices[0]], points[triangleVertices[1]], points[triangleVertices[2]], MPoint::origin); __m256d hitNormalV = Dot4<MVector>(coords[0], coords[1], coords[2], 0.0, normals[triangleVertices[0]], normals[triangleVertices[1]], normals[triangleVertices[2]], MVector::zero); __m256d hitWeightV = _mm256_set1_pd(weights[hitIndex]); // Create the barycentric point and normal. __m256d normalV = _mm256_mul_pd(hitNormalV, hitWeightV); // Then use the weighted adjacent data. for (unsigned int j = 0; j < hitIndex; j += 4) { __m256d tempNormal = Dot4<MVector>(weights[j], weights[j+1], weights[j+2], weights[j+3], normals[sampleIds[j]], normals[sampleIds[j+1]], normals[sampleIds[j+2]], normals[sampleIds[j+3]]); normalV = _mm256_add_pd(tempNormal, normalV); } _mm256_store_pd(alignedStorage, originV); origin.x = alignedStorage[0]; origin.y = alignedStorage[1]; origin.z = alignedStorage[2]; _mm256_store_pd(alignedStorage, normalV); normal.x = alignedStorage[0]; normal.y = alignedStorage[1]; normal.z = alignedStorage[2]; // Calculate the up vector const MPoint& pt1 = points[triangleVertices[0]]; const MPoint& pt2 = points[triangleVertices[1]]; __m256d p1 = _mm256_set_pd(pt1.w, pt1.z, pt1.y, pt1.x); __m256d p2 = _mm256_set_pd(pt2.w, pt2.z, pt2.y, pt2.x); p1 = _mm256_add_pd(p1, p2); __m256d half = _mm256_set_pd(0.5, 0.5, 0.5, 0.5); p1 = _mm256_mul_pd(p1, half); __m256d upV = _mm256_sub_pd(p1, originV); _mm256_store_pd(alignedStorage, upV); up.x = alignedStorage[0]; up.y = alignedStorage[1]; up.z = alignedStorage[2]; #else MVector hitNormal; // Create the barycentric point and normal. for (int i = 0; i < 3; ++i) { origin += points[triangleVertices[i]] * coords[i]; hitNormal += MVector(normals[triangleVertices[i]]) * coords[i]; } // Use crawl data to calculate normal normal = hitNormal * weights[hitIndex]; for (unsigned int j = 0; j < hitIndex; j++) { normal += MVector(normals[sampleIds[j]]) * weights[j]; } // Calculate the up vector // The triangle vertices are sorted by decreasing barycentric coordinates so the first two are // the two closest vertices in the triangle. up = ((points[triangleVertices[0]] + points[triangleVertices[1]]) * 0.5) - origin; #endif normal.normalize(); GetValidUp(weights, points, sampleIds, origin, normal, up); }
int *iinr,*jindex,*jjnr,*shiftidx,*gid; real rcutoff_scalar; real *shiftvec,*fshift,*x,*f; real *fjptrA,*fjptrB,*fjptrC,*fjptrD; real scratch[4*DIM]; __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; real * vdwioffsetptr0; __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D; __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; __m256d velec,felec,velecsum,facel,crf,krf,krf2; real *charge; __m128i gbitab; __m256d vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp; __m256d minushalf = _mm256_set1_pd(-0.5); real *invsqrta,*dvda,*gbtab; int nvdwtype; __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; int *vdwtype; real *vdwparam; __m256d one_sixth = _mm256_set1_pd(1.0/6.0); __m256d one_twelfth = _mm256_set1_pd(1.0/12.0); __m128i vfitab; __m128i ifour = _mm_set1_epi32(4); __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; real *vftab; __m256d dummy_mask,cutoff_mask; __m128 tmpmask0,tmpmask1; __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) ); __m256d one = _mm256_set1_pd(1.0);
static real probabilities_avx_hernquist(const AstronomyParameters* ap, const StreamConstants* sc, const real* RESTRICT sg_dx, const real* RESTRICT r_point, const real* RESTRICT qw_r3_N, LBTrig lbt, real gPrime, real reff_xr_rp3, real* RESTRICT streamTmps) { double bg_prob, dotted, xyz_norm; int i, j, k, convolve, nStreams; MW_ALIGN_V(64) double psgt[256], psgf[256], xyzstr[256]; MW_ALIGN_V(64) double xs[256], ys[256], zs[256]; const __m256d REF_XR = _mm256_set1_pd(reff_xr_rp3); const __m256d COSBL = _mm256_set1_pd(lbt.lCosBCos); const __m256d SINB = _mm256_set1_pd(lbt.bSin); const __m256d SINCOSBL = _mm256_set1_pd(lbt.lSinBCos); const __m256d SUNR0 = _mm256_set1_pd(ap->sun_r0); const __m256d R0 = _mm256_set1_pd(ap->r0); const __m256d QV_RECIP = _mm256_set1_pd(ap->q_inv); __m256d RI, QI; ssp_m256 xyz0, xyz1, xyz2, tmp0, tmp1, tmp2, PROD, PBXV, BGP; //xyz0, 1, 2 = x, y, z BGP.d = _mm256_setzero_pd(); convolve = ap->convolve; nStreams = ap->number_streams;
int *iinr,*jindex,*jjnr,*shiftidx,*gid; real rcutoff_scalar; real *shiftvec,*fshift,*x,*f; real *fjptrA,*fjptrB,*fjptrC,*fjptrD; real scratch[4*DIM]; __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; real * vdwioffsetptr0; __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D; __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; int nvdwtype; __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; int *vdwtype; real *vdwparam; __m256d one_sixth = _mm256_set1_pd(1.0/6.0); __m256d one_twelfth = _mm256_set1_pd(1.0/12.0); __m256d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; real rswitch_scalar,d_scalar; __m256d dummy_mask,cutoff_mask; __m128 tmpmask0,tmpmask1; __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) ); __m256d one = _mm256_set1_pd(1.0); __m256d two = _mm256_set1_pd(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr;
static inline __m256d gmx_mm256_exp2_pd(__m256d x) { /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */ const __m256d arglimit = _mm256_set1_pd(1022.0); const __m128i expbase = _mm_set1_epi32(1023); const __m256d P2 = _mm256_set1_pd(2.30933477057345225087e-2); const __m256d P1 = _mm256_set1_pd(2.02020656693165307700e1); const __m256d P0 = _mm256_set1_pd(1.51390680115615096133e3); /* Q2 == 1.0 */ const __m256d Q1 = _mm256_set1_pd(2.33184211722314911771e2); const __m256d Q0 = _mm256_set1_pd(4.36821166879210612817e3); const __m256d one = _mm256_set1_pd(1.0); const __m256d two = _mm256_set1_pd(2.0); __m256d valuemask; __m256i iexppart; __m128i iexppart128a, iexppart128b; __m256d fexppart; __m256d intpart; __m256d z, z2; __m256d PolyP, PolyQ; iexppart128a = _mm256_cvtpd_epi32(x); intpart = _mm256_round_pd(x, _MM_FROUND_TO_NEAREST_INT); /* Add exponent bias */ iexppart128a = _mm_add_epi32(iexppart128a, expbase); /* We now want to shift the exponent 52 positions left, but to achieve this we need * to separate the 128-bit register data into two registers (4x64-bit > 128bit) * shift them, and then merge into a single __m256d. * Elements 0/1 should end up in iexppart128a, and 2/3 in iexppart128b. * It doesnt matter what we put in the 2nd/4th position, since that data will be * shifted out and replaced with zeros. */ iexppart128b = _mm_shuffle_epi32(iexppart128a, _MM_SHUFFLE(3, 3, 2, 2)); iexppart128a = _mm_shuffle_epi32(iexppart128a, _MM_SHUFFLE(1, 1, 0, 0)); iexppart128b = _mm_slli_epi64(iexppart128b, 52); iexppart128a = _mm_slli_epi64(iexppart128a, 52); iexppart = _mm256_castsi128_si256(iexppart128a); iexppart = _mm256_insertf128_si256(iexppart, iexppart128b, 0x1); valuemask = _mm256_cmp_pd(arglimit, gmx_mm256_abs_pd(x), _CMP_GE_OQ); fexppart = _mm256_and_pd(valuemask, _mm256_castsi256_pd(iexppart)); z = _mm256_sub_pd(x, intpart); z2 = _mm256_mul_pd(z, z); PolyP = _mm256_mul_pd(P2, z2); PolyP = _mm256_add_pd(PolyP, P1); PolyQ = _mm256_add_pd(z2, Q1); PolyP = _mm256_mul_pd(PolyP, z2); PolyQ = _mm256_mul_pd(PolyQ, z2); PolyP = _mm256_add_pd(PolyP, P0); PolyQ = _mm256_add_pd(PolyQ, Q0); PolyP = _mm256_mul_pd(PolyP, z); z = _mm256_mul_pd(PolyP, gmx_mm256_inv_pd(_mm256_sub_pd(PolyQ, PolyP))); z = _mm256_add_pd(one, _mm256_mul_pd(two, z)); z = _mm256_mul_pd(z, fexppart); return z; }
__m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; real * vdwioffsetptr0; __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D; __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; __m256d velec,felec,velecsum,facel,crf,krf,krf2; real *charge; __m128i ewitab; __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3; real *ewtab; __m256d dummy_mask,cutoff_mask; __m128 tmpmask0,tmpmask1; __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) ); __m256d one = _mm256_set1_pd(1.0); __m256d two = _mm256_set1_pd(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0]; fshift = fr->fshift[0]; facel = _mm256_set1_pd(fr->epsfac); charge = mdatoms->chargeA;
real *shiftvec,*fshift,*x,*f; real *fjptrA,*fjptrB,*fjptrC,*fjptrD; real scratch[4*DIM]; __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; real * vdwioffsetptr0; __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D; __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; __m256d velec,felec,velecsum,facel,crf,krf,krf2; real *charge; int nvdwtype; __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; int *vdwtype; real *vdwparam; __m256d one_sixth = _mm256_set1_pd(1.0/6.0); __m256d one_twelfth = _mm256_set1_pd(1.0/12.0); __m128i ewitab; __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3; real *ewtab; __m256d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; real rswitch_scalar,d_scalar; __m256d dummy_mask,cutoff_mask; __m128 tmpmask0,tmpmask1; __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) ); __m256d one = _mm256_set1_pd(1.0); __m256d two = _mm256_set1_pd(2.0); x = xx[0]; f = ff[0];
Color3 sampleFourier3(float * const coeffs[3], const double *recip, size_t nCoeffs, Float sample, Float &pdf, Float &phi) { bool flip = false; if (sample < 0.5f) { sample *= 2.0f; } else { sample = 1.0f - 2.0f * (sample - 0.5f); flip = true; } int iterations = 0; double a = 0.0, c = math::Pi_d, coeff0 = coeffs[0][0], y = coeff0*math::Pi_d*sample, deriv = 0.0, b = 0.5 * math::Pi_d, cosB = 0, sinB = 1; if (nCoeffs > 10 && sample != 0 && sample != 1) { float stddev = std::sqrt(2.0f / 3.0f * std::log(coeffs[0][1] / coeffs[0][2])); if (std::isfinite(stddev)) { b = std::min(c, (double) math::normal_quantile(0.5f + sample / 2) * stddev); cosB = std::cos(b); sinB = std::sqrt(1 - cosB * cosB); } } #if FOURIER_SCALAR != 1 __m256d factorB_prev, factorB_cur; #endif while (true) { #if FOURIER_SCALAR == 1 double cosB_prev = cosB, sinB_prev = -sinB, sinB_cur = 0.0, cosB_cur = 1.0, value = coeff0 * b; deriv = coeff0; for (size_t j=1; j<nCoeffs; ++j) { double sinB_next = 2.0*cosB*sinB_cur - sinB_prev, cosB_next = 2.0*cosB*cosB_cur - cosB_prev, coeff = (double) coeffs[0][j]; value += coeff * recip[j] * sinB_next; deriv += coeff * cosB_next; sinB_prev = sinB_cur; sinB_cur = sinB_next; cosB_prev = cosB_cur; cosB_cur = cosB_next; } #else initializeRecurrence(cosB, factorB_prev, factorB_cur); __m256d sinB_prev = _mm256_set1_pd(-sinB), sinB_cur = _mm256_set1_pd(0.0), cosB_prev = _mm256_set1_pd(cosB), cosB_cur = _mm256_set1_pd(1.0), value_vec = _mm256_set_sd(coeff0 * b), deriv_vec = _mm256_set_sd(coeff0); for (size_t j=1; j<nCoeffs; j+=4) { __m128 coeff_vec_f = _mm_load_ps(coeffs[0]+j); __m256d recip_vec = _mm256_load_pd(recip+j); __m256d coeff_vec = _mm256_cvtps_pd(coeff_vec_f); __m256d sinB_next = _mm256_add_pd( _mm256_mul_pd(factorB_prev, sinB_prev), _mm256_mul_pd(factorB_cur, sinB_cur)); __m256d cosB_next = _mm256_add_pd( _mm256_mul_pd(factorB_prev, cosB_prev), _mm256_mul_pd(factorB_cur, cosB_cur)); value_vec = _mm256_add_pd(value_vec, _mm256_mul_pd( _mm256_mul_pd(recip_vec, coeff_vec), sinB_next)); deriv_vec = _mm256_add_pd(deriv_vec, _mm256_mul_pd(coeff_vec, cosB_next)); sinB_prev = _mm256_splat2_pd(sinB_next); cosB_prev = _mm256_splat2_pd(cosB_next); sinB_cur = _mm256_splat3_pd(sinB_next); cosB_cur = _mm256_splat3_pd(cosB_next); } double value = simd::hadd(value_vec); deriv = simd::hadd(deriv_vec); #endif value -= y; if (std::abs(value) <= 1e-5 * coeff0 || ++iterations > 20) break; else if (value > 0.0) c = b; else a = b; b -= value / deriv; if (!(b >= a && b <= c)) b = 0.5f * (a + c); cosB = std::cos(b); sinB = std::sqrt(1-cosB*cosB); } double Y = deriv; if (flip) b = 2.0*math::Pi_d - b; pdf = (Float) (math::InvTwoPi_d * Y / coeff0); phi = (Float) b; #if FOURIER_SCALAR == 1 double cosB_prev = cosB, cosB_cur = 1.0; double R = coeffs[1][0]; double B = coeffs[2][0]; for (size_t j=1; j<nCoeffs; ++j) { double cosB_next = 2.0*cosB*cosB_cur - cosB_prev, coeffR = (double) coeffs[1][j], coeffB = (double) coeffs[2][j]; R += coeffR * cosB_next; B += coeffB * cosB_next; cosB_prev = cosB_cur; cosB_cur = cosB_next; } #else __m256d cosB_prev = _mm256_set1_pd(cosB), cosB_cur = _mm256_set1_pd(1.0), R_vec = _mm256_set_sd(coeffs[1][0]), B_vec = _mm256_set_sd(coeffs[2][0]); for (size_t j=1; j<nCoeffs; j+=4) { __m128 coeff_R_vec_f = _mm_load_ps(coeffs[1]+j); __m128 coeff_B_vec_f = _mm_load_ps(coeffs[2]+j); __m256d coeff_R_vec = _mm256_cvtps_pd(coeff_R_vec_f); __m256d coeff_B_vec = _mm256_cvtps_pd(coeff_B_vec_f); __m256d cosB_next = _mm256_add_pd( _mm256_mul_pd(factorB_prev, cosB_prev), _mm256_mul_pd(factorB_cur, cosB_cur)); R_vec = _mm256_add_pd(R_vec, _mm256_mul_pd(coeff_R_vec, cosB_next)); B_vec = _mm256_add_pd(B_vec, _mm256_mul_pd(coeff_B_vec, cosB_next)); cosB_prev = _mm256_splat2_pd(cosB_next); cosB_cur = _mm256_splat3_pd(cosB_next); } double R = simd::hadd(R_vec); double B = simd::hadd(B_vec); #endif double G = 1.39829 * Y - 0.100913 * B - 0.297375 * R; return Color3((Float) R, (Float) G, (Float) B) * (2 * math::Pi) * (Float) (coeff0 / Y); }
BI_FORCE_INLINE inline avx_double operator*(const avx_double& o1, const double& o2) { avx_double res; res.packed = _mm256_mul_pd(o1.packed, _mm256_set1_pd(o2)); return res; }