real             rcutoff_scalar;
    real             *shiftvec,*fshift,*x,*f;
    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
    real             scratch[4*DIM];
    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
    real *           vdwioffsetptr0;
    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    __m256d          dummy_mask,cutoff_mask;
    __m128           tmpmask0,tmpmask1;
    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
    __m256d          one     = _mm256_set1_pd(1.0);
    __m256d          two     = _mm256_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    shiftvec         = fr->shift_vec[0];
    fshift           = fr->fshift[0];
    facel            = _mm256_set1_pd(fr->epsfac);
    charge           = mdatoms->chargeA;
    real             *shiftvec,*fshift,*x,*f;
    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
    real             scratch[4*DIM];
    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
    real *           vdwioffsetptr0;
    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    int              nvdwtype;
    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
    int              *vdwtype;
    real             *vdwparam;
    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
    __m128i          vfitab;
    __m128i          ifour       = _mm_set1_epi32(4);
    __m256d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
    real             *vftab;
    __m256d          dummy_mask,cutoff_mask;
    __m128           tmpmask0,tmpmask1;
    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
    __m256d          one     = _mm256_set1_pd(1.0);
    __m256d          two     = _mm256_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
    real *           vdwioffsetptr2;
    __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
    real *           vdwioffsetptr3;
    __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
    __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
    __m256d          dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    __m256d          dummy_mask,cutoff_mask;
    __m128           tmpmask0,tmpmask1;
    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
    __m256d          one     = _mm256_set1_pd(1.0);
    __m256d          two     = _mm256_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    shiftvec         = fr->shift_vec[0];
    fshift           = fr->fshift[0];
    facel            = _mm256_set1_pd(fr->epsfac);
    charge           = mdatoms->chargeA;
示例#4
0
BI_FORCE_INLINE inline avx_double operator/(const double& o1,
    const avx_double& o2) {
  avx_double res;
  res.packed = _mm256_div_pd(_mm256_set1_pd(o1), o2.packed);
  return res;
}
    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
    real             rcutoff_scalar;
    real             *shiftvec,*fshift,*x,*f;
    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
    real             scratch[4*DIM];
    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
    real *           vdwioffsetptr0;
    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    int              nvdwtype;
    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
    int              *vdwtype;
    real             *vdwparam;
    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
    __m256d          dummy_mask,cutoff_mask;
    __m128           tmpmask0,tmpmask1;
    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
    __m256d          one     = _mm256_set1_pd(1.0);
    __m256d          two     = _mm256_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    real             *shiftvec,*fshift,*x,*f;
    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
    real             scratch[4*DIM];
    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
    real *           vdwioffsetptr0;
    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    int              nvdwtype;
    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
    int              *vdwtype;
    real             *vdwparam;
    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
    __m128i          ewitab;
    __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
    __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
    real             *ewtab;
    __m256d          dummy_mask,cutoff_mask;
    __m128           tmpmask0,tmpmask1;
    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
    __m256d          one     = _mm256_set1_pd(1.0);
    __m256d          two     = _mm256_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
    real *           vdwioffsetptr0;
    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    __m128i          vfitab;
    __m128i          ifour       = _mm_set1_epi32(4);
    __m256d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
    real             *vftab;
    __m256d          dummy_mask,cutoff_mask;
    __m128           tmpmask0,tmpmask1;
    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
    __m256d          one     = _mm256_set1_pd(1.0);
    __m256d          two     = _mm256_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    shiftvec         = fr->shift_vec[0];
    fshift           = fr->fshift[0];
    facel            = _mm256_set1_pd(fr->epsfac);
    charge           = mdatoms->chargeA;
    __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
    real *           vdwioffsetptr2;
    __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
    real *           vdwioffsetptr3;
    __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
    __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
    __m256d          dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    __m256d          dummy_mask,cutoff_mask;
    __m128           tmpmask0,tmpmask1;
    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
    __m256d          one     = _mm256_set1_pd(1.0);
    __m256d          two     = _mm256_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    shiftvec         = fr->shift_vec[0];
    fshift           = fr->fshift[0];
    facel            = _mm256_set1_pd(fr->epsfac);
    charge           = mdatoms->chargeA;
    krf              = _mm256_set1_pd(fr->ic->k_rf);
    real             *shiftvec,*fshift,*x,*f;
    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
    real             scratch[4*DIM];
    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
    real *           vdwioffsetptr0;
    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    int              nvdwtype;
    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
    int              *vdwtype;
    real             *vdwparam;
    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
    __m256d          dummy_mask,cutoff_mask;
    __m128           tmpmask0,tmpmask1;
    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
    __m256d          one     = _mm256_set1_pd(1.0);
    __m256d          two     = _mm256_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
    real             rcutoff_scalar;
    real             *shiftvec,*fshift,*x,*f;
    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
    real             scratch[4*DIM];
    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
    real *           vdwioffsetptr0;
    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    __m128i          gbitab;
    __m256d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
    __m256d          minushalf = _mm256_set1_pd(-0.5);
    real             *invsqrta,*dvda,*gbtab;
    __m128i          vfitab;
    __m128i          ifour       = _mm_set1_epi32(4);
    __m256d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
    real             *vftab;
    __m256d          dummy_mask,cutoff_mask;
    __m128           tmpmask0,tmpmask1;
    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
    __m256d          one     = _mm256_set1_pd(1.0);
    __m256d          two     = _mm256_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    real             rcutoff_scalar;
    real             *shiftvec,*fshift,*x,*f;
    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
    real             scratch[4*DIM];
    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
    real *           vdwioffsetptr0;
    real *           vdwgridioffsetptr0;
    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    int              nvdwtype;
    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
    int              *vdwtype;
    real             *vdwparam;
    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
    __m256d           c6grid_00;
    real             *vdwgridparam;
    __m256d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
    __m256d           one_half  = _mm256_set1_pd(0.5);
    __m256d           minus_one = _mm256_set1_pd(-1.0);
    __m256d          dummy_mask,cutoff_mask;
    __m128           tmpmask0,tmpmask1;
    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
    __m256d          one     = _mm256_set1_pd(1.0);
    __m256d          two     = _mm256_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    real             rcutoff_scalar;
    real             *shiftvec,*fshift,*x,*f;
    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
    real             scratch[4*DIM];
    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
    real *           vdwioffsetptr0;
    real *           vdwgridioffsetptr0;
    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    int              nvdwtype;
    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
    int              *vdwtype;
    real             *vdwparam;
    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
    __m256d           c6grid_00;
    real             *vdwgridparam;
    __m256d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
    __m256d           one_half  = _mm256_set1_pd(0.5);
    __m256d           minus_one = _mm256_set1_pd(-1.0);
    __m256d          dummy_mask,cutoff_mask;
    __m128           tmpmask0,tmpmask1;
    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
    __m256d          one     = _mm256_set1_pd(1.0);
    __m256d          two     = _mm256_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    real             *shiftvec,*fshift,*x,*f;
    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
    real             scratch[4*DIM];
    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
    real *           vdwioffsetptr0;
    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    int              nvdwtype;
    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
    int              *vdwtype;
    real             *vdwparam;
    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
    __m256d          dummy_mask,cutoff_mask;
    __m128           tmpmask0,tmpmask1;
    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
    __m256d          one     = _mm256_set1_pd(1.0);
    __m256d          two     = _mm256_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    real *           vdwioffsetptr1;
    __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
    real *           vdwioffsetptr2;
    __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
    __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    __m256d          dummy_mask,cutoff_mask;
    __m128           tmpmask0,tmpmask1;
    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
    __m256d          one     = _mm256_set1_pd(1.0);
    __m256d          two     = _mm256_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    shiftvec         = fr->shift_vec[0];
    fshift           = fr->fshift[0];
    facel            = _mm256_set1_pd(fr->epsfac);
    charge           = mdatoms->chargeA;
    krf              = _mm256_set1_pd(fr->ic->k_rf);
    real             rcutoff_scalar;
    real             *shiftvec,*fshift,*x,*f;
    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
    real             scratch[4*DIM];
    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
    real *           vdwioffsetptr0;
    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    __m256d          dummy_mask,cutoff_mask;
    __m128           tmpmask0,tmpmask1;
    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
    __m256d          one     = _mm256_set1_pd(1.0);
    __m256d          two     = _mm256_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    shiftvec         = fr->shift_vec[0];
    fshift           = fr->fshift[0];
    facel            = _mm256_set1_pd(fr->epsfac);
    charge           = mdatoms->chargeA;
    krf              = _mm256_set1_pd(fr->ic->k_rf);
    real *           vdwioffsetptr1;
    __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
    real *           vdwioffsetptr2;
    __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
    __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    int              nvdwtype;
    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
    int              *vdwtype;
    real             *vdwparam;
    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
    __m256d          dummy_mask,cutoff_mask;
    __m128           tmpmask0,tmpmask1;
    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
    __m256d          one     = _mm256_set1_pd(1.0);
    __m256d          two     = _mm256_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
示例#17
0
void CalculateBasisComponents(const MDoubleArray& weights, const BaryCoords& coords,
                              const MIntArray& triangleVertices, const MPointArray& points,
                              const MFloatVectorArray& normals, const MIntArray& sampleIds,
                              double* alignedStorage,
                              MPoint& origin, MVector& up, MVector& normal) {
  // Start with the recreated point and normal using the barycentric coordinates of the hit point.
  unsigned int hitIndex = weights.length()-1;
#ifdef __AVX__
  __m256d originV = Dot4<MPoint>(coords[0], coords[1], coords[2], 0.0,
                                points[triangleVertices[0]], points[triangleVertices[1]],
                                points[triangleVertices[2]], MPoint::origin);
  __m256d hitNormalV = Dot4<MVector>(coords[0], coords[1], coords[2], 0.0,
                                normals[triangleVertices[0]], normals[triangleVertices[1]],
                                normals[triangleVertices[2]], MVector::zero);
  __m256d hitWeightV = _mm256_set1_pd(weights[hitIndex]);
  // Create the barycentric point and normal.
  __m256d normalV = _mm256_mul_pd(hitNormalV, hitWeightV);
  // Then use the weighted adjacent data.
  for (unsigned int j = 0; j < hitIndex; j += 4) {
    __m256d tempNormal = Dot4<MVector>(weights[j], weights[j+1], weights[j+2], weights[j+3],
                                       normals[sampleIds[j]], normals[sampleIds[j+1]],
                                       normals[sampleIds[j+2]], normals[sampleIds[j+3]]);
    normalV = _mm256_add_pd(tempNormal, normalV);
  }

  _mm256_store_pd(alignedStorage, originV);
  origin.x = alignedStorage[0];
  origin.y = alignedStorage[1];
  origin.z = alignedStorage[2];
  _mm256_store_pd(alignedStorage, normalV);
  normal.x = alignedStorage[0];
  normal.y = alignedStorage[1];
  normal.z = alignedStorage[2];

  // Calculate the up vector
  const MPoint& pt1 = points[triangleVertices[0]];
  const MPoint& pt2 = points[triangleVertices[1]];
  __m256d p1 = _mm256_set_pd(pt1.w, pt1.z, pt1.y, pt1.x);
  __m256d p2 = _mm256_set_pd(pt2.w, pt2.z, pt2.y, pt2.x);
  p1 = _mm256_add_pd(p1, p2);
  __m256d half = _mm256_set_pd(0.5, 0.5, 0.5, 0.5);
  p1 = _mm256_mul_pd(p1, half);
  __m256d upV = _mm256_sub_pd(p1, originV);
  _mm256_store_pd(alignedStorage, upV);
  up.x = alignedStorage[0];
  up.y = alignedStorage[1];
  up.z = alignedStorage[2];
#else
  MVector hitNormal;
  // Create the barycentric point and normal.
  for (int i = 0; i < 3; ++i) {
    origin += points[triangleVertices[i]] * coords[i];
    hitNormal += MVector(normals[triangleVertices[i]]) * coords[i];
  }
  // Use crawl data to calculate normal
  normal = hitNormal * weights[hitIndex];
  for (unsigned int j = 0; j < hitIndex; j++) {
    normal += MVector(normals[sampleIds[j]]) * weights[j];
  }

  // Calculate the up vector
  // The triangle vertices are sorted by decreasing barycentric coordinates so the first two are
  // the two closest vertices in the triangle.
  up = ((points[triangleVertices[0]] + points[triangleVertices[1]]) * 0.5) - origin;
#endif
  normal.normalize();
  GetValidUp(weights, points, sampleIds, origin, normal, up);
}
 int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 real             rcutoff_scalar;
 real             *shiftvec,*fshift,*x,*f;
 real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 real             scratch[4*DIM];
 __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 real *           vdwioffsetptr0;
 __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
 __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
 real             *charge;
 __m128i          gbitab;
 __m256d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
 __m256d          minushalf = _mm256_set1_pd(-0.5);
 real             *invsqrta,*dvda,*gbtab;
 int              nvdwtype;
 __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 int              *vdwtype;
 real             *vdwparam;
 __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
 __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
 __m128i          vfitab;
 __m128i          ifour       = _mm_set1_epi32(4);
 __m256d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 real             *vftab;
 __m256d          dummy_mask,cutoff_mask;
 __m128           tmpmask0,tmpmask1;
 __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
 __m256d          one     = _mm256_set1_pd(1.0);
static real probabilities_avx_hernquist(const AstronomyParameters* ap,
                              const StreamConstants* sc,
                              const real* RESTRICT sg_dx,
                              const real* RESTRICT r_point,
                              const real* RESTRICT qw_r3_N,
                              LBTrig lbt,
                              real gPrime,
                              real reff_xr_rp3,
                              real* RESTRICT streamTmps)
{
    double bg_prob, dotted, xyz_norm;
    int i, j, k, convolve, nStreams;
    MW_ALIGN_V(64) double psgt[256], psgf[256], xyzstr[256];
    MW_ALIGN_V(64) double xs[256], ys[256], zs[256];

    const __m256d REF_XR = _mm256_set1_pd(reff_xr_rp3);

    const __m256d COSBL    = _mm256_set1_pd(lbt.lCosBCos);
    const __m256d SINB     = _mm256_set1_pd(lbt.bSin);
    const __m256d SINCOSBL = _mm256_set1_pd(lbt.lSinBCos);
    const __m256d SUNR0    = _mm256_set1_pd(ap->sun_r0);
    const __m256d R0       = _mm256_set1_pd(ap->r0);
    const __m256d QV_RECIP = _mm256_set1_pd(ap->q_inv);
    __m256d RI, QI;
    ssp_m256 xyz0, xyz1, xyz2, tmp0, tmp1, tmp2, PROD, PBXV, BGP;
    //xyz0, 1, 2 = x, y, z
    BGP.d = _mm256_setzero_pd();

    convolve = ap->convolve;
    nStreams = ap->number_streams;
    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
    real             rcutoff_scalar;
    real             *shiftvec,*fshift,*x,*f;
    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
    real             scratch[4*DIM];
    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
    real *           vdwioffsetptr0;
    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    int              nvdwtype;
    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
    int              *vdwtype;
    real             *vdwparam;
    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
    __m256d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
    real             rswitch_scalar,d_scalar;
    __m256d          dummy_mask,cutoff_mask;
    __m128           tmpmask0,tmpmask1;
    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
    __m256d          one     = _mm256_set1_pd(1.0);
    __m256d          two     = _mm256_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
static inline __m256d gmx_mm256_exp2_pd(__m256d x)
{
    /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
    const __m256d arglimit = _mm256_set1_pd(1022.0);
    const __m128i expbase  = _mm_set1_epi32(1023);

    const __m256d P2       = _mm256_set1_pd(2.30933477057345225087e-2);
    const __m256d P1       = _mm256_set1_pd(2.02020656693165307700e1);
    const __m256d P0       = _mm256_set1_pd(1.51390680115615096133e3);
    /* Q2 == 1.0 */
    const __m256d Q1       = _mm256_set1_pd(2.33184211722314911771e2);
    const __m256d Q0       = _mm256_set1_pd(4.36821166879210612817e3);
    const __m256d one      = _mm256_set1_pd(1.0);
    const __m256d two      = _mm256_set1_pd(2.0);

    __m256d       valuemask;
    __m256i       iexppart;
    __m128i       iexppart128a, iexppart128b;
    __m256d       fexppart;
    __m256d       intpart;
    __m256d       z, z2;
    __m256d       PolyP, PolyQ;

    iexppart128a  = _mm256_cvtpd_epi32(x);
    intpart       = _mm256_round_pd(x, _MM_FROUND_TO_NEAREST_INT);

    /* Add exponent bias */
    iexppart128a   = _mm_add_epi32(iexppart128a, expbase);

    /* We now want to shift the exponent 52 positions left, but to achieve this we need
     * to separate the 128-bit register data into two registers (4x64-bit > 128bit)
     * shift them, and then merge into a single __m256d.
     * Elements 0/1 should end up in iexppart128a, and 2/3 in iexppart128b.
     * It doesnt matter what we put in the 2nd/4th position, since that data will be
     * shifted out and replaced with zeros.
     */
    iexppart128b   = _mm_shuffle_epi32(iexppart128a, _MM_SHUFFLE(3, 3, 2, 2));
    iexppart128a   = _mm_shuffle_epi32(iexppart128a, _MM_SHUFFLE(1, 1, 0, 0));

    iexppart128b   = _mm_slli_epi64(iexppart128b, 52);
    iexppart128a   = _mm_slli_epi64(iexppart128a, 52);

    iexppart  = _mm256_castsi128_si256(iexppart128a);
    iexppart  = _mm256_insertf128_si256(iexppart, iexppart128b, 0x1);

    valuemask = _mm256_cmp_pd(arglimit, gmx_mm256_abs_pd(x), _CMP_GE_OQ);
    fexppart  = _mm256_and_pd(valuemask, _mm256_castsi256_pd(iexppart));

    z         = _mm256_sub_pd(x, intpart);

    z2        = _mm256_mul_pd(z, z);

    PolyP     = _mm256_mul_pd(P2, z2);
    PolyP     = _mm256_add_pd(PolyP, P1);
    PolyQ     = _mm256_add_pd(z2, Q1);
    PolyP     = _mm256_mul_pd(PolyP, z2);
    PolyQ     = _mm256_mul_pd(PolyQ, z2);
    PolyP     = _mm256_add_pd(PolyP, P0);
    PolyQ     = _mm256_add_pd(PolyQ, Q0);
    PolyP     = _mm256_mul_pd(PolyP, z);

    z         = _mm256_mul_pd(PolyP, gmx_mm256_inv_pd(_mm256_sub_pd(PolyQ, PolyP)));
    z         = _mm256_add_pd(one, _mm256_mul_pd(two, z));

    z         = _mm256_mul_pd(z, fexppart);

    return z;
}
    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
    real *           vdwioffsetptr0;
    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    __m128i          ewitab;
    __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
    __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
    real             *ewtab;
    __m256d          dummy_mask,cutoff_mask;
    __m128           tmpmask0,tmpmask1;
    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
    __m256d          one     = _mm256_set1_pd(1.0);
    __m256d          two     = _mm256_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    shiftvec         = fr->shift_vec[0];
    fshift           = fr->fshift[0];
    facel            = _mm256_set1_pd(fr->epsfac);
    charge           = mdatoms->chargeA;
    real             *shiftvec,*fshift,*x,*f;
    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
    real             scratch[4*DIM];
    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
    real *           vdwioffsetptr0;
    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    int              nvdwtype;
    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
    int              *vdwtype;
    real             *vdwparam;
    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
    __m128i          ewitab;
    __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
    __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
    real             *ewtab;
    __m256d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
    real             rswitch_scalar,d_scalar;
    __m256d          dummy_mask,cutoff_mask;
    __m128           tmpmask0,tmpmask1;
    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
    __m256d          one     = _mm256_set1_pd(1.0);
    __m256d          two     = _mm256_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];
    real             *shiftvec,*fshift,*x,*f;
    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
    real             scratch[4*DIM];
    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
    real *           vdwioffsetptr0;
    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    int              nvdwtype;
    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
    int              *vdwtype;
    real             *vdwparam;
    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
    __m128i          vfitab;
    __m128i          ifour       = _mm_set1_epi32(4);
    __m256d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
    real             *vftab;
    __m256d          dummy_mask,cutoff_mask;
    __m128           tmpmask0,tmpmask1;
    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
    __m256d          one     = _mm256_set1_pd(1.0);
    __m256d          two     = _mm256_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
示例#25
0
Color3 sampleFourier3(float * const coeffs[3], const double *recip, size_t nCoeffs,
        Float sample, Float &pdf, Float &phi) {
    bool flip = false;
    if (sample < 0.5f) {
        sample *= 2.0f;
    } else {
        sample = 1.0f - 2.0f * (sample - 0.5f);
        flip = true;
    }

    int iterations = 0;

    double a = 0.0,
           c = math::Pi_d,
           coeff0 = coeffs[0][0],
           y = coeff0*math::Pi_d*sample,
           deriv = 0.0,
           b = 0.5 * math::Pi_d,
           cosB = 0,
           sinB = 1;

    if (nCoeffs > 10 && sample != 0 && sample != 1) {
        float stddev = std::sqrt(2.0f / 3.0f * std::log(coeffs[0][1] / coeffs[0][2]));
        if (std::isfinite(stddev)) {
            b = std::min(c, (double) math::normal_quantile(0.5f + sample / 2) * stddev);
            cosB = std::cos(b);
            sinB = std::sqrt(1 - cosB * cosB);
        }
    }

    #if FOURIER_SCALAR != 1
        __m256d factorB_prev, factorB_cur;
    #endif

    while (true) {
        #if FOURIER_SCALAR == 1
            double cosB_prev = cosB,
                   sinB_prev = -sinB,
                   sinB_cur  = 0.0,
                   cosB_cur  = 1.0,
                   value     = coeff0 * b;

            deriv = coeff0;

            for (size_t j=1; j<nCoeffs; ++j) {
                double sinB_next = 2.0*cosB*sinB_cur - sinB_prev,
                       cosB_next = 2.0*cosB*cosB_cur - cosB_prev,
                       coeff     = (double) coeffs[0][j];

                value += coeff * recip[j] * sinB_next;
                deriv += coeff * cosB_next;

                sinB_prev = sinB_cur; sinB_cur = sinB_next;
                cosB_prev = cosB_cur; cosB_cur = cosB_next;
            }
        #else
            initializeRecurrence(cosB, factorB_prev, factorB_cur);

            __m256d
                sinB_prev  = _mm256_set1_pd(-sinB),
                sinB_cur   = _mm256_set1_pd(0.0),
                cosB_prev  = _mm256_set1_pd(cosB),
                cosB_cur   = _mm256_set1_pd(1.0),
                value_vec  = _mm256_set_sd(coeff0 * b),
                deriv_vec  = _mm256_set_sd(coeff0);

            for (size_t j=1; j<nCoeffs; j+=4) {
                __m128 coeff_vec_f = _mm_load_ps(coeffs[0]+j);
                __m256d recip_vec  = _mm256_load_pd(recip+j);
                __m256d coeff_vec  = _mm256_cvtps_pd(coeff_vec_f);

                __m256d sinB_next = _mm256_add_pd(
                        _mm256_mul_pd(factorB_prev, sinB_prev),
                        _mm256_mul_pd(factorB_cur, sinB_cur));

                __m256d cosB_next = _mm256_add_pd(
                        _mm256_mul_pd(factorB_prev, cosB_prev),
                        _mm256_mul_pd(factorB_cur, cosB_cur));

                value_vec = _mm256_add_pd(value_vec, _mm256_mul_pd(
                    _mm256_mul_pd(recip_vec, coeff_vec), sinB_next));
                deriv_vec = _mm256_add_pd(deriv_vec, _mm256_mul_pd(coeff_vec, cosB_next));

                sinB_prev = _mm256_splat2_pd(sinB_next);
                cosB_prev = _mm256_splat2_pd(cosB_next);
                sinB_cur  = _mm256_splat3_pd(sinB_next);
                cosB_cur  = _mm256_splat3_pd(cosB_next);
            }

            double value = simd::hadd(value_vec);
            deriv = simd::hadd(deriv_vec);
        #endif

        value -= y;

        if (std::abs(value) <= 1e-5 * coeff0 || ++iterations > 20)
            break;
        else if (value > 0.0)
            c = b;
        else
            a = b;

        b -= value / deriv;

        if (!(b >= a && b <= c))
            b = 0.5f * (a + c);

        cosB = std::cos(b);
        sinB = std::sqrt(1-cosB*cosB);
    }

    double Y = deriv;
    if (flip)
        b = 2.0*math::Pi_d - b;

    pdf = (Float) (math::InvTwoPi_d * Y / coeff0);
    phi = (Float) b;

    #if FOURIER_SCALAR == 1
        double cosB_prev = cosB,
               cosB_cur  = 1.0;

        double R = coeffs[1][0];
        double B = coeffs[2][0];

        for (size_t j=1; j<nCoeffs; ++j) {
            double cosB_next = 2.0*cosB*cosB_cur - cosB_prev,
                   coeffR    = (double) coeffs[1][j],
                   coeffB    = (double) coeffs[2][j];

            R += coeffR * cosB_next;
            B += coeffB * cosB_next;

            cosB_prev = cosB_cur; cosB_cur = cosB_next;
        }
    #else
        __m256d
            cosB_prev  = _mm256_set1_pd(cosB),
            cosB_cur   = _mm256_set1_pd(1.0),
            R_vec  = _mm256_set_sd(coeffs[1][0]),
            B_vec  = _mm256_set_sd(coeffs[2][0]);

        for (size_t j=1; j<nCoeffs; j+=4) {
            __m128 coeff_R_vec_f = _mm_load_ps(coeffs[1]+j);
            __m128 coeff_B_vec_f = _mm_load_ps(coeffs[2]+j);
            __m256d coeff_R_vec  = _mm256_cvtps_pd(coeff_R_vec_f);
            __m256d coeff_B_vec  = _mm256_cvtps_pd(coeff_B_vec_f);

            __m256d cosB_next = _mm256_add_pd(
                    _mm256_mul_pd(factorB_prev, cosB_prev),
                    _mm256_mul_pd(factorB_cur, cosB_cur));

            R_vec = _mm256_add_pd(R_vec, _mm256_mul_pd(coeff_R_vec, cosB_next));
            B_vec = _mm256_add_pd(B_vec, _mm256_mul_pd(coeff_B_vec, cosB_next));

            cosB_prev = _mm256_splat2_pd(cosB_next);
            cosB_cur  = _mm256_splat3_pd(cosB_next);
        }

        double R = simd::hadd(R_vec);
        double B = simd::hadd(B_vec);
    #endif

    double G = 1.39829 * Y - 0.100913 * B - 0.297375 * R;
    return Color3((Float) R, (Float) G, (Float) B)
        * (2 * math::Pi) * (Float) (coeff0 / Y);
}
示例#26
0
BI_FORCE_INLINE inline avx_double operator*(const avx_double& o1,
    const double& o2) {
  avx_double res;
  res.packed = _mm256_mul_pd(o1.packed, _mm256_set1_pd(o2));
  return res;
}
    real             *shiftvec,*fshift,*x,*f;
    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
    real             scratch[4*DIM];
    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
    real *           vdwioffsetptr0;
    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    int              nvdwtype;
    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
    int              *vdwtype;
    real             *vdwparam;
    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
    __m128i          ewitab;
    __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
    __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
    real             *ewtab;
    __m256d          dummy_mask,cutoff_mask;
    __m128           tmpmask0,tmpmask1;
    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
    __m256d          one     = _mm256_set1_pd(1.0);
    __m256d          two     = _mm256_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;