Exemplo n.º 1
0
 BOOST_FORCEINLINE
 __m256i shuffle(__m256i const lower, __m256i const upper)
 {
   return _mm256_castpd_si256(
     _mm256_shuffle_pd( _mm256_castsi256_pd(lower), _mm256_castsi256_pd(upper)
                   , _MM_SHUFFLE(upper_i1, upper_i0, lower_i1, lower_i0)
                   )
   );
 }
Exemplo n.º 2
0
int main()
{
    __m256d a;
    __m256i mask;
    double  d[4]={1,2,3,4};

    a = _mm256_setzero_pd();
    mask = _mm256_castpd_si256(a);

#ifdef GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG
    a = _mm256_maskload_pd(d,_mm256_castsi256_pd(mask));
#else
    a = _mm256_maskload_pd(d,mask);
#endif
}
    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
    real             rcutoff_scalar;
    real             *shiftvec,*fshift,*x,*f;
    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
    real             scratch[4*DIM];
    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
    real *           vdwioffsetptr0;
    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    __m256d          dummy_mask,cutoff_mask;
    __m128           tmpmask0,tmpmask1;
    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
    __m256d          one     = _mm256_set1_pd(1.0);
    __m256d          two     = _mm256_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    shiftvec         = fr->shift_vec[0];
    fshift           = fr->fshift[0];
    facel            = _mm256_set1_pd(fr->epsfac);
    charge           = mdatoms->chargeA;
Exemplo n.º 4
0
 inline vector4d operator~(const vector4d& rhs)
 {
     return _mm256_xor_pd(rhs, _mm256_castsi256_pd(_mm256_set1_epi32(-1)));
 }
Exemplo n.º 5
0
 inline vector4db::vector4db(bool b0, bool b1, bool b2, bool b3)
     : m_value(_mm256_castsi256_pd(
               _mm256_setr_epi32(-(int)b0, -(int)b0, -(int)b1, -(int)b1,
                                 -(int)b2, -(int)b2, -(int)b3, -(int)b3)))
 {
 }
Exemplo n.º 6
0
 inline vector4db(bool b)
     : m_value(_mm256_castsi256_pd(_mm256_set1_epi32(-(int)b)))
 {
 }
static inline __m256d gmx_mm256_exp2_pd(__m256d x)
{
    /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
    const __m256d arglimit = _mm256_set1_pd(1022.0);
    const __m128i expbase  = _mm_set1_epi32(1023);

    const __m256d P2       = _mm256_set1_pd(2.30933477057345225087e-2);
    const __m256d P1       = _mm256_set1_pd(2.02020656693165307700e1);
    const __m256d P0       = _mm256_set1_pd(1.51390680115615096133e3);
    /* Q2 == 1.0 */
    const __m256d Q1       = _mm256_set1_pd(2.33184211722314911771e2);
    const __m256d Q0       = _mm256_set1_pd(4.36821166879210612817e3);
    const __m256d one      = _mm256_set1_pd(1.0);
    const __m256d two      = _mm256_set1_pd(2.0);

    __m256d       valuemask;
    __m256i       iexppart;
    __m128i       iexppart128a, iexppart128b;
    __m256d       fexppart;
    __m256d       intpart;
    __m256d       z, z2;
    __m256d       PolyP, PolyQ;

    iexppart128a  = _mm256_cvtpd_epi32(x);
    intpart       = _mm256_round_pd(x, _MM_FROUND_TO_NEAREST_INT);

    /* Add exponent bias */
    iexppart128a   = _mm_add_epi32(iexppart128a, expbase);

    /* We now want to shift the exponent 52 positions left, but to achieve this we need
     * to separate the 128-bit register data into two registers (4x64-bit > 128bit)
     * shift them, and then merge into a single __m256d.
     * Elements 0/1 should end up in iexppart128a, and 2/3 in iexppart128b.
     * It doesnt matter what we put in the 2nd/4th position, since that data will be
     * shifted out and replaced with zeros.
     */
    iexppart128b   = _mm_shuffle_epi32(iexppart128a, _MM_SHUFFLE(3, 3, 2, 2));
    iexppart128a   = _mm_shuffle_epi32(iexppart128a, _MM_SHUFFLE(1, 1, 0, 0));

    iexppart128b   = _mm_slli_epi64(iexppart128b, 52);
    iexppart128a   = _mm_slli_epi64(iexppart128a, 52);

    iexppart  = _mm256_castsi128_si256(iexppart128a);
    iexppart  = _mm256_insertf128_si256(iexppart, iexppart128b, 0x1);

    valuemask = _mm256_cmp_pd(arglimit, gmx_mm256_abs_pd(x), _CMP_GE_OQ);
    fexppart  = _mm256_and_pd(valuemask, _mm256_castsi256_pd(iexppart));

    z         = _mm256_sub_pd(x, intpart);

    z2        = _mm256_mul_pd(z, z);

    PolyP     = _mm256_mul_pd(P2, z2);
    PolyP     = _mm256_add_pd(PolyP, P1);
    PolyQ     = _mm256_add_pd(z2, Q1);
    PolyP     = _mm256_mul_pd(PolyP, z2);
    PolyQ     = _mm256_mul_pd(PolyQ, z2);
    PolyP     = _mm256_add_pd(PolyP, P0);
    PolyQ     = _mm256_add_pd(PolyQ, Q0);
    PolyP     = _mm256_mul_pd(PolyP, z);

    z         = _mm256_mul_pd(PolyP, gmx_mm256_inv_pd(_mm256_sub_pd(PolyQ, PolyP)));
    z         = _mm256_add_pd(one, _mm256_mul_pd(two, z));

    z         = _mm256_mul_pd(z, fexppart);

    return z;
}