inline void AES_reduced_opt(int128 &u) { //Round Key initialization __m128i roundkey[AES_ROUNDS + 1]; for (unsigned i = 0; i<AES_ROUNDS + 1; ++i) { roundkey[i] = _mm_set_epi64x(subkeys64[i][1], subkeys64[i][0]); } __m128i acc0 = _mm_set_epi64x(u.i1, u.i0); acc0 = _mm_xor_si128(acc0, roundkey[0]); for (unsigned j = 0; j<AES_ROUNDS; ++j) { for (unsigned i = 0; i<1; ++i) { acc0 = _mm_aesenc_si128(acc0, roundkey[j + 1]); } } { u.i0 = _mm_extract_epi64(acc0, 0); u.i1 = _mm_extract_epi64(acc0, 1); } }
void FillBlock(__m128i* state, const uint8_t *ref_block, uint8_t *next_block, const uint64_t* Sbox) { __m128i block_XY[ARGON2_QWORDS_IN_BLOCK]; //__m128i state[64]; for (uint32_t i = 0; i < ARGON2_QWORDS_IN_BLOCK; i++) { block_XY[i] = _mm_load_si128((__m128i *) ref_block); ref_block += 16; } for (uint32_t i = 0; i < ARGON2_QWORDS_IN_BLOCK; i++) { block_XY[i] = state[i] = _mm_xor_si128(state[i], block_XY[i]); } uint64_t x = 0; if (Sbox != NULL) { x = _mm_extract_epi64(block_XY[0], 0) ^ _mm_extract_epi64(block_XY[ARGON2_QWORDS_IN_BLOCK - 1], 1); for (int i = 0; i < 6 * 16; ++i) { uint32_t x1 = x >> 32; uint32_t x2 = x & 0xFFFFFFFF; uint64_t y = Sbox[x1 & ARGON2_SBOX_MASK]; uint64_t z = Sbox[(x2 & ARGON2_SBOX_MASK) + ARGON2_SBOX_SIZE / 2]; x = (uint64_t) x1 * (uint64_t) x2; x += y; x ^= z; } }
static inline void arr_store_col( int *col, __m128i vH, int32_t t, int32_t seglen) { col[0*seglen+t] = (int64_t)_mm_extract_epi64(vH, 0); col[1*seglen+t] = (int64_t)_mm_extract_epi64(vH, 1); }
static inline void arr_store_si128( int *array, __m128i vH, int32_t t, int32_t seglen, int32_t d, int32_t dlen) { array[(0*seglen+t)*dlen + d] = (int64_t)_mm_extract_epi64(vH, 0); array[(1*seglen+t)*dlen + d] = (int64_t)_mm_extract_epi64(vH, 1); }
static inline void arr_store_si128( int *array, vec128i vWH, int32_t i, int32_t s1Len, int32_t j, int32_t s2Len) { if (0 <= i+0 && i+0 < s1Len && 0 <= j-0 && j-0 < s2Len) { array[1LL*(i+0)*s2Len + (j-0)] = (int64_t)_mm_extract_epi64(vWH, 1); } if (0 <= i+1 && i+1 < s1Len && 0 <= j-1 && j-1 < s2Len) { array[1LL*(i+1)*s2Len + (j-1)] = (int64_t)_mm_extract_epi64(vWH, 0); } }
static inline void arr_store_rowcol( int *row, int *col, vec128i vWH, int32_t i, int32_t s1Len, int32_t j, int32_t s2Len) { if (i+0 == s1Len-1 && 0 <= j-0 && j-0 < s2Len) { row[j-0] = (int64_t)_mm_extract_epi64(vWH, 1); } if (j-0 == s2Len-1 && 0 <= i+0 && i+0 < s1Len) { col[(i+0)] = (int64_t)_mm_extract_epi64(vWH, 1); } if (i+1 == s1Len-1 && 0 <= j-1 && j-1 < s2Len) { row[j-1] = (int64_t)_mm_extract_epi64(vWH, 0); } if (j-1 == s2Len-1 && 0 <= i+1 && i+1 < s1Len) { col[(i+1)] = (int64_t)_mm_extract_epi64(vWH, 0); } }
TEST (void) { union { __m128i x; long long ll[2]; } val1; long long res[2]; int masks[2]; int i; val1.ll[0] = 0x0807060504030201LL; val1.ll[1] = 0x100F0E0D0C0B0A09LL; res[0] = _mm_extract_epi64 (val1.x, msk0); res[1] = _mm_extract_epi64 (val1.x, msk1); masks[0] = msk0; masks[1] = msk1; for (i = 0; i < 2; i++) if (res[i] != val1.ll [masks[i]]) abort (); }
void fb_slvn_low(dig_t *c, const dig_t *a) { int i; dig_t *p, u0, u1, u2, u3; void *tab = fb_poly_get_slv(); __m128i m0, m1, m2, m3, m4, sqrt0, sqrt1, mask0, mask1, mask2, r0, r1, t0, t1, perm; perm = _mm_set_epi32(0x0F0D0B09, 0x07050301, 0x0E0C0A08, 0x06040200); mask2 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000); mask1 = _mm_set_epi32(0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0); mask0 = _mm_set_epi32(0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F); sqrt0 = _mm_set_epi32(0x03020302, 0x01000100, 0x03020302, 0x01000100); sqrt1 = _mm_set_epi32(0x0c080c08, 0x04000400, 0x0c080c08, 0x04000400); t0 = _mm_load_si128((__m128i *)a); t1 = _mm_load_si128((__m128i *)(a + 2)); r0 = r1 = _mm_setzero_si128(); m0 = _mm_shuffle_epi8(t1, perm); m1 = _mm_and_si128(m0, mask0); m2 = _mm_and_si128(m0, mask1); m2 = _mm_srli_epi64(m2, 4); m2 = _mm_shuffle_epi8(sqrt1, m2); m1 = _mm_shuffle_epi8(sqrt0, m1); m1 = _mm_xor_si128(m1, m2); m2 = _mm_slli_si128(m1, 8); m1 = _mm_and_si128(m1, mask2); m1 = _mm_slli_epi64(m1, 4); m1 = _mm_xor_si128(m1, m2); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); m0 = _mm_and_si128(t0, mask2); m0 = _mm_shuffle_epi8(m0, perm); m1 = _mm_and_si128(m0, mask0); m2 = _mm_and_si128(m0, mask1); m2 = _mm_srli_epi64(m2, 4); m2 = _mm_shuffle_epi8(sqrt1, m2); m1 = _mm_shuffle_epi8(sqrt0, m1); m1 = _mm_xor_si128(m1, m2); m2 = _mm_srli_si128(m1, 8); m1 = _mm_andnot_si128(mask2, m1); m2 = _mm_slli_epi64(m2, 4); m1 = _mm_xor_si128(m1, m2); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); m1 = _mm_srli_si128(t0, 4); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0xFFFFFFFF)); m0 = _mm_shuffle_epi8(m1, perm); m1 = _mm_and_si128(m0, mask0); m2 = _mm_and_si128(m0, mask1); m2 = _mm_srli_epi64(m2, 4); m2 = _mm_shuffle_epi8(sqrt1, m2); m1 = _mm_shuffle_epi8(sqrt0, m1); m1 = _mm_xor_si128(m1, m2); m2 = _mm_slli_si128(m1, 8); m1 = _mm_slli_epi64(m1, 4); m1 = _mm_xor_si128(m1, m2); m1 = _mm_srli_si128(m1, 6); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); m1 = _mm_srli_si128(t0, 2); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0xFFFF)); m0 = _mm_shuffle_epi8(m1, perm); m1 = _mm_and_si128(m0, mask0); m2 = _mm_and_si128(m0, mask1); m2 = _mm_srli_epi64(m2, 4); m2 = _mm_shuffle_epi8(sqrt1, m2); m1 = _mm_shuffle_epi8(sqrt0, m1); m1 = _mm_xor_si128(m1, m2); m2 = _mm_slli_si128(m1, 8); m1 = _mm_slli_epi64(m1, 4); m1 = _mm_xor_si128(m1, m2); m1 = _mm_srli_si128(m1, 7); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); m1 = _mm_srli_si128(t0, 1); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x55)); m1 = _mm_or_si128(m1, _mm_srli_epi64(m1, 1)); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x33)); m1 = _mm_or_si128(m1, _mm_srli_epi64(m1, 2)); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x0F)); m1 = _mm_slli_epi64(m1, 4); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); m1 = _mm_srli_epi64(t0, 4); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x5)); m1 = _mm_or_si128(m1, _mm_srli_epi64(m1, 1)); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x3)); m1 = _mm_slli_epi64(m1, 2); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); m1 = _mm_srli_epi64(t0, 2); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x1)); m1 = _mm_slli_epi64(m1, 1); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); sqrt0 = _mm_set_epi32(0x03030202, 0x03030202, 0x01010000, 0x01010000); sqrt1 = _mm_set_epi32(0x0C0C0808, 0x0C0C0808, 0x04040000, 0x04040000); m1 = _mm_and_si128(t0, mask0); m2 = _mm_and_si128(t0, mask1); m3 = _mm_and_si128(t1, mask0); m4 = _mm_and_si128(t1, mask1); m2 = _mm_srli_epi64(m2, 4); m4 = _mm_srli_epi64(m4, 4); m2 = _mm_shuffle_epi8(sqrt1, m2); m1 = _mm_shuffle_epi8(sqrt0, m1); m4 = _mm_shuffle_epi8(sqrt1, m4); m3 = _mm_shuffle_epi8(sqrt0, m3); m1 = _mm_or_si128(m1, m2); m3 = _mm_or_si128(m3, m4); #ifndef __PCLMUL__ align dig_t x[2]; _mm_store_si128((__m128i *)x, m1); u0 = x[0]; u1 = x[1]; _mm_store_si128((__m128i *)x, m3); u2 = x[0]; u3 = x[1]; #else u0 = _mm_extract_epi64(m1, 0); u1 = _mm_extract_epi64(m1, 1); u2 = _mm_extract_epi64(m3, 0); u3 = _mm_extract_epi64(m3, 1); #endif for (i = 0; i < 8; i++) { p = (dig_t *)(tab + (16 * i + (u0 & 0x0F)) * sizeof(fb_st)); r0 = _mm_xor_si128(r0, *(__m128i *)(p)); r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2)); u0 >>= 8; p = (dig_t *)(tab + (16 * (i + 8) + (u1 & 0x0F)) * sizeof(fb_st)); r0 = _mm_xor_si128(r0, *(__m128i *)(p)); r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2)); u1 >>= 8; p = (dig_t *)(tab + (16 * (i + 16) + (u2 & 0x0F)) * sizeof(fb_st)); r0 = _mm_xor_si128(r0, *(__m128i *)(p)); r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2)); u2 >>= 8; p = (dig_t *)(tab + (16 * (i + 24) + (u3 & 0xF)) * sizeof(fb_st)); r0 = _mm_xor_si128(r0, *(__m128i *)(p)); r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2)); u3 >>= 8; } _mm_store_si128((__m128i *)c, r0); _mm_store_si128((__m128i *)(c + 2), r1); }
static inline __m128d my_invrsq_pd(__m128d x) { const __m128d three = (const __m128d) {3.0f, 3.0f}; const __m128d half = (const __m128d) {0.5f, 0.5f}; __m128 t = _mm_rsqrt_ps(_mm_cvtpd_ps(x)); /* Convert to single precision and do _mm_rsqrt_ps() */ __m128d t1 = _mm_cvtps_pd(t); /* Convert back to double precision */ /* First Newton-Rapson step, accuracy is now 24 bits */ __m128d t2 = _mm_mul_pd(half,_mm_mul_pd(t1,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t1,t1))))); /* Return second Newton-Rapson step, accuracy 48 bits */ return (__m128d) _mm_mul_pd(half,_mm_mul_pd(t2,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t2,t2))))); } /* to extract single integers from a __m128i datatype */ #define _mm_extract_epi64(x, imm) \ _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm))) void nb_kernel400_x86_64_sse2(int * p_nri, int * iinr, int * jindex, int * jjnr, int * shift, double * shiftvec, double * fshift, int * gid, double * pos, double * faction, double * charge, double * p_facel, double * p_krf, double * p_crf, double * Vc, int * type, int * p_ntype, double * vdwparam, double * Vvdw, double * p_tabscale, double * VFtab, double * invsqrta, double * dvda, double * p_gbtabscale, double * GBtab, int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, double * work) { int nri,ntype,nthreads,offset; int n,ii,is3,ii3,k,nj0,nj1,jnr1,jnr2,j13,j23,ggid; double facel,krf,crf,tabscl,gbtabscl,vct,vgbt; double shX,shY,shZ,isai_d,dva; gmx_gbdata_t *gbdata; float * gpol; __m128d ix,iy,iz,jx,jy,jz; __m128d dx,dy,dz,t1,t2,t3; __m128d fix,fiy,fiz,rsq11,rinv,r,fscal,rt,eps,eps2; __m128d q,iq,qq,isai,isaj,isaprod,vcoul,gbscale,dvdai,dvdaj; __m128d Y,F,G,H,Fp,VV,FF,vgb,fijC,dvdatmp,dvdasum,vctot,vgbtot,n0d; __m128d xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8; __m128d fac,tabscale,gbtabscale; __m128i n0,nnn; const __m128d neg = {-1.0f,-1.0f}; const __m128d zero = {0.0f,0.0f}; const __m128d half = {0.5f,0.5f}; const __m128d two = {2.0f,2.0f}; const __m128d three = {3.0f,3.0f}; gbdata = (gmx_gbdata_t *)work; gpol = gbdata->gpol; nri = *p_nri; ntype = *p_ntype; nthreads = *p_nthreads; facel = (*p_facel) * (1.0 - (1.0/gbdata->gb_epsilon_solvent)); krf = *p_krf; crf = *p_crf; tabscl = *p_tabscale; gbtabscl = *p_gbtabscale; nj1 = 0; /* Splat variables */ fac = _mm_load1_pd(&facel); tabscale = _mm_load1_pd(&tabscl); gbtabscale = _mm_load1_pd(&gbtabscl); /* Keep compiler happy */ dvdatmp = _mm_setzero_pd(); vgb = _mm_setzero_pd(); dvdaj = _mm_setzero_pd(); isaj = _mm_setzero_pd(); vcoul = _mm_setzero_pd(); t1 = _mm_setzero_pd(); t2 = _mm_setzero_pd(); t3 = _mm_setzero_pd(); jnr1=jnr2=0; j13=j23=0; for(n=0;n<nri;n++) { is3 = 3*shift[n]; shX = shiftvec[is3]; shY = shiftvec[is3+1]; shZ = shiftvec[is3+2]; nj0 = jindex[n]; nj1 = jindex[n+1]; offset = (nj1-nj0)%2; ii = iinr[n]; ii3 = ii*3; ix = _mm_set1_pd(shX+pos[ii3+0]); iy = _mm_set1_pd(shX+pos[ii3+1]); iz = _mm_set1_pd(shX+pos[ii3+2]); q = _mm_set1_pd(charge[ii]); iq = _mm_mul_pd(fac,q); isai_d = invsqrta[ii]; isai = _mm_load1_pd(&isai_d); fix = _mm_setzero_pd(); fiy = _mm_setzero_pd(); fiz = _mm_setzero_pd(); dvdasum = _mm_setzero_pd(); vctot = _mm_setzero_pd(); vgbtot = _mm_setzero_pd(); for(k=nj0;k<nj1-offset; k+=2) { jnr1 = jjnr[k]; jnr2 = jjnr[k+1]; j13 = jnr1 * 3; j23 = jnr2 * 3; /* Load coordinates */ xmm1 = _mm_loadu_pd(pos+j13); /* x1 y1 */ xmm2 = _mm_loadu_pd(pos+j23); /* x2 y2 */ xmm5 = _mm_load_sd(pos+j13+2); /* z1 - */ xmm6 = _mm_load_sd(pos+j23+2); /* z2 - */ /* transpose */ jx = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); jy = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); jz = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* distances */ dx = _mm_sub_pd(ix,jx); dy = _mm_sub_pd(iy,jy); dz = _mm_sub_pd(iz,jz); rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) ); rinv = my_invrsq_pd(rsq11); /* Load invsqrta */ isaj = _mm_loadl_pd(isaj,invsqrta+jnr1); isaj = _mm_loadh_pd(isaj,invsqrta+jnr2); isaprod = _mm_mul_pd(isai,isaj); /* Load charges */ q = _mm_loadl_pd(q,charge+jnr1); q = _mm_loadh_pd(q,charge+jnr2); qq = _mm_mul_pd(iq,q); vcoul = _mm_mul_pd(qq,rinv); fscal = _mm_mul_pd(vcoul,rinv); qq = _mm_mul_pd(isaprod,qq); qq = _mm_mul_pd(qq,neg); gbscale = _mm_mul_pd(isaprod,gbtabscale); /* Load dvdaj */ dvdaj = _mm_loadl_pd(dvdaj, dvda+jnr1); dvdaj = _mm_loadh_pd(dvdaj, dvda+jnr2); r = _mm_mul_pd(rsq11,rinv); rt = _mm_mul_pd(r,gbscale); n0 = _mm_cvttpd_epi32(rt); n0d = _mm_cvtepi32_pd(n0); eps = _mm_sub_pd(rt,n0d); eps2 = _mm_mul_pd(eps,eps); nnn = _mm_slli_epi64(n0,2); xmm1 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))); /* Y1 F1 */ xmm2 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))); /* Y2 F2 */ xmm3 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */ xmm4 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */ Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */ F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */ G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */ H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */ G = _mm_mul_pd(G,eps); H = _mm_mul_pd(H,eps2); Fp = _mm_add_pd(F,G); Fp = _mm_add_pd(Fp,H); VV = _mm_mul_pd(Fp,eps); VV = _mm_add_pd(Y,VV); H = _mm_mul_pd(two,H); FF = _mm_add_pd(Fp,G); FF = _mm_add_pd(FF,H); vgb = _mm_mul_pd(qq,VV); fijC = _mm_mul_pd(qq,FF); fijC = _mm_mul_pd(fijC,gbscale); dvdatmp = _mm_mul_pd(fijC,r); dvdatmp = _mm_add_pd(vgb,dvdatmp); dvdatmp = _mm_mul_pd(dvdatmp,neg); dvdatmp = _mm_mul_pd(dvdatmp,half); dvdasum = _mm_add_pd(dvdasum,dvdatmp); xmm1 = _mm_mul_pd(dvdatmp,isaj); xmm1 = _mm_mul_pd(xmm1,isaj); dvdaj = _mm_add_pd(dvdaj,xmm1); /* store dvda */ _mm_storel_pd(dvda+jnr1,dvdaj); _mm_storeh_pd(dvda+jnr2,dvdaj); vctot = _mm_add_pd(vctot,vcoul); vgbtot = _mm_add_pd(vgbtot,vgb); fscal = _mm_sub_pd(fijC,fscal); fscal = _mm_mul_pd(fscal,neg); fscal = _mm_mul_pd(fscal,rinv); /* calculate partial force terms */ t1 = _mm_mul_pd(fscal,dx); t2 = _mm_mul_pd(fscal,dy); t3 = _mm_mul_pd(fscal,dz); /* update the i force */ fix = _mm_add_pd(fix,t1); fiy = _mm_add_pd(fiy,t2); fiz = _mm_add_pd(fiz,t3); /* accumulate forces from memory */ xmm1 = _mm_loadu_pd(faction+j13); /* fx1 fy1 */ xmm2 = _mm_loadu_pd(faction+j23); /* fx2 fy2 */ xmm5 = _mm_load1_pd(faction+j13+2); /* fz1 fz1 */ xmm6 = _mm_load1_pd(faction+j23+2); /* fz2 fz2 */ /* transpose */ xmm7 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */ xmm5 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */ xmm6 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */ /* subtract partial forces */ xmm5 = _mm_sub_pd(xmm5,t1); xmm6 = _mm_sub_pd(xmm6,t2); xmm7 = _mm_sub_pd(xmm7,t3); xmm1 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fx1 fy1 */ xmm2 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */ /* store fx and fy */ _mm_storeu_pd(faction+j13,xmm1); _mm_storeu_pd(faction+j23,xmm2); /* .. then fz */ _mm_storel_pd(faction+j13+2,xmm7); _mm_storel_pd(faction+j23+2,xmm7); } /* In double precision, offset can only be either 0 or 1 */ if(offset!=0) { jnr1 = jjnr[k]; j13 = jnr1*3; jx = _mm_load_sd(pos+j13); jy = _mm_load_sd(pos+j13+1); jz = _mm_load_sd(pos+j13+2); isaj = _mm_load_sd(invsqrta+jnr1); isaprod = _mm_mul_sd(isai,isaj); dvdaj = _mm_load_sd(dvda+jnr1); q = _mm_load_sd(charge+jnr1); qq = _mm_mul_sd(iq,q); dx = _mm_sub_sd(ix,jx); dy = _mm_sub_sd(iy,jy); dz = _mm_sub_sd(iz,jz); rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) ); rinv = my_invrsq_pd(rsq11); vcoul = _mm_mul_sd(qq,rinv); fscal = _mm_mul_sd(vcoul,rinv); qq = _mm_mul_sd(isaprod,qq); qq = _mm_mul_sd(qq,neg); gbscale = _mm_mul_sd(isaprod,gbtabscale); r = _mm_mul_sd(rsq11,rinv); rt = _mm_mul_sd(r,gbscale); n0 = _mm_cvttpd_epi32(rt); n0d = _mm_cvtepi32_pd(n0); eps = _mm_sub_sd(rt,n0d); eps2 = _mm_mul_sd(eps,eps); nnn = _mm_slli_epi64(n0,2); xmm1 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))); xmm2 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))); xmm3 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))+2); xmm4 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))+2); Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); G = _mm_mul_sd(G,eps); H = _mm_mul_sd(H,eps2); Fp = _mm_add_sd(F,G); Fp = _mm_add_sd(Fp,H); VV = _mm_mul_sd(Fp,eps); VV = _mm_add_sd(Y,VV); H = _mm_mul_sd(two,H); FF = _mm_add_sd(Fp,G); FF = _mm_add_sd(FF,H); vgb = _mm_mul_sd(qq,VV); fijC = _mm_mul_sd(qq,FF); fijC = _mm_mul_sd(fijC,gbscale); dvdatmp = _mm_mul_sd(fijC,r); dvdatmp = _mm_add_sd(vgb,dvdatmp); dvdatmp = _mm_mul_sd(dvdatmp,neg); dvdatmp = _mm_mul_sd(dvdatmp,half); dvdasum = _mm_add_sd(dvdasum,dvdatmp); xmm1 = _mm_mul_sd(dvdatmp,isaj); xmm1 = _mm_mul_sd(xmm1,isaj); dvdaj = _mm_add_sd(dvdaj,xmm1); /* store dvda */ _mm_storel_pd(dvda+jnr1,dvdaj); vctot = _mm_add_sd(vctot,vcoul); vgbtot = _mm_add_sd(vgbtot,vgb); fscal = _mm_sub_sd(fijC,fscal); fscal = _mm_mul_sd(fscal,neg); fscal = _mm_mul_sd(fscal,rinv); /* calculate partial force terms */ t1 = _mm_mul_sd(fscal,dx); t2 = _mm_mul_sd(fscal,dy); t3 = _mm_mul_sd(fscal,dz); /* update the i force */ fix = _mm_add_sd(fix,t1); fiy = _mm_add_sd(fiy,t2); fiz = _mm_add_sd(fiz,t3); /* accumulate forces from memory */ xmm5 = _mm_load_sd(faction+j13); /* fx */ xmm6 = _mm_load_sd(faction+j13+1); /* fy */ xmm7 = _mm_load_sd(faction+j13+2); /* fz */ /* subtract partial forces */ xmm5 = _mm_sub_sd(xmm5,t1); xmm6 = _mm_sub_sd(xmm6,t2); xmm7 = _mm_sub_sd(xmm7,t3); /* store forces */ _mm_store_sd(faction+j13,xmm5); _mm_store_sd(faction+j13+1,xmm6); _mm_store_sd(faction+j13+2,xmm7); } /* fix/fiy/fiz now contain four partial terms, that all should be * added to the i particle forces */ t1 = _mm_unpacklo_pd(t1,fix); t2 = _mm_unpacklo_pd(t2,fiy); t3 = _mm_unpacklo_pd(t3,fiz); fix = _mm_add_pd(fix,t1); fiy = _mm_add_pd(fiy,t2); fiz = _mm_add_pd(fiz,t3); fix = _mm_shuffle_pd(fix,fix,_MM_SHUFFLE2(1,1)); fiy = _mm_shuffle_pd(fiy,fiy,_MM_SHUFFLE2(1,1)); fiz = _mm_shuffle_pd(fiz,fiz,_MM_SHUFFLE2(1,1)); /* Load i forces from memory */ xmm1 = _mm_load_sd(faction+ii3); xmm2 = _mm_load_sd(faction+ii3+1); xmm3 = _mm_load_sd(faction+ii3+2); /* Add to i force */ fix = _mm_add_sd(fix,xmm1); fiy = _mm_add_sd(fiy,xmm2); fiz = _mm_add_sd(fiz,xmm3); /* store i forces to memory */ _mm_store_sd(faction+ii3,fix); _mm_store_sd(faction+ii3+1,fiy); _mm_store_sd(faction+ii3+2,fiz); /* now do dvda */ dvdatmp = _mm_unpacklo_pd(dvdatmp,dvdasum); dvdasum = _mm_add_pd(dvdasum,dvdatmp); _mm_storeh_pd(&dva,dvdasum); dvda[ii] = dvda[ii] + dva*isai_d*isai_d; ggid = gid[n]; /* Coulomb potential */ vcoul = _mm_unpacklo_pd(vcoul,vctot); vctot = _mm_add_pd(vctot,vcoul); _mm_storeh_pd(&vct,vctot); Vc[ggid] = Vc[ggid] + vct; /* GB potential */ vgb = _mm_unpacklo_pd(vgb,vgbtot); vgbtot = _mm_add_pd(vgbtot,vgb); _mm_storeh_pd(&vgbt,vgbtot); gpol[ggid] = gpol[ggid] + vgbt; } *outeriter = nri; *inneriter = nj1; }
(A0), ((simd_<ints64_<A0>,tag::sse_>)) ); namespace nt2 { namespace ext { template<class Dummy> struct call<tag::first_(tag::simd_<tag::ints64_, tag::sse_> ), tag::sse4_1_, Dummy> : callable { template<class Sig> struct result; template<class This,class A0> struct result<This(A0)> : meta::scalar_of<typename meta::strip<A0>::type> {}; NT2_FUNCTOR_CALL(1) { typedef typename meta::scalar_of<A0>::type type; type z = {_mm_extract_epi64(a0, 0)}; return z; } }; } } #endif #endif // ///////////////////////////////////////////////////////////////////////////// // End of first.hpp // /////////////////////////////////////////////////////////////////////////////
long long test_extract_epi64(__m128i x) { // CHECK-LABEL: test_extract_epi64 // CHECK: extractelement <2 x i64> %{{.*}}, i32 1 // CHECK-ASM: pextrq return _mm_extract_epi64(x, 1); }