void test4bit (void) { d1 = _mm_round_pd (d2, k4); /* { dg-error "the last argument must be a 4-bit immediate" } */ d1 = _mm_round_sd (d2, d3, k4); /* { dg-error "the last argument must be a 4-bit immediate" } */ a1 = _mm_round_ps (a2, k4); /* { dg-error "the last argument must be a 4-bit immediate" } */ a1 = _mm_round_ss (a2, a2, k4); /* { dg-error "the last argument must be a 4-bit immediate" } */ a1 = _mm_blend_ps (a2, a3, k4); /* { dg-error "the last argument must be a 4-bit immediate" } */ e1 = _mm256_blend_pd (e2, e3, k4); /* { dg-error "the last argument must be a 4-bit immediate" } */ e1 = _mm256_round_pd (e2, k4); /* { dg-error "the last argument must be a 4-bit immediate" } */ b1 = _mm256_round_ps (b2, k4); /* { dg-error "the last argument must be a 4-bit immediate" } */ }
/*! * \brief Round up each values of the vector and return them */ ETL_STATIC_INLINE(avx_simd_double) round_up(avx_simd_double x) { return _mm256_round_pd(x.value, (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)); }
void ntt_transform(poly out, const poly o) { int s, pos = 0, offset; __m256d vt,vo0,vo10,vo11,vo20,vo21,vo22,vo23,vc,vp,vpinv,neg2,neg4; __m256d vx0,vx1,vx2,vx3,vx4,vx5,vx6,vx7; vpinv = _mm256_set_pd(PARAM_APPROX_P_INVERSE, PARAM_APPROX_P_INVERSE, PARAM_APPROX_P_INVERSE, PARAM_APPROX_P_INVERSE); vp = _mm256_set_pd(8383489., 8383489., 8383489., 8383489.); bitrev(out); vo10 = _mm256_load_pd(o+pos); vo20 = _mm256_load_pd(o+pos+4); neg2 = _mm256_load_pd(_neg2); neg4 = _mm256_load_pd(_neg4); // m = 2, m = 4, m = 8 (3 levels merged) for(s = 0; s<POLY_DEG; s+=8) { // No multiplication with omega required, respective value is 1 vx0 = _mm256_load_pd(out+s); vt = _mm256_mul_pd(vx0,neg2); vx0 = _mm256_hadd_pd(vx0,vt); vx1 = _mm256_load_pd(out+s+4); vt = _mm256_mul_pd(vx1,neg2); vx1 = _mm256_hadd_pd(vx1,vt); vx0 = _mm256_mul_pd(vx0, vo10); vc = _mm256_mul_pd(vx0, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vx0 = _mm256_sub_pd(vx0,vc); vt = _mm256_permute2f128_pd (vx0, vx0, 0x01); // now contains x2,x3,x0,x1 vx0 = _mm256_mul_pd(vx0, neg4); vx0 = _mm256_add_pd(vx0, vt); vx1 = _mm256_mul_pd(vx1, vo10); vc = _mm256_mul_pd(vx1, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vx1 = _mm256_sub_pd(vx1,vc); vt = _mm256_permute2f128_pd (vx1, vx1, 0x01); // now contains x2,x3,x0,x1 vx1 = _mm256_mul_pd(vx1, neg4); vx1 = _mm256_add_pd(vx1, vt); vt = _mm256_mul_pd(vx1, vo20); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx1 = _mm256_sub_pd(vx0, vt); _mm256_store_pd(out+s+4, vx1); vx0 = _mm256_add_pd(vx0, vt); _mm256_store_pd(out+s+0, vx0); } pos += 8; // m = 16, m = 32, m = 64 (3 levels merged) for(offset = 0; offset < 8; offset+=4) { vo0 = _mm256_load_pd(o+pos+offset); vo10 = _mm256_load_pd(o+pos+offset+8); vo11 = _mm256_load_pd(o+pos+offset+16); for(s = 0; s<POLY_DEG; s+=64) { vx1 = _mm256_load_pd(out+offset+s+8); vt = _mm256_mul_pd(vx1, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx0 = _mm256_load_pd(out+offset+s+0); vx1 = _mm256_sub_pd(vx0, vt); // _mm256_store_pd(out+offset+s+8, vx1); vx0 = _mm256_add_pd(vx0, vt); // _mm256_store_pd(out+offset+s+0, vx0); vx3 = _mm256_load_pd(out+offset+s+24); vt = _mm256_mul_pd(vx3, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx2 = _mm256_load_pd(out+offset+s+16); vx3 = _mm256_sub_pd(vx2, vt); // _mm256_store_pd(out+offset+s+24, vx3); vx2 = _mm256_add_pd(vx2, vt); // _mm256_store_pd(out+offset+s+16, vx2); vx5 = _mm256_load_pd(out+offset+s+40); vt = _mm256_mul_pd(vx5, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx4 = _mm256_load_pd(out+offset+s+32); vx5 = _mm256_sub_pd(vx4, vt); // _mm256_store_pd(out+offset+s+40, vx5); vx4 = _mm256_add_pd(vx4, vt); // _mm256_store_pd(out+offset+s+32, vx4); vx7 = _mm256_load_pd(out+offset+s+56); vt = _mm256_mul_pd(vx7, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx6 = _mm256_load_pd(out+offset+s+48); vx7 = _mm256_sub_pd(vx6, vt); // _mm256_store_pd(out+offset+s+56, vx7); vx6 = _mm256_add_pd(vx6, vt); // _mm256_store_pd(out+offset+s+48, vx6); // vx2 = _mm256_load_pd(out+offset+s+16); vt = _mm256_mul_pd(vx2, vo10); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx0 = _mm256_load_pd(out+offset+s+0); vx2 = _mm256_sub_pd(vx0, vt); // _mm256_store_pd(out+offset+s+16, vx2); vx0 = _mm256_add_pd(vx0, vt); // _mm256_store_pd(out+offset+s+0, vx0); // vx6 = _mm256_load_pd(out+offset+s+48); vt = _mm256_mul_pd(vx6, vo10); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx4 = _mm256_load_pd(out+offset+s+32); vx6 = _mm256_sub_pd(vx4, vt); // _mm256_store_pd(out+offset+s+48, vx6); vx4 = _mm256_add_pd(vx4, vt); // _mm256_store_pd(out+offset+s+32, vx4); // vx3 = _mm256_load_pd(out+offset+s+24); vt = _mm256_mul_pd(vx3, vo11); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx1 = _mm256_load_pd(out+offset+s+8); vx3 = _mm256_sub_pd(vx1, vt); // _mm256_store_pd(out+offset+s+24, vx3); vx1 = _mm256_add_pd(vx1, vt); // _mm256_store_pd(out+offset+s+8, vx1); // vx7 = _mm256_load_pd(out+offset+s+56); vt = _mm256_mul_pd(vx7, vo11); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx5 = _mm256_load_pd(out+offset+s+40); vx7 = _mm256_sub_pd(vx5, vt); // _mm256_store_pd(out+offset+s+56, vx7); vx5 = _mm256_add_pd(vx5, vt); // _mm256_store_pd(out+offset+s+40, vx5); // vx4 = _mm256_load_pd(out+offset+s+32); vo20 = _mm256_load_pd(o+pos+offset+24); vt = _mm256_mul_pd(vx4, vo20); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx0 = _mm256_load_pd(out+offset+s+0); vx4 = _mm256_sub_pd(vx0, vt); _mm256_store_pd(out+offset+s+32, vx4); vx0 = _mm256_add_pd(vx0, vt); _mm256_store_pd(out+offset+s+0, vx0); // vx5 = _mm256_load_pd(out+offset+s+40); vo21 = _mm256_load_pd(o+pos+offset+32); vt = _mm256_mul_pd(vx5, vo21); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx1 = _mm256_load_pd(out+offset+s+8); vx5 = _mm256_sub_pd(vx1, vt); _mm256_store_pd(out+offset+s+40, vx5); vx1 = _mm256_add_pd(vx1, vt); _mm256_store_pd(out+offset+s+8, vx1); // vx6 = _mm256_load_pd(out+offset+s+48); vo22 = _mm256_load_pd(o+pos+offset+40); vt = _mm256_mul_pd(vx6, vo22); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx2 = _mm256_load_pd(out+offset+s+16); vx6 = _mm256_sub_pd(vx2, vt); _mm256_store_pd(out+offset+s+48, vx6); vx2 = _mm256_add_pd(vx2, vt); _mm256_store_pd(out+offset+s+16, vx2); // vx7 = _mm256_load_pd(out+offset+s+56); vo23 = _mm256_load_pd(o+pos+offset+48); vt = _mm256_mul_pd(vx7, vo23); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx3 = _mm256_load_pd(out+offset+s+24); vx7 = _mm256_sub_pd(vx3, vt); _mm256_store_pd(out+offset+s+56, vx7); vx3 = _mm256_add_pd(vx3, vt); _mm256_store_pd(out+offset+s+24, vx3); } } pos += 56; // m = 128, m=256, m=512 (3 levels merged) for(offset=0;offset<64;offset+=4) { vo0 = _mm256_load_pd(o+pos+offset); vo10 = _mm256_load_pd(o+pos+offset+64); vo11 = _mm256_load_pd(o+pos+offset+128); for(s = 0; s<POLY_DEG; s+=512) { vx1 = _mm256_load_pd(out+offset+s+64); vt = _mm256_mul_pd(vx1, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx0 = _mm256_load_pd(out+offset+s+0); vx1 = _mm256_sub_pd(vx0, vt); //_mm256_store_pd(out+offset+s+64, vx1); vx0 = _mm256_add_pd(vx0, vt); //_mm256_store_pd(out+offset+s+0, vx0); vx3 = _mm256_load_pd(out+offset+s+192); vt = _mm256_mul_pd(vx3, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx2 = _mm256_load_pd(out+offset+s+128); vx3 = _mm256_sub_pd(vx2, vt); //_mm256_store_pd(out+offset+s+192, vx3); vx2 = _mm256_add_pd(vx2, vt); //_mm256_store_pd(out+offset+s+128, vx2); vx5 = _mm256_load_pd(out+offset+s+320); vt = _mm256_mul_pd(vx5, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx4 = _mm256_load_pd(out+offset+s+256); vx5 = _mm256_sub_pd(vx4, vt); //_mm256_store_pd(out+offset+s+320, vx5); vx4 = _mm256_add_pd(vx4, vt); //_mm256_store_pd(out+offset+s+256, vx4); vx7 = _mm256_load_pd(out+offset+s+448); vt = _mm256_mul_pd(vx7, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx6 = _mm256_load_pd(out+offset+s+384); vx7 = _mm256_sub_pd(vx6, vt); //_mm256_store_pd(out+offset+s+448, vx7); vx6 = _mm256_add_pd(vx6, vt); //_mm256_store_pd(out+offset+s+384, vx6); //vx2 = _mm256_load_pd(out+offset+s+128); vt = _mm256_mul_pd(vx2, vo10); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx0 = _mm256_load_pd(out+offset+s+0); vx2 = _mm256_sub_pd(vx0, vt); //_mm256_store_pd(out+offset+s+128, vx2); vx0 = _mm256_add_pd(vx0, vt); //_mm256_store_pd(out+offset+s+0, vx0); //vx3 = _mm256_load_pd(out+offset+s+192); vt = _mm256_mul_pd(vx3, vo11); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx1 = _mm256_load_pd(out+offset+s+64); vx3 = _mm256_sub_pd(vx1, vt); //_mm256_store_pd(out+offset+s+192, vx3); vx1 = _mm256_add_pd(vx1, vt); //_mm256_store_pd(out+offset+s+64, vx1); //vx6 = _mm256_load_pd(out+offset+s+384); vt = _mm256_mul_pd(vx6, vo10); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx4 = _mm256_load_pd(out+offset+s+256); vx6 = _mm256_sub_pd(vx4, vt); //_mm256_store_pd(out+offset+s+384, vx6); vx4 = _mm256_add_pd(vx4, vt); //_mm256_store_pd(out+offset+s+256, vx4); //vx7 = _mm256_load_pd(out+offset+s+448); vt = _mm256_mul_pd(vx7, vo11); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx5 = _mm256_load_pd(out+offset+s+320); vx7 = _mm256_sub_pd(vx5, vt); //_mm256_store_pd(out+offset+s+448, vx7); vx5 = _mm256_add_pd(vx5, vt); //_mm256_store_pd(out+offset+s+320, vx5); //vx4 = _mm256_load_pd(out+offset+s+256); vo20 = _mm256_load_pd(o+pos+offset+192); vt = _mm256_mul_pd(vx4, vo20); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx0 = _mm256_load_pd(out+offset+s+0); vx4 = _mm256_sub_pd(vx0, vt); _mm256_store_pd(out+offset+s+256, vx4); vx0 = _mm256_add_pd(vx0, vt); _mm256_store_pd(out+offset+s+0, vx0); //vx5 = _mm256_load_pd(out+offset+s+320); vo21 = _mm256_load_pd(o+pos+offset+256); vt = _mm256_mul_pd(vx5, vo21); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx1 = _mm256_load_pd(out+offset+s+64); vx5 = _mm256_sub_pd(vx1, vt); _mm256_store_pd(out+offset+s+320, vx5); vx1 = _mm256_add_pd(vx1, vt); _mm256_store_pd(out+offset+s+64, vx1); //vx6 = _mm256_load_pd(out+offset+s+384); vo22 = _mm256_load_pd(o+pos+offset+320); vt = _mm256_mul_pd(vx6, vo22); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx2 = _mm256_load_pd(out+offset+s+128); vx6 = _mm256_sub_pd(vx2, vt); _mm256_store_pd(out+offset+s+384, vx6); vx2 = _mm256_add_pd(vx2, vt); _mm256_store_pd(out+offset+s+128, vx2); //vx7 = _mm256_load_pd(out+offset+s+448); vo23 = _mm256_load_pd(o+pos+offset+384); vt = _mm256_mul_pd(vx7, vo23); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx3 = _mm256_load_pd(out+offset+s+192); vx7 = _mm256_sub_pd(vx3, vt); _mm256_store_pd(out+offset+s+448, vx7); vx3 = _mm256_add_pd(vx3, vt); _mm256_store_pd(out+offset+s+192, vx3); } } }
static inline __m256d gmx_mm256_exp2_pd(__m256d x) { /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */ const __m256d arglimit = _mm256_set1_pd(1022.0); const __m128i expbase = _mm_set1_epi32(1023); const __m256d P2 = _mm256_set1_pd(2.30933477057345225087e-2); const __m256d P1 = _mm256_set1_pd(2.02020656693165307700e1); const __m256d P0 = _mm256_set1_pd(1.51390680115615096133e3); /* Q2 == 1.0 */ const __m256d Q1 = _mm256_set1_pd(2.33184211722314911771e2); const __m256d Q0 = _mm256_set1_pd(4.36821166879210612817e3); const __m256d one = _mm256_set1_pd(1.0); const __m256d two = _mm256_set1_pd(2.0); __m256d valuemask; __m256i iexppart; __m128i iexppart128a, iexppart128b; __m256d fexppart; __m256d intpart; __m256d z, z2; __m256d PolyP, PolyQ; iexppart128a = _mm256_cvtpd_epi32(x); intpart = _mm256_round_pd(x, _MM_FROUND_TO_NEAREST_INT); /* Add exponent bias */ iexppart128a = _mm_add_epi32(iexppart128a, expbase); /* We now want to shift the exponent 52 positions left, but to achieve this we need * to separate the 128-bit register data into two registers (4x64-bit > 128bit) * shift them, and then merge into a single __m256d. * Elements 0/1 should end up in iexppart128a, and 2/3 in iexppart128b. * It doesnt matter what we put in the 2nd/4th position, since that data will be * shifted out and replaced with zeros. */ iexppart128b = _mm_shuffle_epi32(iexppart128a, _MM_SHUFFLE(3, 3, 2, 2)); iexppart128a = _mm_shuffle_epi32(iexppart128a, _MM_SHUFFLE(1, 1, 0, 0)); iexppart128b = _mm_slli_epi64(iexppart128b, 52); iexppart128a = _mm_slli_epi64(iexppart128a, 52); iexppart = _mm256_castsi128_si256(iexppart128a); iexppart = _mm256_insertf128_si256(iexppart, iexppart128b, 0x1); valuemask = _mm256_cmp_pd(arglimit, gmx_mm256_abs_pd(x), _CMP_GE_OQ); fexppart = _mm256_and_pd(valuemask, _mm256_castsi256_pd(iexppart)); z = _mm256_sub_pd(x, intpart); z2 = _mm256_mul_pd(z, z); PolyP = _mm256_mul_pd(P2, z2); PolyP = _mm256_add_pd(PolyP, P1); PolyQ = _mm256_add_pd(z2, Q1); PolyP = _mm256_mul_pd(PolyP, z2); PolyQ = _mm256_mul_pd(PolyQ, z2); PolyP = _mm256_add_pd(PolyP, P0); PolyQ = _mm256_add_pd(PolyQ, Q0); PolyP = _mm256_mul_pd(PolyP, z); z = _mm256_mul_pd(PolyP, gmx_mm256_inv_pd(_mm256_sub_pd(PolyQ, PolyP))); z = _mm256_add_pd(one, _mm256_mul_pd(two, z)); z = _mm256_mul_pd(z, fexppart); return z; }