mul(avx_simd_complex_double<T> lhs, avx_simd_complex_double<T> rhs) { //lhs = [x1.real, x1.img, x2.real, x2.img] //rhs = [y1.real, y1.img, y2.real, y2.img] //ymm1 = [y1.real, y1.real, y2.real, y2.real] __m256d ymm1 = _mm256_movedup_pd(rhs.value); //ymm2 = [x1.img, x1.real, x2.img, x2.real] __m256d ymm2 = _mm256_permute_pd(lhs.value, 0b0101); //ymm3 = [y1.imag, y1.imag, y2.imag, y2.imag] __m256d ymm3 = _mm256_permute_pd(rhs.value, 0b1111); //ymm4 = ymm2 * ymm3 __m256d ymm4 = _mm256_mul_pd(ymm2, ymm3); //result = [(lhs * ymm1) -+ ymm4]; #ifdef __FMA__ return _mm256_fmaddsub_pd(lhs.value, ymm1, ymm4); #elif defined(__FMA4__) return _mm256_maddsub_pd(lhs.value, ymm1, ymm4); #else __m256d tmp = _mm256_mul_pd(lhs.value, ymm1); return _mm256_addsub_pd(tmp, ymm4); #endif }
div(avx_simd_complex_double<T> lhs, avx_simd_complex_double<T> rhs) { //lhs = [x1.real, x1.img, x2.real, x2.img] //rhs = [y1.real, y1.img, y2.real, y2.img] //ymm0 = [y1.real, y1.real, y2.real, y2.real] __m256d ymm0 = _mm256_movedup_pd(rhs.value); //ymm1 = [y1.imag, y1.imag, y2.imag, y2.imag] __m256d ymm1 = _mm256_permute_pd(rhs.value, 0b1111); //ymm2 = [x1.img, x1.real, x2.img, x2.real] __m256d ymm2 = _mm256_permute_pd(lhs.value, 0b0101); //ymm4 = [x.img * y.img, x.real * y.img] __m256d ymm4 = _mm256_mul_pd(ymm2, ymm1); //ymm5 = subadd((lhs * ymm0), ymm4) #ifdef __FMA__ __m256d ymm5 = _mm256_fmsubadd_pd(lhs.value, ymm0, ymm4); #else __m256d t1 = _mm256_mul_pd(lhs.value, ymm0); __m256d t2 = _mm256_sub_pd(_mm256_set1_pd(0.0), ymm4); __m256d ymm5 = _mm256_addsub_pd(t1, t2); #endif //ymm3 = [y.imag^2, y.imag^2] __m256d ymm3 = _mm256_mul_pd(ymm1, ymm1); //ymm0 = (ymm0 * ymm0 + ymm3) #ifdef __FMA__ ymm0 = _mm256_fmadd_pd(ymm0, ymm0, ymm3); #else __m256d t3 = _mm256_mul_pd(ymm0, ymm0); ymm0 = _mm256_add_pd(t3, ymm3); #endif //result = ymm5 / ymm0 return _mm256_div_pd(ymm5, ymm0); }
inline float64x4_t mat4_mul_vec4(const float64x4_t ymm[4], const float64x4_t ymm_v) { float64x4_t perm0 = _mm256_permute_pd(ymm_v, 0x0); // x x y y float64x4_t perm1 = _mm256_permute_pd(ymm_v, 0xF); // z z w w float64x4_t bcast0 = _mm256_broadcast_pd(&_mm256_extractf128_pd(perm0, 0)); // x x x x float64x4_t bcast1 = _mm256_broadcast_pd(&_mm256_extractf128_pd(perm0, 1)); // y y y y float64x4_t bcast2 = _mm256_broadcast_pd(&_mm256_extractf128_pd(perm1, 0)); // z z z z float64x4_t bcast3 = _mm256_broadcast_pd(&_mm256_extractf128_pd(perm1, 1)); // w w w w float64x4_t mul0 = _mm256_mul_pd(ymm[0], bcast0); float64x4_t mul1 = _mm256_mul_pd(ymm[1], bcast1); float64x4_t mul2 = _mm256_mul_pd(ymm[2], bcast2); float64x4_t mul3 = _mm256_mul_pd(ymm[3], bcast3); float64x4_t add0 = _mm256_add_pd(mul0, mul1); float64x4_t add1 = _mm256_add_pd(mul2, mul3); float64x4_t add2 = _mm256_add_pd(add0, add1); return add2; }
inline void rotate_left_wm1(F64vec4 *v0, const F64vec4 v1) { // {1.0, 2.0, 3.0, 4.0}; // {5.0, 6.0, 7.0, 8.0}; const __m128d hiv0 = _mm256_extractf128_pd(*v0, 1); // {3.0, 4.0} const __m128d phiv0 = _mm_permute_pd(hiv0, 0x1); // {4.0, 3.0} const __m256d shufv1 = _mm256_permute_pd(v1, 0x1); // {6.0, 5.0, 8.0, 7.0}; const __m128d shufv1_lo = _mm256_extractf128_pd(shufv1, 0); // {6.0, 5.0} const __m128d shufv1_hi = _mm256_extractf128_pd(shufv1, 1); // {8.0, 7.0} const __m128d v1_blend = _mm_blend_pd(shufv1_lo, shufv1_hi, 0x2); // blend {6.0, 7.0}; const __m256d inserted = _mm256_insertf128_pd(shufv1, v1_blend, 1); // insert {6.0, 5.0, 6.0, 7.0}; const __m256d blended = _mm256_blend_pd(_mm256_castpd128_pd256(phiv0), inserted, 0xE); *v0 = blended; }
double zdotu_aos( const int N, const double* dx, const int ix, const double* dy, const int iy, double* res ) { __m256d ymm0; __m256d ymm1; __m256d ymm2; __m256d ymm3; __m256d ymm4 = _mm256_setzero_pd(); __m256d ymm5 = _mm256_setzero_pd(); // int ii = 0; //for(ii = 0; ii < N/2; ii++) do { //IACA_START; ymm0 = _mm256_loadu_pd(dx + 4*ii); ymm1 = _mm256_loadu_pd(dy + 4*ii); // ymm4 = _mm256_fmadd_pd(ymm1, ymm0, ymm4); ymm2 = _mm256_permute_pd(ymm1, 0x5); ymm5 = _mm256_fmadd_pd(ymm2, ymm0, ymm5); ii++; // } while (ii < N/2); //IACA_END double* re = (double*)&ymm4; double* im = (double*)&ymm5; res[0] = re[0] - re[1] + re[2] - re[3]; res[1] = im[0] + im[1] + im[2] + im[3]; }
int main(void) { //_mm256_permute_pd __m256d da = _mm256_setr_pd(1,2,3,4); printf("da: "); for(int i=0; i<sizeof(da)/sizeof(da.m256d_f64[0]); i++) printf("%5.1f ", da.m256d_f64[i]); printf("\n"); __m256d dc = _mm256_permute_pd(da, 0x02); printf("dc: "); for(int i=0; i<sizeof(dc)/sizeof(dc.m256d_f64[0]); i++) printf("%5.1f ", dc.m256d_f64[i]); printf("\n\n"); //_mm_permute_pd __m128d fa = _mm_setr_pd(1, 2); printf("fa: "); for(int i=0; i<sizeof(fa)/sizeof(fa.m128d_f64[0]); i++) printf("%5.1f ", fa.m128d_f64[i]); printf("\n"); __m128d fc = _mm_permute_pd(fa,0x01); printf("fc: "); for(int i=0; i<sizeof(fc)/sizeof(fc.m128d_f64[0]); i++) printf("%5.1f ", fc.m128d_f64[i]); printf("\n"); return 0; }
__m256d test_mm256_permute_pd(__m256d a) { // Check if the mask is correct // CHECK: shufflevector{{.*}}<i32 1, i32 0, i32 3, i32 2> return _mm256_permute_pd(a, 5); }