mul(avx_simd_complex_float<T> lhs, avx_simd_complex_float<T> rhs) { //lhs = [x1.real, x1.img, x2.real, x2.img, ...] //rhs = [y1.real, y1.img, y2.real, y2.img, ...] //ymm1 = [y1.real, y1.real, y2.real, y2.real, ...] __m256 ymm1 = _mm256_moveldup_ps(rhs.value); //ymm2 = [x1.img, x1.real, x2.img, x2.real] __m256 ymm2 = _mm256_permute_ps(lhs.value, 0b10110001); //ymm3 = [y1.imag, y1.imag, y2.imag, y2.imag] __m256 ymm3 = _mm256_movehdup_ps(rhs.value); //ymm4 = ymm2 * ymm3 __m256 ymm4 = _mm256_mul_ps(ymm2, ymm3); //result = [(lhs * ymm1) -+ ymm4]; #ifdef __FMA__ return _mm256_fmaddsub_ps(lhs.value, ymm1, ymm4); #elif defined(__FMA4__) return _mm256_maddsub_ps(lhs.value, ymm1, ymm4); #else __m256 tmp = _mm256_mul_ps(lhs.value, ymm1); return _mm256_addsub_ps(tmp, ymm4); #endif }
/**************************************************************** * This technique for efficient SIMD complex-complex multiplication was found at * https://software.intel.com/file/1000 *****************************************************************/ inline __m256 avx_multiply_float_complex_(const __m256& vecA, const __m256& vecB) { __m256 vec1 = _mm256_moveldup_ps(vecB); __m256 vec2 = _mm256_movehdup_ps(vecB); vec1 = _mm256_mul_ps(vecA,vec1); vec2 = _mm256_mul_ps(vecA,vec2); vec2 = _mm256_permute_ps(vec2,0xB1); return _mm256_addsub_ps(vec1,vec2); }
div(avx_simd_complex_float<T> lhs, avx_simd_complex_float<T> rhs) { //lhs = [x1.real, x1.img, x2.real, x2.img ...] //rhs = [y1.real, y1.img, y2.real, y2.img ...] //ymm0 = [y1.real, y1.real, y2.real, y2.real, ...] __m256 ymm0 = _mm256_moveldup_ps(rhs.value); //ymm1 = [y1.imag, y1.imag, y2.imag, y2.imag] __m256 ymm1 = _mm256_movehdup_ps(rhs.value); //ymm2 = [x1.img, x1.real, x2.img, x2.real] __m256 ymm2 = _mm256_permute_ps(lhs.value, 0b10110001); //ymm4 = [x.img * y.img, x.real * y.img] __m256 ymm4 = _mm256_mul_ps(ymm2, ymm1); //ymm5 = subadd((lhs * ymm0), ymm4) #ifdef __FMA__ __m256 ymm5 = _mm256_fmsubadd_ps(lhs.value, ymm0, ymm4); #else __m256 t1 = _mm256_mul_ps(lhs.value, ymm0); __m256 t2 = _mm256_sub_ps(_mm256_set1_ps(0.0), ymm4); __m256 ymm5 = _mm256_addsub_ps(t1, t2); #endif //ymm3 = [y.imag^2, y.imag^2] __m256 ymm3 = _mm256_mul_ps(ymm1, ymm1); //ymm0 = (ymm0 * ymm0 + ymm3) #ifdef __FMA__ ymm0 = _mm256_fmadd_ps(ymm0, ymm0, ymm3); #else __m256 t3 = _mm256_mul_ps(ymm0, ymm0); ymm0 = _mm256_add_ps(t3, ymm3); #endif //result = ymm5 / ymm0 return _mm256_div_ps(ymm5, ymm0); }
// PRE: all vectors aligned, // imag_c = [i1,i1,...,i4,i4] // vec = [v1r,v1i,...,v4r,v4i] // component-wise multiplication // POST: returns [-i1*v1i,i1*v1r,...,-i4*v4i,i4*v4r] inline __m256 avx_multiply_float_imag_(const __m256& imag_c, const __m256& vec) { static const __m256 zero = _mm256_setzero_ps(); __m256 vec1 = _mm256_mul_ps(imag_c,vec); vec1 = _mm256_permute_ps(vec1,0xB1); return _mm256_addsub_ps(zero,vec1); }