div(avx_simd_complex_float<T> lhs, avx_simd_complex_float<T> rhs) { //lhs = [x1.real, x1.img, x2.real, x2.img ...] //rhs = [y1.real, y1.img, y2.real, y2.img ...] //ymm0 = [y1.real, y1.real, y2.real, y2.real, ...] __m256 ymm0 = _mm256_moveldup_ps(rhs.value); //ymm1 = [y1.imag, y1.imag, y2.imag, y2.imag] __m256 ymm1 = _mm256_movehdup_ps(rhs.value); //ymm2 = [x1.img, x1.real, x2.img, x2.real] __m256 ymm2 = _mm256_permute_ps(lhs.value, 0b10110001); //ymm4 = [x.img * y.img, x.real * y.img] __m256 ymm4 = _mm256_mul_ps(ymm2, ymm1); //ymm5 = subadd((lhs * ymm0), ymm4) #ifdef __FMA__ __m256 ymm5 = _mm256_fmsubadd_ps(lhs.value, ymm0, ymm4); #else __m256 t1 = _mm256_mul_ps(lhs.value, ymm0); __m256 t2 = _mm256_sub_ps(_mm256_set1_ps(0.0), ymm4); __m256 ymm5 = _mm256_addsub_ps(t1, t2); #endif //ymm3 = [y.imag^2, y.imag^2] __m256 ymm3 = _mm256_mul_ps(ymm1, ymm1); //ymm0 = (ymm0 * ymm0 + ymm3) #ifdef __FMA__ ymm0 = _mm256_fmadd_ps(ymm0, ymm0, ymm3); #else __m256 t3 = _mm256_mul_ps(ymm0, ymm0); ymm0 = _mm256_add_ps(t3, ymm3); #endif //result = ymm5 / ymm0 return _mm256_div_ps(ymm5, ymm0); }
__m256 test_mm256_fmsubadd_ps(__m256 a, __m256 b, __m256 c) { // CHECK-LABEL: test_mm256_fmsubadd_ps // CHECK: [[NEG:%.+]] = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}} // CHECK: @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.+}}, <8 x float> [[NEG]]) return _mm256_fmsubadd_ps(a, b, c); }
__m256 test_mm256_fmsubadd_ps(__m256 a, __m256 b, __m256 c) { // CHECK: @llvm.x86.fma.vfmsubadd.ps.256 return _mm256_fmsubadd_ps(a, b, c); }
__m256 check_mm256_fmsubadd_ps (__m256 a, __m256 b, __m256 c) { return _mm256_fmsubadd_ps (a, b, c); }