div(avx_simd_complex_double<T> lhs, avx_simd_complex_double<T> rhs) { //lhs = [x1.real, x1.img, x2.real, x2.img] //rhs = [y1.real, y1.img, y2.real, y2.img] //ymm0 = [y1.real, y1.real, y2.real, y2.real] __m256d ymm0 = _mm256_movedup_pd(rhs.value); //ymm1 = [y1.imag, y1.imag, y2.imag, y2.imag] __m256d ymm1 = _mm256_permute_pd(rhs.value, 0b1111); //ymm2 = [x1.img, x1.real, x2.img, x2.real] __m256d ymm2 = _mm256_permute_pd(lhs.value, 0b0101); //ymm4 = [x.img * y.img, x.real * y.img] __m256d ymm4 = _mm256_mul_pd(ymm2, ymm1); //ymm5 = subadd((lhs * ymm0), ymm4) #ifdef __FMA__ __m256d ymm5 = _mm256_fmsubadd_pd(lhs.value, ymm0, ymm4); #else __m256d t1 = _mm256_mul_pd(lhs.value, ymm0); __m256d t2 = _mm256_sub_pd(_mm256_set1_pd(0.0), ymm4); __m256d ymm5 = _mm256_addsub_pd(t1, t2); #endif //ymm3 = [y.imag^2, y.imag^2] __m256d ymm3 = _mm256_mul_pd(ymm1, ymm1); //ymm0 = (ymm0 * ymm0 + ymm3) #ifdef __FMA__ ymm0 = _mm256_fmadd_pd(ymm0, ymm0, ymm3); #else __m256d t3 = _mm256_mul_pd(ymm0, ymm0); ymm0 = _mm256_add_pd(t3, ymm3); #endif //result = ymm5 / ymm0 return _mm256_div_pd(ymm5, ymm0); }
__m256d test_mm256_fmsubadd_pd(__m256d a, __m256d b, __m256d c) { // CHECK-LABEL: test_mm256_fmsubadd_pd // CHECK: [[NEG:%.+]] = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.+}} // CHECK: @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %{{.+}}, <4 x double> %{{.+}}, <4 x double> [[NEG]]) return _mm256_fmsubadd_pd(a, b, c); }
__m256d test_mm256_fmsubadd_pd(__m256d a, __m256d b, __m256d c) { // CHECK: @llvm.x86.fma.vfmsubadd.pd.256 return _mm256_fmsubadd_pd(a, b, c); }
__m256d __attribute__((__target__("fma"))) mm256_fmsubadd_pd_wrap(__m256d a, __m256d b, __m256d c) { return _mm256_fmsubadd_pd(a, b, c); }
__m256d check_mm256_fmsubadd_pd (__m256d a, __m256d b, __m256d c) { return _mm256_fmsubadd_pd (a, b, c); }