示例#1
0
    mul(avx_simd_complex_float<T> lhs, avx_simd_complex_float<T> rhs) {
        //lhs = [x1.real, x1.img, x2.real, x2.img, ...]
        //rhs = [y1.real, y1.img, y2.real, y2.img, ...]

        //ymm1 = [y1.real, y1.real, y2.real, y2.real, ...]
        __m256 ymm1 = _mm256_moveldup_ps(rhs.value);

        //ymm2 = [x1.img, x1.real, x2.img, x2.real]
        __m256 ymm2 = _mm256_permute_ps(lhs.value, 0b10110001);

        //ymm3 = [y1.imag, y1.imag, y2.imag, y2.imag]
        __m256 ymm3 = _mm256_movehdup_ps(rhs.value);

        //ymm4 = ymm2 * ymm3
        __m256 ymm4 = _mm256_mul_ps(ymm2, ymm3);

        //result = [(lhs * ymm1) -+ ymm4];

#ifdef __FMA__
        return _mm256_fmaddsub_ps(lhs.value, ymm1, ymm4);
#elif defined(__FMA4__)
        return _mm256_maddsub_ps(lhs.value, ymm1, ymm4);
#else
        __m256 tmp = _mm256_mul_ps(lhs.value, ymm1);
        return _mm256_addsub_ps(tmp, ymm4);
#endif
    }
	/****************************************************************
	 * This technique for efficient SIMD complex-complex multiplication was found at
	 *			https://software.intel.com/file/1000
	*****************************************************************/
	inline __m256 avx_multiply_float_complex_(const __m256& vecA, const __m256& vecB) {
		__m256 vec1 = _mm256_moveldup_ps(vecB);
		__m256 vec2 = _mm256_movehdup_ps(vecB);
		vec1 = _mm256_mul_ps(vecA,vec1);
		vec2 = _mm256_mul_ps(vecA,vec2); 
		vec2 = _mm256_permute_ps(vec2,0xB1); 
		return _mm256_addsub_ps(vec1,vec2);
	}
示例#3
0
    div(avx_simd_complex_float<T> lhs, avx_simd_complex_float<T> rhs) {
        //lhs = [x1.real, x1.img, x2.real, x2.img ...]
        //rhs = [y1.real, y1.img, y2.real, y2.img ...]

        //ymm0 = [y1.real, y1.real, y2.real, y2.real, ...]
        __m256 ymm0 = _mm256_moveldup_ps(rhs.value);

        //ymm1 = [y1.imag, y1.imag, y2.imag, y2.imag]
        __m256 ymm1 = _mm256_movehdup_ps(rhs.value);

        //ymm2 = [x1.img, x1.real, x2.img, x2.real]
        __m256 ymm2 = _mm256_permute_ps(lhs.value, 0b10110001);

        //ymm4 = [x.img * y.img, x.real * y.img]
        __m256 ymm4 = _mm256_mul_ps(ymm2, ymm1);

        //ymm5 = subadd((lhs * ymm0), ymm4)

#ifdef __FMA__
        __m256 ymm5 = _mm256_fmsubadd_ps(lhs.value, ymm0, ymm4);
#else
        __m256 t1    = _mm256_mul_ps(lhs.value, ymm0);
        __m256 t2    = _mm256_sub_ps(_mm256_set1_ps(0.0), ymm4);
        __m256 ymm5  = _mm256_addsub_ps(t1, t2);
#endif

        //ymm3 = [y.imag^2, y.imag^2]
        __m256 ymm3 = _mm256_mul_ps(ymm1, ymm1);

        //ymm0 = (ymm0 * ymm0 + ymm3)

#ifdef __FMA__
        ymm0 = _mm256_fmadd_ps(ymm0, ymm0, ymm3);
#else
        __m256 t3    = _mm256_mul_ps(ymm0, ymm0);
        ymm0         = _mm256_add_ps(t3, ymm3);
#endif

        //result = ymm5 / ymm0
        return _mm256_div_ps(ymm5, ymm0);
    }
	// PRE: all vectors aligned, 
	//		imag_c = [i1,i1,...,i4,i4]
	//		vec = [v1r,v1i,...,v4r,v4i]
	//		component-wise multiplication
	// POST: returns [-i1*v1i,i1*v1r,...,-i4*v4i,i4*v4r]
	inline __m256 avx_multiply_float_imag_(const __m256& imag_c, const __m256& vec) {
		static const __m256 zero = _mm256_setzero_ps();
		__m256 vec1 = _mm256_mul_ps(imag_c,vec);
		vec1 = _mm256_permute_ps(vec1,0xB1);
		return _mm256_addsub_ps(zero,vec1);
	}