Exemple #1
0
    mul(avx_simd_complex_float<T> lhs, avx_simd_complex_float<T> rhs) {
        //lhs = [x1.real, x1.img, x2.real, x2.img, ...]
        //rhs = [y1.real, y1.img, y2.real, y2.img, ...]

        //ymm1 = [y1.real, y1.real, y2.real, y2.real, ...]
        __m256 ymm1 = _mm256_moveldup_ps(rhs.value);

        //ymm2 = [x1.img, x1.real, x2.img, x2.real]
        __m256 ymm2 = _mm256_permute_ps(lhs.value, 0b10110001);

        //ymm3 = [y1.imag, y1.imag, y2.imag, y2.imag]
        __m256 ymm3 = _mm256_movehdup_ps(rhs.value);

        //ymm4 = ymm2 * ymm3
        __m256 ymm4 = _mm256_mul_ps(ymm2, ymm3);

        //result = [(lhs * ymm1) -+ ymm4];

#ifdef __FMA__
        return _mm256_fmaddsub_ps(lhs.value, ymm1, ymm4);
#elif defined(__FMA4__)
        return _mm256_maddsub_ps(lhs.value, ymm1, ymm4);
#else
        __m256 tmp = _mm256_mul_ps(lhs.value, ymm1);
        return _mm256_addsub_ps(tmp, ymm4);
#endif
    }
	/****************************************************************
	 * This technique for efficient SIMD complex-complex multiplication was found at
	 *			https://software.intel.com/file/1000
	*****************************************************************/
	inline __m256 avx_multiply_float_complex_(const __m256& vecA, const __m256& vecB) {
		__m256 vec1 = _mm256_moveldup_ps(vecB);
		__m256 vec2 = _mm256_movehdup_ps(vecB);
		vec1 = _mm256_mul_ps(vecA,vec1);
		vec2 = _mm256_mul_ps(vecA,vec2); 
		vec2 = _mm256_permute_ps(vec2,0xB1); 
		return _mm256_addsub_ps(vec1,vec2);
	}
Exemple #3
0
/* AVX implementation for complex reciprocal */
inline __m256 srslte_mat_cf_recip_avx(__m256 a) {
  __m256 conj = _MM256_CONJ_PS(a);
  __m256 sqabs = _mm256_mul_ps(a, a);
  sqabs = _mm256_add_ps(_mm256_movehdup_ps(sqabs), _mm256_moveldup_ps(sqabs));

  __m256 recp = _mm256_rcp_ps(sqabs);

  return _mm256_mul_ps(recp, conj);
}
Exemple #4
0
    div(avx_simd_complex_float<T> lhs, avx_simd_complex_float<T> rhs) {
        //lhs = [x1.real, x1.img, x2.real, x2.img ...]
        //rhs = [y1.real, y1.img, y2.real, y2.img ...]

        //ymm0 = [y1.real, y1.real, y2.real, y2.real, ...]
        __m256 ymm0 = _mm256_moveldup_ps(rhs.value);

        //ymm1 = [y1.imag, y1.imag, y2.imag, y2.imag]
        __m256 ymm1 = _mm256_movehdup_ps(rhs.value);

        //ymm2 = [x1.img, x1.real, x2.img, x2.real]
        __m256 ymm2 = _mm256_permute_ps(lhs.value, 0b10110001);

        //ymm4 = [x.img * y.img, x.real * y.img]
        __m256 ymm4 = _mm256_mul_ps(ymm2, ymm1);

        //ymm5 = subadd((lhs * ymm0), ymm4)

#ifdef __FMA__
        __m256 ymm5 = _mm256_fmsubadd_ps(lhs.value, ymm0, ymm4);
#else
        __m256 t1    = _mm256_mul_ps(lhs.value, ymm0);
        __m256 t2    = _mm256_sub_ps(_mm256_set1_ps(0.0), ymm4);
        __m256 ymm5  = _mm256_addsub_ps(t1, t2);
#endif

        //ymm3 = [y.imag^2, y.imag^2]
        __m256 ymm3 = _mm256_mul_ps(ymm1, ymm1);

        //ymm0 = (ymm0 * ymm0 + ymm3)

#ifdef __FMA__
        ymm0 = _mm256_fmadd_ps(ymm0, ymm0, ymm3);
#else
        __m256 t3    = _mm256_mul_ps(ymm0, ymm0);
        ymm0         = _mm256_add_ps(t3, ymm3);
#endif

        //result = ymm5 / ymm0
        return _mm256_div_ps(ymm5, ymm0);
    }
Exemple #5
0
template<> INLINE avxb shuffle<0,0,2,2>(const avxb& b) {
  return _mm256_moveldup_ps(b);
}
Exemple #6
0
template<> INLINE const avxi shuffle<0, 0, 2, 2>( const avxi& b ) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(b))); }