Exemple #1
0
    mul(avx_simd_complex_double<T> lhs, avx_simd_complex_double<T> rhs) {
        //lhs = [x1.real, x1.img, x2.real, x2.img]
        //rhs = [y1.real, y1.img, y2.real, y2.img]

        //ymm1 = [y1.real, y1.real, y2.real, y2.real]
        __m256d ymm1 = _mm256_movedup_pd(rhs.value);

        //ymm2 = [x1.img, x1.real, x2.img, x2.real]
        __m256d ymm2 = _mm256_permute_pd(lhs.value, 0b0101);

        //ymm3 = [y1.imag, y1.imag, y2.imag, y2.imag]
        __m256d ymm3 = _mm256_permute_pd(rhs.value, 0b1111);

        //ymm4 = ymm2 * ymm3
        __m256d ymm4 = _mm256_mul_pd(ymm2, ymm3);

        //result = [(lhs * ymm1) -+ ymm4];

#ifdef __FMA__
        return _mm256_fmaddsub_pd(lhs.value, ymm1, ymm4);
#elif defined(__FMA4__)
        return _mm256_maddsub_pd(lhs.value, ymm1, ymm4);
#else
        __m256d tmp = _mm256_mul_pd(lhs.value, ymm1);
        return _mm256_addsub_pd(tmp, ymm4);
#endif
    }
Exemple #2
0
    div(avx_simd_complex_double<T> lhs, avx_simd_complex_double<T> rhs) {
        //lhs = [x1.real, x1.img, x2.real, x2.img]
        //rhs = [y1.real, y1.img, y2.real, y2.img]

        //ymm0 = [y1.real, y1.real, y2.real, y2.real]
        __m256d ymm0 = _mm256_movedup_pd(rhs.value);

        //ymm1 = [y1.imag, y1.imag, y2.imag, y2.imag]
        __m256d ymm1 = _mm256_permute_pd(rhs.value, 0b1111);

        //ymm2 = [x1.img, x1.real, x2.img, x2.real]
        __m256d ymm2 = _mm256_permute_pd(lhs.value, 0b0101);

        //ymm4 = [x.img * y.img, x.real * y.img]
        __m256d ymm4 = _mm256_mul_pd(ymm2, ymm1);

        //ymm5 = subadd((lhs * ymm0), ymm4)

#ifdef __FMA__
        __m256d ymm5 = _mm256_fmsubadd_pd(lhs.value, ymm0, ymm4);
#else
        __m256d t1   = _mm256_mul_pd(lhs.value, ymm0);
        __m256d t2   = _mm256_sub_pd(_mm256_set1_pd(0.0), ymm4);
        __m256d ymm5 = _mm256_addsub_pd(t1, t2);
#endif

        //ymm3 = [y.imag^2, y.imag^2]
        __m256d ymm3 = _mm256_mul_pd(ymm1, ymm1);

        //ymm0 = (ymm0 * ymm0 + ymm3)

#ifdef __FMA__
        ymm0 = _mm256_fmadd_pd(ymm0, ymm0, ymm3);
#else
        __m256d t3   = _mm256_mul_pd(ymm0, ymm0);
        ymm0         = _mm256_add_pd(t3, ymm3);
#endif

        //result = ymm5 / ymm0
        return _mm256_div_pd(ymm5, ymm0);
    }
Exemple #3
0
		inline float64x4_t mat4_mul_vec4(const float64x4_t ymm[4], const float64x4_t ymm_v)
		{
			float64x4_t perm0 = _mm256_permute_pd(ymm_v, 0x0); // x x y y
			float64x4_t perm1 = _mm256_permute_pd(ymm_v, 0xF); // z z w w

			float64x4_t bcast0 = _mm256_broadcast_pd(&_mm256_extractf128_pd(perm0, 0)); // x x x x 
			float64x4_t bcast1 = _mm256_broadcast_pd(&_mm256_extractf128_pd(perm0, 1)); // y y y y
			float64x4_t bcast2 = _mm256_broadcast_pd(&_mm256_extractf128_pd(perm1, 0)); // z z z z
			float64x4_t bcast3 = _mm256_broadcast_pd(&_mm256_extractf128_pd(perm1, 1)); // w w w w

			float64x4_t mul0 = _mm256_mul_pd(ymm[0], bcast0);
			float64x4_t mul1 = _mm256_mul_pd(ymm[1], bcast1);
			float64x4_t mul2 = _mm256_mul_pd(ymm[2], bcast2);
			float64x4_t mul3 = _mm256_mul_pd(ymm[3], bcast3);

			float64x4_t add0 = _mm256_add_pd(mul0, mul1);
			float64x4_t add1 = _mm256_add_pd(mul2, mul3);
			float64x4_t add2 = _mm256_add_pd(add0, add1);

			return add2;
		}
Exemple #4
0
inline void rotate_left_wm1(F64vec4 *v0, const F64vec4 v1)
{
    // {1.0, 2.0, 3.0, 4.0};
    // {5.0, 6.0, 7.0, 8.0};

    const __m128d hiv0      = _mm256_extractf128_pd(*v0, 1); // {3.0, 4.0}
    const __m128d phiv0     = _mm_permute_pd(hiv0, 0x1); // {4.0, 3.0}
    const __m256d shufv1    = _mm256_permute_pd(v1, 0x1); // {6.0, 5.0, 8.0, 7.0};
    const __m128d shufv1_lo = _mm256_extractf128_pd(shufv1, 0); // {6.0, 5.0}
    const __m128d shufv1_hi = _mm256_extractf128_pd(shufv1, 1); // {8.0, 7.0}
    const __m128d v1_blend  = _mm_blend_pd(shufv1_lo, shufv1_hi, 0x2); // blend   {6.0, 7.0};
    const __m256d inserted  = _mm256_insertf128_pd(shufv1, v1_blend, 1); // insert  {6.0, 5.0, 6.0, 7.0};
    const __m256d blended   = _mm256_blend_pd(_mm256_castpd128_pd256(phiv0), inserted, 0xE);
    *v0                     = blended;
}
Exemple #5
0
 double zdotu_aos(
                const int    N,
                const double* dx,
                const int    ix,
                const double* dy,
                const int    iy,
		double*  res
                )
{
	__m256d ymm0;
	__m256d ymm1;
	__m256d ymm2;
	__m256d ymm3;
	__m256d ymm4 = _mm256_setzero_pd();
	__m256d ymm5 = _mm256_setzero_pd();
	//
	int ii = 0;
	//for(ii = 0; ii < N/2; ii++)
	do
	{
		//IACA_START;
		ymm0 = _mm256_loadu_pd(dx + 4*ii);	
		ymm1 = _mm256_loadu_pd(dy + 4*ii);	
		//
		ymm4 = _mm256_fmadd_pd(ymm1, ymm0, ymm4);
		ymm2 = _mm256_permute_pd(ymm1, 0x5);
		ymm5 = _mm256_fmadd_pd(ymm2, ymm0, ymm5);
		ii++;
		//
	} while (ii < N/2);
	//IACA_END
	double* re = (double*)&ymm4;
	double* im = (double*)&ymm5;
	res[0] = re[0] - re[1] + re[2] - re[3];
	res[1] = im[0] + im[1] + im[2] + im[3];
}
Exemple #6
0
int
main(void)
{
    //_mm256_permute_pd
    __m256d da = _mm256_setr_pd(1,2,3,4);

    printf("da: ");
    for(int i=0; i<sizeof(da)/sizeof(da.m256d_f64[0]); i++)
        printf("%5.1f  ", da.m256d_f64[i]);
    printf("\n");

    __m256d dc = _mm256_permute_pd(da, 0x02);

    printf("dc: ");
    for(int i=0; i<sizeof(dc)/sizeof(dc.m256d_f64[0]); i++)
        printf("%5.1f  ", dc.m256d_f64[i]);
    printf("\n\n");


    //_mm_permute_pd
    __m128d fa = _mm_setr_pd(1, 2);

    printf("fa: ");
    for(int i=0; i<sizeof(fa)/sizeof(fa.m128d_f64[0]); i++)
        printf("%5.1f  ", fa.m128d_f64[i]);
    printf("\n");

    __m128d fc = _mm_permute_pd(fa,0x01);

    printf("fc: ");
    for(int i=0; i<sizeof(fc)/sizeof(fc.m128d_f64[0]); i++)
        printf("%5.1f  ", fc.m128d_f64[i]);
    printf("\n");

    return 0;
}
__m256d test_mm256_permute_pd(__m256d a) {
  // Check if the mask is correct
  // CHECK: shufflevector{{.*}}<i32 1, i32 0, i32 3, i32 2>
  return _mm256_permute_pd(a, 5);
}