int main(int, char**) { volatile __m128d a = _mm_set1_pd(6.28); volatile __m128d b = _mm_set1_pd(3.14); volatile __m128d result = _mm_addsub_pd(a, b); result = _mm_movedup_pd(result); return 0; }
static __inline __m128d ZMUL(__m128d a, __m128d b) { __m128d ar, ai; ar = _mm_movedup_pd(a); /* ar = [a.r a.r] */ ar = _mm_mul_pd(ar, b); /* ar = [a.r*b.r a.r*b.i] */ ai = _mm_unpackhi_pd(a, a); /* ai = [a.i a.i] */ b = _mm_shuffle_pd(b, b, 1); /* b = [b.i b.r] */ ai = _mm_mul_pd(ai, b); /* ai = [a.i*b.i a.i*b.r] */ return _mm_addsub_pd(ar, ai); /* [a.r*b.r-a.i*b.i a.r*b.i+a.i*b.r] */ }
value complex_mul(value vab, value vcd) { CAMLparam2(vab, vcd); CAMLlocal1(vz); vz = caml_alloc(Double_array_tag, 2); __m128d ab, cd, ac_bd, ba, bc_ad; ab = _mm_loadu_pd((double const*) vab); cd = _mm_loadu_pd((double const*) vcd); ac_bd = _mm_mul_pd(ab, cd); ba = _mm_shuffle_pd(ab, ab, 1); bc_ad = _mm_mul_pd(ba, cd); _mm_storeu_pd((double*) vz, _mm_addsub_pd(ac_bd, bc_ad)); CAMLreturn(vz); }
dcomplex zdotc_( int* n, dcomplex* x, int* inc_x, dcomplex* z, int* inc_z ) { dcomplex* restrict x1; dcomplex* restrict z1; int i; v2df_t rho1v; v2df_t z11v, z12v; v2df_t x1v, x1rv; dcomplex rho; int n1 = *n; int incx = *inc_x; int incz = *inc_z; x1 = x; z1 = z; rho1v.v = _mm_setzero_pd(); { v2df_t bcac, adbd; for ( i = 0; i < n1; ++i ) { z11v.v = _mm_loaddup_pd( ( double* )&(z1->real) ); z12v.v = _mm_loaddup_pd( ( double* )&(z1->imag) ); x1v.v = _mm_load_pd( ( double* )x1 ); x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) ); bcac.v = x1rv.v * z11v.v; adbd.v = x1v.v * z12v.v; rho1v.v = rho1v.v + _mm_addsub_pd( bcac.v, adbd.v ); x1 += incx; z1 += incz; } rho1v.v = _mm_shuffle_pd( rho1v.v, rho1v.v, _MM_SHUFFLE2 (0,1) ); rho1v.d[1] = -rho1v.d[1]; } rho.real = rho1v.d[0]; rho.imag = rho1v.d[1]; return rho; }
// from Intel's sample intrin_double_sample.c void multiply_SSE3(double xr, double xi, double yr, double yi, complex_num *z) { __m128d num1, num2, num3; // Duplicates lower vector element into upper vector element. // num1: [x.real, x.real] num1 = _mm_loaddup_pd(&xr); // Move y elements into a vector // num2: [y.img, y.real] num2 = _mm_set_pd(yi, yr); // Multiplies vector elements // num3: [(x.real*y.img), (x.real*y.real)] num3 = _mm_mul_pd(num2, num1); // num1: [x.img, x.img] num1 = _mm_loaddup_pd(&xi); // Swaps the vector elements // num2: [y.real, y.img] num2 = _mm_shuffle_pd(num2, num2, 1); // num2: [(x.img*y.real), (x.img*y.img)] num2 = _mm_mul_pd(num2, num1); // Adds upper vector element while subtracting lower vector element // num3: [((x.real *y.img)+(x.img*y.real)), // ((x.real*y.real)-(x.img*y.img))] num3 = _mm_addsub_pd(num3, num2); // Stores the elements of num3 into z _mm_storeu_pd((double *)z, num3); }
__m128d test_mm_addsub_pd(__m128d A, __m128d B) { // CHECK-LABEL: test_mm_addsub_pd // CHECK: call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}) return _mm_addsub_pd(A, B); }
__m128d test_mm_addsub_pd(__m128d A, __m128d B) { // CHECK-LABEL: test_mm_addsub_pd // CHECK: call <2 x double> @llvm.x86.sse3.addsub.pd // CHECK-ASM: addsubpd %xmm{{.*}}, %xmm{{.*}} return _mm_addsub_pd(A, B); }
/* * Subtract off x0 & x1 contribution to all remaining equations using a * rank-2 update with mu=2, nu=3, ku=2. This version is for 16 SSE regs. * nu is the # of RHS, ku is the number of equations solved, and mu is * unrolled only to enable vectorization & software pipelining of load/use. * Loop order is MKN, so that B is kept completely in registers, and * C and A are streamed in (and out, for C) from cache during the operation. */ ATL_SINLINE void ATL_rk2(ATL_CINT M, const TYPE *pA0, const TYPE *pA1, const TYPE *pB0, const TYPE *pB1, TYPE *C, ATL_CINT ldc0) { ATL_CINT ldc=ldc0+ldc0; TYPE *pC0 = C, *pC1 = C+ldc, *pC2 = C+((ldc)<<1); ATL_INT i; ATL_CINT MM = (M&1) ? M-1 : M-2; register __m128d B00, B10, B01, B11, B02, B12; register __m128d C00, C01, C02, C10, C11, C12; register __m128d A, a; B00 = _mm_load_pd(pB0); B10 = _mm_load_pd(pB1); B01 = _mm_load_pd(pB0+2); B11 = _mm_load_pd(pB1+2); B02 = _mm_load_pd(pB0+4); B12 = _mm_load_pd(pB1+4); /* iB12, rB12 */ C00 = _mm_load_pd(pC0); C01 = _mm_load_pd(pC1); C02 = _mm_load_pd(pC2); A = _mm_load_pd(pA0); /* iA00, rA00 */ for (i=0; i < MM; i += 2, pA0 += 4, pA1 += 4, pC0 += 4, pC1 += 4, pC2 += 4) { register __m128d b; /* * K=0, M=[0,1], apply real components of B0x */ b = _mm_movedup_pd(B00); /* rB00, rB00 */ b = _mm_mul_pd(b, A); /* iA00*rB00, rA00*rB00 */ C00 = _mm_add_pd(C00, b); a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E); /* rA00, iA00 */ b = _mm_movedup_pd(B01); b = _mm_mul_pd(b, A); C01 = _mm_add_pd(C01, b); C10 = _mm_load_pd(pC0+2); b = _mm_movedup_pd(B02); b = _mm_mul_pd(b, A); C02 = _mm_add_pd(C02, b); A = _mm_load_pd(pA1); /* iA01, rA01 */ /* * K=0, M=0, apply imaginary components of B0x */ b = (__m128d)_mm_shuffle_epi32((__m128i)B00, 0xEE); /* iB00, iB00 */ b = _mm_mul_pd(b, a); /* rA00*iB00, iA00*iB00 */ C00 = _mm_addsub_pd(C00, b); C11 = _mm_load_pd(pC1+2); b = (__m128d)_mm_shuffle_epi32((__m128i)B01, 0xEE); b = _mm_mul_pd(b, a); C01 = _mm_addsub_pd(C01, b); C12 = _mm_load_pd(pC2+2); b = (__m128d)_mm_shuffle_epi32((__m128i)B02, 0xEE); b = _mm_mul_pd(b, a); C02 = _mm_addsub_pd(C02, b); /* * K=1, M=0, apply real components of B1x */ b = _mm_movedup_pd(B10); /* rB10, rB10 */ b = _mm_mul_pd(b, A); /* iA01*rB10, rA01*rB10 */ C00 = _mm_add_pd(C00, b); a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E); /* rA01, iA01 */ b = _mm_movedup_pd(B11); b = _mm_mul_pd(b, A); C01 = _mm_add_pd(C01, b); b = _mm_movedup_pd(B12); b = _mm_mul_pd(b, A); C02 = _mm_add_pd(C02, b); A = _mm_load_pd(pA0+2); /* iA10, rA10 */ /* * K=1, M=0, apply imaginary components of B1x */ b = (__m128d)_mm_shuffle_epi32((__m128i)B10, 0xEE); /* iB10, iB10 */ b = _mm_mul_pd(b, a); /* rA01*iB10, iA01*iB10 */ C00 = _mm_addsub_pd(C00, b); _mm_store_pd(pC0, C00); b = (__m128d)_mm_shuffle_epi32((__m128i)B11, 0xEE); b = _mm_mul_pd(b, a); C01 = _mm_addsub_pd(C01, b); _mm_store_pd(pC1, C01); b = (__m128d)_mm_shuffle_epi32((__m128i)B12, 0xEE); b = _mm_mul_pd(b, a); C02 = _mm_addsub_pd(C02, b); _mm_store_pd(pC2, C02); /* * K=0, M=1, apply real components of B0x */ b = _mm_movedup_pd(B00); /* rB00, rB00 */ b = _mm_mul_pd(b, A); /* iA10*rB00, rA10*rB00 */ C10 = _mm_add_pd(C10, b); a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E); /* rA10, iA10 */ b = _mm_movedup_pd(B01); b = _mm_mul_pd(b, A); C11 = _mm_add_pd(C11, b); C00 = _mm_load_pd(pC0+4); b = _mm_movedup_pd(B02); b = _mm_mul_pd(b, A); C12 = _mm_add_pd(C12, b); A = _mm_load_pd(pA1+2); /* iA11, rA11 */ /* * K=0, M=1, apply imaginary components of B0x */ b = (__m128d)_mm_shuffle_epi32((__m128i)B00, 0xEE); /* iB00, iB00 */ b = _mm_mul_pd(b, a); /* rA10*iB00, iA10*iB00 */ C10 = _mm_addsub_pd(C10, b); C01 = _mm_load_pd(pC1+4); b = (__m128d)_mm_shuffle_epi32((__m128i)B01, 0xEE); b = _mm_mul_pd(b, a); C11 = _mm_addsub_pd(C11, b); C02 = _mm_load_pd(pC2+4); b = (__m128d)_mm_shuffle_epi32((__m128i)B02, 0xEE); b = _mm_mul_pd(b, a); C12 = _mm_addsub_pd(C12, b); /* * K=1, M=1, apply real components of B1x */ b = _mm_movedup_pd(B10); /* rB10, rB10 */ b = _mm_mul_pd(b, A); /* iA11*rB10, rA11*rB10 */ C10 = _mm_add_pd(C10, b); a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E); /* rA11, iA11 */ b = _mm_movedup_pd(B11); b = _mm_mul_pd(b, A); C11 = _mm_add_pd(C11, b); b = _mm_movedup_pd(B12); b = _mm_mul_pd(b, A); C12 = _mm_add_pd(C12, b); A = _mm_load_pd(pA0+4); /* iA20, rA20 */ /* * K=1, M=1, apply imaginary components of B1x */ b = (__m128d)_mm_shuffle_epi32((__m128i)B10, 0xEE); /* iB10, iB10 */ b = _mm_mul_pd(b, a); /* rA11*iB10, iA11*iB10 */ C10 = _mm_addsub_pd(C10, b); _mm_store_pd(pC0+2, C10); b = (__m128d)_mm_shuffle_epi32((__m128i)B11, 0xEE); b = _mm_mul_pd(b, a); C11 = _mm_addsub_pd(C11, b); _mm_store_pd(pC1+2, C11); b = (__m128d)_mm_shuffle_epi32((__m128i)B12, 0xEE); b = _mm_mul_pd(b, a); C12 = _mm_addsub_pd(C12, b); _mm_store_pd(pC2+2, C12); } /* * Drain pipes */ { register __m128d b; /* * K=0, M=[0,1], apply real components of B0x */ b = _mm_movedup_pd(B00); /* rB00, rB00 */ b = _mm_mul_pd(b, A); /* iA00*rB00, rA00*rB00 */ C00 = _mm_add_pd(C00, b); a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E); /* rA00, iA00 */ b = _mm_movedup_pd(B01); b = _mm_mul_pd(b, A); C01 = _mm_add_pd(C01, b); b = _mm_movedup_pd(B02); b = _mm_mul_pd(b, A); C02 = _mm_add_pd(C02, b); A = _mm_load_pd(pA1); /* iA01, rA01 */ /* * K=0, M=0, apply imaginary components of B0x */ b = (__m128d)_mm_shuffle_epi32((__m128i)B00, 0xEE); /* iB00, iB00 */ b = _mm_mul_pd(b, a); /* rA00*iB00, iA00*iB00 */ C00 = _mm_addsub_pd(C00, b); b = (__m128d)_mm_shuffle_epi32((__m128i)B01, 0xEE); b = _mm_mul_pd(b, a); C01 = _mm_addsub_pd(C01, b); b = (__m128d)_mm_shuffle_epi32((__m128i)B02, 0xEE); b = _mm_mul_pd(b, a); C02 = _mm_addsub_pd(C02, b); /* * K=1, M=0, apply real components of B1x */ b = _mm_movedup_pd(B10); /* rB10, rB10 */ b = _mm_mul_pd(b, A); /* iA01*rB10, rA01*rB10 */ C00 = _mm_add_pd(C00, b); a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E); /* rA01, iA01 */ b = _mm_movedup_pd(B11); b = _mm_mul_pd(b, A); C01 = _mm_add_pd(C01, b); b = _mm_movedup_pd(B12); b = _mm_mul_pd(b, A); C02 = _mm_add_pd(C02, b); /* * K=1, M=0, apply imaginary components of B1x */ b = (__m128d)_mm_shuffle_epi32((__m128i)B10, 0xEE); /* iB10, iB10 */ b = _mm_mul_pd(b, a); /* rA01*iB10, iA01*iB10 */ C00 = _mm_addsub_pd(C00, b); _mm_store_pd(pC0, C00); b = (__m128d)_mm_shuffle_epi32((__m128i)B11, 0xEE); b = _mm_mul_pd(b, a); C01 = _mm_addsub_pd(C01, b); _mm_store_pd(pC1, C01); b = (__m128d)_mm_shuffle_epi32((__m128i)B12, 0xEE); b = _mm_mul_pd(b, a); C02 = _mm_addsub_pd(C02, b); _mm_store_pd(pC2, C02); if (!(M&1)) { C10 = _mm_load_pd(pC0+2); C11 = _mm_load_pd(pC1+2); C12 = _mm_load_pd(pC2+2); A = _mm_load_pd(pA0+2); /* iA10, rA10 */ /* * K=0, M=1, apply real components of B0x */ b = _mm_movedup_pd(B00); /* rB00, rB00 */ b = _mm_mul_pd(b, A); /* iA10*rB00, rA10*rB00 */ C10 = _mm_add_pd(C10, b); a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E); /* rA10, iA10 */ b = _mm_movedup_pd(B01); b = _mm_mul_pd(b, A); C11 = _mm_add_pd(C11, b); b = _mm_movedup_pd(B02); b = _mm_mul_pd(b, A); C12 = _mm_add_pd(C12, b); A = _mm_load_pd(pA1+2); /* iA11, rA11 */ /* * K=0, M=1, apply imaginary components of B0x */ b = (__m128d)_mm_shuffle_epi32((__m128i)B00, 0xEE); /* iB00, iB00 */ b = _mm_mul_pd(b, a); /* rA10*iB00, iA10*iB00 */ C10 = _mm_addsub_pd(C10, b); b = (__m128d)_mm_shuffle_epi32((__m128i)B01, 0xEE); b = _mm_mul_pd(b, a); C11 = _mm_addsub_pd(C11, b); b = (__m128d)_mm_shuffle_epi32((__m128i)B02, 0xEE); b = _mm_mul_pd(b, a); C12 = _mm_addsub_pd(C12, b); /* * K=1, M=1, apply real components of B1x */ b = _mm_movedup_pd(B10); /* rB10, rB10 */ b = _mm_mul_pd(b, A); /* iA11*rB10, rA11*rB10 */ C10 = _mm_add_pd(C10, b); a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E); /* rA11, iA11 */ b = _mm_movedup_pd(B11); b = _mm_mul_pd(b, A); C11 = _mm_add_pd(C11, b); b = _mm_movedup_pd(B12); b = _mm_mul_pd(b, A); C12 = _mm_add_pd(C12, b); /* * K=1, M=1, apply imaginary components of B1x */ b = (__m128d)_mm_shuffle_epi32((__m128i)B10, 0xEE); /* iB10, iB10 */ b = _mm_mul_pd(b, a); /* rA11*iB10, iA11*iB10 */ C10 = _mm_addsub_pd(C10, b); _mm_store_pd(pC0+2, C10); b = (__m128d)_mm_shuffle_epi32((__m128i)B11, 0xEE); b = _mm_mul_pd(b, a); C11 = _mm_addsub_pd(C11, b); _mm_store_pd(pC1+2, C11); b = (__m128d)_mm_shuffle_epi32((__m128i)B12, 0xEE); b = _mm_mul_pd(b, a); C12 = _mm_addsub_pd(C12, b); _mm_store_pd(pC2+2, C12); } } }